Added crawl page for organizations and services.

This commit is contained in:
Christian P. MOMON 2021-05-18 19:40:29 +02:00
parent bf81404746
commit 68906ed88a
18 changed files with 371 additions and 149 deletions

View file

@ -27,9 +27,6 @@ import fr.devinsy.statoolinfos.core.Factory;
import fr.devinsy.statoolinfos.core.Federation; import fr.devinsy.statoolinfos.core.Federation;
import fr.devinsy.statoolinfos.core.StatoolInfosException; import fr.devinsy.statoolinfos.core.StatoolInfosException;
import fr.devinsy.statoolinfos.crawl.CrawlCache; import fr.devinsy.statoolinfos.crawl.CrawlCache;
import fr.devinsy.statoolinfos.crawl.CrawlJournal;
import fr.devinsy.statoolinfos.crawl.CrawlJournalFile;
import fr.devinsy.statoolinfos.crawl.Crawler;
/** /**
* The Class Manager. * The Class Manager.
@ -47,7 +44,6 @@ public class HtmlizerContext
private Federation federation; private Federation federation;
private Categories categories; private Categories categories;
private CrawlCache cache; private CrawlCache cache;
private CrawlJournal crawlJournal;
/** /**
* Instantiates a new manager. * Instantiates a new manager.
@ -73,23 +69,9 @@ public class HtmlizerContext
logger.info("Htmlize directory setting: {}", this.configuration.getHtmlizeDirectoryPath()); logger.info("Htmlize directory setting: {}", this.configuration.getHtmlizeDirectoryPath());
this.cache = new CrawlCache(this.configuration.getCrawlCacheDirectory()); this.cache = new CrawlCache(this.configuration.getCrawlCacheDirectory());
this.crawlJournal = CrawlJournalFile.load(this.cache.restoreFile(Crawler.getJournalURL()));
File htmlizeInputFile = this.cache.restoreFile(this.configuration.getHtmlizeInputURL());
File htmlizeDirectory = this.configuration.getHtmlizeDirectory(); File htmlizeDirectory = this.configuration.getHtmlizeDirectory();
if (htmlizeInputFile == null) if (htmlizeDirectory == null)
{
throw new IllegalArgumentException("Htmlize input undefined.");
}
else if (!htmlizeInputFile.exists())
{
throw new IllegalArgumentException("Htmlize input is missing.");
}
else if (htmlizeInputFile.isDirectory())
{
throw new IllegalArgumentException("Htmlize input is a directory.");
}
else if (htmlizeDirectory == null)
{ {
throw new IllegalArgumentException("Htmlize directory undefined."); throw new IllegalArgumentException("Htmlize directory undefined.");
} }
@ -105,7 +87,7 @@ public class HtmlizerContext
{ {
if (this.configuration.isFederation()) if (this.configuration.isFederation())
{ {
this.federation = Factory.loadFederation(htmlizeInputFile, this.cache); this.federation = Factory.loadFederation(this.configuration.getHtmlizeInputURL(), this.cache);
this.categories = Factory.loadCategories(this.configuration.getCategoryFile(), this.federation); this.categories = Factory.loadCategories(this.configuration.getCategoryFile(), this.federation);
} }
else else
@ -160,11 +142,6 @@ public class HtmlizerContext
return result; return result;
} }
public CrawlJournal getCrawlJournal()
{
return this.crawlJournal;
}
/** /**
* Gets the federation. * Gets the federation.
* *

View file

@ -29,6 +29,7 @@ import org.slf4j.LoggerFactory;
import fr.devinsy.statoolinfos.checker.PropertyChecker; import fr.devinsy.statoolinfos.checker.PropertyChecker;
import fr.devinsy.statoolinfos.checker.PropertyChecks; import fr.devinsy.statoolinfos.checker.PropertyChecks;
import fr.devinsy.statoolinfos.crawl.CrawlCache; import fr.devinsy.statoolinfos.crawl.CrawlCache;
import fr.devinsy.statoolinfos.crawl.CrawlJournal;
import fr.devinsy.statoolinfos.properties.PathProperties; import fr.devinsy.statoolinfos.properties.PathProperties;
import fr.devinsy.statoolinfos.properties.PathProperty; import fr.devinsy.statoolinfos.properties.PathProperty;
import fr.devinsy.statoolinfos.properties.PathPropertyUtils; import fr.devinsy.statoolinfos.properties.PathPropertyUtils;
@ -159,67 +160,63 @@ public class Factory
* @throws IOException * @throws IOException
* Signals that an I/O exception has occurred. * Signals that an I/O exception has occurred.
*/ */
public static Federation loadFederation(final File federationFile, final CrawlCache cache) throws StatoolInfosException, IOException public static Federation loadFederation(final URL inputURL, final CrawlCache cache) throws StatoolInfosException, IOException
{ {
Federation result; Federation result;
PathProperties properties = PathPropertyUtils.load(federationFile); if (inputURL == null)
result = new Federation(properties);
result.setInputFile(federationFile);
result.setLogoFileName(result.getTechnicalName() + "-logo" + StringUtils.defaultIfBlank(cache.getExtension(result.getLogoURL()), ".png"));
PropertyChecker checker = new PropertyChecker();
PropertyChecks checks = checker.checkFederation(result.getInputFile());
result.getInputChecks().addAll(checks);
result.getInputChecks().setFileName(result.getLocalFileName());
PathProperties subs = result.getByPrefix("subs");
for (PathProperty property : subs)
{ {
if (StringUtils.startsWith(property.getValue(), "http")) throw new IllegalArgumentException("Null input URL.");
{
URL inputURL = new URL(property.getValue());
Organization organization = loadOrganization(inputURL, cache);
if (organization != null)
{
organization.setFederation(result);
result.getOrganizations().add(organization);
}
}
} }
else if (cache == null)
//
return result;
}
/**
* Load organization.
*
* @param organizationFile
* the organization file
* @param cache
* the cache
* @return the organization
* @throws IOException
* Signals that an I/O exception has occurred.
*/
public static Organization loadOrganization(final File organizationFile, final CrawlCache cache) throws IOException
{
Organization result;
PathProperties properties = PathPropertyUtils.load(organizationFile);
result = new Organization(properties);
result.setInputFile(organizationFile);
PathProperties subs = result.getByPrefix("subs");
for (PathProperty property : subs)
{ {
if (StringUtils.startsWith(property.getValue(), "http")) throw new IllegalArgumentException("Null cache URL.");
}
else
{
File federationFile = cache.restoreFile(inputURL);
if (federationFile == null)
{ {
URL serviceInputFile = new URL(property.getValue()); throw new IllegalArgumentException("Htmlize input file undefined.");
Service service = loadService(serviceInputFile, cache); }
service.setOrganization(result); else if (!federationFile.exists())
result.getServices().add(service); {
throw new IllegalArgumentException("Htmlize input file is missing.");
}
else if (federationFile.isDirectory())
{
throw new IllegalArgumentException("Htmlize input file is a directory.");
}
else
{
PathProperties properties = PathPropertyUtils.load(federationFile);
result = new Federation(properties);
result.setInputURL(inputURL);
result.setInputFile(federationFile);
result.setLogoFileName(result.getTechnicalName() + "-logo" + StringUtils.defaultIfBlank(cache.getExtension(result.getLogoURL()), ".png"));
PropertyChecker checker = new PropertyChecker();
PropertyChecks checks = checker.checkFederation(result.getInputFile());
result.getInputChecks().addAll(checks);
result.getInputChecks().setFileName(result.getLocalFileName());
PathProperties subs = result.getByPrefix("subs");
for (PathProperty property : subs)
{
if (StringUtils.startsWith(property.getValue(), "http"))
{
URL subInputURL = new URL(property.getValue());
Organization organization = loadOrganization(subInputURL, cache);
if (organization != null)
{
organization.setFederation(result);
result.getOrganizations().add(organization);
}
}
}
//
result.getCrawlJournal().addAll(cache.restoreJournal());
} }
} }
@ -285,6 +282,14 @@ public class Factory
} }
} }
} }
//
CrawlJournal journal = cache.restoreJournal();
result.getCrawlJournal().addAll(journal.searchByParent(result.getInputURL()));
for (Service service : result.getServices())
{
result.getCrawlJournal().addAll(journal.searchByParent(service.getInputURL()));
}
} }
else else
{ {
@ -321,6 +326,10 @@ public class Factory
result.setInputFile(inputFile); result.setInputFile(inputFile);
result.setInputURL(inputURL); result.setInputURL(inputURL);
result.setLogoFileName(result.getTechnicalName() + "-logo" + StringUtils.defaultIfBlank(cache.getExtension(result.getLogoURL()), ".png")); result.setLogoFileName(result.getTechnicalName() + "-logo" + StringUtils.defaultIfBlank(cache.getExtension(result.getLogoURL()), ".png"));
//
CrawlJournal journal = cache.restoreJournal();
result.getCrawlJournal().addAll(journal.searchByParent(result.getInputURL()));
} }
// //

View file

@ -26,6 +26,7 @@ import java.time.LocalDateTime;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import fr.devinsy.statoolinfos.checker.PropertyChecks; import fr.devinsy.statoolinfos.checker.PropertyChecks;
import fr.devinsy.statoolinfos.crawl.CrawlJournal;
import fr.devinsy.statoolinfos.properties.PathProperties; import fr.devinsy.statoolinfos.properties.PathProperties;
import fr.devinsy.statoolinfos.properties.PathPropertyList; import fr.devinsy.statoolinfos.properties.PathPropertyList;
@ -36,9 +37,11 @@ public class Federation extends PathPropertyList
{ {
private static final long serialVersionUID = -8970835291634661580L; private static final long serialVersionUID = -8970835291634661580L;
private Organizations organizations; private Organizations organizations;
private URL inputURL;
private File inputFile; private File inputFile;
private String logoFileName; private String logoFileName;
private PropertyChecks inputChecks; private PropertyChecks inputChecks;
private CrawlJournal crawlJournal;
/** /**
* Instantiates a new federation. * Instantiates a new federation.
@ -48,6 +51,7 @@ public class Federation extends PathPropertyList
super(); super();
this.inputChecks = new PropertyChecks(); this.inputChecks = new PropertyChecks();
this.organizations = new Organizations(); this.organizations = new Organizations();
this.crawlJournal = new CrawlJournal();
} }
/** /**
@ -68,7 +72,7 @@ public class Federation extends PathPropertyList
else else
{ {
this.organizations = new Organizations(); this.organizations = new Organizations();
this.crawlJournal = new CrawlJournal();
} }
} }
@ -157,6 +161,11 @@ public class Federation extends PathPropertyList
return result; return result;
} }
public CrawlJournal getCrawlJournal()
{
return this.crawlJournal;
}
/** /**
* Gets the description. * Gets the description.
* *
@ -204,6 +213,11 @@ public class Federation extends PathPropertyList
return this.inputFile; return this.inputFile;
} }
public URL getInputURL()
{
return this.inputURL;
}
/** /**
* Gets the legal website. * Gets the legal website.
* *
@ -435,6 +449,11 @@ public class Federation extends PathPropertyList
this.inputFile = inputFile; this.inputFile = inputFile;
} }
public void setInputURL(final URL inputURL)
{
this.inputURL = inputURL;
}
public void setLogoFileName(final String logoFileName) public void setLogoFileName(final String logoFileName)
{ {
this.logoFileName = logoFileName; this.logoFileName = logoFileName;

View file

@ -28,6 +28,7 @@ import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import fr.devinsy.statoolinfos.checker.PropertyChecks; import fr.devinsy.statoolinfos.checker.PropertyChecks;
import fr.devinsy.statoolinfos.crawl.CrawlJournal;
import fr.devinsy.statoolinfos.properties.PathProperties; import fr.devinsy.statoolinfos.properties.PathProperties;
import fr.devinsy.statoolinfos.properties.PathPropertyList; import fr.devinsy.statoolinfos.properties.PathPropertyList;
@ -43,6 +44,7 @@ public class Organization extends PathPropertyList
private URL inputURL; private URL inputURL;
private String logoFileName; private String logoFileName;
private PropertyChecks inputChecks; private PropertyChecks inputChecks;
private CrawlJournal crawlJournal;
/** /**
* Instantiates a new organization. * Instantiates a new organization.
@ -52,6 +54,7 @@ public class Organization extends PathPropertyList
super(); super();
this.inputChecks = new PropertyChecks(); this.inputChecks = new PropertyChecks();
this.services = new Services(); this.services = new Services();
this.crawlJournal = new CrawlJournal();
} }
/** /**
@ -65,6 +68,7 @@ public class Organization extends PathPropertyList
super(properties); super(properties);
this.inputChecks = new PropertyChecks(); this.inputChecks = new PropertyChecks();
this.services = new Services(); this.services = new Services();
this.crawlJournal = new CrawlJournal();
} }
/** /**
@ -165,6 +169,11 @@ public class Organization extends PathPropertyList
return result; return result;
} }
public CrawlJournal getCrawlJournal()
{
return this.crawlJournal;
}
public String getDescription() public String getDescription()
{ {
String result; String result;

View file

@ -33,6 +33,7 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import fr.devinsy.statoolinfos.checker.PropertyChecks; import fr.devinsy.statoolinfos.checker.PropertyChecks;
import fr.devinsy.statoolinfos.crawl.CrawlJournal;
import fr.devinsy.statoolinfos.metrics.Metric; import fr.devinsy.statoolinfos.metrics.Metric;
import fr.devinsy.statoolinfos.properties.PathProperties; import fr.devinsy.statoolinfos.properties.PathProperties;
import fr.devinsy.statoolinfos.properties.PathProperty; import fr.devinsy.statoolinfos.properties.PathProperty;
@ -104,6 +105,7 @@ public class Service extends PathPropertyList
private URL inputURL; private URL inputURL;
private String logoFileName; private String logoFileName;
private PropertyChecks inputChecks; private PropertyChecks inputChecks;
private CrawlJournal crawlJournal;
/** /**
* Instantiates a new service. * Instantiates a new service.
@ -123,6 +125,7 @@ public class Service extends PathPropertyList
{ {
super(properties); super(properties);
this.inputChecks = new PropertyChecks(); this.inputChecks = new PropertyChecks();
this.crawlJournal = new CrawlJournal();
} }
/** /**
@ -223,6 +226,11 @@ public class Service extends PathPropertyList
return result; return result;
} }
public CrawlJournal getCrawlJournal()
{
return this.crawlJournal;
}
/** /**
* Gets the description. * Gets the description.
* *

View file

@ -20,7 +20,9 @@ package fr.devinsy.statoolinfos.crawl;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL; import java.net.URL;
import java.nio.file.Files;
import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
@ -189,6 +191,22 @@ public class CrawlCache
} }
} }
/**
* @return
* @throws IOException
*/
public CrawlJournal restoreJournal() throws IOException
{
CrawlJournal result;
File journalFile = restoreFile(getJournalURL());
result = CrawlJournalFile.load(journalFile);
//
return result;
}
/** /**
* Restore logo to. * Restore logo to.
* *
@ -292,6 +310,24 @@ public class CrawlCache
return result; return result;
} }
/**
* Store journal.
*/
public void storeJournal(final CrawlJournal journal)
{
try
{
File file = Files.createTempFile("tmp-", ".statoolsinfos").toFile();
CrawlJournalFile.save(file, journal);
store(getJournalURL(), file);
file.delete();
}
catch (IOException exception)
{
exception.printStackTrace();
}
}
/** /**
* Store. * Store.
* *
@ -356,4 +392,20 @@ public class CrawlCache
// //
return result; return result;
} }
/**
* Gets the journal URL.
*
* @return the journal URL
* @throws MalformedURLException
*/
public static URL getJournalURL() throws MalformedURLException
{
URL result;
result = new URL("http://localhost/crawl.journal");
//
return result;
}
} }

View file

@ -34,6 +34,7 @@ import java.time.LocalDateTime;
import java.time.ZoneOffset; import java.time.ZoneOffset;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -187,14 +188,33 @@ public class CrawlJournalFile
} }
else else
{ {
String[] tokens = line.split(" ", 2); String[] tokens = line.split(" ", 3);
CrawlStatus status = CrawlStatus.valueOf(tokens[0].toUpperCase()); CrawlStatus status = CrawlStatus.valueOf(tokens[0].toUpperCase());
URL parentURL;
try
{
if (StringUtils.equals(tokens[1], "null"))
{
parentURL = null;
}
else
{
parentURL = new URL(tokens[1].trim());
}
}
catch (MalformedURLException exception)
{
logger.error("Error valuing [{}]", line);
exception.printStackTrace();
parentURL = null;
}
URL url; URL url;
try try
{ {
url = new URL(tokens[1].trim()); url = new URL(tokens[2].trim());
} }
catch (MalformedURLException exception) catch (MalformedURLException exception)
{ {
@ -203,7 +223,7 @@ public class CrawlJournalFile
url = null; url = null;
} }
result = new CrawlLog(url, status); result = new CrawlLog(url, parentURL, status);
} }
// //
@ -226,7 +246,7 @@ public class CrawlJournalFile
{ {
for (CrawlLog log : journal) for (CrawlLog log : journal)
{ {
String line = log.getStatus() + " " + log.getUrl(); String line = String.format("%s %s %s", log.getStatus(), log.getParentUrl(), log.getUrl());
out.write(line); out.write(line);
out.write("\n"); out.write("\n");
} }

View file

@ -26,6 +26,7 @@ import java.net.URL;
public class CrawlLog public class CrawlLog
{ {
private URL url; private URL url;
private URL parentUrl;
private CrawlStatus status; private CrawlStatus status;
/** /**
@ -36,12 +37,40 @@ public class CrawlLog
* @param status * @param status
* the status * the status
*/ */
public CrawlLog(final URL url, final CrawlStatus status) public CrawlLog(final URL url, final URL parentUrl, final CrawlStatus status)
{ {
this.url = url; this.url = url;
this.parentUrl = parentUrl;
this.status = status; this.status = status;
} }
public URL getParentUrl()
{
return this.parentUrl;
}
/**
* Gets the parent url value.
*
* @return the parent url value
*/
public String getParentUrlValue()
{
String result;
if (this.parentUrl == null)
{
result = null;
}
else
{
result = this.parentUrl.toString();
}
//
return result;
}
public CrawlStatus getStatus() public CrawlStatus getStatus()
{ {
return this.status; return this.status;

View file

@ -24,6 +24,8 @@ import java.util.Collections;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import fr.devinsy.statoolinfos.util.URLUtils;
/** /**
* The Class CrawlLogs. * The Class CrawlLogs.
*/ */
@ -39,6 +41,27 @@ public class CrawlLogs extends ArrayList<CrawlLog>
super(); super();
} }
/* (non-Javadoc)
* @see java.util.ArrayList#add(java.lang.Object)
*/
@Override
public boolean add(final CrawlLog log)
{
boolean result;
if (log == null)
{
result = false;
}
else
{
result = super.add(log);
}
//
return result;
}
/** /**
* Adds the. * Adds the.
* *
@ -47,9 +70,9 @@ public class CrawlLogs extends ArrayList<CrawlLog>
* @param status * @param status
* the status * the status
*/ */
public void add(final URL url, final CrawlStatus status) public void add(final URL url, final URL parentUrl, final CrawlStatus status)
{ {
this.add(new CrawlLog(url, status)); this.add(new CrawlLog(url, parentUrl, status));
} }
/** /**
@ -139,4 +162,29 @@ public class CrawlLogs extends ArrayList<CrawlLog>
// //
return result; return result;
} }
/**
* Gets the by parent.
*
* @param parentURL
* the parent URL
* @return the by parent
*/
public CrawlLogs searchByParent(final URL parentURL)
{
CrawlLogs result;
result = new CrawlLogs();
for (CrawlLog log : this)
{
if (URLUtils.equals(log.getParentUrl(), parentURL))
{
result.add(log);
}
}
//
return result;
}
} }

View file

@ -21,7 +21,6 @@ package fr.devinsy.statoolinfos.crawl;
import java.io.File; import java.io.File;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL; import java.net.URL;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.nio.file.Files; import java.nio.file.Files;
@ -89,7 +88,7 @@ public class Crawler
*/ */
public void crawl(final URL url) throws StatoolInfosException, IOException public void crawl(final URL url) throws StatoolInfosException, IOException
{ {
crawl(url, null); crawl(url, null, null);
} }
/** /**
@ -104,7 +103,7 @@ public class Crawler
* @throws IOException * @throws IOException
* Signals that an I/O exception has occurred. * Signals that an I/O exception has occurred.
*/ */
public void crawl(final URL url, final PropertyClassType parent) public void crawl(final URL url, final URL parentURL, final PropertyClassType parent)
{ {
logger.info("Crawling {}", url); logger.info("Crawling {}", url);
@ -118,21 +117,21 @@ public class Crawler
catch (java.net.ConnectException exception) catch (java.net.ConnectException exception)
{ {
logger.error("ERROR: crawl failed for [{}]: {}", url.toString(), exception.getMessage()); logger.error("ERROR: crawl failed for [{}]: {}", url.toString(), exception.getMessage());
this.journal.add(url, CrawlStatus.CONNECTERROR); this.journal.add(url, parentURL, CrawlStatus.CONNECTERROR);
downloadFile = null; downloadFile = null;
exception.printStackTrace(); exception.printStackTrace();
} }
catch (FileNotFoundException exception) catch (FileNotFoundException exception)
{ {
logger.error("ERROR: crawl failed for [{}]: {}", url.toString(), exception.getMessage()); logger.error("ERROR: crawl failed for [{}]: {}", url.toString(), exception.getMessage());
this.journal.add(url, CrawlStatus.URLNOTFOUND); this.journal.add(url, parentURL, CrawlStatus.URLNOTFOUND);
downloadFile = null; downloadFile = null;
exception.printStackTrace(); exception.printStackTrace();
} }
catch (IOException exception) catch (IOException exception)
{ {
logger.error("ERROR: crawl failed for [{}]: {}", url.toString(), exception.getMessage()); logger.error("ERROR: crawl failed for [{}]: {}", url.toString(), exception.getMessage());
this.journal.add(url, CrawlStatus.DOWNLOADERROR); this.journal.add(url, parentURL, CrawlStatus.DOWNLOADERROR);
downloadFile = null; downloadFile = null;
exception.printStackTrace(); exception.printStackTrace();
} }
@ -142,12 +141,12 @@ public class Crawler
if (!downloadFile.exists()) if (!downloadFile.exists())
{ {
logger.error("ERROR: download missing."); logger.error("ERROR: download missing.");
this.journal.add(url, CrawlStatus.MISSING); this.journal.add(url, parentURL, CrawlStatus.MISSING);
} }
else if (downloadFile.length() == 0) else if (downloadFile.length() == 0)
{ {
logger.error("ERROR: download empty."); logger.error("ERROR: download empty.");
this.journal.add(url, CrawlStatus.EMPTY); this.journal.add(url, parentURL, CrawlStatus.EMPTY);
} }
else else
{ {
@ -157,7 +156,7 @@ public class Crawler
if ((downloadClass == null) || (!downloadClass.isChildOf(parent))) if ((downloadClass == null) || (!downloadClass.isChildOf(parent)))
{ {
logger.error("ERROR: bad child class [{}][{}].", downloadClass, parent); logger.error("ERROR: bad child class [{}][{}].", downloadClass, parent);
this.journal.add(url, CrawlStatus.BADCHILDCLASS); this.journal.add(url, parentURL, CrawlStatus.BADCHILDCLASS);
} }
else else
{ {
@ -176,7 +175,7 @@ public class Crawler
String downloadSha = StatoolInfosUtils.sha1sum(downloadFile); String downloadSha = StatoolInfosUtils.sha1sum(downloadFile);
if (StringUtils.equals(downloadSha, storedSha)) if (StringUtils.equals(downloadSha, storedSha))
{ {
this.journal.add(url, CrawlStatus.SUCCESS); this.journal.add(url, parentURL, CrawlStatus.SUCCESS);
} }
else else
{ {
@ -199,13 +198,13 @@ public class Crawler
downloadFile.delete(); downloadFile.delete();
// //
this.journal.add(url, CrawlStatus.UPDATED); this.journal.add(url, parentURL, CrawlStatus.UPDATED);
} }
// Cache another resources. // Cache another resources.
crawlLogo(downloadProperties.getURL("federation.logo")); crawlLogo(downloadProperties.getURL("federation.logo"), url);
crawlLogo(downloadProperties.getURL("organization.logo")); crawlLogo(downloadProperties.getURL("organization.logo"), url);
crawlLogo(downloadProperties.getURL("service.logo")); crawlLogo(downloadProperties.getURL("service.logo"), url);
// Do subs. // Do subs.
PathProperties subs = downloadProperties.getByPrefix("subs"); PathProperties subs = downloadProperties.getByPrefix("subs");
@ -216,12 +215,12 @@ public class Crawler
try try
{ {
URL subUrl = new URL(property.getValue()); URL subUrl = new URL(property.getValue());
crawl(subUrl, downloadClass); crawl(subUrl, url, downloadClass);
} }
catch (java.net.MalformedURLException exception) catch (java.net.MalformedURLException exception)
{ {
logger.error("ERROR: subcrawl failed for [{}][{}][{}]: {}", url.toString(), property.getPath(), property.getValue(), exception.getMessage()); logger.error("ERROR: subcrawl failed for [{}][{}][{}]: {}", url.toString(), property.getPath(), property.getValue(), exception.getMessage());
this.journal.add(url, CrawlStatus.BADURLFORMAT); this.journal.add(url, parentURL, CrawlStatus.BADURLFORMAT);
exception.printStackTrace(); exception.printStackTrace();
} }
} }
@ -232,7 +231,7 @@ public class Crawler
} }
catch (IOException exception) catch (IOException exception)
{ {
this.journal.add(url, CrawlStatus.IOERROR); this.journal.add(url, parentURL, CrawlStatus.IOERROR);
} }
} }
@ -243,7 +242,7 @@ public class Crawler
* the url * the url
* @return the file * @return the file
*/ */
public File crawlLogo(final URL url) public File crawlLogo(final URL url, final URL parentURL)
{ {
File result; File result;
@ -265,19 +264,19 @@ public class Crawler
catch (java.net.ConnectException exception) catch (java.net.ConnectException exception)
{ {
logger.error("ERROR: crawl failed (1) for [{}]: {}", url.toString(), exception.getMessage()); logger.error("ERROR: crawl failed (1) for [{}]: {}", url.toString(), exception.getMessage());
this.journal.add(url, CrawlStatus.CONNECTERROR); this.journal.add(url, parentURL, CrawlStatus.CONNECTERROR);
logoFile = null; logoFile = null;
} }
catch (FileNotFoundException exception) catch (FileNotFoundException exception)
{ {
logger.error("ERROR: crawl failed (2) for [{}]: {}", url.toString(), exception.getMessage()); logger.error("ERROR: crawl failed (2) for [{}]: {}", url.toString(), exception.getMessage());
this.journal.add(url, CrawlStatus.URLNOTFOUND); this.journal.add(url, parentURL, CrawlStatus.URLNOTFOUND);
logoFile = null; logoFile = null;
} }
catch (IOException exception) catch (IOException exception)
{ {
logger.error("ERROR: crawl failed (3) for [{}]: {}", url.toString(), exception.getMessage()); logger.error("ERROR: crawl failed (3) for [{}]: {}", url.toString(), exception.getMessage());
this.journal.add(url, CrawlStatus.DOWNLOADERROR); this.journal.add(url, parentURL, CrawlStatus.DOWNLOADERROR);
logoFile = null; logoFile = null;
} }
@ -288,7 +287,7 @@ public class Crawler
else else
{ {
result = this.cache.store(url, logoFile); result = this.cache.store(url, logoFile);
this.journal.add(url, CrawlStatus.SUCCESS); this.journal.add(url, parentURL, CrawlStatus.SUCCESS);
logoFile.delete(); logoFile.delete();
} }
} }
@ -344,9 +343,7 @@ public class Crawler
logger.info("Restoring crawl journal."); logger.info("Restoring crawl journal.");
File journalFile = this.cache.restoreFile(getJournalURL()); result = this.cache.restoreJournal();
result = CrawlJournalFile.load(journalFile);
// //
return result; return result;
@ -357,34 +354,7 @@ public class Crawler
*/ */
public void storeJournal() public void storeJournal()
{ {
try logger.info("Storing crawl journal.");
{ this.cache.storeJournal(this.journal);
logger.info("Storing crawl journal.");
File file = Files.createTempFile("tmp-", ".statoolsinfos").toFile();
CrawlJournalFile.save(file, this.journal);
this.cache.store(getJournalURL(), file);
file.delete();
}
catch (IOException exception)
{
exception.printStackTrace();
}
}
/**
* Gets the journal URL.
*
* @return the journal URL
* @throws MalformedURLException
*/
public static URL getJournalURL() throws MalformedURLException
{
URL result;
result = new URL("http://localhost/crawl.journal");
//
return result;
} }
} }

View file

@ -21,13 +21,17 @@ package fr.devinsy.statoolinfos.htmlize;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.time.format.DateTimeFormatter;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import fr.devinsy.statoolinfos.HtmlizerContext; import fr.devinsy.statoolinfos.HtmlizerContext;
import fr.devinsy.statoolinfos.core.Federation; import fr.devinsy.statoolinfos.core.Federation;
import fr.devinsy.statoolinfos.core.Organization;
import fr.devinsy.statoolinfos.core.Service;
import fr.devinsy.statoolinfos.core.StatoolInfosException; import fr.devinsy.statoolinfos.core.StatoolInfosException;
import fr.devinsy.statoolinfos.crawl.CrawlCache; import fr.devinsy.statoolinfos.crawl.CrawlCache;
import fr.devinsy.statoolinfos.crawl.CrawlJournal; import fr.devinsy.statoolinfos.crawl.CrawlJournal;
@ -56,9 +60,20 @@ public class CrawlJournalPage
File htmlizeDirectory = HtmlizerContext.instance().getHtmlizeDirectory(); File htmlizeDirectory = HtmlizerContext.instance().getHtmlizeDirectory();
logger.info("Htmlize Crawl Journal pages."); logger.info("Htmlize Crawl Journal pages.");
CrawlJournal journal = HtmlizerContext.instance().getCrawlJournal(); String page = htmlize("Journal des téléchargements", federation.getCrawlJournal());
String page = htmlize("Journal des téléchargements", journal);
FileUtils.write(new File(htmlizeDirectory, federation.getTechnicalName() + "-crawl.xhtml"), page, StandardCharsets.UTF_8); FileUtils.write(new File(htmlizeDirectory, federation.getTechnicalName() + "-crawl.xhtml"), page, StandardCharsets.UTF_8);
for (Organization organization : federation.getOrganizations())
{
page = htmlize("Journal des téléchargements de " + organization.getName(), organization.getCrawlJournal());
FileUtils.write(new File(htmlizeDirectory, organization.getTechnicalName() + "-crawl.xhtml"), page, StandardCharsets.UTF_8);
}
for (Service service : federation.getAllServices())
{
page = htmlize("Journal des téléchargements de " + service.getName(), service.getCrawlJournal());
FileUtils.write(new File(htmlizeDirectory, service.getOrganization().getTechnicalName() + "-" + service.getTechnicalName() + "-crawl.xhtml"), page, StandardCharsets.UTF_8);
}
} }
/** /**
@ -83,7 +98,7 @@ public class CrawlJournalPage
TagDataManager data = new TagDataManager(); TagDataManager data = new TagDataManager();
data.setEscapedContent("title", title); data.setEscapedContent("title", title);
data.setContent("date", journal.getDatetime().toString()); data.setContent("date", journal.getDatetime().format(DateTimeFormatter.ofPattern("dd/MM/YYYY HH:mm")));
data.setContent("totalCount", journal.size()); data.setContent("totalCount", journal.size());
data.setContent("errorCount", journal.getErrors().size()); data.setContent("errorCount", journal.getErrors().size());
@ -92,6 +107,8 @@ public class CrawlJournalPage
{ {
data.setEscapedContent("crawlLogLine", index, "crawlLogLineUrlLink", log.getUrl().toString()); data.setEscapedContent("crawlLogLine", index, "crawlLogLineUrlLink", log.getUrl().toString());
data.setEscapedAttribute("crawlLogLine", index, "crawlLogLineUrlLink", "href", log.getUrl().toString()); data.setEscapedAttribute("crawlLogLine", index, "crawlLogLineUrlLink", "href", log.getUrl().toString());
data.setEscapedContent("crawlLogLine", index, "crawlLogLineParentUrlLink", StringUtils.abbreviate(log.getParentUrlValue(), 35));
data.setEscapedAttribute("crawlLogLine", index, "crawlLogLineParentUrlLink", "href", StringUtils.defaultString(log.getParentUrlValue(), "#"));
data.setContent("crawlLogLine", index, "crawlLogLineStatus", log.getStatus().toString()); data.setContent("crawlLogLine", index, "crawlLogLineStatus", log.getStatus().toString());
if (log.getStatus().isError()) if (log.getStatus().isError())

View file

@ -85,8 +85,9 @@ public class FederationPage
* @return the string * @return the string
* @throws StatoolInfosException * @throws StatoolInfosException
* the statool infos exception * the statool infos exception
* @throws IOException
*/ */
public static String htmlize(final Federation federation) throws StatoolInfosException public static String htmlize(final Federation federation) throws StatoolInfosException, IOException
{ {
String result; String result;
@ -110,8 +111,7 @@ public class FederationPage
data.setAttribute("statsLink", "href", federation.getTechnicalName() + "-stats.xhtml"); data.setAttribute("statsLink", "href", federation.getTechnicalName() + "-stats.xhtml");
data.setAttribute("crawlLink", "href", federation.getTechnicalName() + "-crawl.xhtml"); data.setAttribute("crawlLink", "href", federation.getTechnicalName() + "-crawl.xhtml");
if (federation.getCrawlJournal().getErrors().isEmpty())
if (HtmlizerContext.instance().getCrawlJournal().getErrors().isEmpty())
{ {
data.setAttribute("crawlLinkImg", "src", "circle-icons/download-mono.svg"); data.setAttribute("crawlLinkImg", "src", "circle-icons/download-mono.svg");
} }

View file

@ -152,6 +152,16 @@ public class OrganizationPage
data.setAttribute("statsLink", "href", organization.getTechnicalName() + "-stats.xhtml"); data.setAttribute("statsLink", "href", organization.getTechnicalName() + "-stats.xhtml");
data.setAttribute("crawlLink", "href", organization.getTechnicalName() + "-crawl.xhtml");
if (organization.getCrawlJournal().getErrors().isEmpty())
{
data.setAttribute("crawlLinkImg", "src", "circle-icons/download-mono.svg");
}
else
{
data.setAttribute("crawlLinkImg", "src", "circle-icons/download.svg");
}
{ {
PropertyChecks checks = organization.getInputChecksAll(); PropertyChecks checks = organization.getInputChecksAll();
@ -239,4 +249,5 @@ public class OrganizationPage
FileUtils.copyFile(logoFile, target); FileUtils.copyFile(logoFile, target);
} }
} }
} }

View file

@ -221,6 +221,16 @@ public class ServicePage
data.getIdData("softwareSourceLinkImg").getAttribute("class").setMode(DisplayMode.REPLACE); data.getIdData("softwareSourceLinkImg").getAttribute("class").setMode(DisplayMode.REPLACE);
} }
data.setAttribute("crawlLink", "href", service.getOrganization().getTechnicalName() + "-" + service.getTechnicalName() + "-crawl.xhtml");
if (service.getCrawlJournal().getErrors().isEmpty())
{
data.setAttribute("crawlLinkImg", "src", "circle-icons/download-mono.svg");
}
else
{
data.setAttribute("crawlLinkImg", "src", "circle-icons/download.svg");
}
{ {
PropertyChecks checks = service.getInputChecks(); PropertyChecks checks = service.getInputChecks();
data.setContent("errorCount", checks.getErrorCount()); data.setContent("errorCount", checks.getErrorCount());

View file

@ -20,16 +20,18 @@
<div>Date : <span id="date">n/a</span></div> <div>Date : <span id="date">n/a</span></div>
</div> </div>
<br/> <br/>
<div class="center_table" style="width: 900px;"> <div class="center_table" style="width: 1000px;">
<table id="crawlLogs" class="table_classic left"> <table id="crawlLogs" class="table_classic left">
<thead> <thead>
<tr> <tr>
<th style="width: 200px;">Parent</th>
<th>URL</th> <th>URL</th>
<th style="width: 200px;">Statut</th> <th style="width: 150px;">Statut</th>
</tr> </tr>
</thead> </thead>
<tbody> <tbody>
<tr id="crawlLogLine"> <tr id="crawlLogLine">
<td id="crawlLogLineParentUrl"><a href="#" id="crawlLogLineParentUrlLink">n/a</a></td>
<td id="crawlLogLineUrl"><a href="#" id="crawlLogLineUrlLink">n/a</a></td> <td id="crawlLogLineUrl"><a href="#" id="crawlLogLineUrlLink">n/a</a></td>
<td id="crawlLogLineStatus" class="td_center center">n/a</td> <td id="crawlLogLineStatus" class="td_center center">n/a</td>
</tr> </tr>

View file

@ -33,6 +33,7 @@
<a id="technicalDocLink" href="#"><img id="technicalDocLinkImg" src="circle-icons/tools.svg" class="disabled" title="Documentation technique"/></a> <a id="technicalDocLink" href="#"><img id="technicalDocLinkImg" src="circle-icons/tools.svg" class="disabled" title="Documentation technique"/></a>
<a id="rawCheckLink" href="#"><img id="rawCheckLinkImg" src="circle-icons/clipboard-mono.svg" title="Fichier propriétés analysé"/></a> <a id="rawCheckLink" href="#"><img id="rawCheckLinkImg" src="circle-icons/clipboard-mono.svg" title="Fichier propriétés analysé"/></a>
<a id="rawLink" href="#"><img id="rawLinkImg" src="circle-icons/document-mono.svg" title="Fichier propriétés"/></a> <a id="rawLink" href="#"><img id="rawLinkImg" src="circle-icons/document-mono.svg" title="Fichier propriétés"/></a>
<a id="crawlLink" href="#"><img id="crawlLinkImg" src="circle-icons/download-mono.svg" title="Statut des téléchargements"/></a>
<a id="statsLink" href="#"><img id="statsLinkImg" src="circle-icons/barchart-mono.svg" title="Statistiques"/></a> <a id="statsLink" href="#"><img id="statsLinkImg" src="circle-icons/barchart-mono.svg" title="Statistiques"/></a>
<div style="display: inline-block; vertical-align: middle; font-size: smaller; margin-left: 2px; width: 35px;"> <div style="display: inline-block; vertical-align: middle; font-size: smaller; margin-left: 2px; width: 35px;">
<a id="alertLink" href="#" style="text-decoration: none;"> <a id="alertLink" href="#" style="text-decoration: none;">

View file

@ -41,6 +41,7 @@
<a id="technicalDocLink" href="#"><img id="technicalDocLinkImg" src="circle-icons/tools.svg" class="disabled" title="Documentation technique"/></a> <a id="technicalDocLink" href="#"><img id="technicalDocLinkImg" src="circle-icons/tools.svg" class="disabled" title="Documentation technique"/></a>
<a id="rawCheckLink" href="#"><img id="rawCheckLinkImg" src="circle-icons/clipboard-mono.svg" title="Fichier propriétés analysé"/></a> <a id="rawCheckLink" href="#"><img id="rawCheckLinkImg" src="circle-icons/clipboard-mono.svg" title="Fichier propriétés analysé"/></a>
<a id="rawLink" href="#"><img id="rawLinkImg" src="circle-icons/document-mono.svg" title="Fichier propriétés"/></a> <a id="rawLink" href="#"><img id="rawLinkImg" src="circle-icons/document-mono.svg" title="Fichier propriétés"/></a>
<a id="crawlLink" href="#"><img id="crawlLinkImg" src="circle-icons/download-mono.svg" title="Statut des téléchargements"/></a>
<a id="statsLink" href="#"><img id="statsLinkImg" src="circle-icons/barchart-mono.svg" title="Statistiques"/></a> <a id="statsLink" href="#"><img id="statsLinkImg" src="circle-icons/barchart-mono.svg" title="Statistiques"/></a>
<div style="display: inline-block; vertical-align: middle; font-size: smaller; margin-left: 2px; width: 35px;"> <div style="display: inline-block; vertical-align: middle; font-size: smaller; margin-left: 2px; width: 35px;">
<a id="alertLink" href="#" style="text-decoration: none;"> <a id="alertLink" href="#" style="text-decoration: none;">

View file

@ -172,4 +172,43 @@ public final class URLUtils
// //
return result; return result;
} }
/**
* Equals.
*
* @param alpha
* the alpha
* @param bravo
* the bravo
* @return true, if successful
*/
public static boolean equals(final URL alpha, final URL bravo)
{
boolean result;
String alphaValue;
if (alpha == null)
{
alphaValue = null;
}
else
{
alphaValue = alpha.toString();
}
String bravoValue;
if (bravo == null)
{
bravoValue = null;
}
else
{
bravoValue = bravo.toString();
}
result = StringUtils.equals(alphaValue, bravoValue);
//
return result;
}
} }