Added crawl page for organizations and services.
This commit is contained in:
parent
bf81404746
commit
68906ed88a
18 changed files with 371 additions and 149 deletions
|
@ -27,9 +27,6 @@ import fr.devinsy.statoolinfos.core.Factory;
|
|||
import fr.devinsy.statoolinfos.core.Federation;
|
||||
import fr.devinsy.statoolinfos.core.StatoolInfosException;
|
||||
import fr.devinsy.statoolinfos.crawl.CrawlCache;
|
||||
import fr.devinsy.statoolinfos.crawl.CrawlJournal;
|
||||
import fr.devinsy.statoolinfos.crawl.CrawlJournalFile;
|
||||
import fr.devinsy.statoolinfos.crawl.Crawler;
|
||||
|
||||
/**
|
||||
* The Class Manager.
|
||||
|
@ -47,7 +44,6 @@ public class HtmlizerContext
|
|||
private Federation federation;
|
||||
private Categories categories;
|
||||
private CrawlCache cache;
|
||||
private CrawlJournal crawlJournal;
|
||||
|
||||
/**
|
||||
* Instantiates a new manager.
|
||||
|
@ -73,23 +69,9 @@ public class HtmlizerContext
|
|||
logger.info("Htmlize directory setting: {}", this.configuration.getHtmlizeDirectoryPath());
|
||||
|
||||
this.cache = new CrawlCache(this.configuration.getCrawlCacheDirectory());
|
||||
this.crawlJournal = CrawlJournalFile.load(this.cache.restoreFile(Crawler.getJournalURL()));
|
||||
|
||||
File htmlizeInputFile = this.cache.restoreFile(this.configuration.getHtmlizeInputURL());
|
||||
File htmlizeDirectory = this.configuration.getHtmlizeDirectory();
|
||||
if (htmlizeInputFile == null)
|
||||
{
|
||||
throw new IllegalArgumentException("Htmlize input undefined.");
|
||||
}
|
||||
else if (!htmlizeInputFile.exists())
|
||||
{
|
||||
throw new IllegalArgumentException("Htmlize input is missing.");
|
||||
}
|
||||
else if (htmlizeInputFile.isDirectory())
|
||||
{
|
||||
throw new IllegalArgumentException("Htmlize input is a directory.");
|
||||
}
|
||||
else if (htmlizeDirectory == null)
|
||||
if (htmlizeDirectory == null)
|
||||
{
|
||||
throw new IllegalArgumentException("Htmlize directory undefined.");
|
||||
}
|
||||
|
@ -105,7 +87,7 @@ public class HtmlizerContext
|
|||
{
|
||||
if (this.configuration.isFederation())
|
||||
{
|
||||
this.federation = Factory.loadFederation(htmlizeInputFile, this.cache);
|
||||
this.federation = Factory.loadFederation(this.configuration.getHtmlizeInputURL(), this.cache);
|
||||
this.categories = Factory.loadCategories(this.configuration.getCategoryFile(), this.federation);
|
||||
}
|
||||
else
|
||||
|
@ -160,11 +142,6 @@ public class HtmlizerContext
|
|||
return result;
|
||||
}
|
||||
|
||||
public CrawlJournal getCrawlJournal()
|
||||
{
|
||||
return this.crawlJournal;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the federation.
|
||||
*
|
||||
|
|
|
@ -29,6 +29,7 @@ import org.slf4j.LoggerFactory;
|
|||
import fr.devinsy.statoolinfos.checker.PropertyChecker;
|
||||
import fr.devinsy.statoolinfos.checker.PropertyChecks;
|
||||
import fr.devinsy.statoolinfos.crawl.CrawlCache;
|
||||
import fr.devinsy.statoolinfos.crawl.CrawlJournal;
|
||||
import fr.devinsy.statoolinfos.properties.PathProperties;
|
||||
import fr.devinsy.statoolinfos.properties.PathProperty;
|
||||
import fr.devinsy.statoolinfos.properties.PathPropertyUtils;
|
||||
|
@ -159,12 +160,38 @@ public class Factory
|
|||
* @throws IOException
|
||||
* Signals that an I/O exception has occurred.
|
||||
*/
|
||||
public static Federation loadFederation(final File federationFile, final CrawlCache cache) throws StatoolInfosException, IOException
|
||||
public static Federation loadFederation(final URL inputURL, final CrawlCache cache) throws StatoolInfosException, IOException
|
||||
{
|
||||
Federation result;
|
||||
|
||||
if (inputURL == null)
|
||||
{
|
||||
throw new IllegalArgumentException("Null input URL.");
|
||||
}
|
||||
else if (cache == null)
|
||||
{
|
||||
throw new IllegalArgumentException("Null cache URL.");
|
||||
}
|
||||
else
|
||||
{
|
||||
File federationFile = cache.restoreFile(inputURL);
|
||||
if (federationFile == null)
|
||||
{
|
||||
throw new IllegalArgumentException("Htmlize input file undefined.");
|
||||
}
|
||||
else if (!federationFile.exists())
|
||||
{
|
||||
throw new IllegalArgumentException("Htmlize input file is missing.");
|
||||
}
|
||||
else if (federationFile.isDirectory())
|
||||
{
|
||||
throw new IllegalArgumentException("Htmlize input file is a directory.");
|
||||
}
|
||||
else
|
||||
{
|
||||
PathProperties properties = PathPropertyUtils.load(federationFile);
|
||||
result = new Federation(properties);
|
||||
result.setInputURL(inputURL);
|
||||
result.setInputFile(federationFile);
|
||||
result.setLogoFileName(result.getTechnicalName() + "-logo" + StringUtils.defaultIfBlank(cache.getExtension(result.getLogoURL()), ".png"));
|
||||
|
||||
|
@ -178,8 +205,8 @@ public class Factory
|
|||
{
|
||||
if (StringUtils.startsWith(property.getValue(), "http"))
|
||||
{
|
||||
URL inputURL = new URL(property.getValue());
|
||||
Organization organization = loadOrganization(inputURL, cache);
|
||||
URL subInputURL = new URL(property.getValue());
|
||||
Organization organization = loadOrganization(subInputURL, cache);
|
||||
if (organization != null)
|
||||
{
|
||||
organization.setFederation(result);
|
||||
|
@ -189,37 +216,7 @@ public class Factory
|
|||
}
|
||||
|
||||
//
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Load organization.
|
||||
*
|
||||
* @param organizationFile
|
||||
* the organization file
|
||||
* @param cache
|
||||
* the cache
|
||||
* @return the organization
|
||||
* @throws IOException
|
||||
* Signals that an I/O exception has occurred.
|
||||
*/
|
||||
public static Organization loadOrganization(final File organizationFile, final CrawlCache cache) throws IOException
|
||||
{
|
||||
Organization result;
|
||||
|
||||
PathProperties properties = PathPropertyUtils.load(organizationFile);
|
||||
result = new Organization(properties);
|
||||
result.setInputFile(organizationFile);
|
||||
|
||||
PathProperties subs = result.getByPrefix("subs");
|
||||
for (PathProperty property : subs)
|
||||
{
|
||||
if (StringUtils.startsWith(property.getValue(), "http"))
|
||||
{
|
||||
URL serviceInputFile = new URL(property.getValue());
|
||||
Service service = loadService(serviceInputFile, cache);
|
||||
service.setOrganization(result);
|
||||
result.getServices().add(service);
|
||||
result.getCrawlJournal().addAll(cache.restoreJournal());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -285,6 +282,14 @@ public class Factory
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
CrawlJournal journal = cache.restoreJournal();
|
||||
result.getCrawlJournal().addAll(journal.searchByParent(result.getInputURL()));
|
||||
for (Service service : result.getServices())
|
||||
{
|
||||
result.getCrawlJournal().addAll(journal.searchByParent(service.getInputURL()));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -321,6 +326,10 @@ public class Factory
|
|||
result.setInputFile(inputFile);
|
||||
result.setInputURL(inputURL);
|
||||
result.setLogoFileName(result.getTechnicalName() + "-logo" + StringUtils.defaultIfBlank(cache.getExtension(result.getLogoURL()), ".png"));
|
||||
|
||||
//
|
||||
CrawlJournal journal = cache.restoreJournal();
|
||||
result.getCrawlJournal().addAll(journal.searchByParent(result.getInputURL()));
|
||||
}
|
||||
|
||||
//
|
||||
|
|
|
@ -26,6 +26,7 @@ import java.time.LocalDateTime;
|
|||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import fr.devinsy.statoolinfos.checker.PropertyChecks;
|
||||
import fr.devinsy.statoolinfos.crawl.CrawlJournal;
|
||||
import fr.devinsy.statoolinfos.properties.PathProperties;
|
||||
import fr.devinsy.statoolinfos.properties.PathPropertyList;
|
||||
|
||||
|
@ -36,9 +37,11 @@ public class Federation extends PathPropertyList
|
|||
{
|
||||
private static final long serialVersionUID = -8970835291634661580L;
|
||||
private Organizations organizations;
|
||||
private URL inputURL;
|
||||
private File inputFile;
|
||||
private String logoFileName;
|
||||
private PropertyChecks inputChecks;
|
||||
private CrawlJournal crawlJournal;
|
||||
|
||||
/**
|
||||
* Instantiates a new federation.
|
||||
|
@ -48,6 +51,7 @@ public class Federation extends PathPropertyList
|
|||
super();
|
||||
this.inputChecks = new PropertyChecks();
|
||||
this.organizations = new Organizations();
|
||||
this.crawlJournal = new CrawlJournal();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -68,7 +72,7 @@ public class Federation extends PathPropertyList
|
|||
else
|
||||
{
|
||||
this.organizations = new Organizations();
|
||||
|
||||
this.crawlJournal = new CrawlJournal();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -157,6 +161,11 @@ public class Federation extends PathPropertyList
|
|||
return result;
|
||||
}
|
||||
|
||||
public CrawlJournal getCrawlJournal()
|
||||
{
|
||||
return this.crawlJournal;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the description.
|
||||
*
|
||||
|
@ -204,6 +213,11 @@ public class Federation extends PathPropertyList
|
|||
return this.inputFile;
|
||||
}
|
||||
|
||||
public URL getInputURL()
|
||||
{
|
||||
return this.inputURL;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the legal website.
|
||||
*
|
||||
|
@ -435,6 +449,11 @@ public class Federation extends PathPropertyList
|
|||
this.inputFile = inputFile;
|
||||
}
|
||||
|
||||
public void setInputURL(final URL inputURL)
|
||||
{
|
||||
this.inputURL = inputURL;
|
||||
}
|
||||
|
||||
public void setLogoFileName(final String logoFileName)
|
||||
{
|
||||
this.logoFileName = logoFileName;
|
||||
|
|
|
@ -28,6 +28,7 @@ import org.apache.commons.codec.digest.DigestUtils;
|
|||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import fr.devinsy.statoolinfos.checker.PropertyChecks;
|
||||
import fr.devinsy.statoolinfos.crawl.CrawlJournal;
|
||||
import fr.devinsy.statoolinfos.properties.PathProperties;
|
||||
import fr.devinsy.statoolinfos.properties.PathPropertyList;
|
||||
|
||||
|
@ -43,6 +44,7 @@ public class Organization extends PathPropertyList
|
|||
private URL inputURL;
|
||||
private String logoFileName;
|
||||
private PropertyChecks inputChecks;
|
||||
private CrawlJournal crawlJournal;
|
||||
|
||||
/**
|
||||
* Instantiates a new organization.
|
||||
|
@ -52,6 +54,7 @@ public class Organization extends PathPropertyList
|
|||
super();
|
||||
this.inputChecks = new PropertyChecks();
|
||||
this.services = new Services();
|
||||
this.crawlJournal = new CrawlJournal();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -65,6 +68,7 @@ public class Organization extends PathPropertyList
|
|||
super(properties);
|
||||
this.inputChecks = new PropertyChecks();
|
||||
this.services = new Services();
|
||||
this.crawlJournal = new CrawlJournal();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -165,6 +169,11 @@ public class Organization extends PathPropertyList
|
|||
return result;
|
||||
}
|
||||
|
||||
public CrawlJournal getCrawlJournal()
|
||||
{
|
||||
return this.crawlJournal;
|
||||
}
|
||||
|
||||
public String getDescription()
|
||||
{
|
||||
String result;
|
||||
|
|
|
@ -33,6 +33,7 @@ import org.slf4j.Logger;
|
|||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import fr.devinsy.statoolinfos.checker.PropertyChecks;
|
||||
import fr.devinsy.statoolinfos.crawl.CrawlJournal;
|
||||
import fr.devinsy.statoolinfos.metrics.Metric;
|
||||
import fr.devinsy.statoolinfos.properties.PathProperties;
|
||||
import fr.devinsy.statoolinfos.properties.PathProperty;
|
||||
|
@ -104,6 +105,7 @@ public class Service extends PathPropertyList
|
|||
private URL inputURL;
|
||||
private String logoFileName;
|
||||
private PropertyChecks inputChecks;
|
||||
private CrawlJournal crawlJournal;
|
||||
|
||||
/**
|
||||
* Instantiates a new service.
|
||||
|
@ -123,6 +125,7 @@ public class Service extends PathPropertyList
|
|||
{
|
||||
super(properties);
|
||||
this.inputChecks = new PropertyChecks();
|
||||
this.crawlJournal = new CrawlJournal();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -223,6 +226,11 @@ public class Service extends PathPropertyList
|
|||
return result;
|
||||
}
|
||||
|
||||
public CrawlJournal getCrawlJournal()
|
||||
{
|
||||
return this.crawlJournal;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the description.
|
||||
*
|
||||
|
|
|
@ -20,7 +20,9 @@ package fr.devinsy.statoolinfos.crawl;
|
|||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.nio.file.Files;
|
||||
|
||||
import org.apache.commons.codec.digest.DigestUtils;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
|
@ -189,6 +191,22 @@ public class CrawlCache
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
public CrawlJournal restoreJournal() throws IOException
|
||||
{
|
||||
CrawlJournal result;
|
||||
|
||||
File journalFile = restoreFile(getJournalURL());
|
||||
|
||||
result = CrawlJournalFile.load(journalFile);
|
||||
|
||||
//
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Restore logo to.
|
||||
*
|
||||
|
@ -292,6 +310,24 @@ public class CrawlCache
|
|||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Store journal.
|
||||
*/
|
||||
public void storeJournal(final CrawlJournal journal)
|
||||
{
|
||||
try
|
||||
{
|
||||
File file = Files.createTempFile("tmp-", ".statoolsinfos").toFile();
|
||||
CrawlJournalFile.save(file, journal);
|
||||
store(getJournalURL(), file);
|
||||
file.delete();
|
||||
}
|
||||
catch (IOException exception)
|
||||
{
|
||||
exception.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Store.
|
||||
*
|
||||
|
@ -356,4 +392,20 @@ public class CrawlCache
|
|||
//
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the journal URL.
|
||||
*
|
||||
* @return the journal URL
|
||||
* @throws MalformedURLException
|
||||
*/
|
||||
public static URL getJournalURL() throws MalformedURLException
|
||||
{
|
||||
URL result;
|
||||
|
||||
result = new URL("http://localhost/crawl.journal");
|
||||
|
||||
//
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -34,6 +34,7 @@ import java.time.LocalDateTime;
|
|||
import java.time.ZoneOffset;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -187,14 +188,33 @@ public class CrawlJournalFile
|
|||
}
|
||||
else
|
||||
{
|
||||
String[] tokens = line.split(" ", 2);
|
||||
String[] tokens = line.split(" ", 3);
|
||||
|
||||
CrawlStatus status = CrawlStatus.valueOf(tokens[0].toUpperCase());
|
||||
|
||||
URL parentURL;
|
||||
try
|
||||
{
|
||||
if (StringUtils.equals(tokens[1], "null"))
|
||||
{
|
||||
parentURL = null;
|
||||
}
|
||||
else
|
||||
{
|
||||
parentURL = new URL(tokens[1].trim());
|
||||
}
|
||||
}
|
||||
catch (MalformedURLException exception)
|
||||
{
|
||||
logger.error("Error valuing [{}]", line);
|
||||
exception.printStackTrace();
|
||||
parentURL = null;
|
||||
}
|
||||
|
||||
URL url;
|
||||
try
|
||||
{
|
||||
url = new URL(tokens[1].trim());
|
||||
url = new URL(tokens[2].trim());
|
||||
}
|
||||
catch (MalformedURLException exception)
|
||||
{
|
||||
|
@ -203,7 +223,7 @@ public class CrawlJournalFile
|
|||
url = null;
|
||||
}
|
||||
|
||||
result = new CrawlLog(url, status);
|
||||
result = new CrawlLog(url, parentURL, status);
|
||||
}
|
||||
|
||||
//
|
||||
|
@ -226,7 +246,7 @@ public class CrawlJournalFile
|
|||
{
|
||||
for (CrawlLog log : journal)
|
||||
{
|
||||
String line = log.getStatus() + " " + log.getUrl();
|
||||
String line = String.format("%s %s %s", log.getStatus(), log.getParentUrl(), log.getUrl());
|
||||
out.write(line);
|
||||
out.write("\n");
|
||||
}
|
||||
|
|
|
@ -26,6 +26,7 @@ import java.net.URL;
|
|||
public class CrawlLog
|
||||
{
|
||||
private URL url;
|
||||
private URL parentUrl;
|
||||
private CrawlStatus status;
|
||||
|
||||
/**
|
||||
|
@ -36,12 +37,40 @@ public class CrawlLog
|
|||
* @param status
|
||||
* the status
|
||||
*/
|
||||
public CrawlLog(final URL url, final CrawlStatus status)
|
||||
public CrawlLog(final URL url, final URL parentUrl, final CrawlStatus status)
|
||||
{
|
||||
this.url = url;
|
||||
this.parentUrl = parentUrl;
|
||||
this.status = status;
|
||||
}
|
||||
|
||||
public URL getParentUrl()
|
||||
{
|
||||
return this.parentUrl;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the parent url value.
|
||||
*
|
||||
* @return the parent url value
|
||||
*/
|
||||
public String getParentUrlValue()
|
||||
{
|
||||
String result;
|
||||
|
||||
if (this.parentUrl == null)
|
||||
{
|
||||
result = null;
|
||||
}
|
||||
else
|
||||
{
|
||||
result = this.parentUrl.toString();
|
||||
}
|
||||
|
||||
//
|
||||
return result;
|
||||
}
|
||||
|
||||
public CrawlStatus getStatus()
|
||||
{
|
||||
return this.status;
|
||||
|
|
|
@ -24,6 +24,8 @@ import java.util.Collections;
|
|||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import fr.devinsy.statoolinfos.util.URLUtils;
|
||||
|
||||
/**
|
||||
* The Class CrawlLogs.
|
||||
*/
|
||||
|
@ -39,6 +41,27 @@ public class CrawlLogs extends ArrayList<CrawlLog>
|
|||
super();
|
||||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see java.util.ArrayList#add(java.lang.Object)
|
||||
*/
|
||||
@Override
|
||||
public boolean add(final CrawlLog log)
|
||||
{
|
||||
boolean result;
|
||||
|
||||
if (log == null)
|
||||
{
|
||||
result = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
result = super.add(log);
|
||||
}
|
||||
|
||||
//
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds the.
|
||||
*
|
||||
|
@ -47,9 +70,9 @@ public class CrawlLogs extends ArrayList<CrawlLog>
|
|||
* @param status
|
||||
* the status
|
||||
*/
|
||||
public void add(final URL url, final CrawlStatus status)
|
||||
public void add(final URL url, final URL parentUrl, final CrawlStatus status)
|
||||
{
|
||||
this.add(new CrawlLog(url, status));
|
||||
this.add(new CrawlLog(url, parentUrl, status));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -139,4 +162,29 @@ public class CrawlLogs extends ArrayList<CrawlLog>
|
|||
//
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the by parent.
|
||||
*
|
||||
* @param parentURL
|
||||
* the parent URL
|
||||
* @return the by parent
|
||||
*/
|
||||
public CrawlLogs searchByParent(final URL parentURL)
|
||||
{
|
||||
CrawlLogs result;
|
||||
|
||||
result = new CrawlLogs();
|
||||
|
||||
for (CrawlLog log : this)
|
||||
{
|
||||
if (URLUtils.equals(log.getParentUrl(), parentURL))
|
||||
{
|
||||
result.add(log);
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,7 +21,6 @@ package fr.devinsy.statoolinfos.crawl;
|
|||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
|
@ -89,7 +88,7 @@ public class Crawler
|
|||
*/
|
||||
public void crawl(final URL url) throws StatoolInfosException, IOException
|
||||
{
|
||||
crawl(url, null);
|
||||
crawl(url, null, null);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -104,7 +103,7 @@ public class Crawler
|
|||
* @throws IOException
|
||||
* Signals that an I/O exception has occurred.
|
||||
*/
|
||||
public void crawl(final URL url, final PropertyClassType parent)
|
||||
public void crawl(final URL url, final URL parentURL, final PropertyClassType parent)
|
||||
{
|
||||
logger.info("Crawling {}", url);
|
||||
|
||||
|
@ -118,21 +117,21 @@ public class Crawler
|
|||
catch (java.net.ConnectException exception)
|
||||
{
|
||||
logger.error("ERROR: crawl failed for [{}]: {}", url.toString(), exception.getMessage());
|
||||
this.journal.add(url, CrawlStatus.CONNECTERROR);
|
||||
this.journal.add(url, parentURL, CrawlStatus.CONNECTERROR);
|
||||
downloadFile = null;
|
||||
exception.printStackTrace();
|
||||
}
|
||||
catch (FileNotFoundException exception)
|
||||
{
|
||||
logger.error("ERROR: crawl failed for [{}]: {}", url.toString(), exception.getMessage());
|
||||
this.journal.add(url, CrawlStatus.URLNOTFOUND);
|
||||
this.journal.add(url, parentURL, CrawlStatus.URLNOTFOUND);
|
||||
downloadFile = null;
|
||||
exception.printStackTrace();
|
||||
}
|
||||
catch (IOException exception)
|
||||
{
|
||||
logger.error("ERROR: crawl failed for [{}]: {}", url.toString(), exception.getMessage());
|
||||
this.journal.add(url, CrawlStatus.DOWNLOADERROR);
|
||||
this.journal.add(url, parentURL, CrawlStatus.DOWNLOADERROR);
|
||||
downloadFile = null;
|
||||
exception.printStackTrace();
|
||||
}
|
||||
|
@ -142,12 +141,12 @@ public class Crawler
|
|||
if (!downloadFile.exists())
|
||||
{
|
||||
logger.error("ERROR: download missing.");
|
||||
this.journal.add(url, CrawlStatus.MISSING);
|
||||
this.journal.add(url, parentURL, CrawlStatus.MISSING);
|
||||
}
|
||||
else if (downloadFile.length() == 0)
|
||||
{
|
||||
logger.error("ERROR: download empty.");
|
||||
this.journal.add(url, CrawlStatus.EMPTY);
|
||||
this.journal.add(url, parentURL, CrawlStatus.EMPTY);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -157,7 +156,7 @@ public class Crawler
|
|||
if ((downloadClass == null) || (!downloadClass.isChildOf(parent)))
|
||||
{
|
||||
logger.error("ERROR: bad child class [{}][{}].", downloadClass, parent);
|
||||
this.journal.add(url, CrawlStatus.BADCHILDCLASS);
|
||||
this.journal.add(url, parentURL, CrawlStatus.BADCHILDCLASS);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -176,7 +175,7 @@ public class Crawler
|
|||
String downloadSha = StatoolInfosUtils.sha1sum(downloadFile);
|
||||
if (StringUtils.equals(downloadSha, storedSha))
|
||||
{
|
||||
this.journal.add(url, CrawlStatus.SUCCESS);
|
||||
this.journal.add(url, parentURL, CrawlStatus.SUCCESS);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -199,13 +198,13 @@ public class Crawler
|
|||
downloadFile.delete();
|
||||
|
||||
//
|
||||
this.journal.add(url, CrawlStatus.UPDATED);
|
||||
this.journal.add(url, parentURL, CrawlStatus.UPDATED);
|
||||
}
|
||||
|
||||
// Cache another resources.
|
||||
crawlLogo(downloadProperties.getURL("federation.logo"));
|
||||
crawlLogo(downloadProperties.getURL("organization.logo"));
|
||||
crawlLogo(downloadProperties.getURL("service.logo"));
|
||||
crawlLogo(downloadProperties.getURL("federation.logo"), url);
|
||||
crawlLogo(downloadProperties.getURL("organization.logo"), url);
|
||||
crawlLogo(downloadProperties.getURL("service.logo"), url);
|
||||
|
||||
// Do subs.
|
||||
PathProperties subs = downloadProperties.getByPrefix("subs");
|
||||
|
@ -216,12 +215,12 @@ public class Crawler
|
|||
try
|
||||
{
|
||||
URL subUrl = new URL(property.getValue());
|
||||
crawl(subUrl, downloadClass);
|
||||
crawl(subUrl, url, downloadClass);
|
||||
}
|
||||
catch (java.net.MalformedURLException exception)
|
||||
{
|
||||
logger.error("ERROR: subcrawl failed for [{}][{}][{}]: {}", url.toString(), property.getPath(), property.getValue(), exception.getMessage());
|
||||
this.journal.add(url, CrawlStatus.BADURLFORMAT);
|
||||
this.journal.add(url, parentURL, CrawlStatus.BADURLFORMAT);
|
||||
exception.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
@ -232,7 +231,7 @@ public class Crawler
|
|||
}
|
||||
catch (IOException exception)
|
||||
{
|
||||
this.journal.add(url, CrawlStatus.IOERROR);
|
||||
this.journal.add(url, parentURL, CrawlStatus.IOERROR);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -243,7 +242,7 @@ public class Crawler
|
|||
* the url
|
||||
* @return the file
|
||||
*/
|
||||
public File crawlLogo(final URL url)
|
||||
public File crawlLogo(final URL url, final URL parentURL)
|
||||
{
|
||||
File result;
|
||||
|
||||
|
@ -265,19 +264,19 @@ public class Crawler
|
|||
catch (java.net.ConnectException exception)
|
||||
{
|
||||
logger.error("ERROR: crawl failed (1) for [{}]: {}", url.toString(), exception.getMessage());
|
||||
this.journal.add(url, CrawlStatus.CONNECTERROR);
|
||||
this.journal.add(url, parentURL, CrawlStatus.CONNECTERROR);
|
||||
logoFile = null;
|
||||
}
|
||||
catch (FileNotFoundException exception)
|
||||
{
|
||||
logger.error("ERROR: crawl failed (2) for [{}]: {}", url.toString(), exception.getMessage());
|
||||
this.journal.add(url, CrawlStatus.URLNOTFOUND);
|
||||
this.journal.add(url, parentURL, CrawlStatus.URLNOTFOUND);
|
||||
logoFile = null;
|
||||
}
|
||||
catch (IOException exception)
|
||||
{
|
||||
logger.error("ERROR: crawl failed (3) for [{}]: {}", url.toString(), exception.getMessage());
|
||||
this.journal.add(url, CrawlStatus.DOWNLOADERROR);
|
||||
this.journal.add(url, parentURL, CrawlStatus.DOWNLOADERROR);
|
||||
logoFile = null;
|
||||
}
|
||||
|
||||
|
@ -288,7 +287,7 @@ public class Crawler
|
|||
else
|
||||
{
|
||||
result = this.cache.store(url, logoFile);
|
||||
this.journal.add(url, CrawlStatus.SUCCESS);
|
||||
this.journal.add(url, parentURL, CrawlStatus.SUCCESS);
|
||||
logoFile.delete();
|
||||
}
|
||||
}
|
||||
|
@ -344,9 +343,7 @@ public class Crawler
|
|||
|
||||
logger.info("Restoring crawl journal.");
|
||||
|
||||
File journalFile = this.cache.restoreFile(getJournalURL());
|
||||
|
||||
result = CrawlJournalFile.load(journalFile);
|
||||
result = this.cache.restoreJournal();
|
||||
|
||||
//
|
||||
return result;
|
||||
|
@ -356,35 +353,8 @@ public class Crawler
|
|||
* Store journal.
|
||||
*/
|
||||
public void storeJournal()
|
||||
{
|
||||
try
|
||||
{
|
||||
logger.info("Storing crawl journal.");
|
||||
File file = Files.createTempFile("tmp-", ".statoolsinfos").toFile();
|
||||
|
||||
CrawlJournalFile.save(file, this.journal);
|
||||
this.cache.store(getJournalURL(), file);
|
||||
file.delete();
|
||||
}
|
||||
catch (IOException exception)
|
||||
{
|
||||
exception.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the journal URL.
|
||||
*
|
||||
* @return the journal URL
|
||||
* @throws MalformedURLException
|
||||
*/
|
||||
public static URL getJournalURL() throws MalformedURLException
|
||||
{
|
||||
URL result;
|
||||
|
||||
result = new URL("http://localhost/crawl.journal");
|
||||
|
||||
//
|
||||
return result;
|
||||
this.cache.storeJournal(this.journal);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,13 +21,17 @@ package fr.devinsy.statoolinfos.htmlize;
|
|||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import fr.devinsy.statoolinfos.HtmlizerContext;
|
||||
import fr.devinsy.statoolinfos.core.Federation;
|
||||
import fr.devinsy.statoolinfos.core.Organization;
|
||||
import fr.devinsy.statoolinfos.core.Service;
|
||||
import fr.devinsy.statoolinfos.core.StatoolInfosException;
|
||||
import fr.devinsy.statoolinfos.crawl.CrawlCache;
|
||||
import fr.devinsy.statoolinfos.crawl.CrawlJournal;
|
||||
|
@ -56,9 +60,20 @@ public class CrawlJournalPage
|
|||
File htmlizeDirectory = HtmlizerContext.instance().getHtmlizeDirectory();
|
||||
|
||||
logger.info("Htmlize Crawl Journal pages.");
|
||||
CrawlJournal journal = HtmlizerContext.instance().getCrawlJournal();
|
||||
String page = htmlize("Journal des téléchargements", journal);
|
||||
String page = htmlize("Journal des téléchargements", federation.getCrawlJournal());
|
||||
FileUtils.write(new File(htmlizeDirectory, federation.getTechnicalName() + "-crawl.xhtml"), page, StandardCharsets.UTF_8);
|
||||
|
||||
for (Organization organization : federation.getOrganizations())
|
||||
{
|
||||
page = htmlize("Journal des téléchargements de " + organization.getName(), organization.getCrawlJournal());
|
||||
FileUtils.write(new File(htmlizeDirectory, organization.getTechnicalName() + "-crawl.xhtml"), page, StandardCharsets.UTF_8);
|
||||
}
|
||||
|
||||
for (Service service : federation.getAllServices())
|
||||
{
|
||||
page = htmlize("Journal des téléchargements de " + service.getName(), service.getCrawlJournal());
|
||||
FileUtils.write(new File(htmlizeDirectory, service.getOrganization().getTechnicalName() + "-" + service.getTechnicalName() + "-crawl.xhtml"), page, StandardCharsets.UTF_8);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -83,7 +98,7 @@ public class CrawlJournalPage
|
|||
TagDataManager data = new TagDataManager();
|
||||
|
||||
data.setEscapedContent("title", title);
|
||||
data.setContent("date", journal.getDatetime().toString());
|
||||
data.setContent("date", journal.getDatetime().format(DateTimeFormatter.ofPattern("dd/MM/YYYY HH:mm")));
|
||||
data.setContent("totalCount", journal.size());
|
||||
data.setContent("errorCount", journal.getErrors().size());
|
||||
|
||||
|
@ -92,6 +107,8 @@ public class CrawlJournalPage
|
|||
{
|
||||
data.setEscapedContent("crawlLogLine", index, "crawlLogLineUrlLink", log.getUrl().toString());
|
||||
data.setEscapedAttribute("crawlLogLine", index, "crawlLogLineUrlLink", "href", log.getUrl().toString());
|
||||
data.setEscapedContent("crawlLogLine", index, "crawlLogLineParentUrlLink", StringUtils.abbreviate(log.getParentUrlValue(), 35));
|
||||
data.setEscapedAttribute("crawlLogLine", index, "crawlLogLineParentUrlLink", "href", StringUtils.defaultString(log.getParentUrlValue(), "#"));
|
||||
data.setContent("crawlLogLine", index, "crawlLogLineStatus", log.getStatus().toString());
|
||||
|
||||
if (log.getStatus().isError())
|
||||
|
|
|
@ -85,8 +85,9 @@ public class FederationPage
|
|||
* @return the string
|
||||
* @throws StatoolInfosException
|
||||
* the statool infos exception
|
||||
* @throws IOException
|
||||
*/
|
||||
public static String htmlize(final Federation federation) throws StatoolInfosException
|
||||
public static String htmlize(final Federation federation) throws StatoolInfosException, IOException
|
||||
{
|
||||
String result;
|
||||
|
||||
|
@ -110,8 +111,7 @@ public class FederationPage
|
|||
data.setAttribute("statsLink", "href", federation.getTechnicalName() + "-stats.xhtml");
|
||||
|
||||
data.setAttribute("crawlLink", "href", federation.getTechnicalName() + "-crawl.xhtml");
|
||||
|
||||
if (HtmlizerContext.instance().getCrawlJournal().getErrors().isEmpty())
|
||||
if (federation.getCrawlJournal().getErrors().isEmpty())
|
||||
{
|
||||
data.setAttribute("crawlLinkImg", "src", "circle-icons/download-mono.svg");
|
||||
}
|
||||
|
|
|
@ -152,6 +152,16 @@ public class OrganizationPage
|
|||
|
||||
data.setAttribute("statsLink", "href", organization.getTechnicalName() + "-stats.xhtml");
|
||||
|
||||
data.setAttribute("crawlLink", "href", organization.getTechnicalName() + "-crawl.xhtml");
|
||||
if (organization.getCrawlJournal().getErrors().isEmpty())
|
||||
{
|
||||
data.setAttribute("crawlLinkImg", "src", "circle-icons/download-mono.svg");
|
||||
}
|
||||
else
|
||||
{
|
||||
data.setAttribute("crawlLinkImg", "src", "circle-icons/download.svg");
|
||||
}
|
||||
|
||||
{
|
||||
PropertyChecks checks = organization.getInputChecksAll();
|
||||
|
||||
|
@ -239,4 +249,5 @@ public class OrganizationPage
|
|||
FileUtils.copyFile(logoFile, target);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -221,6 +221,16 @@ public class ServicePage
|
|||
data.getIdData("softwareSourceLinkImg").getAttribute("class").setMode(DisplayMode.REPLACE);
|
||||
}
|
||||
|
||||
data.setAttribute("crawlLink", "href", service.getOrganization().getTechnicalName() + "-" + service.getTechnicalName() + "-crawl.xhtml");
|
||||
if (service.getCrawlJournal().getErrors().isEmpty())
|
||||
{
|
||||
data.setAttribute("crawlLinkImg", "src", "circle-icons/download-mono.svg");
|
||||
}
|
||||
else
|
||||
{
|
||||
data.setAttribute("crawlLinkImg", "src", "circle-icons/download.svg");
|
||||
}
|
||||
|
||||
{
|
||||
PropertyChecks checks = service.getInputChecks();
|
||||
data.setContent("errorCount", checks.getErrorCount());
|
||||
|
|
|
@ -20,16 +20,18 @@
|
|||
<div>Date : <span id="date">n/a</span></div>
|
||||
</div>
|
||||
<br/>
|
||||
<div class="center_table" style="width: 900px;">
|
||||
<div class="center_table" style="width: 1000px;">
|
||||
<table id="crawlLogs" class="table_classic left">
|
||||
<thead>
|
||||
<tr>
|
||||
<th style="width: 200px;">Parent</th>
|
||||
<th>URL</th>
|
||||
<th style="width: 200px;">Statut</th>
|
||||
<th style="width: 150px;">Statut</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr id="crawlLogLine">
|
||||
<td id="crawlLogLineParentUrl"><a href="#" id="crawlLogLineParentUrlLink">n/a</a></td>
|
||||
<td id="crawlLogLineUrl"><a href="#" id="crawlLogLineUrlLink">n/a</a></td>
|
||||
<td id="crawlLogLineStatus" class="td_center center">n/a</td>
|
||||
</tr>
|
||||
|
|
|
@ -33,6 +33,7 @@
|
|||
<a id="technicalDocLink" href="#"><img id="technicalDocLinkImg" src="circle-icons/tools.svg" class="disabled" title="Documentation technique"/></a>
|
||||
<a id="rawCheckLink" href="#"><img id="rawCheckLinkImg" src="circle-icons/clipboard-mono.svg" title="Fichier propriétés analysé"/></a>
|
||||
<a id="rawLink" href="#"><img id="rawLinkImg" src="circle-icons/document-mono.svg" title="Fichier propriétés"/></a>
|
||||
<a id="crawlLink" href="#"><img id="crawlLinkImg" src="circle-icons/download-mono.svg" title="Statut des téléchargements"/></a>
|
||||
<a id="statsLink" href="#"><img id="statsLinkImg" src="circle-icons/barchart-mono.svg" title="Statistiques"/></a>
|
||||
<div style="display: inline-block; vertical-align: middle; font-size: smaller; margin-left: 2px; width: 35px;">
|
||||
<a id="alertLink" href="#" style="text-decoration: none;">
|
||||
|
|
|
@ -41,6 +41,7 @@
|
|||
<a id="technicalDocLink" href="#"><img id="technicalDocLinkImg" src="circle-icons/tools.svg" class="disabled" title="Documentation technique"/></a>
|
||||
<a id="rawCheckLink" href="#"><img id="rawCheckLinkImg" src="circle-icons/clipboard-mono.svg" title="Fichier propriétés analysé"/></a>
|
||||
<a id="rawLink" href="#"><img id="rawLinkImg" src="circle-icons/document-mono.svg" title="Fichier propriétés"/></a>
|
||||
<a id="crawlLink" href="#"><img id="crawlLinkImg" src="circle-icons/download-mono.svg" title="Statut des téléchargements"/></a>
|
||||
<a id="statsLink" href="#"><img id="statsLinkImg" src="circle-icons/barchart-mono.svg" title="Statistiques"/></a>
|
||||
<div style="display: inline-block; vertical-align: middle; font-size: smaller; margin-left: 2px; width: 35px;">
|
||||
<a id="alertLink" href="#" style="text-decoration: none;">
|
||||
|
|
|
@ -172,4 +172,43 @@ public final class URLUtils
|
|||
//
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Equals.
|
||||
*
|
||||
* @param alpha
|
||||
* the alpha
|
||||
* @param bravo
|
||||
* the bravo
|
||||
* @return true, if successful
|
||||
*/
|
||||
public static boolean equals(final URL alpha, final URL bravo)
|
||||
{
|
||||
boolean result;
|
||||
|
||||
String alphaValue;
|
||||
if (alpha == null)
|
||||
{
|
||||
alphaValue = null;
|
||||
}
|
||||
else
|
||||
{
|
||||
alphaValue = alpha.toString();
|
||||
}
|
||||
|
||||
String bravoValue;
|
||||
if (bravo == null)
|
||||
{
|
||||
bravoValue = null;
|
||||
}
|
||||
else
|
||||
{
|
||||
bravoValue = bravo.toString();
|
||||
}
|
||||
|
||||
result = StringUtils.equals(alphaValue, bravoValue);
|
||||
|
||||
//
|
||||
return result;
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue