Improved logs and sub error management.
This commit is contained in:
parent
35bddb9f26
commit
a653627030
3 changed files with 60 additions and 42 deletions
|
@ -173,11 +173,7 @@ public class Factory
|
|||
{
|
||||
URL inputURL = new URL(property.getValue());
|
||||
Organization organization = loadOrganization(inputURL, cache);
|
||||
if (organization == null)
|
||||
{
|
||||
logger.error("Loading organization failed for [{}]", property.getValue());
|
||||
}
|
||||
else
|
||||
if (organization != null)
|
||||
{
|
||||
result.getOrganizations().add(organization);
|
||||
}
|
||||
|
@ -243,6 +239,7 @@ public class Factory
|
|||
if (inputFile == null)
|
||||
{
|
||||
result = null;
|
||||
logger.warn("WARNING: organization not found in cache [{}]", inputURL);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
|
@ -487,7 +487,7 @@ public class CrawlCache
|
|||
}
|
||||
catch (IOException exception)
|
||||
{
|
||||
logger.info("Store faile for {}: {}", url, exception.getMessage());
|
||||
logger.info("Store failed for {}: {}", url, exception.getMessage());
|
||||
result = null;
|
||||
}
|
||||
|
||||
|
|
|
@ -110,8 +110,24 @@ public class Crawler
|
|||
PathProperties subs = input.getByPrefix("subs");
|
||||
for (PathProperty property : subs)
|
||||
{
|
||||
URL url = new URL(property.getValue());
|
||||
crawl(url, cache);
|
||||
if (StringUtils.isNotBlank(property.getValue()))
|
||||
{
|
||||
try
|
||||
{
|
||||
URL subUrl = new URL(property.getValue());
|
||||
crawl(subUrl, cache);
|
||||
}
|
||||
catch (java.net.MalformedURLException exception)
|
||||
{
|
||||
logger.error("ERROR: subcrawl failed for [{}][{}]: {}", property.getPath(), property.getValue(), exception.getMessage());
|
||||
exception.printStackTrace();
|
||||
}
|
||||
catch (java.net.ConnectException | FileNotFoundException exception)
|
||||
{
|
||||
logger.error("ERROR: subcrawl failed for [{}][{}]: {}", property.getPath(), property.getValue(), exception.getMessage());
|
||||
exception.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -148,47 +164,52 @@ public class Crawler
|
|||
*/
|
||||
public static void crawl(final URL url, final CrawlCache cache) throws StatoolInfosException, IOException
|
||||
{
|
||||
try
|
||||
logger.info("Crawling " + url);
|
||||
|
||||
// Crawl.
|
||||
File file = cache.store(url);
|
||||
|
||||
// Build crawl data.
|
||||
PathProperties crawlSection = new PathPropertyList();
|
||||
crawlSection.put("crawl.crawler", "StatoolInfos");
|
||||
crawlSection.put("crawl.datetime", LocalDateTime.now().toString());
|
||||
crawlSection.put("crawl.url", url.toString());
|
||||
crawlSection.put("crawl.file.size", FileUtils.sizeOf(file));
|
||||
crawlSection.put("crawl.file.datetime", StatoolInfosUtils.urlLastModified(url).toString());
|
||||
crawlSection.put("crawl.file.sha1", DigestUtils.sha1Hex(FileUtils.readFileToByteArray(file)));
|
||||
|
||||
// Add crawl data in crawled file.
|
||||
String lines = crawlSection.toStringListFormatted().toStringSeparatedBy('\n');
|
||||
FileUtils.write(file, FileUtils.readFileToString(file, StandardCharsets.UTF_8) + "\n" + lines, StandardCharsets.UTF_8);
|
||||
|
||||
// Crawl another resources.
|
||||
PathProperties properties = PathPropertyUtils.load(file);
|
||||
|
||||
cache.storeQuietly(properties.getURL("organization.logo"));
|
||||
cache.storeQuietly(properties.getURL("service.logo"));
|
||||
|
||||
// Crawl subs.
|
||||
PathProperties subs = properties.getByPrefix("subs");
|
||||
for (PathProperty property : subs)
|
||||
{
|
||||
logger.info("Crawling " + url);
|
||||
|
||||
// Crawl.
|
||||
File file = cache.store(url);
|
||||
|
||||
// Build crawl data.
|
||||
PathProperties crawlSection = new PathPropertyList();
|
||||
crawlSection.put("crawl.crawler", "StatoolInfos");
|
||||
crawlSection.put("crawl.datetime", LocalDateTime.now().toString());
|
||||
crawlSection.put("crawl.url", url.toString());
|
||||
crawlSection.put("crawl.file.size", FileUtils.sizeOf(file));
|
||||
crawlSection.put("crawl.file.datetime", StatoolInfosUtils.urlLastModified(url).toString());
|
||||
crawlSection.put("crawl.file.sha1", DigestUtils.sha1Hex(FileUtils.readFileToByteArray(file)));
|
||||
|
||||
// Add crawl data in crawled file.
|
||||
String lines = crawlSection.toStringListFormatted().toStringSeparatedBy('\n');
|
||||
FileUtils.write(file, FileUtils.readFileToString(file, StandardCharsets.UTF_8) + "\n" + lines, StandardCharsets.UTF_8);
|
||||
|
||||
// Crawl another resources.
|
||||
PathProperties properties = PathPropertyUtils.load(file);
|
||||
|
||||
cache.storeQuietly(properties.getURL("organization.logo"));
|
||||
cache.storeQuietly(properties.getURL("service.logo"));
|
||||
|
||||
// Crawl subs.
|
||||
PathProperties subs = properties.getByPrefix("subs");
|
||||
for (PathProperty property : subs)
|
||||
if (StringUtils.isNotBlank(property.getValue()))
|
||||
{
|
||||
if (StringUtils.isNotBlank(property.getValue()))
|
||||
try
|
||||
{
|
||||
URL subUrl = new URL(property.getValue());
|
||||
crawl(subUrl, cache);
|
||||
}
|
||||
catch (java.net.MalformedURLException exception)
|
||||
{
|
||||
logger.error("ERROR: subcrawl failed for [{}][{}][{}]: {}", url.toString(), property.getPath(), property.getValue(), exception.getMessage());
|
||||
exception.printStackTrace();
|
||||
}
|
||||
catch (java.net.ConnectException | FileNotFoundException exception)
|
||||
{
|
||||
logger.error("ERROR: subcrawl failed for [{}][{}][{}]: {}", url.toString(), property.getPath(), property.getValue(), exception.getMessage());
|
||||
exception.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (java.net.ConnectException | FileNotFoundException exception)
|
||||
{
|
||||
logger.error("ERROR: crawl failed for [{}]: {}", url.toString(), exception.getMessage());
|
||||
exception.printStackTrace();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue