diff --git a/src/fr/devinsy/statoolinfos/core/Factory.java b/src/fr/devinsy/statoolinfos/core/Factory.java index bd5841d..e81ccf7 100644 --- a/src/fr/devinsy/statoolinfos/core/Factory.java +++ b/src/fr/devinsy/statoolinfos/core/Factory.java @@ -173,11 +173,7 @@ public class Factory { URL inputURL = new URL(property.getValue()); Organization organization = loadOrganization(inputURL, cache); - if (organization == null) - { - logger.error("Loading organization failed for [{}]", property.getValue()); - } - else + if (organization != null) { result.getOrganizations().add(organization); } @@ -243,6 +239,7 @@ public class Factory if (inputFile == null) { result = null; + logger.warn("WARNING: organization not found in cache [{}]", inputURL); } else { diff --git a/src/fr/devinsy/statoolinfos/crawl/CrawlCache.java b/src/fr/devinsy/statoolinfos/crawl/CrawlCache.java index 01c9f7f..ef2bd3f 100644 --- a/src/fr/devinsy/statoolinfos/crawl/CrawlCache.java +++ b/src/fr/devinsy/statoolinfos/crawl/CrawlCache.java @@ -487,7 +487,7 @@ public class CrawlCache } catch (IOException exception) { - logger.info("Store faile for {}: {}", url, exception.getMessage()); + logger.info("Store failed for {}: {}", url, exception.getMessage()); result = null; } diff --git a/src/fr/devinsy/statoolinfos/crawl/Crawler.java b/src/fr/devinsy/statoolinfos/crawl/Crawler.java index b44c5eb..5d055cc 100644 --- a/src/fr/devinsy/statoolinfos/crawl/Crawler.java +++ b/src/fr/devinsy/statoolinfos/crawl/Crawler.java @@ -110,8 +110,24 @@ public class Crawler PathProperties subs = input.getByPrefix("subs"); for (PathProperty property : subs) { - URL url = new URL(property.getValue()); - crawl(url, cache); + if (StringUtils.isNotBlank(property.getValue())) + { + try + { + URL subUrl = new URL(property.getValue()); + crawl(subUrl, cache); + } + catch (java.net.MalformedURLException exception) + { + logger.error("ERROR: subcrawl failed for [{}][{}]: {}", property.getPath(), property.getValue(), exception.getMessage()); + exception.printStackTrace(); + } + catch (java.net.ConnectException | FileNotFoundException exception) + { + logger.error("ERROR: subcrawl failed for [{}][{}]: {}", property.getPath(), property.getValue(), exception.getMessage()); + exception.printStackTrace(); + } + } } } @@ -148,47 +164,52 @@ public class Crawler */ public static void crawl(final URL url, final CrawlCache cache) throws StatoolInfosException, IOException { - try + logger.info("Crawling " + url); + + // Crawl. + File file = cache.store(url); + + // Build crawl data. + PathProperties crawlSection = new PathPropertyList(); + crawlSection.put("crawl.crawler", "StatoolInfos"); + crawlSection.put("crawl.datetime", LocalDateTime.now().toString()); + crawlSection.put("crawl.url", url.toString()); + crawlSection.put("crawl.file.size", FileUtils.sizeOf(file)); + crawlSection.put("crawl.file.datetime", StatoolInfosUtils.urlLastModified(url).toString()); + crawlSection.put("crawl.file.sha1", DigestUtils.sha1Hex(FileUtils.readFileToByteArray(file))); + + // Add crawl data in crawled file. + String lines = crawlSection.toStringListFormatted().toStringSeparatedBy('\n'); + FileUtils.write(file, FileUtils.readFileToString(file, StandardCharsets.UTF_8) + "\n" + lines, StandardCharsets.UTF_8); + + // Crawl another resources. + PathProperties properties = PathPropertyUtils.load(file); + + cache.storeQuietly(properties.getURL("organization.logo")); + cache.storeQuietly(properties.getURL("service.logo")); + + // Crawl subs. + PathProperties subs = properties.getByPrefix("subs"); + for (PathProperty property : subs) { - logger.info("Crawling " + url); - - // Crawl. - File file = cache.store(url); - - // Build crawl data. - PathProperties crawlSection = new PathPropertyList(); - crawlSection.put("crawl.crawler", "StatoolInfos"); - crawlSection.put("crawl.datetime", LocalDateTime.now().toString()); - crawlSection.put("crawl.url", url.toString()); - crawlSection.put("crawl.file.size", FileUtils.sizeOf(file)); - crawlSection.put("crawl.file.datetime", StatoolInfosUtils.urlLastModified(url).toString()); - crawlSection.put("crawl.file.sha1", DigestUtils.sha1Hex(FileUtils.readFileToByteArray(file))); - - // Add crawl data in crawled file. - String lines = crawlSection.toStringListFormatted().toStringSeparatedBy('\n'); - FileUtils.write(file, FileUtils.readFileToString(file, StandardCharsets.UTF_8) + "\n" + lines, StandardCharsets.UTF_8); - - // Crawl another resources. - PathProperties properties = PathPropertyUtils.load(file); - - cache.storeQuietly(properties.getURL("organization.logo")); - cache.storeQuietly(properties.getURL("service.logo")); - - // Crawl subs. - PathProperties subs = properties.getByPrefix("subs"); - for (PathProperty property : subs) + if (StringUtils.isNotBlank(property.getValue())) { - if (StringUtils.isNotBlank(property.getValue())) + try { URL subUrl = new URL(property.getValue()); crawl(subUrl, cache); } + catch (java.net.MalformedURLException exception) + { + logger.error("ERROR: subcrawl failed for [{}][{}][{}]: {}", url.toString(), property.getPath(), property.getValue(), exception.getMessage()); + exception.printStackTrace(); + } + catch (java.net.ConnectException | FileNotFoundException exception) + { + logger.error("ERROR: subcrawl failed for [{}][{}][{}]: {}", url.toString(), property.getPath(), property.getValue(), exception.getMessage()); + exception.printStackTrace(); + } } } - catch (java.net.ConnectException | FileNotFoundException exception) - { - logger.error("ERROR: crawl failed for [{}]: {}", url.toString(), exception.getMessage()); - exception.printStackTrace(); - } } }