From 51a46a5ed494901354dae7363c60b3523b16c419 Mon Sep 17 00:00:00 2001 From: "Christian P. MOMON" Date: Tue, 12 Jan 2021 00:24:26 +0100 Subject: [PATCH] Improved empty crawled file case. --- .../statoolinfos/crawl/CrawlCache.java | 14 +++- .../devinsy/statoolinfos/crawl/Crawler.java | 76 ++++++++++--------- 2 files changed, 51 insertions(+), 39 deletions(-) diff --git a/src/fr/devinsy/statoolinfos/crawl/CrawlCache.java b/src/fr/devinsy/statoolinfos/crawl/CrawlCache.java index e83c6d1..ec6193e 100644 --- a/src/fr/devinsy/statoolinfos/crawl/CrawlCache.java +++ b/src/fr/devinsy/statoolinfos/crawl/CrawlCache.java @@ -394,16 +394,26 @@ public class CrawlCache FileUtils.copyURLToFile(url, temp, TIMEOUT, TIMEOUT); if (temp.length() == 0) { - logger.warn("WARNING: empty file crawled for [{}]", url); + if (result.exists()) + { + logger.warn("WARNING: empty file crawled and ignored for [{}]", url); + result = null; + } + else + { + logger.warn("WARNING: empty file crawled and copied for [{}]", url); + FileUtils.copyFile(temp, result); + } } else { - temp.renameTo(result); + FileUtils.copyFile(temp, result); } temp.delete(); } else { + logger.warn("WARNING: crawl failed because bad http+ protocol for [{}]", url); result = null; } diff --git a/src/fr/devinsy/statoolinfos/crawl/Crawler.java b/src/fr/devinsy/statoolinfos/crawl/Crawler.java index 5d055cc..bd30884 100644 --- a/src/fr/devinsy/statoolinfos/crawl/Crawler.java +++ b/src/fr/devinsy/statoolinfos/crawl/Crawler.java @@ -168,46 +168,48 @@ public class Crawler // Crawl. File file = cache.store(url); - - // Build crawl data. - PathProperties crawlSection = new PathPropertyList(); - crawlSection.put("crawl.crawler", "StatoolInfos"); - crawlSection.put("crawl.datetime", LocalDateTime.now().toString()); - crawlSection.put("crawl.url", url.toString()); - crawlSection.put("crawl.file.size", FileUtils.sizeOf(file)); - crawlSection.put("crawl.file.datetime", StatoolInfosUtils.urlLastModified(url).toString()); - crawlSection.put("crawl.file.sha1", DigestUtils.sha1Hex(FileUtils.readFileToByteArray(file))); - - // Add crawl data in crawled file. - String lines = crawlSection.toStringListFormatted().toStringSeparatedBy('\n'); - FileUtils.write(file, FileUtils.readFileToString(file, StandardCharsets.UTF_8) + "\n" + lines, StandardCharsets.UTF_8); - - // Crawl another resources. - PathProperties properties = PathPropertyUtils.load(file); - - cache.storeQuietly(properties.getURL("organization.logo")); - cache.storeQuietly(properties.getURL("service.logo")); - - // Crawl subs. - PathProperties subs = properties.getByPrefix("subs"); - for (PathProperty property : subs) + if (file != null) { - if (StringUtils.isNotBlank(property.getValue())) + // Build crawl data. + PathProperties crawlSection = new PathPropertyList(); + crawlSection.put("crawl.crawler", "StatoolInfos"); + crawlSection.put("crawl.datetime", LocalDateTime.now().toString()); + crawlSection.put("crawl.url", url.toString()); + crawlSection.put("crawl.file.size", FileUtils.sizeOf(file)); + crawlSection.put("crawl.file.datetime", StatoolInfosUtils.urlLastModified(url).toString()); + crawlSection.put("crawl.file.sha1", DigestUtils.sha1Hex(FileUtils.readFileToByteArray(file))); + + // Add crawl data in crawled file. + String lines = crawlSection.toStringListFormatted().toStringSeparatedBy('\n'); + FileUtils.write(file, FileUtils.readFileToString(file, StandardCharsets.UTF_8) + "\n" + lines, StandardCharsets.UTF_8); + + // Crawl another resources. + PathProperties properties = PathPropertyUtils.load(file); + + cache.storeQuietly(properties.getURL("organization.logo")); + cache.storeQuietly(properties.getURL("service.logo")); + + // Crawl subs. + PathProperties subs = properties.getByPrefix("subs"); + for (PathProperty property : subs) { - try + if (StringUtils.isNotBlank(property.getValue())) { - URL subUrl = new URL(property.getValue()); - crawl(subUrl, cache); - } - catch (java.net.MalformedURLException exception) - { - logger.error("ERROR: subcrawl failed for [{}][{}][{}]: {}", url.toString(), property.getPath(), property.getValue(), exception.getMessage()); - exception.printStackTrace(); - } - catch (java.net.ConnectException | FileNotFoundException exception) - { - logger.error("ERROR: subcrawl failed for [{}][{}][{}]: {}", url.toString(), property.getPath(), property.getValue(), exception.getMessage()); - exception.printStackTrace(); + try + { + URL subUrl = new URL(property.getValue()); + crawl(subUrl, cache); + } + catch (java.net.MalformedURLException exception) + { + logger.error("ERROR: subcrawl failed for [{}][{}][{}]: {}", url.toString(), property.getPath(), property.getValue(), exception.getMessage()); + exception.printStackTrace(); + } + catch (java.net.ConnectException | FileNotFoundException exception) + { + logger.error("ERROR: subcrawl failed for [{}][{}][{}]: {}", url.toString(), property.getPath(), property.getValue(), exception.getMessage()); + exception.printStackTrace(); + } } } }