Improved empty crawled file case.

This commit is contained in:
Christian P. MOMON 2021-01-12 00:24:26 +01:00
parent 80fce490ce
commit 51a46a5ed4
2 changed files with 51 additions and 39 deletions

View file

@ -394,16 +394,26 @@ public class CrawlCache
FileUtils.copyURLToFile(url, temp, TIMEOUT, TIMEOUT); FileUtils.copyURLToFile(url, temp, TIMEOUT, TIMEOUT);
if (temp.length() == 0) if (temp.length() == 0)
{ {
logger.warn("WARNING: empty file crawled for [{}]", url); if (result.exists())
{
logger.warn("WARNING: empty file crawled and ignored for [{}]", url);
result = null;
}
else
{
logger.warn("WARNING: empty file crawled and copied for [{}]", url);
FileUtils.copyFile(temp, result);
}
} }
else else
{ {
temp.renameTo(result); FileUtils.copyFile(temp, result);
} }
temp.delete(); temp.delete();
} }
else else
{ {
logger.warn("WARNING: crawl failed because bad http+ protocol for [{}]", url);
result = null; result = null;
} }

View file

@ -168,46 +168,48 @@ public class Crawler
// Crawl. // Crawl.
File file = cache.store(url); File file = cache.store(url);
if (file != null)
// Build crawl data.
PathProperties crawlSection = new PathPropertyList();
crawlSection.put("crawl.crawler", "StatoolInfos");
crawlSection.put("crawl.datetime", LocalDateTime.now().toString());
crawlSection.put("crawl.url", url.toString());
crawlSection.put("crawl.file.size", FileUtils.sizeOf(file));
crawlSection.put("crawl.file.datetime", StatoolInfosUtils.urlLastModified(url).toString());
crawlSection.put("crawl.file.sha1", DigestUtils.sha1Hex(FileUtils.readFileToByteArray(file)));
// Add crawl data in crawled file.
String lines = crawlSection.toStringListFormatted().toStringSeparatedBy('\n');
FileUtils.write(file, FileUtils.readFileToString(file, StandardCharsets.UTF_8) + "\n" + lines, StandardCharsets.UTF_8);
// Crawl another resources.
PathProperties properties = PathPropertyUtils.load(file);
cache.storeQuietly(properties.getURL("organization.logo"));
cache.storeQuietly(properties.getURL("service.logo"));
// Crawl subs.
PathProperties subs = properties.getByPrefix("subs");
for (PathProperty property : subs)
{ {
if (StringUtils.isNotBlank(property.getValue())) // Build crawl data.
PathProperties crawlSection = new PathPropertyList();
crawlSection.put("crawl.crawler", "StatoolInfos");
crawlSection.put("crawl.datetime", LocalDateTime.now().toString());
crawlSection.put("crawl.url", url.toString());
crawlSection.put("crawl.file.size", FileUtils.sizeOf(file));
crawlSection.put("crawl.file.datetime", StatoolInfosUtils.urlLastModified(url).toString());
crawlSection.put("crawl.file.sha1", DigestUtils.sha1Hex(FileUtils.readFileToByteArray(file)));
// Add crawl data in crawled file.
String lines = crawlSection.toStringListFormatted().toStringSeparatedBy('\n');
FileUtils.write(file, FileUtils.readFileToString(file, StandardCharsets.UTF_8) + "\n" + lines, StandardCharsets.UTF_8);
// Crawl another resources.
PathProperties properties = PathPropertyUtils.load(file);
cache.storeQuietly(properties.getURL("organization.logo"));
cache.storeQuietly(properties.getURL("service.logo"));
// Crawl subs.
PathProperties subs = properties.getByPrefix("subs");
for (PathProperty property : subs)
{ {
try if (StringUtils.isNotBlank(property.getValue()))
{ {
URL subUrl = new URL(property.getValue()); try
crawl(subUrl, cache); {
} URL subUrl = new URL(property.getValue());
catch (java.net.MalformedURLException exception) crawl(subUrl, cache);
{ }
logger.error("ERROR: subcrawl failed for [{}][{}][{}]: {}", url.toString(), property.getPath(), property.getValue(), exception.getMessage()); catch (java.net.MalformedURLException exception)
exception.printStackTrace(); {
} logger.error("ERROR: subcrawl failed for [{}][{}][{}]: {}", url.toString(), property.getPath(), property.getValue(), exception.getMessage());
catch (java.net.ConnectException | FileNotFoundException exception) exception.printStackTrace();
{ }
logger.error("ERROR: subcrawl failed for [{}][{}][{}]: {}", url.toString(), property.getPath(), property.getValue(), exception.getMessage()); catch (java.net.ConnectException | FileNotFoundException exception)
exception.printStackTrace(); {
logger.error("ERROR: subcrawl failed for [{}][{}][{}]: {}", url.toString(), property.getPath(), property.getValue(), exception.getMessage());
exception.printStackTrace();
}
} }
} }
} }