Improved empty crawled file case.

This commit is contained in:
Christian P. MOMON 2021-01-12 00:24:26 +01:00
parent 80fce490ce
commit 51a46a5ed4
2 changed files with 51 additions and 39 deletions

View file

@ -394,16 +394,26 @@ public class CrawlCache
FileUtils.copyURLToFile(url, temp, TIMEOUT, TIMEOUT);
if (temp.length() == 0)
{
logger.warn("WARNING: empty file crawled for [{}]", url);
if (result.exists())
{
logger.warn("WARNING: empty file crawled and ignored for [{}]", url);
result = null;
}
else
{
logger.warn("WARNING: empty file crawled and copied for [{}]", url);
FileUtils.copyFile(temp, result);
}
}
else
{
temp.renameTo(result);
FileUtils.copyFile(temp, result);
}
temp.delete();
}
else
{
logger.warn("WARNING: crawl failed because bad http+ protocol for [{}]", url);
result = null;
}

View file

@ -168,46 +168,48 @@ public class Crawler
// Crawl.
File file = cache.store(url);
// Build crawl data.
PathProperties crawlSection = new PathPropertyList();
crawlSection.put("crawl.crawler", "StatoolInfos");
crawlSection.put("crawl.datetime", LocalDateTime.now().toString());
crawlSection.put("crawl.url", url.toString());
crawlSection.put("crawl.file.size", FileUtils.sizeOf(file));
crawlSection.put("crawl.file.datetime", StatoolInfosUtils.urlLastModified(url).toString());
crawlSection.put("crawl.file.sha1", DigestUtils.sha1Hex(FileUtils.readFileToByteArray(file)));
// Add crawl data in crawled file.
String lines = crawlSection.toStringListFormatted().toStringSeparatedBy('\n');
FileUtils.write(file, FileUtils.readFileToString(file, StandardCharsets.UTF_8) + "\n" + lines, StandardCharsets.UTF_8);
// Crawl another resources.
PathProperties properties = PathPropertyUtils.load(file);
cache.storeQuietly(properties.getURL("organization.logo"));
cache.storeQuietly(properties.getURL("service.logo"));
// Crawl subs.
PathProperties subs = properties.getByPrefix("subs");
for (PathProperty property : subs)
if (file != null)
{
if (StringUtils.isNotBlank(property.getValue()))
// Build crawl data.
PathProperties crawlSection = new PathPropertyList();
crawlSection.put("crawl.crawler", "StatoolInfos");
crawlSection.put("crawl.datetime", LocalDateTime.now().toString());
crawlSection.put("crawl.url", url.toString());
crawlSection.put("crawl.file.size", FileUtils.sizeOf(file));
crawlSection.put("crawl.file.datetime", StatoolInfosUtils.urlLastModified(url).toString());
crawlSection.put("crawl.file.sha1", DigestUtils.sha1Hex(FileUtils.readFileToByteArray(file)));
// Add crawl data in crawled file.
String lines = crawlSection.toStringListFormatted().toStringSeparatedBy('\n');
FileUtils.write(file, FileUtils.readFileToString(file, StandardCharsets.UTF_8) + "\n" + lines, StandardCharsets.UTF_8);
// Crawl another resources.
PathProperties properties = PathPropertyUtils.load(file);
cache.storeQuietly(properties.getURL("organization.logo"));
cache.storeQuietly(properties.getURL("service.logo"));
// Crawl subs.
PathProperties subs = properties.getByPrefix("subs");
for (PathProperty property : subs)
{
try
if (StringUtils.isNotBlank(property.getValue()))
{
URL subUrl = new URL(property.getValue());
crawl(subUrl, cache);
}
catch (java.net.MalformedURLException exception)
{
logger.error("ERROR: subcrawl failed for [{}][{}][{}]: {}", url.toString(), property.getPath(), property.getValue(), exception.getMessage());
exception.printStackTrace();
}
catch (java.net.ConnectException | FileNotFoundException exception)
{
logger.error("ERROR: subcrawl failed for [{}][{}][{}]: {}", url.toString(), property.getPath(), property.getValue(), exception.getMessage());
exception.printStackTrace();
try
{
URL subUrl = new URL(property.getValue());
crawl(subUrl, cache);
}
catch (java.net.MalformedURLException exception)
{
logger.error("ERROR: subcrawl failed for [{}][{}][{}]: {}", url.toString(), property.getPath(), property.getValue(), exception.getMessage());
exception.printStackTrace();
}
catch (java.net.ConnectException | FileNotFoundException exception)
{
logger.error("ERROR: subcrawl failed for [{}][{}][{}]: {}", url.toString(), property.getPath(), property.getValue(), exception.getMessage());
exception.printStackTrace();
}
}
}
}