Improved empty crawled file case.

This commit is contained in:
Christian P. MOMON 2021-01-12 00:24:26 +01:00
parent 80fce490ce
commit 51a46a5ed4
2 changed files with 51 additions and 39 deletions

View file

@ -394,16 +394,26 @@ public class CrawlCache
FileUtils.copyURLToFile(url, temp, TIMEOUT, TIMEOUT); FileUtils.copyURLToFile(url, temp, TIMEOUT, TIMEOUT);
if (temp.length() == 0) if (temp.length() == 0)
{ {
logger.warn("WARNING: empty file crawled for [{}]", url); if (result.exists())
{
logger.warn("WARNING: empty file crawled and ignored for [{}]", url);
result = null;
} }
else else
{ {
temp.renameTo(result); logger.warn("WARNING: empty file crawled and copied for [{}]", url);
FileUtils.copyFile(temp, result);
}
}
else
{
FileUtils.copyFile(temp, result);
} }
temp.delete(); temp.delete();
} }
else else
{ {
logger.warn("WARNING: crawl failed because bad http+ protocol for [{}]", url);
result = null; result = null;
} }

View file

@ -168,7 +168,8 @@ public class Crawler
// Crawl. // Crawl.
File file = cache.store(url); File file = cache.store(url);
if (file != null)
{
// Build crawl data. // Build crawl data.
PathProperties crawlSection = new PathPropertyList(); PathProperties crawlSection = new PathPropertyList();
crawlSection.put("crawl.crawler", "StatoolInfos"); crawlSection.put("crawl.crawler", "StatoolInfos");
@ -213,3 +214,4 @@ public class Crawler
} }
} }
} }
}