Improved empty crawled file case.

This commit is contained in:
Christian P. MOMON 2021-01-12 00:24:26 +01:00
parent 80fce490ce
commit 51a46a5ed4
2 changed files with 51 additions and 39 deletions

View file

@ -394,16 +394,26 @@ public class CrawlCache
FileUtils.copyURLToFile(url, temp, TIMEOUT, TIMEOUT);
if (temp.length() == 0)
{
logger.warn("WARNING: empty file crawled for [{}]", url);
if (result.exists())
{
logger.warn("WARNING: empty file crawled and ignored for [{}]", url);
result = null;
}
else
{
temp.renameTo(result);
logger.warn("WARNING: empty file crawled and copied for [{}]", url);
FileUtils.copyFile(temp, result);
}
}
else
{
FileUtils.copyFile(temp, result);
}
temp.delete();
}
else
{
logger.warn("WARNING: crawl failed because bad http+ protocol for [{}]", url);
result = null;
}

View file

@ -168,7 +168,8 @@ public class Crawler
// Crawl.
File file = cache.store(url);
if (file != null)
{
// Build crawl data.
PathProperties crawlSection = new PathPropertyList();
crawlSection.put("crawl.crawler", "StatoolInfos");
@ -212,4 +213,5 @@ public class Crawler
}
}
}
}
}