Improved empty crawled file case.
This commit is contained in:
parent
80fce490ce
commit
51a46a5ed4
2 changed files with 51 additions and 39 deletions
|
@ -394,16 +394,26 @@ public class CrawlCache
|
|||
FileUtils.copyURLToFile(url, temp, TIMEOUT, TIMEOUT);
|
||||
if (temp.length() == 0)
|
||||
{
|
||||
logger.warn("WARNING: empty file crawled for [{}]", url);
|
||||
if (result.exists())
|
||||
{
|
||||
logger.warn("WARNING: empty file crawled and ignored for [{}]", url);
|
||||
result = null;
|
||||
}
|
||||
else
|
||||
{
|
||||
temp.renameTo(result);
|
||||
logger.warn("WARNING: empty file crawled and copied for [{}]", url);
|
||||
FileUtils.copyFile(temp, result);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
FileUtils.copyFile(temp, result);
|
||||
}
|
||||
temp.delete();
|
||||
}
|
||||
else
|
||||
{
|
||||
logger.warn("WARNING: crawl failed because bad http+ protocol for [{}]", url);
|
||||
result = null;
|
||||
}
|
||||
|
||||
|
|
|
@ -168,7 +168,8 @@ public class Crawler
|
|||
|
||||
// Crawl.
|
||||
File file = cache.store(url);
|
||||
|
||||
if (file != null)
|
||||
{
|
||||
// Build crawl data.
|
||||
PathProperties crawlSection = new PathPropertyList();
|
||||
crawlSection.put("crawl.crawler", "StatoolInfos");
|
||||
|
@ -212,4 +213,5 @@ public class Crawler
|
|||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue