Improved empty crawled file case.
This commit is contained in:
parent
80fce490ce
commit
51a46a5ed4
2 changed files with 51 additions and 39 deletions
|
@ -394,16 +394,26 @@ public class CrawlCache
|
||||||
FileUtils.copyURLToFile(url, temp, TIMEOUT, TIMEOUT);
|
FileUtils.copyURLToFile(url, temp, TIMEOUT, TIMEOUT);
|
||||||
if (temp.length() == 0)
|
if (temp.length() == 0)
|
||||||
{
|
{
|
||||||
logger.warn("WARNING: empty file crawled for [{}]", url);
|
if (result.exists())
|
||||||
|
{
|
||||||
|
logger.warn("WARNING: empty file crawled and ignored for [{}]", url);
|
||||||
|
result = null;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
logger.warn("WARNING: empty file crawled and copied for [{}]", url);
|
||||||
|
FileUtils.copyFile(temp, result);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
temp.renameTo(result);
|
FileUtils.copyFile(temp, result);
|
||||||
}
|
}
|
||||||
temp.delete();
|
temp.delete();
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
logger.warn("WARNING: crawl failed because bad http+ protocol for [{}]", url);
|
||||||
result = null;
|
result = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -168,46 +168,48 @@ public class Crawler
|
||||||
|
|
||||||
// Crawl.
|
// Crawl.
|
||||||
File file = cache.store(url);
|
File file = cache.store(url);
|
||||||
|
if (file != null)
|
||||||
// Build crawl data.
|
|
||||||
PathProperties crawlSection = new PathPropertyList();
|
|
||||||
crawlSection.put("crawl.crawler", "StatoolInfos");
|
|
||||||
crawlSection.put("crawl.datetime", LocalDateTime.now().toString());
|
|
||||||
crawlSection.put("crawl.url", url.toString());
|
|
||||||
crawlSection.put("crawl.file.size", FileUtils.sizeOf(file));
|
|
||||||
crawlSection.put("crawl.file.datetime", StatoolInfosUtils.urlLastModified(url).toString());
|
|
||||||
crawlSection.put("crawl.file.sha1", DigestUtils.sha1Hex(FileUtils.readFileToByteArray(file)));
|
|
||||||
|
|
||||||
// Add crawl data in crawled file.
|
|
||||||
String lines = crawlSection.toStringListFormatted().toStringSeparatedBy('\n');
|
|
||||||
FileUtils.write(file, FileUtils.readFileToString(file, StandardCharsets.UTF_8) + "\n" + lines, StandardCharsets.UTF_8);
|
|
||||||
|
|
||||||
// Crawl another resources.
|
|
||||||
PathProperties properties = PathPropertyUtils.load(file);
|
|
||||||
|
|
||||||
cache.storeQuietly(properties.getURL("organization.logo"));
|
|
||||||
cache.storeQuietly(properties.getURL("service.logo"));
|
|
||||||
|
|
||||||
// Crawl subs.
|
|
||||||
PathProperties subs = properties.getByPrefix("subs");
|
|
||||||
for (PathProperty property : subs)
|
|
||||||
{
|
{
|
||||||
if (StringUtils.isNotBlank(property.getValue()))
|
// Build crawl data.
|
||||||
|
PathProperties crawlSection = new PathPropertyList();
|
||||||
|
crawlSection.put("crawl.crawler", "StatoolInfos");
|
||||||
|
crawlSection.put("crawl.datetime", LocalDateTime.now().toString());
|
||||||
|
crawlSection.put("crawl.url", url.toString());
|
||||||
|
crawlSection.put("crawl.file.size", FileUtils.sizeOf(file));
|
||||||
|
crawlSection.put("crawl.file.datetime", StatoolInfosUtils.urlLastModified(url).toString());
|
||||||
|
crawlSection.put("crawl.file.sha1", DigestUtils.sha1Hex(FileUtils.readFileToByteArray(file)));
|
||||||
|
|
||||||
|
// Add crawl data in crawled file.
|
||||||
|
String lines = crawlSection.toStringListFormatted().toStringSeparatedBy('\n');
|
||||||
|
FileUtils.write(file, FileUtils.readFileToString(file, StandardCharsets.UTF_8) + "\n" + lines, StandardCharsets.UTF_8);
|
||||||
|
|
||||||
|
// Crawl another resources.
|
||||||
|
PathProperties properties = PathPropertyUtils.load(file);
|
||||||
|
|
||||||
|
cache.storeQuietly(properties.getURL("organization.logo"));
|
||||||
|
cache.storeQuietly(properties.getURL("service.logo"));
|
||||||
|
|
||||||
|
// Crawl subs.
|
||||||
|
PathProperties subs = properties.getByPrefix("subs");
|
||||||
|
for (PathProperty property : subs)
|
||||||
{
|
{
|
||||||
try
|
if (StringUtils.isNotBlank(property.getValue()))
|
||||||
{
|
{
|
||||||
URL subUrl = new URL(property.getValue());
|
try
|
||||||
crawl(subUrl, cache);
|
{
|
||||||
}
|
URL subUrl = new URL(property.getValue());
|
||||||
catch (java.net.MalformedURLException exception)
|
crawl(subUrl, cache);
|
||||||
{
|
}
|
||||||
logger.error("ERROR: subcrawl failed for [{}][{}][{}]: {}", url.toString(), property.getPath(), property.getValue(), exception.getMessage());
|
catch (java.net.MalformedURLException exception)
|
||||||
exception.printStackTrace();
|
{
|
||||||
}
|
logger.error("ERROR: subcrawl failed for [{}][{}][{}]: {}", url.toString(), property.getPath(), property.getValue(), exception.getMessage());
|
||||||
catch (java.net.ConnectException | FileNotFoundException exception)
|
exception.printStackTrace();
|
||||||
{
|
}
|
||||||
logger.error("ERROR: subcrawl failed for [{}][{}][{}]: {}", url.toString(), property.getPath(), property.getValue(), exception.getMessage());
|
catch (java.net.ConnectException | FileNotFoundException exception)
|
||||||
exception.printStackTrace();
|
{
|
||||||
|
logger.error("ERROR: subcrawl failed for [{}][{}][{}]: {}", url.toString(), property.getPath(), property.getValue(), exception.getMessage());
|
||||||
|
exception.printStackTrace();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue