Added URL connexion error management.
This commit is contained in:
parent
2166819c24
commit
0a5fba2ea5
3 changed files with 84 additions and 53 deletions
|
@ -173,7 +173,14 @@ public class Factory
|
||||||
{
|
{
|
||||||
URL inputURL = new URL(property.getValue());
|
URL inputURL = new URL(property.getValue());
|
||||||
Organization organization = loadOrganization(inputURL, cache);
|
Organization organization = loadOrganization(inputURL, cache);
|
||||||
result.getOrganizations().add(organization);
|
if (organization == null)
|
||||||
|
{
|
||||||
|
logger.error("Loading organization failed for [{}]", property.getValue());
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
result.getOrganizations().add(organization);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -233,22 +240,29 @@ public class Factory
|
||||||
|
|
||||||
File inputFile = cache.restoreFile(inputURL);
|
File inputFile = cache.restoreFile(inputURL);
|
||||||
|
|
||||||
PathProperties properties = PathPropertyUtils.load(inputFile);
|
if (inputFile == null)
|
||||||
result = new Organization(properties);
|
|
||||||
result.setInputFile(inputFile);
|
|
||||||
result.setInputURL(inputURL);
|
|
||||||
result.setLogoFileName(result.getTechnicalName() + "-logo" + StringUtils.defaultIfBlank(cache.getExtension(result.getLogoURL()), ".png"));
|
|
||||||
|
|
||||||
PathProperties subs = result.getByPrefix("subs");
|
|
||||||
for (PathProperty property : subs)
|
|
||||||
{
|
{
|
||||||
if (StringUtils.startsWith(property.getValue(), "http"))
|
result = null;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
PathProperties properties = PathPropertyUtils.load(inputFile);
|
||||||
|
result = new Organization(properties);
|
||||||
|
result.setInputFile(inputFile);
|
||||||
|
result.setInputURL(inputURL);
|
||||||
|
result.setLogoFileName(result.getTechnicalName() + "-logo" + StringUtils.defaultIfBlank(cache.getExtension(result.getLogoURL()), ".png"));
|
||||||
|
|
||||||
|
PathProperties subs = result.getByPrefix("subs");
|
||||||
|
for (PathProperty property : subs)
|
||||||
{
|
{
|
||||||
URL serviceInputURL = new URL(property.getValue());
|
if (StringUtils.startsWith(property.getValue(), "http"))
|
||||||
Service service = loadService(serviceInputURL, cache);
|
{
|
||||||
service.setOrganization(result);
|
URL serviceInputURL = new URL(property.getValue());
|
||||||
service.setLogoFileName(result.getTechnicalName() + "-" + service.getLogoFileName());
|
Service service = loadService(serviceInputURL, cache);
|
||||||
result.getServices().add(service);
|
service.setOrganization(result);
|
||||||
|
service.setLogoFileName(result.getTechnicalName() + "-" + service.getLogoFileName());
|
||||||
|
result.getServices().add(service);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
package fr.devinsy.statoolinfos.crawl;
|
package fr.devinsy.statoolinfos.crawl;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.net.URL;
|
import java.net.URL;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
|
@ -147,39 +148,48 @@ public class Crawler
|
||||||
*/
|
*/
|
||||||
public static void crawl(final URL url, final CrawlCache cache) throws StatoolInfosException, IOException
|
public static void crawl(final URL url, final CrawlCache cache) throws StatoolInfosException, IOException
|
||||||
{
|
{
|
||||||
logger.info("Crawling " + url);
|
try
|
||||||
|
|
||||||
// Crawl.
|
|
||||||
File file = cache.store(url);
|
|
||||||
|
|
||||||
// Build crawl data.
|
|
||||||
PathProperties crawlSection = new PathPropertyList();
|
|
||||||
crawlSection.put("crawl.crawler", "StatoolInfos");
|
|
||||||
crawlSection.put("crawl.datetime", LocalDateTime.now().toString());
|
|
||||||
crawlSection.put("crawl.url", url.toString());
|
|
||||||
crawlSection.put("crawl.file.size", FileUtils.sizeOf(file));
|
|
||||||
crawlSection.put("crawl.file.datetime", StatoolInfosUtils.urlLastModified(url).toString());
|
|
||||||
crawlSection.put("crawl.file.sha1", DigestUtils.sha1Hex(FileUtils.readFileToByteArray(file)));
|
|
||||||
|
|
||||||
// Add crawl data in crawled file.
|
|
||||||
String lines = crawlSection.toStringListFormatted().toStringSeparatedBy('\n');
|
|
||||||
FileUtils.write(file, FileUtils.readFileToString(file, StandardCharsets.UTF_8) + "\n" + lines, StandardCharsets.UTF_8);
|
|
||||||
|
|
||||||
// Crawl another resources.
|
|
||||||
PathProperties properties = PathPropertyUtils.load(file);
|
|
||||||
|
|
||||||
cache.storeQuietly(properties.getURL("organization.logo"));
|
|
||||||
cache.storeQuietly(properties.getURL("service.logo"));
|
|
||||||
|
|
||||||
// Crawl subs.
|
|
||||||
PathProperties subs = properties.getByPrefix("subs");
|
|
||||||
for (PathProperty property : subs)
|
|
||||||
{
|
{
|
||||||
if (StringUtils.isNotBlank(property.getValue()))
|
|
||||||
|
logger.info("Crawling " + url);
|
||||||
|
|
||||||
|
// Crawl.
|
||||||
|
File file = cache.store(url);
|
||||||
|
|
||||||
|
// Build crawl data.
|
||||||
|
PathProperties crawlSection = new PathPropertyList();
|
||||||
|
crawlSection.put("crawl.crawler", "StatoolInfos");
|
||||||
|
crawlSection.put("crawl.datetime", LocalDateTime.now().toString());
|
||||||
|
crawlSection.put("crawl.url", url.toString());
|
||||||
|
crawlSection.put("crawl.file.size", FileUtils.sizeOf(file));
|
||||||
|
crawlSection.put("crawl.file.datetime", StatoolInfosUtils.urlLastModified(url).toString());
|
||||||
|
crawlSection.put("crawl.file.sha1", DigestUtils.sha1Hex(FileUtils.readFileToByteArray(file)));
|
||||||
|
|
||||||
|
// Add crawl data in crawled file.
|
||||||
|
String lines = crawlSection.toStringListFormatted().toStringSeparatedBy('\n');
|
||||||
|
FileUtils.write(file, FileUtils.readFileToString(file, StandardCharsets.UTF_8) + "\n" + lines, StandardCharsets.UTF_8);
|
||||||
|
|
||||||
|
// Crawl another resources.
|
||||||
|
PathProperties properties = PathPropertyUtils.load(file);
|
||||||
|
|
||||||
|
cache.storeQuietly(properties.getURL("organization.logo"));
|
||||||
|
cache.storeQuietly(properties.getURL("service.logo"));
|
||||||
|
|
||||||
|
// Crawl subs.
|
||||||
|
PathProperties subs = properties.getByPrefix("subs");
|
||||||
|
for (PathProperty property : subs)
|
||||||
{
|
{
|
||||||
URL subUrl = new URL(property.getValue());
|
if (StringUtils.isNotBlank(property.getValue()))
|
||||||
crawl(subUrl, cache);
|
{
|
||||||
|
URL subUrl = new URL(property.getValue());
|
||||||
|
crawl(subUrl, cache);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
catch (java.net.ConnectException | FileNotFoundException exception)
|
||||||
|
{
|
||||||
|
logger.error("ERROR: crawl failed for [{}]: {}", url.toString(), exception.getMessage());
|
||||||
|
exception.printStackTrace();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -103,17 +103,24 @@ public class PathPropertyUtils
|
||||||
{
|
{
|
||||||
PathProperties result;
|
PathProperties result;
|
||||||
|
|
||||||
result = new PathPropertyList();
|
if (file == null)
|
||||||
|
|
||||||
BufferedReader in = null;
|
|
||||||
try
|
|
||||||
{
|
{
|
||||||
in = new BufferedReader(new InputStreamReader(new FileInputStream(file), charsetName));
|
throw new IllegalArgumentException("File parameter is null.");
|
||||||
result = read(in);
|
|
||||||
}
|
}
|
||||||
finally
|
else
|
||||||
{
|
{
|
||||||
IOUtils.closeQuietly(in);
|
result = new PathPropertyList();
|
||||||
|
|
||||||
|
BufferedReader in = null;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
in = new BufferedReader(new InputStreamReader(new FileInputStream(file), charsetName));
|
||||||
|
result = read(in);
|
||||||
|
}
|
||||||
|
finally
|
||||||
|
{
|
||||||
|
IOUtils.closeQuietly(in);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
|
|
Loading…
Reference in a new issue