diff --git a/src/fr/devinsy/statoolinfos/HtmlizerContext.java b/src/fr/devinsy/statoolinfos/HtmlizerContext.java index b3b374f..2081f42 100644 --- a/src/fr/devinsy/statoolinfos/HtmlizerContext.java +++ b/src/fr/devinsy/statoolinfos/HtmlizerContext.java @@ -27,6 +27,9 @@ import fr.devinsy.statoolinfos.core.Factory; import fr.devinsy.statoolinfos.core.Federation; import fr.devinsy.statoolinfos.core.StatoolInfosException; import fr.devinsy.statoolinfos.crawl.CrawlCache; +import fr.devinsy.statoolinfos.crawl.CrawlJournal; +import fr.devinsy.statoolinfos.crawl.CrawlJournalFile; +import fr.devinsy.statoolinfos.crawl.Crawler; /** * The Class Manager. @@ -44,6 +47,7 @@ public class HtmlizerContext private Federation federation; private Categories categories; private CrawlCache cache; + private CrawlJournal crawlJournal; /** * Instantiates a new manager. @@ -65,20 +69,23 @@ public class HtmlizerContext this.configuration = Factory.loadConfiguration(configurationFile); logger.info("Cache setting: {}", this.configuration.getCrawlCachePath()); - logger.info("Htmlize input setting: {}", this.configuration.getHtmlizeInputPath()); + logger.info("Htmlize input setting: {}", this.configuration.getHtmlizeInputURL()); logger.info("Htmlize directory setting: {}", this.configuration.getHtmlizeDirectoryPath()); - File htmlizeInput = this.configuration.getHtmlizeInput(); + this.cache = new CrawlCache(this.configuration.getCrawlCacheDirectory()); + this.crawlJournal = CrawlJournalFile.load(this.cache.restoreFile(Crawler.getJournalURL())); + + File htmlizeInputFile = this.cache.restoreFile(this.configuration.getHtmlizeInputURL()); File htmlizeDirectory = this.configuration.getHtmlizeDirectory(); - if (htmlizeInput == null) + if (htmlizeInputFile == null) { throw new IllegalArgumentException("Htmlize input undefined."); } - else if (!htmlizeInput.exists()) + else if (!htmlizeInputFile.exists()) { throw new IllegalArgumentException("Htmlize input is missing."); } - else if (htmlizeInput.isDirectory()) + else if (htmlizeInputFile.isDirectory()) { throw new IllegalArgumentException("Htmlize input is a directory."); } @@ -98,8 +105,7 @@ public class HtmlizerContext { if (this.configuration.isFederation()) { - this.cache = this.configuration.getCrawlCache(); - this.federation = Factory.loadFederation(this.configuration.getHtmlizeInput(), this.cache); + this.federation = Factory.loadFederation(htmlizeInputFile, this.cache); this.categories = Factory.loadCategories(this.configuration.getCategoryFile(), this.federation); } else @@ -154,6 +160,11 @@ public class HtmlizerContext return result; } + public CrawlJournal getCrawlJournal() + { + return this.crawlJournal; + } + /** * Gets the federation. * diff --git a/src/fr/devinsy/statoolinfos/checker/PropertyChecker.java b/src/fr/devinsy/statoolinfos/checker/PropertyChecker.java index d0329f3..df2ce4c 100644 --- a/src/fr/devinsy/statoolinfos/checker/PropertyChecker.java +++ b/src/fr/devinsy/statoolinfos/checker/PropertyChecker.java @@ -95,7 +95,7 @@ public class PropertyChecker this.federationRules.add(METRICS_WEEKS, WEEKS, PropertyMode.OPTIONAL); this.federationRules.add(METRICS_DAYS, DAYS, PropertyMode.OPTIONAL); - // this.federationRules.add(CRAWL, ALL, PropertyMode.MANDATORY); + this.federationRules.add(CRAWL, ALL, PropertyMode.MANDATORY); // this.organizationRules = new PropertyRules(); diff --git a/src/fr/devinsy/statoolinfos/core/Configuration.java b/src/fr/devinsy/statoolinfos/core/Configuration.java index b8d82a7..ca26c5f 100644 --- a/src/fr/devinsy/statoolinfos/core/Configuration.java +++ b/src/fr/devinsy/statoolinfos/core/Configuration.java @@ -19,10 +19,11 @@ package fr.devinsy.statoolinfos.core; import java.io.File; +import java.net.MalformedURLException; +import java.net.URL; import org.apache.commons.lang3.StringUtils; -import fr.devinsy.statoolinfos.crawl.CrawlCache; import fr.devinsy.statoolinfos.properties.PathProperties; import fr.devinsy.statoolinfos.properties.PathPropertyList; import fr.devinsy.strings.StringList; @@ -177,18 +178,15 @@ public class Configuration extends PathPropertyList } /** - * Gets the cache. + * Gets the crawl cache directory. * - * @return the cache - * @throws StatoolInfosException + * @return the crawl cache directory */ - public CrawlCache getCrawlCache() throws StatoolInfosException + public File getCrawlCacheDirectory() { - CrawlCache result; + File result; - String path = getCrawlCachePath(); - - result = new CrawlCache(new File(path)); + result = new File(get("conf.crawl.cache")); // return result; @@ -209,29 +207,6 @@ public class Configuration extends PathPropertyList return result; } - /** - * Gets the crawl input. - * - * @return the crawl input - */ - public File getCrawlInputFile() - { - File result; - - String path = getCrawlInputPath(); - if (StringUtils.isBlank(path)) - { - result = null; - } - else - { - result = new File(path); - } - - // - return result; - } - /** * Gets the crawl input path. * @@ -247,6 +222,37 @@ public class Configuration extends PathPropertyList return result; } + /** + * Gets the crawl input. + * + * @return the crawl input + */ + public URL getCrawlInputURL() + { + URL result; + + try + { + String path = getCrawlInputPath(); + if (StringUtils.isBlank(path)) + { + result = null; + } + else + { + result = new URL(path); + } + } + catch (MalformedURLException exception) + { + exception.printStackTrace(); + result = null; + } + + // + return result; + } + /** * Gets the edito directory. * @@ -309,39 +315,17 @@ public class Configuration extends PathPropertyList return result; } - /** - * Gets the htmlize input. - * - * @return the htmlize input - */ - public File getHtmlizeInput() - { - File result; - - String path = getHtmlizeInputPath(); - if (StringUtils.isBlank(path)) - { - result = null; - } - else - { - result = new File(path); - } - - // - return result; - } - /** * Gets the htmlize input path. * * @return the htmlize input path + * @throws MalformedURLException */ - public String getHtmlizeInputPath() + public URL getHtmlizeInputURL() throws MalformedURLException { - String result; + URL result; - result = get("conf.htmlize.input"); + result = new URL(get("conf.htmlize.input")); // return result; diff --git a/src/fr/devinsy/statoolinfos/core/PropertyClassType.java b/src/fr/devinsy/statoolinfos/core/PropertyClassType.java new file mode 100644 index 0000000..c6020ad --- /dev/null +++ b/src/fr/devinsy/statoolinfos/core/PropertyClassType.java @@ -0,0 +1,125 @@ +/* + * Copyright (C) 2021 Christian Pierre MOMON + * + * This file is part of StatoolInfos, simple service statistics tool. + * + * StatoolInfos is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * StatoolInfos is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with StatoolInfos. If not, see . + */ +package fr.devinsy.statoolinfos.core; + +import org.apache.commons.lang3.StringUtils; + +public enum PropertyClassType +{ + FEDERATION, + ORGANIZATION, + SERVICE, + METRICS; + + /** + * Checks if is child of. + * + * @param parent + * the parent + * @return true, if is child of + */ + public boolean isChildOf(final PropertyClassType parent) + { + boolean result; + + switch (this) + { + case FEDERATION: + if (parent == null) + { + result = true; + } + else + { + result = false; + } + break; + case ORGANIZATION: + if (parent == FEDERATION) + { + result = true; + } + else + { + result = false; + } + break; + case SERVICE: + if (parent == ORGANIZATION) + { + result = true; + } + else + { + result = false; + } + break; + case METRICS: + result = true; + break; + default: + result = false; + } + + // + return result; + } + + /** + * Of. + * + * @param value + * the value + * @return the property class type + */ + public static PropertyClassType of(final String value) + { + PropertyClassType result; + + String target = StringUtils.trim(StringUtils.toRootLowerCase(value)); + + if (target == null) + { + result = null; + } + else if (StringUtils.equals(target, "federation")) + { + result = FEDERATION; + } + else if (StringUtils.equals(target, "organization")) + { + result = ORGANIZATION; + } + else if (StringUtils.equals(target, "service")) + { + result = SERVICE; + } + else if (StringUtils.equals(target, "metrics")) + { + result = METRICS; + } + else + { + result = null; + } + + // + return result; + } +} \ No newline at end of file diff --git a/src/fr/devinsy/statoolinfos/core/StatoolInfos.java b/src/fr/devinsy/statoolinfos/core/StatoolInfos.java index ed9b010..fd76a27 100644 --- a/src/fr/devinsy/statoolinfos/core/StatoolInfos.java +++ b/src/fr/devinsy/statoolinfos/core/StatoolInfos.java @@ -74,7 +74,7 @@ public class StatoolInfos Configuration configuration = Factory.loadConfiguration(configurationFile); Builder.clear(configuration); - Crawler.clear(configuration); + new Crawler(configuration.getCrawlCacheDirectory()).clear(); Htmlizer.clear(configuration); } @@ -90,7 +90,10 @@ public class StatoolInfos */ public static void crawl(final File configurationFile) throws StatoolInfosException, IOException { - Crawler.crawl(configurationFile); + Configuration configuration = Factory.loadConfiguration(configurationFile); + Crawler crawler = new Crawler(configuration.getCrawlCacheDirectory()); + crawler.crawl(configuration.getCrawlInputURL()); + crawler.storeJournal(); } /** diff --git a/src/fr/devinsy/statoolinfos/core/StatoolInfosUtils.java b/src/fr/devinsy/statoolinfos/core/StatoolInfosUtils.java index c6084c2..2aba2a7 100644 --- a/src/fr/devinsy/statoolinfos/core/StatoolInfosUtils.java +++ b/src/fr/devinsy/statoolinfos/core/StatoolInfosUtils.java @@ -22,6 +22,7 @@ import java.io.File; import java.io.IOException; import java.net.HttpURLConnection; import java.net.URL; +import java.nio.charset.StandardCharsets; import java.time.Instant; import java.time.LocalDate; import java.time.LocalDateTime; @@ -40,6 +41,7 @@ import javax.net.ssl.SSLContext; import javax.net.ssl.TrustManager; import javax.net.ssl.X509TrustManager; +import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.math.NumberUtils; @@ -119,7 +121,7 @@ public class StatoolInfosUtils // Because Tika failed to recognize SVG file without xml header // line. - if (result.equals(".txt") && (StringUtils.startsWithIgnoreCase(FileUtils.readFileToString(file, "UTF8"), " + * + * This file is part of StatoolInfos, simple service statistics tool. + * + * StatoolInfos is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * StatoolInfos is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with StatoolInfos. If not, see . + */ +package fr.devinsy.statoolinfos.crawl; + +import java.time.LocalDateTime; + +/** + * The Class CrawlJournal. + */ +public class CrawlJournal extends CrawlLogs +{ + private static final long serialVersionUID = -7855320365496351766L; + + private LocalDateTime datetime; + + /** + * Instantiates a new crawl journal. + */ + public CrawlJournal() + { + super(); + this.datetime = LocalDateTime.now(); + } + + /** + * Gets the date. + * + * @return the date + */ + public LocalDateTime getDatetime() + { + return this.datetime; + } + + /** + * Sets the date. + * + * @param date + * the new date + */ + public void setDatetime(final LocalDateTime date) + { + this.datetime = date; + } + +} \ No newline at end of file diff --git a/src/fr/devinsy/statoolinfos/crawl/CrawlJournalFile.java b/src/fr/devinsy/statoolinfos/crawl/CrawlJournalFile.java new file mode 100644 index 0000000..bb0d25b --- /dev/null +++ b/src/fr/devinsy/statoolinfos/crawl/CrawlJournalFile.java @@ -0,0 +1,236 @@ +/* + * Copyright (C) 2021 Christian Pierre MOMON + * + * This file is part of StatoolInfos, simple service statistics tool. + * + * StatoolInfos is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * StatoolInfos is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with StatoolInfos. If not, see . + */ +package fr.devinsy.statoolinfos.crawl; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.PrintWriter; +import java.net.MalformedURLException; +import java.net.URL; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.time.LocalDateTime; +import java.time.ZoneOffset; + +import org.apache.commons.io.IOUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * The Class CrawlFile. + */ +public class CrawlJournalFile +{ + private static Logger logger = LoggerFactory.getLogger(CrawlJournalFile.class); + + /** + * Instantiates a new crawl file. + */ + private CrawlJournalFile() + { + super(); + } + + /** + * Load. + * + * @param file + * the file + * @return the path property list + * @throws IOException + * Signals that an I/O exception has occurred. + */ + public static CrawlJournal load(final File file) throws IOException + { + CrawlJournal result; + + result = load(file, StandardCharsets.UTF_8); + + // + return result; + } + + /** + * Load. + * + * @param file + * the file + * @param charset + * the charset name + * @return the path properties + * @throws IOException + * Signals that an I/O exception has occurred. + */ + public static CrawlJournal load(final File file, final Charset charset) throws IOException + { + CrawlJournal result; + + if (file == null) + { + throw new IllegalArgumentException("File parameter is null."); + } + else + { + BufferedReader in = null; + try + { + in = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset)); + result = read(in); + } + finally + { + IOUtils.closeQuietly(in); + } + + result.setDatetime(LocalDateTime.ofEpochSecond(file.lastModified() / 1000, 0, ZoneOffset.UTC)); + } + + // + return result; + } + + /** + * Read. + * + * @param in + * the in + * @return the crawl logs + * @throws IOException + * Signals that an I/O exception has occurred. + */ + public static CrawlJournal read(final BufferedReader in) throws IOException + { + CrawlJournal result; + + result = new CrawlJournal(); + + boolean ended = false; + while (!ended) + { + String line = in.readLine(); + + if (line == null) + { + ended = true; + } + else + { + CrawlLog log = valueOf(line); + result.add(log); + } + } + + // + return result; + } + + /** + * Save. + * + * @param file + * the file + * @param source + * the source + * @throws IOException + * Signals that an I/O exception has occurred. + */ + public static void save(final File file, final CrawlJournal source) throws IOException + { + PrintWriter out = null; + try + { + out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(file), "UTF-8")); + write(out, source); + } + finally + { + // + IOUtils.closeQuietly(out); + } + } + + /** + * Value of. + * + * @param line + * the line + * @return the path property + */ + public static CrawlLog valueOf(final String line) + { + CrawlLog result; + + if (line == null) + { + result = null; + } + else + { + String[] tokens = line.split(" ", 2); + + CrawlStatus status = CrawlStatus.valueOf(tokens[0].toUpperCase()); + + URL url; + try + { + url = new URL(tokens[1].trim()); + } + catch (MalformedURLException exception) + { + logger.error("Error valuing [{}]", line); + exception.printStackTrace(); + url = null; + } + + result = new CrawlLog(url, status); + } + + // + return result; + } + + /** + * Write. + * + * @param out + * the out + * @param source + * the source + * @throws IOException + * Signals that an I/O exception has occurred. + */ + public static void write(final PrintWriter out, final CrawlJournal journal) throws IOException + { + if (journal != null) + { + for (CrawlLog log : journal) + { + String line = log.getStatus() + " " + log.getUrl(); + out.write(line); + out.write("\n"); + } + } + } + +} \ No newline at end of file diff --git a/src/fr/devinsy/statoolinfos/crawl/CrawlLog.java b/src/fr/devinsy/statoolinfos/crawl/CrawlLog.java new file mode 100644 index 0000000..fba147f --- /dev/null +++ b/src/fr/devinsy/statoolinfos/crawl/CrawlLog.java @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2021 Christian Pierre MOMON + * + * This file is part of StatoolInfos, simple service statistics tool. + * + * StatoolInfos is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * StatoolInfos is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with StatoolInfos. If not, see . + */ +package fr.devinsy.statoolinfos.crawl; + +import java.net.URL; + +/** + * The Class CrawlLog. + */ +public class CrawlLog +{ + private URL url; + private CrawlStatus status; + + /** + * Instantiates a new crawl log. + * + * @param url + * the url + * @param status + * the status + */ + public CrawlLog(final URL url, final CrawlStatus status) + { + this.url = url; + this.status = status; + } + + public CrawlStatus getStatus() + { + return this.status; + } + + public URL getUrl() + { + return this.url; + } +} \ No newline at end of file diff --git a/src/fr/devinsy/statoolinfos/crawl/CrawlLogs.java b/src/fr/devinsy/statoolinfos/crawl/CrawlLogs.java new file mode 100644 index 0000000..f80f898 --- /dev/null +++ b/src/fr/devinsy/statoolinfos/crawl/CrawlLogs.java @@ -0,0 +1,142 @@ +/* + * Copyright (C) 2021 Christian Pierre MOMON + * + * This file is part of StatoolInfos, simple service statistics tool. + * + * StatoolInfos is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * StatoolInfos is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with StatoolInfos. If not, see . + */ +package fr.devinsy.statoolinfos.crawl; + +import java.net.URL; +import java.util.ArrayList; +import java.util.Collections; + +import org.apache.commons.lang3.StringUtils; + +/** + * The Class CrawlLogs. + */ +public class CrawlLogs extends ArrayList +{ + private static final long serialVersionUID = -8749217049690008582L; + + /** + * Instantiates a new crawl logs. + */ + public CrawlLogs() + { + super(); + } + + /** + * Adds the. + * + * @param url + * the url + * @param status + * the status + */ + public void add(final URL url, final CrawlStatus status) + { + this.add(new CrawlLog(url, status)); + } + + /** + * Find by software. + * + * @param softwareName + * the software name + * @return the category + */ + public CrawlLogs findByUrl(final URL url) + { + CrawlLogs result; + + result = new CrawlLogs(); + + for (CrawlLog log : this) + { + if (StringUtils.equals(log.getUrl().toString(), url.toString())) + { + result.add(log); + } + } + + // + return result; + } + + /** + * Gets the errors. + * + * @return the errors + */ + public CrawlLogs getErrors() + { + CrawlLogs result; + + result = new CrawlLogs(); + + for (CrawlLog log : this) + { + if (log.getStatus().isError()) + { + result.add(log); + } + } + + // + return result; + } + + /** + * Gets the success. + * + * @return the success + */ + public CrawlLogs getSuccess() + { + CrawlLogs result; + + result = new CrawlLogs(); + + for (CrawlLog log : this) + { + if (!log.getStatus().isError()) + { + result.add(log); + } + } + + // + return result; + } + + /** + * Reverse. + * + * @return the categories + */ + public CrawlLogs reverse() + { + CrawlLogs result; + + Collections.reverse(this); + + result = this; + + // + return result; + } +} diff --git a/src/fr/devinsy/statoolinfos/crawl/CrawlStatus.java b/src/fr/devinsy/statoolinfos/crawl/CrawlStatus.java new file mode 100644 index 0000000..893a574 --- /dev/null +++ b/src/fr/devinsy/statoolinfos/crawl/CrawlStatus.java @@ -0,0 +1,50 @@ +/* + * Copyright (C) 2021 Christian Pierre MOMON + * + * This file is part of StatoolInfos, simple service statistics tool. + * + * StatoolInfos is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * StatoolInfos is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with StatoolInfos. If not, see . + */ +package fr.devinsy.statoolinfos.crawl; + +public enum CrawlStatus +{ + BADCHILDCLASS, + BADURLFORMAT, + CONNECTERROR, + DOWNLOADERROR, + EMPTY, + IOERROR, + MISSING, + SUCCESS, + UPDATED, + URLNOTFOUND; + + public boolean isError() + { + boolean result; + + if ((this == CrawlStatus.SUCCESS) || (this == CrawlStatus.UPDATED)) + { + result = false; + } + else + { + result = true; + } + + // + return result; + } +} \ No newline at end of file diff --git a/src/fr/devinsy/statoolinfos/crawl/Crawler.java b/src/fr/devinsy/statoolinfos/crawl/Crawler.java index 8ad199a..d8384d2 100644 --- a/src/fr/devinsy/statoolinfos/crawl/Crawler.java +++ b/src/fr/devinsy/statoolinfos/crawl/Crawler.java @@ -21,18 +21,19 @@ package fr.devinsy.statoolinfos.crawl; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; +import java.net.MalformedURLException; import java.net.URL; import java.nio.charset.StandardCharsets; +import java.nio.file.Files; import java.time.LocalDateTime; +import java.time.format.DateTimeFormatter; -import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import fr.devinsy.statoolinfos.core.Configuration; -import fr.devinsy.statoolinfos.core.Factory; +import fr.devinsy.statoolinfos.core.PropertyClassType; import fr.devinsy.statoolinfos.core.StatoolInfosException; import fr.devinsy.statoolinfos.core.StatoolInfosUtils; import fr.devinsy.statoolinfos.properties.PathProperties; @@ -47,107 +48,48 @@ public class Crawler { private static Logger logger = LoggerFactory.getLogger(Crawler.class); + private CrawlCache cache; + private CrawlJournal journal; + + /** + * Instantiates a new crawler. + * + * @param rootDirectory + * the root directory + * @throws StatoolInfosException + * the statool infos exception + */ + public Crawler(final File rootDirectory) throws StatoolInfosException + { + logger.info("Crawl cache setting: {}", rootDirectory); + this.cache = new CrawlCache(rootDirectory); + this.journal = new CrawlJournal(); + } + /** * Clear. * - * @param configuration - * the configuration * @throws StatoolInfosException * the statool infos exception - * @throws IOException - * Signals that an I/O exception has occurred. */ - public static void clear(final Configuration configuration) throws StatoolInfosException, IOException + public void clear() throws StatoolInfosException { - logger.info("Cache setting: {}", configuration.getCrawlCachePath()); - - String path = configuration.getCrawlCachePath(); - if (StringUtils.isBlank(path)) - { - logger.warn("Undefined crawl cache."); - } - else if (!new File(path).exists()) - { - logger.warn("Crawl cache does not exist: {}.", path); - } - else - { - CrawlCache cache = configuration.getCrawlCache(); - cache.clear(); - } + this.cache.clear(); } /** * Crawl. * - * @param configuration - * the configuration + * @param url + * the input url * @throws StatoolInfosException * the statool infos exception * @throws IOException * Signals that an I/O exception has occurred. */ - public static void crawl(final Configuration configuration) throws StatoolInfosException, IOException + public void crawl(final URL url) throws StatoolInfosException, IOException { - logger.info("Crawl input setting: {}", configuration.getCrawlInputPath()); - logger.info("Crawl cache setting: {}", configuration.getCrawlCachePath()); - - CrawlCache cache = configuration.getCrawlCache(); - - PathProperties input = PathPropertyUtils.load(configuration.getCrawlInputFile()); - - if (configuration.isFederation()) - { - cache.store(input.get("federation.name"), configuration.getCrawlInputFile()); - cache.storeQuietly(input.getURL("federation.logo")); - } - else if (configuration.isOrganization()) - { - cache.store(input.get("organization.name"), configuration.getCrawlInputFile()); - cache.storeQuietly(input.getURL("organization.logo")); - } - - PathProperties subs = input.getByPrefix("subs"); - for (PathProperty property : subs) - { - if (StringUtils.isNotBlank(property.getValue())) - { - try - { - URL subUrl = new URL(property.getValue()); - crawl(subUrl, cache, input.get("file.class")); - } - catch (java.net.MalformedURLException exception) - { - logger.error("ERROR: subcrawl failed for [{}][{}]: {}", property.getPath(), property.getValue(), exception.getMessage()); - exception.printStackTrace(); - } - catch (IOException exception) - { - logger.error("ERROR: subcrawl failed for [{}][{}]: {}", property.getPath(), property.getValue(), exception.getMessage()); - exception.printStackTrace(); - } - } - } - } - - /** - * Crawl. - * - * @param configurationFile - * the input - * @throws StatoolInfosException - * the statool infos exception - * @throws IOException - * Signals that an I/O exception has occurred. - */ - public static void crawl(final File configurationFile) throws StatoolInfosException, IOException - { - logger.info("Crawl {}", configurationFile.getAbsolutePath()); - - Configuration configuration = Factory.loadConfiguration(configurationFile); - - crawl(configuration); + crawl(url, null); } /** @@ -162,64 +104,287 @@ public class Crawler * @throws IOException * Signals that an I/O exception has occurred. */ - public static void crawl(final URL url, final CrawlCache cache, final String parentFileClass) throws StatoolInfosException, IOException + public void crawl(final URL url, final PropertyClassType parent) { - logger.info("Crawling " + url); + logger.info("Crawling {}", url); - // Crawl. - File file = cache.store(url); - if (file != null) + try { - // Build crawl data. - PathProperties crawlSection = new PathPropertyList(); - crawlSection.put("crawl.crawler", "StatoolInfos"); - crawlSection.put("crawl.datetime", LocalDateTime.now().toString()); - crawlSection.put("crawl.url", url.toString()); - crawlSection.put("crawl.file.size", FileUtils.sizeOf(file)); - crawlSection.put("crawl.file.datetime", StatoolInfosUtils.urlLastModified(url).toString()); - crawlSection.put("crawl.file.sha1", DigestUtils.sha1Hex(FileUtils.readFileToByteArray(file))); - - // Add crawl data in crawled file. - String lines = crawlSection.toStringListFormatted().toStringSeparatedBy('\n'); - FileUtils.write(file, FileUtils.readFileToString(file, StandardCharsets.UTF_8) + "\n" + lines, StandardCharsets.UTF_8); - - // Crawl another resources. - PathProperties properties = PathPropertyUtils.load(file); - - cache.storeQuietly(properties.getURL("organization.logo")); - cache.storeQuietly(properties.getURL("service.logo")); - - // Crawl subs. - String fileClass = properties.get("file.class"); - if (StringUtils.equalsIgnoreCase(fileClass, parentFileClass)) + File downloadFile; + try { - logger.warn("WARNING: file class same than parent for [{}]", url); + downloadFile = download(url); } - else + catch (java.net.ConnectException exception) { - PathProperties subs = properties.getByPrefix("subs"); - for (PathProperty property : subs) + logger.error("ERROR: crawl failed for [{}]: {}", url.toString(), exception.getMessage()); + this.journal.add(url, CrawlStatus.CONNECTERROR); + downloadFile = null; + exception.printStackTrace(); + } + catch (FileNotFoundException exception) + { + logger.error("ERROR: crawl failed for [{}]: {}", url.toString(), exception.getMessage()); + this.journal.add(url, CrawlStatus.URLNOTFOUND); + downloadFile = null; + exception.printStackTrace(); + } + catch (IOException exception) + { + logger.error("ERROR: crawl failed for [{}]: {}", url.toString(), exception.getMessage()); + this.journal.add(url, CrawlStatus.DOWNLOADERROR); + downloadFile = null; + exception.printStackTrace(); + } + + if (downloadFile != null) + { + if (!downloadFile.exists()) { - if (StringUtils.isNotBlank(property.getValue())) + logger.error("ERROR: download missing."); + this.journal.add(url, CrawlStatus.MISSING); + } + else if (downloadFile.length() == 0) + { + logger.error("ERROR: download empty."); + this.journal.add(url, CrawlStatus.EMPTY); + } + else + { + PathProperties downloadProperties = PathPropertyUtils.load(downloadFile); + PropertyClassType downloadClass = PropertyClassType.of(downloadProperties.get("file.class")); + + if ((downloadClass == null) || (!downloadClass.isChildOf(parent))) { - try + logger.error("ERROR: bad child class [{}][{}].", downloadClass, parent); + this.journal.add(url, CrawlStatus.BADCHILDCLASS); + } + else + { + File storedFile = this.cache.restoreFile(url); + String storedSha; + if (storedFile == null) { - URL subUrl = new URL(property.getValue()); - crawl(subUrl, cache, fileClass); + storedSha = null; } - catch (java.net.MalformedURLException exception) + else { - logger.error("ERROR: subcrawl failed for [{}][{}][{}]: {}", url.toString(), property.getPath(), property.getValue(), exception.getMessage()); - exception.printStackTrace(); + PathProperties storedProperties = PathPropertyUtils.load(storedFile); + storedSha = storedProperties.get("crawl.file.sha1"); } - catch (java.net.ConnectException | FileNotFoundException exception) + + String downloadSha = StatoolInfosUtils.sha1sum(downloadFile); + if (StringUtils.equals(downloadSha, storedSha)) { - logger.error("ERROR: subcrawl failed for [{}][{}][{}]: {}", url.toString(), property.getPath(), property.getValue(), exception.getMessage()); - exception.printStackTrace(); + this.journal.add(url, CrawlStatus.SUCCESS); + } + else + { + // Build crawl data. + PathProperties crawlSection = new PathPropertyList(); + crawlSection.put("crawl.crawler", "StatoolInfos"); + crawlSection.put("crawl.datetime", LocalDateTime.now().format(DateTimeFormatter.ofPattern("YYYY-MM-dd'T'HH:mm:ss"))); + crawlSection.put("crawl.url", url.toString()); + crawlSection.put("crawl.file.size", FileUtils.sizeOf(downloadFile)); + crawlSection.put("crawl.file.datetime", StatoolInfosUtils.urlLastModified(url).toString()); + crawlSection.put("crawl.file.sha1", downloadSha); + String crawlSectionLines = crawlSection.toStringListFormatted().toStringSeparatedBy('\n'); + + // Add crawl data in crawled file. + String downloadExtendedLines = FileUtils.readFileToString(downloadFile, StandardCharsets.UTF_8) + "\n" + crawlSectionLines; + FileUtils.write(downloadFile, downloadExtendedLines, StandardCharsets.UTF_8); + + // Store in cache. + this.cache.store(url, downloadFile); + downloadFile.delete(); + + // + this.journal.add(url, CrawlStatus.UPDATED); + } + + // Cache another resources. + crawlLogo(downloadProperties.getURL("federation.logo")); + crawlLogo(downloadProperties.getURL("organization.logo")); + crawlLogo(downloadProperties.getURL("service.logo")); + + // Do subs. + PathProperties subs = downloadProperties.getByPrefix("subs"); + for (PathProperty property : subs) + { + if (StringUtils.isNotBlank(property.getValue())) + { + try + { + URL subUrl = new URL(property.getValue()); + crawl(subUrl, downloadClass); + } + catch (java.net.MalformedURLException exception) + { + logger.error("ERROR: subcrawl failed for [{}][{}][{}]: {}", url.toString(), property.getPath(), property.getValue(), exception.getMessage()); + this.journal.add(url, CrawlStatus.BADURLFORMAT); + exception.printStackTrace(); + } + } } } } } } + catch (IOException exception) + { + this.journal.add(url, CrawlStatus.IOERROR); + } + } + + /** + * Crawl logo. + * + * @param url + * the url + * @return the file + */ + public File crawlLogo(final URL url) + { + File result; + + try + { + if ((url == null) || (!StringUtils.startsWithIgnoreCase(url.getProtocol(), "http"))) + { + result = null; + } + else + { + logger.info("Crawling {}", url); + + File logoFile; + try + { + logoFile = download(url); + } + catch (java.net.ConnectException exception) + { + logger.error("ERROR: crawl failed (1) for [{}]: {}", url.toString(), exception.getMessage()); + this.journal.add(url, CrawlStatus.CONNECTERROR); + logoFile = null; + } + catch (FileNotFoundException exception) + { + logger.error("ERROR: crawl failed (2) for [{}]: {}", url.toString(), exception.getMessage()); + this.journal.add(url, CrawlStatus.URLNOTFOUND); + logoFile = null; + } + catch (IOException exception) + { + logger.error("ERROR: crawl failed (3) for [{}]: {}", url.toString(), exception.getMessage()); + this.journal.add(url, CrawlStatus.DOWNLOADERROR); + logoFile = null; + } + + if (logoFile == null) + { + result = null; + } + else + { + result = this.cache.store(url, logoFile); + this.journal.add(url, CrawlStatus.SUCCESS); + logoFile.delete(); + } + } + } + catch (IOException exception) + { + logger.info("Store failed for {}: {}", url, exception.getMessage()); + result = null; + } + + // + return result; + } + + /** + * Download. + * + * @param url + * the url + * @return the file + * @throws IOException + * Signals that an I/O exception has occurred. + */ + public File download(final URL url) throws IOException + { + File result; + + if (!StringUtils.startsWith(url.getProtocol(), "http")) + { + logger.warn("WARNING: crawl failed because bad http+ protocol for [{}]", url); + result = null; + } + else + { + final int TIMEOUT = 5000; + result = Files.createTempFile("tmp-", ".statoolsinfos").toFile(); + FileUtils.copyURLToFile(url, result, TIMEOUT, TIMEOUT); + } + + // + return result; + } + + /** + * Restore journal. + * + * @return the crawl journal + * @throws IOException + */ + public CrawlJournal restoreJournal() throws IOException + { + CrawlJournal result; + + logger.info("Restoring crawl journal."); + + File journalFile = this.cache.restoreFile(getJournalURL()); + + result = CrawlJournalFile.load(journalFile); + + // + return result; + } + + /** + * Store journal. + */ + public void storeJournal() + { + try + { + logger.info("Storing crawl journal."); + File file = Files.createTempFile("tmp-", ".statoolsinfos").toFile(); + + CrawlJournalFile.save(file, this.journal); + this.cache.store(getJournalURL(), file); + file.delete(); + } + catch (IOException exception) + { + exception.printStackTrace(); + } + } + + /** + * Gets the journal URL. + * + * @return the journal URL + * @throws MalformedURLException + */ + public static URL getJournalURL() throws MalformedURLException + { + URL result; + + result = new URL("http://localhost/crawl.journal"); + + // + return result; } } diff --git a/src/fr/devinsy/statoolinfos/htmlize/CrawlJournalPage.java b/src/fr/devinsy/statoolinfos/htmlize/CrawlJournalPage.java new file mode 100644 index 0000000..3b649e7 --- /dev/null +++ b/src/fr/devinsy/statoolinfos/htmlize/CrawlJournalPage.java @@ -0,0 +1,122 @@ +/* + * Copyright (C) 2021 Christian Pierre MOMON + * + * This file is part of StatoolInfos, simple service statistics tool. + * + * StatoolInfos is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * StatoolInfos is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with StatoolInfos. If not, see . + */ +package fr.devinsy.statoolinfos.htmlize; + +import java.io.File; +import java.io.IOException; +import java.nio.charset.StandardCharsets; + +import org.apache.commons.io.FileUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import fr.devinsy.statoolinfos.HtmlizerContext; +import fr.devinsy.statoolinfos.core.Federation; +import fr.devinsy.statoolinfos.core.StatoolInfosException; +import fr.devinsy.statoolinfos.crawl.CrawlCache; +import fr.devinsy.statoolinfos.crawl.CrawlJournal; +import fr.devinsy.statoolinfos.crawl.CrawlLog; +import fr.devinsy.xidyn.XidynException; +import fr.devinsy.xidyn.data.TagDataManager; +import fr.devinsy.xidyn.presenters.PresenterUtils; + +/** + * The Class CrawlJournalPage. + */ +public class CrawlJournalPage +{ + private static Logger logger = LoggerFactory.getLogger(CrawlJournalPage.class); + + /** + * Builds the all. + * + * @throws StatoolInfosException + * @throws IOException + */ + public static void buildAll() throws StatoolInfosException, IOException + { + Federation federation = HtmlizerContext.instance().getFederation(); + CrawlCache cache = HtmlizerContext.instance().getCache(); + File htmlizeDirectory = HtmlizerContext.instance().getHtmlizeDirectory(); + + logger.info("Htmlize Crawl Journal pages."); + CrawlJournal journal = HtmlizerContext.instance().getCrawlJournal(); + String page = htmlize("Journal des téléchargements", journal); + FileUtils.write(new File(htmlizeDirectory, federation.getTechnicalName() + "-crawl.xhtml"), page, StandardCharsets.UTF_8); + } + + /** + * Htmlize. + * + * @param title + * the title + * @param journal + * the journal + * @return the string + * @throws StatoolInfosException + * the statool infos exception + */ + public static String htmlize(final String title, final CrawlJournal journal) throws StatoolInfosException + { + String result; + + try + { + logger.debug("Building Crawl journal page…"); + + TagDataManager data = new TagDataManager(); + + data.setEscapedContent("title", title); + data.setContent("date", journal.getDatetime().toString()); + data.setContent("totalCount", journal.size()); + data.setContent("errorCount", journal.getErrors().size()); + + int index = 0; + for (CrawlLog log : journal) + { + data.setEscapedContent("crawlLogLine", index, "crawlLogLineUrlLink", log.getUrl().toString()); + data.setEscapedAttribute("crawlLogLine", index, "crawlLogLineUrlLink", "href", log.getUrl().toString()); + data.setContent("crawlLogLine", index, "crawlLogLineStatus", log.getStatus().toString()); + + if (log.getStatus().isError()) + { + data.setAttribute("crawlLogLine", index, "crawlLogLineStatus", "style", "background-color: red;"); + } + else + { + data.setAttribute("crawlLogLine", index, "crawlLogLineStatus", "style", "background-color: lime;"); + } + + index += 1; + } + + String content = PresenterUtils.dynamize("/fr/devinsy/statoolinfos/htmlize/crawlJournal.xhtml", data).toString(); + + BreadcrumbTrail trail = new BreadcrumbTrail(); + result = WebCharterView.build(content, trail); + } + catch (XidynException exception) + { + throw new StatoolInfosException("Error building crawl journal page: " + exception.getMessage(), exception); + } + + // + return result; + } +} diff --git a/src/fr/devinsy/statoolinfos/htmlize/FederationPage.java b/src/fr/devinsy/statoolinfos/htmlize/FederationPage.java index 125d5b9..6540f5d 100644 --- a/src/fr/devinsy/statoolinfos/htmlize/FederationPage.java +++ b/src/fr/devinsy/statoolinfos/htmlize/FederationPage.java @@ -107,9 +107,19 @@ public class FederationPage data.setAttribute("rawLink", "href", federation.getTechnicalName() + ".properties"); data.setAttribute("rawCheckLink", "href", federation.getTechnicalName() + "-check.xhtml"); - data.setAttribute("statsLink", "href", federation.getTechnicalName() + "-stats.xhtml"); + data.setAttribute("crawlLink", "href", federation.getTechnicalName() + "-crawl.xhtml"); + + if (HtmlizerContext.instance().getCrawlJournal().getErrors().isEmpty()) + { + data.setAttribute("crawlLinkImg", "src", "circle-icons/download-mono.svg"); + } + else + { + data.setAttribute("crawlLinkImg", "src", "circle-icons/download.svg"); + } + { PropertyChecks checks = federation.getInputChecksAll(); diff --git a/src/fr/devinsy/statoolinfos/htmlize/Htmlizer.java b/src/fr/devinsy/statoolinfos/htmlize/Htmlizer.java index e5f41a7..f45ad4b 100644 --- a/src/fr/devinsy/statoolinfos/htmlize/Htmlizer.java +++ b/src/fr/devinsy/statoolinfos/htmlize/Htmlizer.java @@ -193,6 +193,7 @@ public class Htmlizer AboutPage.build(); CategoriesPage.build(); CategoryPage.buildAll(); + CrawlJournalPage.buildAll(); EditoPage.build(); ExportsPage.build(); FederationPage.build(); diff --git a/src/fr/devinsy/statoolinfos/htmlize/crawlJournal.xhtml b/src/fr/devinsy/statoolinfos/htmlize/crawlJournal.xhtml new file mode 100644 index 0000000..d7dd066 --- /dev/null +++ b/src/fr/devinsy/statoolinfos/htmlize/crawlJournal.xhtml @@ -0,0 +1,56 @@ + + + + + StatoolInfos + + + + + + + +
+ +
+

Journal des téléchargements

+
Nombre de téléchargements : n/a
+
Nombre d'erreurs : n/a
+
Date : n/a
+
+
+
+ + + + + + + + + + + + + +
URLStatut
n/an/a
+
+
+ + + diff --git a/src/fr/devinsy/statoolinfos/htmlize/federation.xhtml b/src/fr/devinsy/statoolinfos/htmlize/federation.xhtml index e388340..22c466c 100644 --- a/src/fr/devinsy/statoolinfos/htmlize/federation.xhtml +++ b/src/fr/devinsy/statoolinfos/htmlize/federation.xhtml @@ -31,6 +31,7 @@ +
diff --git a/src/fr/devinsy/statoolinfos/htmlize/stuff/circle-icons/download-ko.svg b/src/fr/devinsy/statoolinfos/htmlize/stuff/circle-icons/download-ko.svg new file mode 100644 index 0000000..f16fdeb --- /dev/null +++ b/src/fr/devinsy/statoolinfos/htmlize/stuff/circle-icons/download-ko.svg @@ -0,0 +1,81 @@ + +image/svg+xml \ No newline at end of file diff --git a/src/fr/devinsy/statoolinfos/htmlize/stuff/circle-icons/download-mono.svg b/src/fr/devinsy/statoolinfos/htmlize/stuff/circle-icons/download-mono.svg new file mode 100644 index 0000000..00dcd20 --- /dev/null +++ b/src/fr/devinsy/statoolinfos/htmlize/stuff/circle-icons/download-mono.svg @@ -0,0 +1,46 @@ + +image/svg+xml \ No newline at end of file diff --git a/src/fr/devinsy/statoolinfos/htmlize/stuff/circle-icons/download-ok.svg b/src/fr/devinsy/statoolinfos/htmlize/stuff/circle-icons/download-ok.svg new file mode 100644 index 0000000..95e6fca --- /dev/null +++ b/src/fr/devinsy/statoolinfos/htmlize/stuff/circle-icons/download-ok.svg @@ -0,0 +1,110 @@ + +image/svg+xml \ No newline at end of file diff --git a/src/fr/devinsy/statoolinfos/htmlize/stuff/circle-icons/download.svg b/src/fr/devinsy/statoolinfos/htmlize/stuff/circle-icons/download.svg new file mode 100644 index 0000000..f16fdeb --- /dev/null +++ b/src/fr/devinsy/statoolinfos/htmlize/stuff/circle-icons/download.svg @@ -0,0 +1,81 @@ + +image/svg+xml \ No newline at end of file diff --git a/src/fr/devinsy/statoolinfos/properties/PathProperties.java b/src/fr/devinsy/statoolinfos/properties/PathProperties.java index 5dd801c..b3cfedf 100644 --- a/src/fr/devinsy/statoolinfos/properties/PathProperties.java +++ b/src/fr/devinsy/statoolinfos/properties/PathProperties.java @@ -35,7 +35,7 @@ public interface PathProperties extends Iterable boolean add(PathProperty property); /** - * h Gets the. + * Gets the. * * @param path * the path diff --git a/src/fr/devinsy/statoolinfos/properties/PathPropertyUtils.java b/src/fr/devinsy/statoolinfos/properties/PathPropertyUtils.java index cd64ff6..c0cb72e 100644 --- a/src/fr/devinsy/statoolinfos/properties/PathPropertyUtils.java +++ b/src/fr/devinsy/statoolinfos/properties/PathPropertyUtils.java @@ -27,6 +27,8 @@ import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.net.URL; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; @@ -43,8 +45,6 @@ public class PathPropertyUtils { private static final Logger logger = LoggerFactory.getLogger(PathPropertyUtils.class); - public static final String DEFAULT_CHARSET_NAME = "UTF-8"; - /** * Checks if is property line. * @@ -82,7 +82,7 @@ public class PathPropertyUtils { PathProperties result; - result = load(file, DEFAULT_CHARSET_NAME); + result = load(file, StandardCharsets.UTF_8); // return result; @@ -93,13 +93,13 @@ public class PathPropertyUtils * * @param file * the file - * @param charsetName + * @param charset * the charset name * @return the path properties * @throws IOException * Signals that an I/O exception has occurred. */ - public static PathProperties load(final File file, final String charsetName) throws IOException + public static PathProperties load(final File file, final Charset charset) throws IOException { PathProperties result; @@ -114,7 +114,7 @@ public class PathPropertyUtils BufferedReader in = null; try { - in = new BufferedReader(new InputStreamReader(new FileInputStream(file), charsetName)); + in = new BufferedReader(new InputStreamReader(new FileInputStream(file), charset)); result = read(in); } finally