From a1e1a60c5aa638ec66ab57243d59bb71ef309c6a Mon Sep 17 00:00:00 2001 From: "Christian P. MOMON" Date: Mon, 7 Jun 2021 03:41:48 +0200 Subject: [PATCH] Refactored user agent bot detection feature. --- .../metrics/http/HttpAccessLog.java | 25 +--- .../metrics/http/UserAgentBotDetector.java | 108 ++++++++++++++++++ .../metrics/http/userAgentBotDetectorData.txt | 34 ++++++ .../http/UserAgentBotDetectorTest.java | 78 +++++++++++++ 4 files changed, 224 insertions(+), 21 deletions(-) create mode 100644 src/fr/devinsy/statoolinfos/metrics/http/UserAgentBotDetector.java create mode 100644 src/fr/devinsy/statoolinfos/metrics/http/userAgentBotDetectorData.txt create mode 100644 test/fr/devinsy/statoolinfos/metrics/http/UserAgentBotDetectorTest.java diff --git a/src/fr/devinsy/statoolinfos/metrics/http/HttpAccessLog.java b/src/fr/devinsy/statoolinfos/metrics/http/HttpAccessLog.java index 7462e3c..dafc30b 100644 --- a/src/fr/devinsy/statoolinfos/metrics/http/HttpAccessLog.java +++ b/src/fr/devinsy/statoolinfos/metrics/http/HttpAccessLog.java @@ -23,13 +23,11 @@ import java.time.format.DateTimeFormatter; import java.util.Locale; import java.util.regex.Pattern; -import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import fr.devinsy.statoolinfos.metrics.TimeMarkUtils; import fr.devinsy.strings.StringList; -import fr.devinsy.strings.StringsUtils; /** * The Class HttpAccessLog. @@ -179,30 +177,15 @@ public class HttpAccessLog } /** - * @return + * Checks if is bot. + * + * @return true, if is bot */ public boolean isBot() { boolean result; - if (StringsUtils.containsAnyIgnoreCase(this.userAgent.toString(), "BingPreview", "bot", "crawler", "HeadlessChrome/", " - Mobilizon ", "monitoring", "YisouSpider")) - { - result = true; - } - else if (StringUtils.startsWithAny(this.userAgent.toString(), "Apache-HttpClient/", "curl/", "Friendica ", "git/", "github-camo", "hackney/", "http.rb/", "FediList ", "Go-http-client", - "GoModuleMirror/", "HotJava/", "Java/", "JGit/", "MastoPeek ", "mattermost-", "Misskey/", "newspaper/", "node-fetch/", "okhttp/", "PeerTube/", "PHP/", "Pleroma ", "python-requests/", - "python/", "Python/", "Synapse/", "Tusky/")) - { - result = true; - } - else if (StringUtils.equalsAnyIgnoreCase(this.userAgent.toString(), "-")) - { - result = true; - } - else - { - result = false; - } + result = UserAgentBotDetector.isBot(this.userAgent.toString()); // return result; diff --git a/src/fr/devinsy/statoolinfos/metrics/http/UserAgentBotDetector.java b/src/fr/devinsy/statoolinfos/metrics/http/UserAgentBotDetector.java new file mode 100644 index 0000000..79fc930 --- /dev/null +++ b/src/fr/devinsy/statoolinfos/metrics/http/UserAgentBotDetector.java @@ -0,0 +1,108 @@ +/* + * Copyright (C) 2021 Christian Pierre MOMON + * + * This file is part of StatoolInfos, simple service statistics tool. + * + * StatoolInfos is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * StatoolInfos is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with StatoolInfos. If not, see . + */ +package fr.devinsy.statoolinfos.metrics.http; + +import java.io.IOException; + +import org.apache.commons.lang3.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import fr.devinsy.strings.StringList; +import fr.devinsy.strings.StringsUtils; + +/** + * The Class UserAgentBotDetector. + */ +public class UserAgentBotDetector +{ + private static Logger logger = LoggerFactory.getLogger(UserAgentBotDetector.class); + + private static final StringList startList = new StringList(); + private static final StringList containList = new StringList(); + + static + { + StringList lines; + try + { + lines = StringsUtils.load(UserAgentBotDetector.class.getResource("/fr/devinsy/statoolinfos/metrics/http/userAgentBotDetectorData.txt")); + } + catch (IOException exception) + { + exception.printStackTrace(); + lines = new StringList(); + } + + for (String line : lines) + { + if (line.startsWith("^")) + { + startList.add(line.substring(1)); + } + else + { + containList.add(line); + } + } + } + + /** + * Instantiates a new user agent bot detector. + */ + private UserAgentBotDetector() + { + } + + /** + * Checks if is bot. + * + * @param userAgent + * the user agent + * @return true, if is bot + */ + public static boolean isBot(final String userAgent) + { + boolean result; + + if (StringUtils.isBlank(userAgent)) + { + result = true; + } + else if (StringUtils.equalsAny(userAgent.trim(), "-")) + { + result = true; + } + else if (StringsUtils.containsAnyIgnoreCase(userAgent, containList)) + { + result = true; + } + else if (StringsUtils.startsWithAny(userAgent, startList)) + { + result = true; + } + else + { + result = false; + } + + // + return result; + } +} diff --git a/src/fr/devinsy/statoolinfos/metrics/http/userAgentBotDetectorData.txt b/src/fr/devinsy/statoolinfos/metrics/http/userAgentBotDetectorData.txt new file mode 100644 index 0000000..e31369a --- /dev/null +++ b/src/fr/devinsy/statoolinfos/metrics/http/userAgentBotDetectorData.txt @@ -0,0 +1,34 @@ +^Apache-HttpClient/ +bot +BingPreview +crawler +^curl/ +^Friendica +^git/ +^github-camo +^hackney/ +HeadlessChrome/ +^http.rb/ +^FediList +^Go-http-client +^GoModuleMirror/ +^HotJava/ +^Java/ +^JGit/ +^MastoPeek +^mattermost- +^Misskey/ + - Mobilizon +monitoring +^newspaper/ +^node-fetch/ +^okhttp/ +^PeerTube/ +^PHP/ +^Pleroma +^python-requests/ +^python/ +^Python/ +^Synapse/ +^Tusky/ +YisouSpider diff --git a/test/fr/devinsy/statoolinfos/metrics/http/UserAgentBotDetectorTest.java b/test/fr/devinsy/statoolinfos/metrics/http/UserAgentBotDetectorTest.java new file mode 100644 index 0000000..5b6a3ae --- /dev/null +++ b/test/fr/devinsy/statoolinfos/metrics/http/UserAgentBotDetectorTest.java @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2021 Christian Pierre MOMON + * + * This file is part of StatoolInfos, simple key value database. + * + * StatoolInfos is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * StatoolInfos is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with StatoolInfos. If not, see . + */ +package fr.devinsy.statoolinfos.metrics.http; + +import org.apache.log4j.BasicConfigurator; +import org.apache.log4j.Level; +import org.apache.log4j.Logger; +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import fr.devinsy.statoolinfos.core.StatoolInfosException; + +/** + * The Class UserAgentBotDetectorTest. + */ +public class UserAgentBotDetectorTest +{ + private static org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(UserAgentBotDetectorTest.class); + + /** + * Test 01. + * + * @throws Exception + * the exception + */ + @Test + public void test01() throws Exception + { + Assert.assertTrue(UserAgentBotDetector.isBot("")); + Assert.assertTrue(UserAgentBotDetector.isBot("-")); + Assert.assertTrue(UserAgentBotDetector.isBot("mobilizon.zapashcanon.fr - Mobilizon 1.1.2")); + Assert.assertTrue(UserAgentBotDetector.isBot("Pleroma 2.3.50-242-g8e9f032f-develop; https://soc.abcdefg.club ")); + + Assert.assertFalse(UserAgentBotDetector.isBot("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36")); + } + + /** + * After class. + * + * @throws StatoolInfosException + * the Juga exception + */ + @AfterClass + public static void afterClass() throws StatoolInfosException + { + } + + /** + * Before class. + * + * @throws StatoolInfosException + * the Juga exception + */ + @BeforeClass + public static void beforeClass() throws StatoolInfosException + { + BasicConfigurator.configure(); + Logger.getRootLogger().setLevel(Level.DEBUG); + } +}