Refactored user agent bot detection feature.

This commit is contained in:
Christian P. MOMON 2021-06-07 03:41:48 +02:00
parent 440d973624
commit a1e1a60c5a
4 changed files with 224 additions and 21 deletions

View file

@ -23,13 +23,11 @@ import java.time.format.DateTimeFormatter;
import java.util.Locale;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import fr.devinsy.statoolinfos.metrics.TimeMarkUtils;
import fr.devinsy.strings.StringList;
import fr.devinsy.strings.StringsUtils;
/**
* The Class HttpAccessLog.
@ -179,30 +177,15 @@ public class HttpAccessLog
}
/**
* @return
* Checks if is bot.
*
* @return true, if is bot
*/
public boolean isBot()
{
boolean result;
if (StringsUtils.containsAnyIgnoreCase(this.userAgent.toString(), "BingPreview", "bot", "crawler", "HeadlessChrome/", " - Mobilizon ", "monitoring", "YisouSpider"))
{
result = true;
}
else if (StringUtils.startsWithAny(this.userAgent.toString(), "Apache-HttpClient/", "curl/", "Friendica ", "git/", "github-camo", "hackney/", "http.rb/", "FediList ", "Go-http-client",
"GoModuleMirror/", "HotJava/", "Java/", "JGit/", "MastoPeek ", "mattermost-", "Misskey/", "newspaper/", "node-fetch/", "okhttp/", "PeerTube/", "PHP/", "Pleroma ", "python-requests/",
"python/", "Python/", "Synapse/", "Tusky/"))
{
result = true;
}
else if (StringUtils.equalsAnyIgnoreCase(this.userAgent.toString(), "-"))
{
result = true;
}
else
{
result = false;
}
result = UserAgentBotDetector.isBot(this.userAgent.toString());
//
return result;

View file

@ -0,0 +1,108 @@
/*
* Copyright (C) 2021 Christian Pierre MOMON <christian@momon.org>
*
* This file is part of StatoolInfos, simple service statistics tool.
*
* StatoolInfos is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* StatoolInfos is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with StatoolInfos. If not, see <http://www.gnu.org/licenses/>.
*/
package fr.devinsy.statoolinfos.metrics.http;
import java.io.IOException;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import fr.devinsy.strings.StringList;
import fr.devinsy.strings.StringsUtils;
/**
* The Class UserAgentBotDetector.
*/
public class UserAgentBotDetector
{
private static Logger logger = LoggerFactory.getLogger(UserAgentBotDetector.class);
private static final StringList startList = new StringList();
private static final StringList containList = new StringList();
static
{
StringList lines;
try
{
lines = StringsUtils.load(UserAgentBotDetector.class.getResource("/fr/devinsy/statoolinfos/metrics/http/userAgentBotDetectorData.txt"));
}
catch (IOException exception)
{
exception.printStackTrace();
lines = new StringList();
}
for (String line : lines)
{
if (line.startsWith("^"))
{
startList.add(line.substring(1));
}
else
{
containList.add(line);
}
}
}
/**
* Instantiates a new user agent bot detector.
*/
private UserAgentBotDetector()
{
}
/**
* Checks if is bot.
*
* @param userAgent
* the user agent
* @return true, if is bot
*/
public static boolean isBot(final String userAgent)
{
boolean result;
if (StringUtils.isBlank(userAgent))
{
result = true;
}
else if (StringUtils.equalsAny(userAgent.trim(), "-"))
{
result = true;
}
else if (StringsUtils.containsAnyIgnoreCase(userAgent, containList))
{
result = true;
}
else if (StringsUtils.startsWithAny(userAgent, startList))
{
result = true;
}
else
{
result = false;
}
//
return result;
}
}

View file

@ -0,0 +1,34 @@
^Apache-HttpClient/
bot
BingPreview
crawler
^curl/
^Friendica
^git/
^github-camo
^hackney/
HeadlessChrome/
^http.rb/
^FediList
^Go-http-client
^GoModuleMirror/
^HotJava/
^Java/
^JGit/
^MastoPeek
^mattermost-
^Misskey/
- Mobilizon
monitoring
^newspaper/
^node-fetch/
^okhttp/
^PeerTube/
^PHP/
^Pleroma
^python-requests/
^python/
^Python/
^Synapse/
^Tusky/
YisouSpider

View file

@ -0,0 +1,78 @@
/*
* Copyright (C) 2021 Christian Pierre MOMON <christian@momon.org>
*
* This file is part of StatoolInfos, simple key value database.
*
* StatoolInfos is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* StatoolInfos is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with StatoolInfos. If not, see <http://www.gnu.org/licenses/>.
*/
package fr.devinsy.statoolinfos.metrics.http;
import org.apache.log4j.BasicConfigurator;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import fr.devinsy.statoolinfos.core.StatoolInfosException;
/**
* The Class UserAgentBotDetectorTest.
*/
public class UserAgentBotDetectorTest
{
private static org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(UserAgentBotDetectorTest.class);
/**
* Test 01.
*
* @throws Exception
* the exception
*/
@Test
public void test01() throws Exception
{
Assert.assertTrue(UserAgentBotDetector.isBot(""));
Assert.assertTrue(UserAgentBotDetector.isBot("-"));
Assert.assertTrue(UserAgentBotDetector.isBot("mobilizon.zapashcanon.fr - Mobilizon 1.1.2"));
Assert.assertTrue(UserAgentBotDetector.isBot("Pleroma 2.3.50-242-g8e9f032f-develop; https://soc.abcdefg.club <ary@abcdefg.club>"));
Assert.assertFalse(UserAgentBotDetector.isBot("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36"));
}
/**
* After class.
*
* @throws StatoolInfosException
* the Juga exception
*/
@AfterClass
public static void afterClass() throws StatoolInfosException
{
}
/**
* Before class.
*
* @throws StatoolInfosException
* the Juga exception
*/
@BeforeClass
public static void beforeClass() throws StatoolInfosException
{
BasicConfigurator.configure();
Logger.getRootLogger().setLevel(Level.DEBUG);
}
}