Refactored user agent bot detection feature.
This commit is contained in:
parent
440d973624
commit
a1e1a60c5a
4 changed files with 224 additions and 21 deletions
|
@ -23,13 +23,11 @@ import java.time.format.DateTimeFormatter;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import fr.devinsy.statoolinfos.metrics.TimeMarkUtils;
|
import fr.devinsy.statoolinfos.metrics.TimeMarkUtils;
|
||||||
import fr.devinsy.strings.StringList;
|
import fr.devinsy.strings.StringList;
|
||||||
import fr.devinsy.strings.StringsUtils;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The Class HttpAccessLog.
|
* The Class HttpAccessLog.
|
||||||
|
@ -179,30 +177,15 @@ public class HttpAccessLog
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return
|
* Checks if is bot.
|
||||||
|
*
|
||||||
|
* @return true, if is bot
|
||||||
*/
|
*/
|
||||||
public boolean isBot()
|
public boolean isBot()
|
||||||
{
|
{
|
||||||
boolean result;
|
boolean result;
|
||||||
|
|
||||||
if (StringsUtils.containsAnyIgnoreCase(this.userAgent.toString(), "BingPreview", "bot", "crawler", "HeadlessChrome/", " - Mobilizon ", "monitoring", "YisouSpider"))
|
result = UserAgentBotDetector.isBot(this.userAgent.toString());
|
||||||
{
|
|
||||||
result = true;
|
|
||||||
}
|
|
||||||
else if (StringUtils.startsWithAny(this.userAgent.toString(), "Apache-HttpClient/", "curl/", "Friendica ", "git/", "github-camo", "hackney/", "http.rb/", "FediList ", "Go-http-client",
|
|
||||||
"GoModuleMirror/", "HotJava/", "Java/", "JGit/", "MastoPeek ", "mattermost-", "Misskey/", "newspaper/", "node-fetch/", "okhttp/", "PeerTube/", "PHP/", "Pleroma ", "python-requests/",
|
|
||||||
"python/", "Python/", "Synapse/", "Tusky/"))
|
|
||||||
{
|
|
||||||
result = true;
|
|
||||||
}
|
|
||||||
else if (StringUtils.equalsAnyIgnoreCase(this.userAgent.toString(), "-"))
|
|
||||||
{
|
|
||||||
result = true;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
result = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
//
|
||||||
return result;
|
return result;
|
||||||
|
|
|
@ -0,0 +1,108 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2021 Christian Pierre MOMON <christian@momon.org>
|
||||||
|
*
|
||||||
|
* This file is part of StatoolInfos, simple service statistics tool.
|
||||||
|
*
|
||||||
|
* StatoolInfos is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License as
|
||||||
|
* published by the Free Software Foundation, either version 3 of the
|
||||||
|
* License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* StatoolInfos is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU Affero General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with StatoolInfos. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
package fr.devinsy.statoolinfos.metrics.http;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import fr.devinsy.strings.StringList;
|
||||||
|
import fr.devinsy.strings.StringsUtils;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The Class UserAgentBotDetector.
|
||||||
|
*/
|
||||||
|
public class UserAgentBotDetector
|
||||||
|
{
|
||||||
|
private static Logger logger = LoggerFactory.getLogger(UserAgentBotDetector.class);
|
||||||
|
|
||||||
|
private static final StringList startList = new StringList();
|
||||||
|
private static final StringList containList = new StringList();
|
||||||
|
|
||||||
|
static
|
||||||
|
{
|
||||||
|
StringList lines;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
lines = StringsUtils.load(UserAgentBotDetector.class.getResource("/fr/devinsy/statoolinfos/metrics/http/userAgentBotDetectorData.txt"));
|
||||||
|
}
|
||||||
|
catch (IOException exception)
|
||||||
|
{
|
||||||
|
exception.printStackTrace();
|
||||||
|
lines = new StringList();
|
||||||
|
}
|
||||||
|
|
||||||
|
for (String line : lines)
|
||||||
|
{
|
||||||
|
if (line.startsWith("^"))
|
||||||
|
{
|
||||||
|
startList.add(line.substring(1));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
containList.add(line);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Instantiates a new user agent bot detector.
|
||||||
|
*/
|
||||||
|
private UserAgentBotDetector()
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks if is bot.
|
||||||
|
*
|
||||||
|
* @param userAgent
|
||||||
|
* the user agent
|
||||||
|
* @return true, if is bot
|
||||||
|
*/
|
||||||
|
public static boolean isBot(final String userAgent)
|
||||||
|
{
|
||||||
|
boolean result;
|
||||||
|
|
||||||
|
if (StringUtils.isBlank(userAgent))
|
||||||
|
{
|
||||||
|
result = true;
|
||||||
|
}
|
||||||
|
else if (StringUtils.equalsAny(userAgent.trim(), "-"))
|
||||||
|
{
|
||||||
|
result = true;
|
||||||
|
}
|
||||||
|
else if (StringsUtils.containsAnyIgnoreCase(userAgent, containList))
|
||||||
|
{
|
||||||
|
result = true;
|
||||||
|
}
|
||||||
|
else if (StringsUtils.startsWithAny(userAgent, startList))
|
||||||
|
{
|
||||||
|
result = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
result = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,34 @@
|
||||||
|
^Apache-HttpClient/
|
||||||
|
bot
|
||||||
|
BingPreview
|
||||||
|
crawler
|
||||||
|
^curl/
|
||||||
|
^Friendica
|
||||||
|
^git/
|
||||||
|
^github-camo
|
||||||
|
^hackney/
|
||||||
|
HeadlessChrome/
|
||||||
|
^http.rb/
|
||||||
|
^FediList
|
||||||
|
^Go-http-client
|
||||||
|
^GoModuleMirror/
|
||||||
|
^HotJava/
|
||||||
|
^Java/
|
||||||
|
^JGit/
|
||||||
|
^MastoPeek
|
||||||
|
^mattermost-
|
||||||
|
^Misskey/
|
||||||
|
- Mobilizon
|
||||||
|
monitoring
|
||||||
|
^newspaper/
|
||||||
|
^node-fetch/
|
||||||
|
^okhttp/
|
||||||
|
^PeerTube/
|
||||||
|
^PHP/
|
||||||
|
^Pleroma
|
||||||
|
^python-requests/
|
||||||
|
^python/
|
||||||
|
^Python/
|
||||||
|
^Synapse/
|
||||||
|
^Tusky/
|
||||||
|
YisouSpider
|
|
@ -0,0 +1,78 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2021 Christian Pierre MOMON <christian@momon.org>
|
||||||
|
*
|
||||||
|
* This file is part of StatoolInfos, simple key value database.
|
||||||
|
*
|
||||||
|
* StatoolInfos is free software: you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU Affero General Public License as
|
||||||
|
* published by the Free Software Foundation, either version 3 of the
|
||||||
|
* License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* StatoolInfos is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU Affero General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU Affero General Public License
|
||||||
|
* along with StatoolInfos. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
package fr.devinsy.statoolinfos.metrics.http;
|
||||||
|
|
||||||
|
import org.apache.log4j.BasicConfigurator;
|
||||||
|
import org.apache.log4j.Level;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import org.junit.AfterClass;
|
||||||
|
import org.junit.Assert;
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
import fr.devinsy.statoolinfos.core.StatoolInfosException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The Class UserAgentBotDetectorTest.
|
||||||
|
*/
|
||||||
|
public class UserAgentBotDetectorTest
|
||||||
|
{
|
||||||
|
private static org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(UserAgentBotDetectorTest.class);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test 01.
|
||||||
|
*
|
||||||
|
* @throws Exception
|
||||||
|
* the exception
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void test01() throws Exception
|
||||||
|
{
|
||||||
|
Assert.assertTrue(UserAgentBotDetector.isBot(""));
|
||||||
|
Assert.assertTrue(UserAgentBotDetector.isBot("-"));
|
||||||
|
Assert.assertTrue(UserAgentBotDetector.isBot("mobilizon.zapashcanon.fr - Mobilizon 1.1.2"));
|
||||||
|
Assert.assertTrue(UserAgentBotDetector.isBot("Pleroma 2.3.50-242-g8e9f032f-develop; https://soc.abcdefg.club <ary@abcdefg.club>"));
|
||||||
|
|
||||||
|
Assert.assertFalse(UserAgentBotDetector.isBot("Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36"));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* After class.
|
||||||
|
*
|
||||||
|
* @throws StatoolInfosException
|
||||||
|
* the Juga exception
|
||||||
|
*/
|
||||||
|
@AfterClass
|
||||||
|
public static void afterClass() throws StatoolInfosException
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Before class.
|
||||||
|
*
|
||||||
|
* @throws StatoolInfosException
|
||||||
|
* the Juga exception
|
||||||
|
*/
|
||||||
|
@BeforeClass
|
||||||
|
public static void beforeClass() throws StatoolInfosException
|
||||||
|
{
|
||||||
|
BasicConfigurator.configure();
|
||||||
|
Logger.getRootLogger().setLevel(Level.DEBUG);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in a new issue