From b370bc4c44158ce1449ac080456f4ae1f4bf1358 Mon Sep 17 00:00:00 2001 From: Soxoj <31013580+soxoj@users.noreply.github.com> Date: Tue, 26 Nov 2024 13:29:43 +0100 Subject: [PATCH] Sites checks fixes (#1896) Fixed incorrect site names, added method to compare sites --- maigret/resources/data.json | 35 +++++++++++++++++++++++------ maigret/sites.py | 44 +++++++++++++++++++++++++++++++++++++ maigret/submit.py | 5 +++++ sites.md | 30 +++++++++++++------------ tests/test_sites.py | 17 ++++++++++++++ 5 files changed, 110 insertions(+), 21 deletions(-) diff --git a/maigret/resources/data.json b/maigret/resources/data.json index 1814779..78ee883 100644 --- a/maigret/resources/data.json +++ b/maigret/resources/data.json @@ -31025,7 +31025,7 @@ "qa-part-form-profile" ] }, - ".com": { + "{username}.com": { "protocol": "dns", "url": "{username}.com", "urlMain": "{username}.com", @@ -31033,7 +31033,7 @@ "usernameUnclaimed": "noonewouldeverusethis7", "checkType": "status_code" }, - ".pro": { + "{username}.pro": { "protocol": "dns", "url": "{username}.pro", "urlMain": "{username}.pro", @@ -31041,7 +31041,7 @@ "usernameUnclaimed": "noonewouldeverusethis7", "checkType": "status_code" }, - ".me": { + "{username}.me": { "protocol": "dns", "url": "{username}.me", "urlMain": "{username}.me", @@ -31049,7 +31049,7 @@ "usernameUnclaimed": "noonewouldeverusethis7", "checkType": "status_code" }, - ".biz": { + "{username}.biz": { "protocol": "dns", "url": "{username}.biz", "urlMain": "{username}.biz", @@ -31057,7 +31057,7 @@ "usernameUnclaimed": "noonewouldeverusethis7", "checkType": "status_code" }, - ".email": { + "{username}.email": { "protocol": "dns", "url": "{username}.email", "urlMain": "{username}.email", @@ -31065,7 +31065,7 @@ "usernameUnclaimed": "noonewouldeverusethis7", "checkType": "status_code" }, - ".guru": { + "{username}.guru": { "protocol": "dns", "url": "{username}.guru", "urlMain": "{username}.guru", @@ -31073,7 +31073,7 @@ "usernameUnclaimed": "noonewouldeverusethis7", "checkType": "status_code" }, - ".ddns.net": { + "{username}.ddns.net": { "protocol": "dns", "url": "{username}.ddns.net", "urlMain": "{username}.ddns.net", @@ -35201,6 +35201,27 @@ "urlMain": "https://massagerepublic.com", "usernameClaimed": "lily88", "usernameUnclaimed": "xzhsxfyfzi" + }, + "mynickname.com": { + "checkType": "message", + "absenceStrs": [ + "

Error 404: Page not found

", + "Nickname , certificate for username ", + "btn green", + "mailto:info@mynickname.com", + ">Register nickname

" + ], + "presenseStrs": [ + " title=", + "bold", + "title-line", + "codehtml", + "User offline" + ], + "url": "https://mynickname.com/{username}", + "urlMain": "https://mynickname.com", + "usernameClaimed": "godbrithil", + "usernameUnclaimed": "fqiakbtdhu" } }, "engines": { diff --git a/maigret/sites.py b/maigret/sites.py index bc71f84..267928f 100644 --- a/maigret/sites.py +++ b/maigret/sites.py @@ -80,6 +80,36 @@ class MaigretSite: def __str__(self): return f"{self.name} ({self.url_main})" + def __is_equal_by_url_or_name(self, url_or_name_str: str): + lower_url_or_name_str = url_or_name_str.lower() + lower_url = self.url.lower() + lower_name = self.name.lower() + lower_url_main = self.url_main.lower() + + return \ + lower_name == lower_url_or_name_str or \ + (lower_url_main and lower_url_main == lower_url_or_name_str) or \ + (lower_url_main and lower_url_main in lower_url_or_name_str) or \ + (lower_url_main and lower_url_or_name_str in lower_url_main) or \ + (lower_url and lower_url_or_name_str in lower_url) + + def __eq__(self, other): + if isinstance(other, MaigretSite): + # Compare only relevant attributes, not internal state like request_future + attrs_to_compare = ['name', 'url_main', 'url_subpath', 'type', 'headers', + 'errors', 'activation', 'regex_check', 'url_probe', + 'check_type', 'request_head_only', 'get_params', + 'presense_strs', 'absence_strs', 'stats', 'engine', + 'engine_data', 'alexa_rank', 'source', 'protocol'] + + return all(getattr(self, attr) == getattr(other, attr) + for attr in attrs_to_compare) + elif isinstance(other, str): + # Compare only by name (exactly) or url_main (partial similarity) + return self.__is_equal_by_url_or_name(other) + return False + + def update_detectors(self): if "url" in self.__dict__: url = self.url @@ -101,6 +131,10 @@ class MaigretSite: return None def extract_id_from_url(self, url: str) -> Optional[Tuple[str, str]]: + """ + Extracts username from url. + It's outdated, detects only a format of https://example.com/{username} + """ if not self.url_regexp: return None @@ -223,6 +257,16 @@ class MaigretDatabase: def sites_dict(self): return {site.name: site for site in self._sites} + def has_site(self, site: MaigretSite): + for s in self._sites: + if site == s: + print(f"input == site: {site} == {s}") + return True + return False + + def __contains__(self, site): + return self.has_site(site) + def ranked_sites_dict( self, reverse=False, diff --git a/maigret/submit.py b/maigret/submit.py index e623ed7..f980082 100644 --- a/maigret/submit.py +++ b/maigret/submit.py @@ -154,6 +154,11 @@ class Submitter: self.logger.info(f"Site {site.name} checking is finished") + # remove service tag "unchecked" + if "unchecked" in site.tags: + site.tags.remove("unchecked") + changes["tags"] = site.tags + return changes def generate_additional_fields_dialog(self, engine: MaigretEngine, dialog): diff --git a/sites.md b/sites.md index 7c6c26a..3481cb1 100644 --- a/sites.md +++ b/sites.md @@ -1,5 +1,5 @@ -## List of supported sites (search methods): total 3125 +## List of supported sites (search methods): total 3126 Rank data fetched from Alexa by domains. @@ -2864,13 +2864,13 @@ Rank data fetched from Alexa by domains. 1. ![](https://www.google.com/s2/favicons?domain=https://ovnl.in) [ovnl.in (https://ovnl.in)](https://ovnl.in)*: top 100M, forum*, search is disabled 1. ![](https://www.google.com/s2/favicons?domain=https://wls.social) [wls.social (https://wls.social)](https://wls.social)*: top 100M, blog*, search is disabled 1. ![](https://www.google.com/s2/favicons?domain=http://answerszuvs3gg2l64e6hmnryudl5zgrmwm3vh65hzszdghblddvfiqd.onion) [HiddenAnswers (http://answerszuvs3gg2l64e6hmnryudl5zgrmwm3vh65hzszdghblddvfiqd.onion)](http://answerszuvs3gg2l64e6hmnryudl5zgrmwm3vh65hzszdghblddvfiqd.onion)*: top 100M, q&a, tor* -1. ![](https://www.google.com/s2/favicons?domain={username}.com) [.com ({username}.com)]({username}.com)*: top 100M* -1. ![](https://www.google.com/s2/favicons?domain={username}.pro) [.pro ({username}.pro)]({username}.pro)*: top 100M* -1. ![](https://www.google.com/s2/favicons?domain={username}.me) [.me ({username}.me)]({username}.me)*: top 100M* -1. ![](https://www.google.com/s2/favicons?domain={username}.biz) [.biz ({username}.biz)]({username}.biz)*: top 100M* -1. ![](https://www.google.com/s2/favicons?domain={username}.email) [.email ({username}.email)]({username}.email)*: top 100M* -1. ![](https://www.google.com/s2/favicons?domain={username}.guru) [.guru ({username}.guru)]({username}.guru)*: top 100M* -1. ![](https://www.google.com/s2/favicons?domain={username}.ddns.net) [.ddns.net ({username}.ddns.net)]({username}.ddns.net)*: top 100M* +1. ![](https://www.google.com/s2/favicons?domain={username}.com) [{username}.com ({username}.com)]({username}.com)*: top 100M* +1. ![](https://www.google.com/s2/favicons?domain={username}.pro) [{username}.pro ({username}.pro)]({username}.pro)*: top 100M* +1. ![](https://www.google.com/s2/favicons?domain={username}.me) [{username}.me ({username}.me)]({username}.me)*: top 100M* +1. ![](https://www.google.com/s2/favicons?domain={username}.biz) [{username}.biz ({username}.biz)]({username}.biz)*: top 100M* +1. ![](https://www.google.com/s2/favicons?domain={username}.email) [{username}.email ({username}.email)]({username}.email)*: top 100M* +1. ![](https://www.google.com/s2/favicons?domain={username}.guru) [{username}.guru ({username}.guru)]({username}.guru)*: top 100M* +1. ![](https://www.google.com/s2/favicons?domain={username}.ddns.net) [{username}.ddns.net ({username}.ddns.net)]({username}.ddns.net)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain=http://forum-history.ru) [forum-history.ru (http://forum-history.ru)](http://forum-history.ru)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain=https://forum.alconar.ru) [forum.alconar.ru (https://forum.alconar.ru)](https://forum.alconar.ru)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain=https://krskforum.com) [krskforum.com (https://krskforum.com)](https://krskforum.com)*: top 100M* @@ -3117,6 +3117,7 @@ Rank data fetched from Alexa by domains. 1. ![](https://www.google.com/s2/favicons?domain=https://www.stopstalk.com) [www.stopstalk.com (https://www.stopstalk.com)](https://www.stopstalk.com)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain=https://www.polywork.com) [www.polywork.com (https://www.polywork.com)](https://www.polywork.com)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain=https://oshwlab.com) [oshwlab.com (https://oshwlab.com)](https://oshwlab.com)*: top 100M* +1. ![](https://www.google.com/s2/favicons?domain=https://www.xshaker.net) [www.xshaker.net (https://www.xshaker.net)](https://www.xshaker.net)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain=https://chaturbator.su) [chaturbator.su (https://chaturbator.su)](https://chaturbator.su)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain=https://imgflip.com) [imgflip.com (https://imgflip.com)](https://imgflip.com)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain=https://www.flickr.com) [www.flickr.com (https://www.flickr.com)](https://www.flickr.com)*: top 100M* @@ -3127,21 +3128,22 @@ Rank data fetched from Alexa by domains. 1. ![](https://www.google.com/s2/favicons?domain=https://archive.transformativeworks.org) [archive.transformativeworks.org (https://archive.transformativeworks.org)](https://archive.transformativeworks.org)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain=https://www.tnaflix.com) [www.tnaflix.com (https://www.tnaflix.com)](https://www.tnaflix.com)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain=https://massagerepublic.com) [massagerepublic.com (https://massagerepublic.com)](https://massagerepublic.com)*: top 100M* +1. ![](https://www.google.com/s2/favicons?domain=https://mynickname.com) [mynickname.com (https://mynickname.com)](https://mynickname.com)*: top 100M, unchecked* -The list was updated at (2024-11-25 17:22:43.959448+00:00 UTC) +The list was updated at (2024-11-26 10:27:01.383232+00:00 UTC) ## Statistics -Enabled/total sites: 2693/3125 = 86.18% +Enabled/total sites: 2694/3126 = 86.18% -Incomplete message checks: 405/2693 = 15.04% (false positive risks) +Incomplete message checks: 405/2694 = 15.03% (false positive risks) -Status code checks: 720/2693 = 26.74% (false positive risks) +Status code checks: 720/2694 = 26.73% (false positive risks) -False positive risk (total): 41.78% +False positive risk (total): 41.76% Top 20 profile URLs: - (796) `{urlMain}/index/8-0-{username} (uCoz)` -- (301) `/{username}` +- (302) `/{username}` - (221) `{urlMain}{urlSubpath}/members/?username={username} (XenForo)` - (160) `/user/{username}` - (133) `{urlMain}{urlSubpath}/member.php?username={username} (vBulletin)` diff --git a/tests/test_sites.py b/tests/test_sites.py index f0a4092..7b386cf 100644 --- a/tests/test_sites.py +++ b/tests/test_sites.py @@ -202,3 +202,20 @@ def test_get_url_template(): }, ) assert site.get_url_template() == "SUBDOMAIN" + + +def test_has_site_url_or_name(default_db): + # by the same url or partial match + assert default_db.has_site("https://aback.com.ua/user/") == True + assert default_db.has_site("https://aback.com.ua") == True + + # acceptable partial match + assert default_db.has_site("https://aback.com.ua/use") == True + assert default_db.has_site("https://aback.com") == True + + # by name + assert default_db.has_site("Aback") == True + + # false + assert default_db.has_site("https://aeifgoai3h4g8a3u4g5") == False + assert default_db.has_site("aeifgoai3h4g8a3u4g5") == False