Sites checks fixes (#1896)

Fixed incorrect site names, added method to compare sites
This commit is contained in:
Soxoj
2024-11-26 13:29:43 +01:00
committed by GitHub
parent f529d16c62
commit b370bc4c44
5 changed files with 110 additions and 21 deletions
+28 -7
View File
@@ -31025,7 +31025,7 @@
"qa-part-form-profile" "qa-part-form-profile"
] ]
}, },
".com": { "{username}.com": {
"protocol": "dns", "protocol": "dns",
"url": "{username}.com", "url": "{username}.com",
"urlMain": "{username}.com", "urlMain": "{username}.com",
@@ -31033,7 +31033,7 @@
"usernameUnclaimed": "noonewouldeverusethis7", "usernameUnclaimed": "noonewouldeverusethis7",
"checkType": "status_code" "checkType": "status_code"
}, },
".pro": { "{username}.pro": {
"protocol": "dns", "protocol": "dns",
"url": "{username}.pro", "url": "{username}.pro",
"urlMain": "{username}.pro", "urlMain": "{username}.pro",
@@ -31041,7 +31041,7 @@
"usernameUnclaimed": "noonewouldeverusethis7", "usernameUnclaimed": "noonewouldeverusethis7",
"checkType": "status_code" "checkType": "status_code"
}, },
".me": { "{username}.me": {
"protocol": "dns", "protocol": "dns",
"url": "{username}.me", "url": "{username}.me",
"urlMain": "{username}.me", "urlMain": "{username}.me",
@@ -31049,7 +31049,7 @@
"usernameUnclaimed": "noonewouldeverusethis7", "usernameUnclaimed": "noonewouldeverusethis7",
"checkType": "status_code" "checkType": "status_code"
}, },
".biz": { "{username}.biz": {
"protocol": "dns", "protocol": "dns",
"url": "{username}.biz", "url": "{username}.biz",
"urlMain": "{username}.biz", "urlMain": "{username}.biz",
@@ -31057,7 +31057,7 @@
"usernameUnclaimed": "noonewouldeverusethis7", "usernameUnclaimed": "noonewouldeverusethis7",
"checkType": "status_code" "checkType": "status_code"
}, },
".email": { "{username}.email": {
"protocol": "dns", "protocol": "dns",
"url": "{username}.email", "url": "{username}.email",
"urlMain": "{username}.email", "urlMain": "{username}.email",
@@ -31065,7 +31065,7 @@
"usernameUnclaimed": "noonewouldeverusethis7", "usernameUnclaimed": "noonewouldeverusethis7",
"checkType": "status_code" "checkType": "status_code"
}, },
".guru": { "{username}.guru": {
"protocol": "dns", "protocol": "dns",
"url": "{username}.guru", "url": "{username}.guru",
"urlMain": "{username}.guru", "urlMain": "{username}.guru",
@@ -31073,7 +31073,7 @@
"usernameUnclaimed": "noonewouldeverusethis7", "usernameUnclaimed": "noonewouldeverusethis7",
"checkType": "status_code" "checkType": "status_code"
}, },
".ddns.net": { "{username}.ddns.net": {
"protocol": "dns", "protocol": "dns",
"url": "{username}.ddns.net", "url": "{username}.ddns.net",
"urlMain": "{username}.ddns.net", "urlMain": "{username}.ddns.net",
@@ -35201,6 +35201,27 @@
"urlMain": "https://massagerepublic.com", "urlMain": "https://massagerepublic.com",
"usernameClaimed": "lily88", "usernameClaimed": "lily88",
"usernameUnclaimed": "xzhsxfyfzi" "usernameUnclaimed": "xzhsxfyfzi"
},
"mynickname.com": {
"checkType": "message",
"absenceStrs": [
"<h1>Error 404: Page not found</h1>",
"Nickname , certificate for username ",
"btn green",
"mailto:info@mynickname.com",
">Register nickname</span></a></p>"
],
"presenseStrs": [
" title=",
"bold",
"title-line",
"codehtml",
"User offline"
],
"url": "https://mynickname.com/{username}",
"urlMain": "https://mynickname.com",
"usernameClaimed": "godbrithil",
"usernameUnclaimed": "fqiakbtdhu"
} }
}, },
"engines": { "engines": {
+44
View File
@@ -80,6 +80,36 @@ class MaigretSite:
def __str__(self): def __str__(self):
return f"{self.name} ({self.url_main})" return f"{self.name} ({self.url_main})"
def __is_equal_by_url_or_name(self, url_or_name_str: str):
lower_url_or_name_str = url_or_name_str.lower()
lower_url = self.url.lower()
lower_name = self.name.lower()
lower_url_main = self.url_main.lower()
return \
lower_name == lower_url_or_name_str or \
(lower_url_main and lower_url_main == lower_url_or_name_str) or \
(lower_url_main and lower_url_main in lower_url_or_name_str) or \
(lower_url_main and lower_url_or_name_str in lower_url_main) or \
(lower_url and lower_url_or_name_str in lower_url)
def __eq__(self, other):
if isinstance(other, MaigretSite):
# Compare only relevant attributes, not internal state like request_future
attrs_to_compare = ['name', 'url_main', 'url_subpath', 'type', 'headers',
'errors', 'activation', 'regex_check', 'url_probe',
'check_type', 'request_head_only', 'get_params',
'presense_strs', 'absence_strs', 'stats', 'engine',
'engine_data', 'alexa_rank', 'source', 'protocol']
return all(getattr(self, attr) == getattr(other, attr)
for attr in attrs_to_compare)
elif isinstance(other, str):
# Compare only by name (exactly) or url_main (partial similarity)
return self.__is_equal_by_url_or_name(other)
return False
def update_detectors(self): def update_detectors(self):
if "url" in self.__dict__: if "url" in self.__dict__:
url = self.url url = self.url
@@ -101,6 +131,10 @@ class MaigretSite:
return None return None
def extract_id_from_url(self, url: str) -> Optional[Tuple[str, str]]: def extract_id_from_url(self, url: str) -> Optional[Tuple[str, str]]:
"""
Extracts username from url.
It's outdated, detects only a format of https://example.com/{username}
"""
if not self.url_regexp: if not self.url_regexp:
return None return None
@@ -223,6 +257,16 @@ class MaigretDatabase:
def sites_dict(self): def sites_dict(self):
return {site.name: site for site in self._sites} return {site.name: site for site in self._sites}
def has_site(self, site: MaigretSite):
for s in self._sites:
if site == s:
print(f"input == site: {site} == {s}")
return True
return False
def __contains__(self, site):
return self.has_site(site)
def ranked_sites_dict( def ranked_sites_dict(
self, self,
reverse=False, reverse=False,
+5
View File
@@ -154,6 +154,11 @@ class Submitter:
self.logger.info(f"Site {site.name} checking is finished") self.logger.info(f"Site {site.name} checking is finished")
# remove service tag "unchecked"
if "unchecked" in site.tags:
site.tags.remove("unchecked")
changes["tags"] = site.tags
return changes return changes
def generate_additional_fields_dialog(self, engine: MaigretEngine, dialog): def generate_additional_fields_dialog(self, engine: MaigretEngine, dialog):
+16 -14
View File
@@ -1,5 +1,5 @@
## List of supported sites (search methods): total 3125 ## List of supported sites (search methods): total 3126
Rank data fetched from Alexa by domains. Rank data fetched from Alexa by domains.
@@ -2864,13 +2864,13 @@ Rank data fetched from Alexa by domains.
1. ![](https://www.google.com/s2/favicons?domain=https://ovnl.in) [ovnl.in (https://ovnl.in)](https://ovnl.in)*: top 100M, forum*, search is disabled 1. ![](https://www.google.com/s2/favicons?domain=https://ovnl.in) [ovnl.in (https://ovnl.in)](https://ovnl.in)*: top 100M, forum*, search is disabled
1. ![](https://www.google.com/s2/favicons?domain=https://wls.social) [wls.social (https://wls.social)](https://wls.social)*: top 100M, blog*, search is disabled 1. ![](https://www.google.com/s2/favicons?domain=https://wls.social) [wls.social (https://wls.social)](https://wls.social)*: top 100M, blog*, search is disabled
1. ![](https://www.google.com/s2/favicons?domain=http://answerszuvs3gg2l64e6hmnryudl5zgrmwm3vh65hzszdghblddvfiqd.onion) [HiddenAnswers (http://answerszuvs3gg2l64e6hmnryudl5zgrmwm3vh65hzszdghblddvfiqd.onion)](http://answerszuvs3gg2l64e6hmnryudl5zgrmwm3vh65hzszdghblddvfiqd.onion)*: top 100M, q&a, tor* 1. ![](https://www.google.com/s2/favicons?domain=http://answerszuvs3gg2l64e6hmnryudl5zgrmwm3vh65hzszdghblddvfiqd.onion) [HiddenAnswers (http://answerszuvs3gg2l64e6hmnryudl5zgrmwm3vh65hzszdghblddvfiqd.onion)](http://answerszuvs3gg2l64e6hmnryudl5zgrmwm3vh65hzszdghblddvfiqd.onion)*: top 100M, q&a, tor*
1. ![](https://www.google.com/s2/favicons?domain={username}.com) [.com ({username}.com)]({username}.com)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain={username}.com) [{username}.com ({username}.com)]({username}.com)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain={username}.pro) [.pro ({username}.pro)]({username}.pro)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain={username}.pro) [{username}.pro ({username}.pro)]({username}.pro)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain={username}.me) [.me ({username}.me)]({username}.me)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain={username}.me) [{username}.me ({username}.me)]({username}.me)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain={username}.biz) [.biz ({username}.biz)]({username}.biz)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain={username}.biz) [{username}.biz ({username}.biz)]({username}.biz)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain={username}.email) [.email ({username}.email)]({username}.email)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain={username}.email) [{username}.email ({username}.email)]({username}.email)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain={username}.guru) [.guru ({username}.guru)]({username}.guru)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain={username}.guru) [{username}.guru ({username}.guru)]({username}.guru)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain={username}.ddns.net) [.ddns.net ({username}.ddns.net)]({username}.ddns.net)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain={username}.ddns.net) [{username}.ddns.net ({username}.ddns.net)]({username}.ddns.net)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain=http://forum-history.ru) [forum-history.ru (http://forum-history.ru)](http://forum-history.ru)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain=http://forum-history.ru) [forum-history.ru (http://forum-history.ru)](http://forum-history.ru)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain=https://forum.alconar.ru) [forum.alconar.ru (https://forum.alconar.ru)](https://forum.alconar.ru)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain=https://forum.alconar.ru) [forum.alconar.ru (https://forum.alconar.ru)](https://forum.alconar.ru)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain=https://krskforum.com) [krskforum.com (https://krskforum.com)](https://krskforum.com)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain=https://krskforum.com) [krskforum.com (https://krskforum.com)](https://krskforum.com)*: top 100M*
@@ -3117,6 +3117,7 @@ Rank data fetched from Alexa by domains.
1. ![](https://www.google.com/s2/favicons?domain=https://www.stopstalk.com) [www.stopstalk.com (https://www.stopstalk.com)](https://www.stopstalk.com)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain=https://www.stopstalk.com) [www.stopstalk.com (https://www.stopstalk.com)](https://www.stopstalk.com)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain=https://www.polywork.com) [www.polywork.com (https://www.polywork.com)](https://www.polywork.com)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain=https://www.polywork.com) [www.polywork.com (https://www.polywork.com)](https://www.polywork.com)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain=https://oshwlab.com) [oshwlab.com (https://oshwlab.com)](https://oshwlab.com)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain=https://oshwlab.com) [oshwlab.com (https://oshwlab.com)](https://oshwlab.com)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain=https://www.xshaker.net) [www.xshaker.net (https://www.xshaker.net)](https://www.xshaker.net)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain=https://chaturbator.su) [chaturbator.su (https://chaturbator.su)](https://chaturbator.su)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain=https://chaturbator.su) [chaturbator.su (https://chaturbator.su)](https://chaturbator.su)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain=https://imgflip.com) [imgflip.com (https://imgflip.com)](https://imgflip.com)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain=https://imgflip.com) [imgflip.com (https://imgflip.com)](https://imgflip.com)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain=https://www.flickr.com) [www.flickr.com (https://www.flickr.com)](https://www.flickr.com)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain=https://www.flickr.com) [www.flickr.com (https://www.flickr.com)](https://www.flickr.com)*: top 100M*
@@ -3127,21 +3128,22 @@ Rank data fetched from Alexa by domains.
1. ![](https://www.google.com/s2/favicons?domain=https://archive.transformativeworks.org) [archive.transformativeworks.org (https://archive.transformativeworks.org)](https://archive.transformativeworks.org)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain=https://archive.transformativeworks.org) [archive.transformativeworks.org (https://archive.transformativeworks.org)](https://archive.transformativeworks.org)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain=https://www.tnaflix.com) [www.tnaflix.com (https://www.tnaflix.com)](https://www.tnaflix.com)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain=https://www.tnaflix.com) [www.tnaflix.com (https://www.tnaflix.com)](https://www.tnaflix.com)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain=https://massagerepublic.com) [massagerepublic.com (https://massagerepublic.com)](https://massagerepublic.com)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain=https://massagerepublic.com) [massagerepublic.com (https://massagerepublic.com)](https://massagerepublic.com)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain=https://mynickname.com) [mynickname.com (https://mynickname.com)](https://mynickname.com)*: top 100M, unchecked*
The list was updated at (2024-11-25 17:22:43.959448+00:00 UTC) The list was updated at (2024-11-26 10:27:01.383232+00:00 UTC)
## Statistics ## Statistics
Enabled/total sites: 2693/3125 = 86.18% Enabled/total sites: 2694/3126 = 86.18%
Incomplete message checks: 405/2693 = 15.04% (false positive risks) Incomplete message checks: 405/2694 = 15.03% (false positive risks)
Status code checks: 720/2693 = 26.74% (false positive risks) Status code checks: 720/2694 = 26.73% (false positive risks)
False positive risk (total): 41.78% False positive risk (total): 41.76%
Top 20 profile URLs: Top 20 profile URLs:
- (796) `{urlMain}/index/8-0-{username} (uCoz)` - (796) `{urlMain}/index/8-0-{username} (uCoz)`
- (301) `/{username}` - (302) `/{username}`
- (221) `{urlMain}{urlSubpath}/members/?username={username} (XenForo)` - (221) `{urlMain}{urlSubpath}/members/?username={username} (XenForo)`
- (160) `/user/{username}` - (160) `/user/{username}`
- (133) `{urlMain}{urlSubpath}/member.php?username={username} (vBulletin)` - (133) `{urlMain}{urlSubpath}/member.php?username={username} (vBulletin)`
+17
View File
@@ -202,3 +202,20 @@ def test_get_url_template():
}, },
) )
assert site.get_url_template() == "SUBDOMAIN" assert site.get_url_template() == "SUBDOMAIN"
def test_has_site_url_or_name(default_db):
# by the same url or partial match
assert default_db.has_site("https://aback.com.ua/user/") == True
assert default_db.has_site("https://aback.com.ua") == True
# acceptable partial match
assert default_db.has_site("https://aback.com.ua/use") == True
assert default_db.has_site("https://aback.com") == True
# by name
assert default_db.has_site("Aback") == True
# false
assert default_db.has_site("https://aeifgoai3h4g8a3u4g5") == False
assert default_db.has_site("aeifgoai3h4g8a3u4g5") == False