mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-06 14:08:59 +00:00
Sites checks fixes (#1896)
Fixed incorrect site names, added method to compare sites
This commit is contained in:
@@ -31025,7 +31025,7 @@
|
||||
"qa-part-form-profile"
|
||||
]
|
||||
},
|
||||
".com": {
|
||||
"{username}.com": {
|
||||
"protocol": "dns",
|
||||
"url": "{username}.com",
|
||||
"urlMain": "{username}.com",
|
||||
@@ -31033,7 +31033,7 @@
|
||||
"usernameUnclaimed": "noonewouldeverusethis7",
|
||||
"checkType": "status_code"
|
||||
},
|
||||
".pro": {
|
||||
"{username}.pro": {
|
||||
"protocol": "dns",
|
||||
"url": "{username}.pro",
|
||||
"urlMain": "{username}.pro",
|
||||
@@ -31041,7 +31041,7 @@
|
||||
"usernameUnclaimed": "noonewouldeverusethis7",
|
||||
"checkType": "status_code"
|
||||
},
|
||||
".me": {
|
||||
"{username}.me": {
|
||||
"protocol": "dns",
|
||||
"url": "{username}.me",
|
||||
"urlMain": "{username}.me",
|
||||
@@ -31049,7 +31049,7 @@
|
||||
"usernameUnclaimed": "noonewouldeverusethis7",
|
||||
"checkType": "status_code"
|
||||
},
|
||||
".biz": {
|
||||
"{username}.biz": {
|
||||
"protocol": "dns",
|
||||
"url": "{username}.biz",
|
||||
"urlMain": "{username}.biz",
|
||||
@@ -31057,7 +31057,7 @@
|
||||
"usernameUnclaimed": "noonewouldeverusethis7",
|
||||
"checkType": "status_code"
|
||||
},
|
||||
".email": {
|
||||
"{username}.email": {
|
||||
"protocol": "dns",
|
||||
"url": "{username}.email",
|
||||
"urlMain": "{username}.email",
|
||||
@@ -31065,7 +31065,7 @@
|
||||
"usernameUnclaimed": "noonewouldeverusethis7",
|
||||
"checkType": "status_code"
|
||||
},
|
||||
".guru": {
|
||||
"{username}.guru": {
|
||||
"protocol": "dns",
|
||||
"url": "{username}.guru",
|
||||
"urlMain": "{username}.guru",
|
||||
@@ -31073,7 +31073,7 @@
|
||||
"usernameUnclaimed": "noonewouldeverusethis7",
|
||||
"checkType": "status_code"
|
||||
},
|
||||
".ddns.net": {
|
||||
"{username}.ddns.net": {
|
||||
"protocol": "dns",
|
||||
"url": "{username}.ddns.net",
|
||||
"urlMain": "{username}.ddns.net",
|
||||
@@ -35201,6 +35201,27 @@
|
||||
"urlMain": "https://massagerepublic.com",
|
||||
"usernameClaimed": "lily88",
|
||||
"usernameUnclaimed": "xzhsxfyfzi"
|
||||
},
|
||||
"mynickname.com": {
|
||||
"checkType": "message",
|
||||
"absenceStrs": [
|
||||
"<h1>Error 404: Page not found</h1>",
|
||||
"Nickname , certificate for username ",
|
||||
"btn green",
|
||||
"mailto:info@mynickname.com",
|
||||
">Register nickname</span></a></p>"
|
||||
],
|
||||
"presenseStrs": [
|
||||
" title=",
|
||||
"bold",
|
||||
"title-line",
|
||||
"codehtml",
|
||||
"User offline"
|
||||
],
|
||||
"url": "https://mynickname.com/{username}",
|
||||
"urlMain": "https://mynickname.com",
|
||||
"usernameClaimed": "godbrithil",
|
||||
"usernameUnclaimed": "fqiakbtdhu"
|
||||
}
|
||||
},
|
||||
"engines": {
|
||||
|
||||
@@ -80,6 +80,36 @@ class MaigretSite:
|
||||
def __str__(self):
|
||||
return f"{self.name} ({self.url_main})"
|
||||
|
||||
def __is_equal_by_url_or_name(self, url_or_name_str: str):
|
||||
lower_url_or_name_str = url_or_name_str.lower()
|
||||
lower_url = self.url.lower()
|
||||
lower_name = self.name.lower()
|
||||
lower_url_main = self.url_main.lower()
|
||||
|
||||
return \
|
||||
lower_name == lower_url_or_name_str or \
|
||||
(lower_url_main and lower_url_main == lower_url_or_name_str) or \
|
||||
(lower_url_main and lower_url_main in lower_url_or_name_str) or \
|
||||
(lower_url_main and lower_url_or_name_str in lower_url_main) or \
|
||||
(lower_url and lower_url_or_name_str in lower_url)
|
||||
|
||||
def __eq__(self, other):
|
||||
if isinstance(other, MaigretSite):
|
||||
# Compare only relevant attributes, not internal state like request_future
|
||||
attrs_to_compare = ['name', 'url_main', 'url_subpath', 'type', 'headers',
|
||||
'errors', 'activation', 'regex_check', 'url_probe',
|
||||
'check_type', 'request_head_only', 'get_params',
|
||||
'presense_strs', 'absence_strs', 'stats', 'engine',
|
||||
'engine_data', 'alexa_rank', 'source', 'protocol']
|
||||
|
||||
return all(getattr(self, attr) == getattr(other, attr)
|
||||
for attr in attrs_to_compare)
|
||||
elif isinstance(other, str):
|
||||
# Compare only by name (exactly) or url_main (partial similarity)
|
||||
return self.__is_equal_by_url_or_name(other)
|
||||
return False
|
||||
|
||||
|
||||
def update_detectors(self):
|
||||
if "url" in self.__dict__:
|
||||
url = self.url
|
||||
@@ -101,6 +131,10 @@ class MaigretSite:
|
||||
return None
|
||||
|
||||
def extract_id_from_url(self, url: str) -> Optional[Tuple[str, str]]:
|
||||
"""
|
||||
Extracts username from url.
|
||||
It's outdated, detects only a format of https://example.com/{username}
|
||||
"""
|
||||
if not self.url_regexp:
|
||||
return None
|
||||
|
||||
@@ -223,6 +257,16 @@ class MaigretDatabase:
|
||||
def sites_dict(self):
|
||||
return {site.name: site for site in self._sites}
|
||||
|
||||
def has_site(self, site: MaigretSite):
|
||||
for s in self._sites:
|
||||
if site == s:
|
||||
print(f"input == site: {site} == {s}")
|
||||
return True
|
||||
return False
|
||||
|
||||
def __contains__(self, site):
|
||||
return self.has_site(site)
|
||||
|
||||
def ranked_sites_dict(
|
||||
self,
|
||||
reverse=False,
|
||||
|
||||
@@ -154,6 +154,11 @@ class Submitter:
|
||||
|
||||
self.logger.info(f"Site {site.name} checking is finished")
|
||||
|
||||
# remove service tag "unchecked"
|
||||
if "unchecked" in site.tags:
|
||||
site.tags.remove("unchecked")
|
||||
changes["tags"] = site.tags
|
||||
|
||||
return changes
|
||||
|
||||
def generate_additional_fields_dialog(self, engine: MaigretEngine, dialog):
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
|
||||
## List of supported sites (search methods): total 3125
|
||||
## List of supported sites (search methods): total 3126
|
||||
|
||||
Rank data fetched from Alexa by domains.
|
||||
|
||||
@@ -2864,13 +2864,13 @@ Rank data fetched from Alexa by domains.
|
||||
1.  [ovnl.in (https://ovnl.in)](https://ovnl.in)*: top 100M, forum*, search is disabled
|
||||
1.  [wls.social (https://wls.social)](https://wls.social)*: top 100M, blog*, search is disabled
|
||||
1.  [HiddenAnswers (http://answerszuvs3gg2l64e6hmnryudl5zgrmwm3vh65hzszdghblddvfiqd.onion)](http://answerszuvs3gg2l64e6hmnryudl5zgrmwm3vh65hzszdghblddvfiqd.onion)*: top 100M, q&a, tor*
|
||||
1.  [.com ({username}.com)]({username}.com)*: top 100M*
|
||||
1.  [.pro ({username}.pro)]({username}.pro)*: top 100M*
|
||||
1.  [.me ({username}.me)]({username}.me)*: top 100M*
|
||||
1.  [.biz ({username}.biz)]({username}.biz)*: top 100M*
|
||||
1.  [.email ({username}.email)]({username}.email)*: top 100M*
|
||||
1.  [.guru ({username}.guru)]({username}.guru)*: top 100M*
|
||||
1.  [.ddns.net ({username}.ddns.net)]({username}.ddns.net)*: top 100M*
|
||||
1.  [{username}.com ({username}.com)]({username}.com)*: top 100M*
|
||||
1.  [{username}.pro ({username}.pro)]({username}.pro)*: top 100M*
|
||||
1.  [{username}.me ({username}.me)]({username}.me)*: top 100M*
|
||||
1.  [{username}.biz ({username}.biz)]({username}.biz)*: top 100M*
|
||||
1.  [{username}.email ({username}.email)]({username}.email)*: top 100M*
|
||||
1.  [{username}.guru ({username}.guru)]({username}.guru)*: top 100M*
|
||||
1.  [{username}.ddns.net ({username}.ddns.net)]({username}.ddns.net)*: top 100M*
|
||||
1.  [forum-history.ru (http://forum-history.ru)](http://forum-history.ru)*: top 100M*
|
||||
1.  [forum.alconar.ru (https://forum.alconar.ru)](https://forum.alconar.ru)*: top 100M*
|
||||
1.  [krskforum.com (https://krskforum.com)](https://krskforum.com)*: top 100M*
|
||||
@@ -3117,6 +3117,7 @@ Rank data fetched from Alexa by domains.
|
||||
1.  [www.stopstalk.com (https://www.stopstalk.com)](https://www.stopstalk.com)*: top 100M*
|
||||
1.  [www.polywork.com (https://www.polywork.com)](https://www.polywork.com)*: top 100M*
|
||||
1.  [oshwlab.com (https://oshwlab.com)](https://oshwlab.com)*: top 100M*
|
||||
1.  [www.xshaker.net (https://www.xshaker.net)](https://www.xshaker.net)*: top 100M*
|
||||
1.  [chaturbator.su (https://chaturbator.su)](https://chaturbator.su)*: top 100M*
|
||||
1.  [imgflip.com (https://imgflip.com)](https://imgflip.com)*: top 100M*
|
||||
1.  [www.flickr.com (https://www.flickr.com)](https://www.flickr.com)*: top 100M*
|
||||
@@ -3127,21 +3128,22 @@ Rank data fetched from Alexa by domains.
|
||||
1.  [archive.transformativeworks.org (https://archive.transformativeworks.org)](https://archive.transformativeworks.org)*: top 100M*
|
||||
1.  [www.tnaflix.com (https://www.tnaflix.com)](https://www.tnaflix.com)*: top 100M*
|
||||
1.  [massagerepublic.com (https://massagerepublic.com)](https://massagerepublic.com)*: top 100M*
|
||||
1.  [mynickname.com (https://mynickname.com)](https://mynickname.com)*: top 100M, unchecked*
|
||||
|
||||
The list was updated at (2024-11-25 17:22:43.959448+00:00 UTC)
|
||||
The list was updated at (2024-11-26 10:27:01.383232+00:00 UTC)
|
||||
## Statistics
|
||||
|
||||
Enabled/total sites: 2693/3125 = 86.18%
|
||||
Enabled/total sites: 2694/3126 = 86.18%
|
||||
|
||||
Incomplete message checks: 405/2693 = 15.04% (false positive risks)
|
||||
Incomplete message checks: 405/2694 = 15.03% (false positive risks)
|
||||
|
||||
Status code checks: 720/2693 = 26.74% (false positive risks)
|
||||
Status code checks: 720/2694 = 26.73% (false positive risks)
|
||||
|
||||
False positive risk (total): 41.78%
|
||||
False positive risk (total): 41.76%
|
||||
|
||||
Top 20 profile URLs:
|
||||
- (796) `{urlMain}/index/8-0-{username} (uCoz)`
|
||||
- (301) `/{username}`
|
||||
- (302) `/{username}`
|
||||
- (221) `{urlMain}{urlSubpath}/members/?username={username} (XenForo)`
|
||||
- (160) `/user/{username}`
|
||||
- (133) `{urlMain}{urlSubpath}/member.php?username={username} (vBulletin)`
|
||||
|
||||
@@ -202,3 +202,20 @@ def test_get_url_template():
|
||||
},
|
||||
)
|
||||
assert site.get_url_template() == "SUBDOMAIN"
|
||||
|
||||
|
||||
def test_has_site_url_or_name(default_db):
|
||||
# by the same url or partial match
|
||||
assert default_db.has_site("https://aback.com.ua/user/") == True
|
||||
assert default_db.has_site("https://aback.com.ua") == True
|
||||
|
||||
# acceptable partial match
|
||||
assert default_db.has_site("https://aback.com.ua/use") == True
|
||||
assert default_db.has_site("https://aback.com") == True
|
||||
|
||||
# by name
|
||||
assert default_db.has_site("Aback") == True
|
||||
|
||||
# false
|
||||
assert default_db.has_site("https://aeifgoai3h4g8a3u4g5") == False
|
||||
assert default_db.has_site("aeifgoai3h4g8a3u4g5") == False
|
||||
|
||||
Reference in New Issue
Block a user