From d15e12750b7635d8d4bdaff5a39b639f8d106a91 Mon Sep 17 00:00:00 2001
From: Soxoj <31013580+soxoj@users.noreply.github.com>
Date: Sun, 1 Dec 2024 03:19:36 +0100
Subject: [PATCH] Sites fixes (#1917)
* Some sites fixes
* Sites stats updated
---
maigret/resources/data.json | 45 ++++++++++++++++++++++---------------
maigret/sites.py | 12 +++++++++-
sites.md | 12 +++++-----
3 files changed, 44 insertions(+), 25 deletions(-)
diff --git a/maigret/resources/data.json b/maigret/resources/data.json
index 78ee883..b4af630 100644
--- a/maigret/resources/data.json
+++ b/maigret/resources/data.json
@@ -10640,10 +10640,11 @@
"type": "ok_id",
"checkType": "message",
"presenceStrs": [
- "profile__menu"
+ "profile__content_header_user"
],
"absenceStrs": [
- "mm-profile_not-found_content"
+ "mm-profile_not-found_content",
+ "
\u041c\u043e\u0439 \u041c\u0438\u0440@Mail.Ru"
],
"alexaRank": 49,
"urlMain": "https://my.mail.ru/",
@@ -10658,10 +10659,11 @@
"type": "vk_id",
"checkType": "message",
"presenceStrs": [
- "profile__menu"
+ "profile__content_header_user"
],
"absenceStrs": [
- "mm-profile_not-found_content"
+ "mm-profile_not-found_content",
+ "\u041c\u043e\u0439 \u041c\u0438\u0440@Mail.Ru"
],
"alexaRank": 49,
"urlMain": "https://my.mail.ru/",
@@ -10675,10 +10677,11 @@
],
"checkType": "message",
"presenceStrs": [
- "profile__menu"
+ "profile__content_header_user"
],
"absenceStrs": [
- "mm-profile_not-found_content"
+ "mm-profile_not-found_content",
+ "\u041c\u043e\u0439 \u041c\u0438\u0440@Mail.Ru"
],
"alexaRank": 49,
"urlMain": "https://my.mail.ru/",
@@ -10692,10 +10695,11 @@
],
"checkType": "message",
"presenceStrs": [
- "profile__menu"
+ "profile__content_header_user"
],
"absenceStrs": [
- "mm-profile_not-found_content"
+ "mm-profile_not-found_content",
+ "\u041c\u043e\u0439 \u041c\u0438\u0440@Mail.Ru"
],
"alexaRank": 49,
"urlMain": "https://my.mail.ru/",
@@ -10709,10 +10713,11 @@
],
"checkType": "message",
"presenceStrs": [
- "profile__menu"
+ "profile__content_header_user"
],
"absenceStrs": [
- "mm-profile_not-found_content"
+ "mm-profile_not-found_content",
+ "\u041c\u043e\u0439 \u041c\u0438\u0440@Mail.Ru"
],
"alexaRank": 49,
"urlMain": "https://my.mail.ru/",
@@ -10726,10 +10731,11 @@
],
"checkType": "message",
"presenceStrs": [
- "profile__menu"
+ "profile__content_header_user"
],
"absenceStrs": [
- "mm-profile_not-found_content"
+ "mm-profile_not-found_content",
+ "\u041c\u043e\u0439 \u041c\u0438\u0440@Mail.Ru"
],
"alexaRank": 49,
"urlMain": "https://my.mail.ru/",
@@ -10743,16 +10749,17 @@
],
"checkType": "message",
"presenceStrs": [
- "profile__menu"
+ "profile__content_header_user"
],
"absenceStrs": [
- "mm-profile_not-found_content"
+ "mm-profile_not-found_content",
+ "\u041c\u043e\u0439 \u041c\u0438\u0440@Mail.Ru"
],
"alexaRank": 49,
"urlMain": "https://my.mail.ru/",
"url": "https://my.mail.ru/ya.ru/{username}/",
"usernameClaimed": "hovsepovich",
- "usernameUnclaimed": "noonewouldeverusethis7"
+ "usernameUnclaimed": "MAlKOVyd"
},
"My.Mail.ru@yandex.ru": {
"tags": [
@@ -10760,10 +10767,11 @@
],
"checkType": "message",
"presenceStrs": [
- "profile__menu"
+ "profile__content_header_user"
],
"absenceStrs": [
- "mm-profile_not-found_content"
+ "mm-profile_not-found_content",
+ "\u041c\u043e\u0439 \u041c\u0438\u0440@Mail.Ru"
],
"alexaRank": 49,
"urlMain": "https://my.mail.ru/",
@@ -18773,7 +18781,8 @@
"\u7528\u6237\u4e0d\u5b58\u5728"
],
"usernameClaimed": "blue",
- "usernameUnclaimed": "noonewouldeverusethis7"
+ "usernameUnclaimed": "noonewouldeverusethis7",
+ "disabled": true
},
"Zhyk": {
"disabled": true,
diff --git a/maigret/sites.py b/maigret/sites.py
index 267928f..dc4cb50 100644
--- a/maigret/sites.py
+++ b/maigret/sites.py
@@ -260,7 +260,6 @@ class MaigretDatabase:
def has_site(self, site: MaigretSite):
for s in self._sites:
if site == s:
- print(f"input == site: {site} == {s}")
return True
return False
@@ -278,6 +277,17 @@ class MaigretDatabase:
):
"""
Ranking and filtering of the sites list
+
+ Args:
+ reverse (bool, optional): Reverse the sorting order. Defaults to False.
+ top (int, optional): Maximum number of sites to return. Defaults to sys.maxsize.
+ tags (list, optional): List of tags to filter sites by. Defaults to empty list.
+ names (list, optional): List of site names (or urls, see MaigretSite.__eq__) to filter by. Defaults to empty list.
+ disabled (bool, optional): Whether to include disabled sites. Defaults to True.
+ id_type (str, optional): Type of identifier to filter by. Defaults to "username".
+
+ Returns:
+ dict: Dictionary of filtered and ranked sites, with site names as keys and MaigretSite objects as values
"""
normalized_names = list(map(str.lower, names))
normalized_tags = list(map(str.lower, tags))
diff --git a/sites.md b/sites.md
index 3c982fd..0f90057 100644
--- a/sites.md
+++ b/sites.md
@@ -84,7 +84,7 @@ Rank data fetched from Alexa by domains.
1.  [discourse.mozilla.org (https://discourse.mozilla.org)](https://discourse.mozilla.org)*: top 500*
1.  [linktr.ee (https://linktr.ee)](https://linktr.ee)*: top 500, links*
1.  [xHamster (https://xhamster.com)](https://xhamster.com)*: top 500, porn, us*
-1.  [Zhihu (https://www.zhihu.com/)](https://www.zhihu.com/)*: top 500, cn*
+1.  [Zhihu (https://www.zhihu.com/)](https://www.zhihu.com/)*: top 500, cn*, search is disabled
1.  [Blogger (by GAIA id) (https://www.blogger.com)](https://www.blogger.com)*: top 500, blog*
1.  [ResearchGate (https://www.researchgate.net/)](https://www.researchgate.net/)*: top 500, in, us*
1.  [Freepik (https://www.freepik.com)](https://www.freepik.com)*: top 500, art, photo, stock*
@@ -3130,16 +3130,16 @@ Rank data fetched from Alexa by domains.
1.  [massagerepublic.com (https://massagerepublic.com)](https://massagerepublic.com)*: top 100M*
1.  [mynickname.com (https://mynickname.com)](https://mynickname.com)*: top 100M*
-The list was updated at (2024-11-27 UTC)
+The list was updated at (2024-11-29 UTC)
## Statistics
-Enabled/total sites: 2694/3126 = 86.18%
+Enabled/total sites: 2693/3126 = 86.15%
-Incomplete message checks: 405/2694 = 15.03% (false positive risks)
+Incomplete message checks: 404/2693 = 15.0% (false positive risks)
-Status code checks: 720/2694 = 26.73% (false positive risks)
+Status code checks: 720/2693 = 26.74% (false positive risks)
-False positive risk (total): 41.76%
+False positive risk (total): 41.74%
Top 20 profile URLs:
- (796) `{urlMain}/index/8-0-{username} (uCoz)`