From d15e12750b7635d8d4bdaff5a39b639f8d106a91 Mon Sep 17 00:00:00 2001 From: Soxoj <31013580+soxoj@users.noreply.github.com> Date: Sun, 1 Dec 2024 03:19:36 +0100 Subject: [PATCH] Sites fixes (#1917) * Some sites fixes * Sites stats updated --- maigret/resources/data.json | 45 ++++++++++++++++++++++--------------- maigret/sites.py | 12 +++++++++- sites.md | 12 +++++----- 3 files changed, 44 insertions(+), 25 deletions(-) diff --git a/maigret/resources/data.json b/maigret/resources/data.json index 78ee883..b4af630 100644 --- a/maigret/resources/data.json +++ b/maigret/resources/data.json @@ -10640,10 +10640,11 @@ "type": "ok_id", "checkType": "message", "presenceStrs": [ - "profile__menu" + "profile__content_header_user" ], "absenceStrs": [ - "mm-profile_not-found_content" + "mm-profile_not-found_content", + "\u041c\u043e\u0439 \u041c\u0438\u0440@Mail.Ru" ], "alexaRank": 49, "urlMain": "https://my.mail.ru/", @@ -10658,10 +10659,11 @@ "type": "vk_id", "checkType": "message", "presenceStrs": [ - "profile__menu" + "profile__content_header_user" ], "absenceStrs": [ - "mm-profile_not-found_content" + "mm-profile_not-found_content", + "\u041c\u043e\u0439 \u041c\u0438\u0440@Mail.Ru" ], "alexaRank": 49, "urlMain": "https://my.mail.ru/", @@ -10675,10 +10677,11 @@ ], "checkType": "message", "presenceStrs": [ - "profile__menu" + "profile__content_header_user" ], "absenceStrs": [ - "mm-profile_not-found_content" + "mm-profile_not-found_content", + "\u041c\u043e\u0439 \u041c\u0438\u0440@Mail.Ru" ], "alexaRank": 49, "urlMain": "https://my.mail.ru/", @@ -10692,10 +10695,11 @@ ], "checkType": "message", "presenceStrs": [ - "profile__menu" + "profile__content_header_user" ], "absenceStrs": [ - "mm-profile_not-found_content" + "mm-profile_not-found_content", + "\u041c\u043e\u0439 \u041c\u0438\u0440@Mail.Ru" ], "alexaRank": 49, "urlMain": "https://my.mail.ru/", @@ -10709,10 +10713,11 @@ ], "checkType": "message", "presenceStrs": [ - "profile__menu" + "profile__content_header_user" ], "absenceStrs": [ - "mm-profile_not-found_content" + "mm-profile_not-found_content", + "\u041c\u043e\u0439 \u041c\u0438\u0440@Mail.Ru" ], "alexaRank": 49, "urlMain": "https://my.mail.ru/", @@ -10726,10 +10731,11 @@ ], "checkType": "message", "presenceStrs": [ - "profile__menu" + "profile__content_header_user" ], "absenceStrs": [ - "mm-profile_not-found_content" + "mm-profile_not-found_content", + "\u041c\u043e\u0439 \u041c\u0438\u0440@Mail.Ru" ], "alexaRank": 49, "urlMain": "https://my.mail.ru/", @@ -10743,16 +10749,17 @@ ], "checkType": "message", "presenceStrs": [ - "profile__menu" + "profile__content_header_user" ], "absenceStrs": [ - "mm-profile_not-found_content" + "mm-profile_not-found_content", + "\u041c\u043e\u0439 \u041c\u0438\u0440@Mail.Ru" ], "alexaRank": 49, "urlMain": "https://my.mail.ru/", "url": "https://my.mail.ru/ya.ru/{username}/", "usernameClaimed": "hovsepovich", - "usernameUnclaimed": "noonewouldeverusethis7" + "usernameUnclaimed": "MAlKOVyd" }, "My.Mail.ru@yandex.ru": { "tags": [ @@ -10760,10 +10767,11 @@ ], "checkType": "message", "presenceStrs": [ - "profile__menu" + "profile__content_header_user" ], "absenceStrs": [ - "mm-profile_not-found_content" + "mm-profile_not-found_content", + "\u041c\u043e\u0439 \u041c\u0438\u0440@Mail.Ru" ], "alexaRank": 49, "urlMain": "https://my.mail.ru/", @@ -18773,7 +18781,8 @@ "\u7528\u6237\u4e0d\u5b58\u5728" ], "usernameClaimed": "blue", - "usernameUnclaimed": "noonewouldeverusethis7" + "usernameUnclaimed": "noonewouldeverusethis7", + "disabled": true }, "Zhyk": { "disabled": true, diff --git a/maigret/sites.py b/maigret/sites.py index 267928f..dc4cb50 100644 --- a/maigret/sites.py +++ b/maigret/sites.py @@ -260,7 +260,6 @@ class MaigretDatabase: def has_site(self, site: MaigretSite): for s in self._sites: if site == s: - print(f"input == site: {site} == {s}") return True return False @@ -278,6 +277,17 @@ class MaigretDatabase: ): """ Ranking and filtering of the sites list + + Args: + reverse (bool, optional): Reverse the sorting order. Defaults to False. + top (int, optional): Maximum number of sites to return. Defaults to sys.maxsize. + tags (list, optional): List of tags to filter sites by. Defaults to empty list. + names (list, optional): List of site names (or urls, see MaigretSite.__eq__) to filter by. Defaults to empty list. + disabled (bool, optional): Whether to include disabled sites. Defaults to True. + id_type (str, optional): Type of identifier to filter by. Defaults to "username". + + Returns: + dict: Dictionary of filtered and ranked sites, with site names as keys and MaigretSite objects as values """ normalized_names = list(map(str.lower, names)) normalized_tags = list(map(str.lower, tags)) diff --git a/sites.md b/sites.md index 3c982fd..0f90057 100644 --- a/sites.md +++ b/sites.md @@ -84,7 +84,7 @@ Rank data fetched from Alexa by domains. 1. ![](https://www.google.com/s2/favicons?domain=https://discourse.mozilla.org) [discourse.mozilla.org (https://discourse.mozilla.org)](https://discourse.mozilla.org)*: top 500* 1. ![](https://www.google.com/s2/favicons?domain=https://linktr.ee) [linktr.ee (https://linktr.ee)](https://linktr.ee)*: top 500, links* 1. ![](https://www.google.com/s2/favicons?domain=https://xhamster.com) [xHamster (https://xhamster.com)](https://xhamster.com)*: top 500, porn, us* -1. ![](https://www.google.com/s2/favicons?domain=https://www.zhihu.com/) [Zhihu (https://www.zhihu.com/)](https://www.zhihu.com/)*: top 500, cn* +1. ![](https://www.google.com/s2/favicons?domain=https://www.zhihu.com/) [Zhihu (https://www.zhihu.com/)](https://www.zhihu.com/)*: top 500, cn*, search is disabled 1. ![](https://www.google.com/s2/favicons?domain=https://www.blogger.com) [Blogger (by GAIA id) (https://www.blogger.com)](https://www.blogger.com)*: top 500, blog* 1. ![](https://www.google.com/s2/favicons?domain=https://www.researchgate.net/) [ResearchGate (https://www.researchgate.net/)](https://www.researchgate.net/)*: top 500, in, us* 1. ![](https://www.google.com/s2/favicons?domain=https://www.freepik.com) [Freepik (https://www.freepik.com)](https://www.freepik.com)*: top 500, art, photo, stock* @@ -3130,16 +3130,16 @@ Rank data fetched from Alexa by domains. 1. ![](https://www.google.com/s2/favicons?domain=https://massagerepublic.com) [massagerepublic.com (https://massagerepublic.com)](https://massagerepublic.com)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain=https://mynickname.com) [mynickname.com (https://mynickname.com)](https://mynickname.com)*: top 100M* -The list was updated at (2024-11-27 UTC) +The list was updated at (2024-11-29 UTC) ## Statistics -Enabled/total sites: 2694/3126 = 86.18% +Enabled/total sites: 2693/3126 = 86.15% -Incomplete message checks: 405/2694 = 15.03% (false positive risks) +Incomplete message checks: 404/2693 = 15.0% (false positive risks) -Status code checks: 720/2694 = 26.73% (false positive risks) +Status code checks: 720/2693 = 26.74% (false positive risks) -False positive risk (total): 41.76% +False positive risk (total): 41.74% Top 20 profile URLs: - (796) `{urlMain}/index/8-0-{username} (uCoz)`