From dcf5181e2851e02e0cfb001520d3ef979fb9208e Mon Sep 17 00:00:00 2001 From: Soxoj <31013580+soxoj@users.noreply.github.com> Date: Sat, 26 Feb 2022 15:31:15 +0300 Subject: [PATCH] Fixed several false positives, improved statistics info (#368) * Fixed several false positives, improved statistics info * Updated site list and statistics --- maigret/resources/data.json | 42 +++++++++++++++++++++++++------------ maigret/sites.py | 7 +++++-- sites.md | 12 +++++------ 3 files changed, 40 insertions(+), 21 deletions(-) diff --git a/maigret/resources/data.json b/maigret/resources/data.json index 6ce0b6a..6d521b1 100644 --- a/maigret/resources/data.json +++ b/maigret/resources/data.json @@ -5203,11 +5203,14 @@ ], "checkType": "message", "presenceStrs": [ - "userStatsTitle" + "Foursquare " ], "alexaRank": 3413, - "urlMain": "https://ru.foursquare.com/", - "url": "https://ru.foursquare.com/{username}", + "urlMain": "https://foursquare.com/", + "url": "https://foursquare.com/{username}", "usernameClaimed": "adam", "usernameUnclaimed": "noonewouldeverusethis7" }, @@ -6310,7 +6313,10 @@ ], "checkType": "message", "absenceStrs": [ - "Page not found." + "Page not found" + ], + "presenseStrs": [ + "title=\"Gumroad\"" ], "alexaRank": 4728, "urlMain": "https://www.gumroad.com/", @@ -8857,7 +8863,10 @@ ], "checkType": "message", "absenceStrs": [ - "\u0417\u0434\u0435\u0441\u044c \u043f\u043e\u043a\u0430 \u043d\u0438\u0447\u0435\u0433\u043e \u043d\u0435\u0442" + "\u041f\u043e \u0412\u0430\u0448\u0435\u043c\u0443 \u0437\u0430\u043f\u0440\u043e\u0441\u0443 \u043d\u0438\u0447\u0435\u0433\u043e \u043d\u0435 \u043d\u0430\u0439\u0434\u0435\u043d\u043e" + ], + "presenseStrs": [ + "\u041b\u044e\u0434\u0438" ], "alexaRank": 6409, "urlMain": "https://mirtesen.ru", @@ -10166,10 +10175,7 @@ "tags": [ "ru" ], - "checkType": "message", - "absenceStrs": [ - "404 - Not Found" - ], + "checkType": "status_code", "alexaRank": 25200, "urlMain": "https://overclockers.ru", "url": "https://overclockers.ru/cpubase/user/{username}", @@ -10714,7 +10720,11 @@ "checkType": "message", "absenceStrs": [ "Hmm, it seems that you've come across an invalid username", - "404 Not Found" + "404 Not Found", + "Member Not Found" + ], + "presenseStrs": [ + "profile on Planet Minecraft to see their public Minecraft community activity" ], "alexaRank": 9050, "urlMain": "https://www.planetminecraft.com", @@ -12851,7 +12861,13 @@ "tags": [ "music" ], - "checkType": "status_code", + "checkType": "message", + "presenseStrs": [ + "Profile: " + ], + "absenceStrs": [ + "Smule | Page Not Found (404)" + ], "alexaRank": 11742, "urlMain": "https://www.smule.com/", "url": "https://www.smule.com/{username}", @@ -13117,7 +13133,7 @@ "us" ], "headers": { - "authorization": "Bearer BQC-v69M-AcXsPLrSktz0Era-J2P1SXWB42HLKRHnCNpj00jLEbbbDFpIFo1UhBKrHrL7FqLQd-X4MIuhFo" + "authorization": "Bearer BQBFTijjpshGAhX7n9-sO46wb8zJIkhu6TT3Ss7b-0V1dw_jXZhcff1agUpqRgbhznOG8pSIRlHtJAtd2TU" }, "errors": { "Spotify is currently not available in your country.": "Access denied in your country, use proxy/vpn" @@ -14973,7 +14989,7 @@ "video" ], "headers": { - "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2NDExNzg4NjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.9rznMue0JmX9SAPuWQDIYR-mmsozFq5PoKUvlvElpkQ" + "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2NDU4Nzg1NDAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.Bs6VBcKPsl-5dqoThdAImBIex1mas1UcyG2pSnIYqYk" }, "activation": { "url": "https://vimeo.com/_rv/viewer", diff --git a/maigret/sites.py b/maigret/sites.py index 9ea540d..402eb29 100644 --- a/maigret/sites.py +++ b/maigret/sites.py @@ -450,8 +450,11 @@ class MaigretDatabase: for tag in filter(lambda x: not is_country_tag(x), site.tags): tags[tag] = tags.get(tag, 0) + 1 - output += f"Enabled/total sites: {total_count - disabled_count}/{total_count}\n\n" - output += f"Incomplete checks: {message_checks_one_factor}/{message_checks} (false positive risks)\n\n" + enabled_perc = round(100*(total_count-disabled_count)/total_count, 2) + output += f"Enabled/total sites: {total_count - disabled_count}/{total_count} = {enabled_perc}%\n\n" + + checks_perc = round(100*message_checks_one_factor/message_checks, 2) + output += f"Incomplete checks: {message_checks_one_factor}/{message_checks} = {checks_perc}% (false positive risks)\n\n" top_urls_count = 20 output += f"Top {top_urls_count} profile URLs:\n" diff --git a/sites.md b/sites.md index f035818..90f9c46 100644 --- a/sites.md +++ b/sites.md @@ -249,7 +249,7 @@ Rank data fetched from Alexa by domains. 1. ![](https://www.google.com/s2/favicons?domain=https://forum.xda-developers.com) [XDA (https://forum.xda-developers.com)](https://forum.xda-developers.com)*: top 5K, apps, forum*, search is disabled 1. ![](https://www.google.com/s2/favicons?domain=https://i.thechive.com/) [Thechive (https://i.thechive.com/)](https://i.thechive.com/)*: top 5K, us* 1. ![](https://www.google.com/s2/favicons?domain=https://999.md) [999.md (https://999.md)](https://999.md)*: top 5K, freelance, md, shopping* -1. ![](https://www.google.com/s2/favicons?domain=https://ru.foursquare.com/) [Foursquare (https://ru.foursquare.com/)](https://ru.foursquare.com/)*: top 5K, geosocial, in* +1. ![](https://www.google.com/s2/favicons?domain=https://foursquare.com/) [Foursquare (https://foursquare.com/)](https://foursquare.com/)*: top 5K, geosocial, in* 1. ![](https://www.google.com/s2/favicons?domain=https://4pda.ru/) [4pda (https://4pda.ru/)](https://4pda.ru/)*: top 5K, ru* 1. ![](https://www.google.com/s2/favicons?domain=https://www.weforum.org) [Weforum (https://www.weforum.org)](https://www.weforum.org)*: top 5K, forum, us* 1. ![](https://www.google.com/s2/favicons?domain=http://www.techspot.com/community/) [techspot.com (http://www.techspot.com/community/)](http://www.techspot.com/community/)*: top 5K, forum, us* @@ -2599,12 +2599,12 @@ Rank data fetched from Alexa by domains. 1. ![](https://www.google.com/s2/favicons?domain=https://www.hozpitality.com) [hozpitality (https://www.hozpitality.com)](https://www.hozpitality.com)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain=https://kazanlashkigalab.com) [kazanlashkigalab.com (https://kazanlashkigalab.com)](https://kazanlashkigalab.com)*: top 100M, kz* -Alexa.com rank data fetched at (2022-02-26 11:41:48.847517 UTC) +Alexa.com rank data fetched at (2022-02-26 12:19:53.127789 UTC) ## Statistics -Enabled/total sites: 2447/2595 +Enabled/total sites: 2447/2595 = 94.3% -Incomplete checks: 586/1978 (false positive risks) +Incomplete checks: 582/1978 = 29.42% (false positive risks) Top 20 profile URLs: - (796) `{urlMain}/index/8-0-{username} (uCoz)` @@ -2634,9 +2634,9 @@ Top 20 tags: - (40) `NO_TAGS` (non-standard) - (24) `coding` - (23) `photo` -- (19) `news` +- (18) `news` - (18) `blog` -- (18) `music` +- (17) `music` - (15) `tech` - (13) `freelance` - (12) `sharing`