From dcf5181e2851e02e0cfb001520d3ef979fb9208e Mon Sep 17 00:00:00 2001
From: Soxoj <31013580+soxoj@users.noreply.github.com>
Date: Sat, 26 Feb 2022 15:31:15 +0300
Subject: [PATCH] Fixed several false positives, improved statistics info
(#368)
* Fixed several false positives, improved statistics info
* Updated site list and statistics
---
maigret/resources/data.json | 42 +++++++++++++++++++++++++------------
maigret/sites.py | 7 +++++--
sites.md | 12 +++++------
3 files changed, 40 insertions(+), 21 deletions(-)
diff --git a/maigret/resources/data.json b/maigret/resources/data.json
index 6ce0b6a..6d521b1 100644
--- a/maigret/resources/data.json
+++ b/maigret/resources/data.json
@@ -5203,11 +5203,14 @@
],
"checkType": "message",
"presenceStrs": [
- "userStatsTitle"
+ "Foursquare "
],
"alexaRank": 3413,
- "urlMain": "https://ru.foursquare.com/",
- "url": "https://ru.foursquare.com/{username}",
+ "urlMain": "https://foursquare.com/",
+ "url": "https://foursquare.com/{username}",
"usernameClaimed": "adam",
"usernameUnclaimed": "noonewouldeverusethis7"
},
@@ -6310,7 +6313,10 @@
],
"checkType": "message",
"absenceStrs": [
- "Page not found."
+ "Page not found"
+ ],
+ "presenseStrs": [
+ "title=\"Gumroad\""
],
"alexaRank": 4728,
"urlMain": "https://www.gumroad.com/",
@@ -8857,7 +8863,10 @@
],
"checkType": "message",
"absenceStrs": [
- "\u0417\u0434\u0435\u0441\u044c \u043f\u043e\u043a\u0430 \u043d\u0438\u0447\u0435\u0433\u043e \u043d\u0435\u0442"
+ "\u041f\u043e \u0412\u0430\u0448\u0435\u043c\u0443 \u0437\u0430\u043f\u0440\u043e\u0441\u0443 \u043d\u0438\u0447\u0435\u0433\u043e \u043d\u0435 \u043d\u0430\u0439\u0434\u0435\u043d\u043e"
+ ],
+ "presenseStrs": [
+ "\u041b\u044e\u0434\u0438"
],
"alexaRank": 6409,
"urlMain": "https://mirtesen.ru",
@@ -10166,10 +10175,7 @@
"tags": [
"ru"
],
- "checkType": "message",
- "absenceStrs": [
- "404 - Not Found"
- ],
+ "checkType": "status_code",
"alexaRank": 25200,
"urlMain": "https://overclockers.ru",
"url": "https://overclockers.ru/cpubase/user/{username}",
@@ -10714,7 +10720,11 @@
"checkType": "message",
"absenceStrs": [
"Hmm, it seems that you've come across an invalid username",
- "404 Not Found"
+ "404 Not Found",
+ "Member Not Found"
+ ],
+ "presenseStrs": [
+ "profile on Planet Minecraft to see their public Minecraft community activity"
],
"alexaRank": 9050,
"urlMain": "https://www.planetminecraft.com",
@@ -12851,7 +12861,13 @@
"tags": [
"music"
],
- "checkType": "status_code",
+ "checkType": "message",
+ "presenseStrs": [
+ "Profile: "
+ ],
+ "absenceStrs": [
+ "Smule | Page Not Found (404)"
+ ],
"alexaRank": 11742,
"urlMain": "https://www.smule.com/",
"url": "https://www.smule.com/{username}",
@@ -13117,7 +13133,7 @@
"us"
],
"headers": {
- "authorization": "Bearer BQC-v69M-AcXsPLrSktz0Era-J2P1SXWB42HLKRHnCNpj00jLEbbbDFpIFo1UhBKrHrL7FqLQd-X4MIuhFo"
+ "authorization": "Bearer BQBFTijjpshGAhX7n9-sO46wb8zJIkhu6TT3Ss7b-0V1dw_jXZhcff1agUpqRgbhznOG8pSIRlHtJAtd2TU"
},
"errors": {
"Spotify is currently not available in your country.": "Access denied in your country, use proxy/vpn"
@@ -14973,7 +14989,7 @@
"video"
],
"headers": {
- "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2NDExNzg4NjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.9rznMue0JmX9SAPuWQDIYR-mmsozFq5PoKUvlvElpkQ"
+ "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2NDU4Nzg1NDAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.Bs6VBcKPsl-5dqoThdAImBIex1mas1UcyG2pSnIYqYk"
},
"activation": {
"url": "https://vimeo.com/_rv/viewer",
diff --git a/maigret/sites.py b/maigret/sites.py
index 9ea540d..402eb29 100644
--- a/maigret/sites.py
+++ b/maigret/sites.py
@@ -450,8 +450,11 @@ class MaigretDatabase:
for tag in filter(lambda x: not is_country_tag(x), site.tags):
tags[tag] = tags.get(tag, 0) + 1
- output += f"Enabled/total sites: {total_count - disabled_count}/{total_count}\n\n"
- output += f"Incomplete checks: {message_checks_one_factor}/{message_checks} (false positive risks)\n\n"
+ enabled_perc = round(100*(total_count-disabled_count)/total_count, 2)
+ output += f"Enabled/total sites: {total_count - disabled_count}/{total_count} = {enabled_perc}%\n\n"
+
+ checks_perc = round(100*message_checks_one_factor/message_checks, 2)
+ output += f"Incomplete checks: {message_checks_one_factor}/{message_checks} = {checks_perc}% (false positive risks)\n\n"
top_urls_count = 20
output += f"Top {top_urls_count} profile URLs:\n"
diff --git a/sites.md b/sites.md
index f035818..90f9c46 100644
--- a/sites.md
+++ b/sites.md
@@ -249,7 +249,7 @@ Rank data fetched from Alexa by domains.
1.  [XDA (https://forum.xda-developers.com)](https://forum.xda-developers.com)*: top 5K, apps, forum*, search is disabled
1.  [Thechive (https://i.thechive.com/)](https://i.thechive.com/)*: top 5K, us*
1.  [999.md (https://999.md)](https://999.md)*: top 5K, freelance, md, shopping*
-1.  [Foursquare (https://ru.foursquare.com/)](https://ru.foursquare.com/)*: top 5K, geosocial, in*
+1.  [Foursquare (https://foursquare.com/)](https://foursquare.com/)*: top 5K, geosocial, in*
1.  [4pda (https://4pda.ru/)](https://4pda.ru/)*: top 5K, ru*
1.  [Weforum (https://www.weforum.org)](https://www.weforum.org)*: top 5K, forum, us*
1.  [techspot.com (http://www.techspot.com/community/)](http://www.techspot.com/community/)*: top 5K, forum, us*
@@ -2599,12 +2599,12 @@ Rank data fetched from Alexa by domains.
1.  [hozpitality (https://www.hozpitality.com)](https://www.hozpitality.com)*: top 100M*
1.  [kazanlashkigalab.com (https://kazanlashkigalab.com)](https://kazanlashkigalab.com)*: top 100M, kz*
-Alexa.com rank data fetched at (2022-02-26 11:41:48.847517 UTC)
+Alexa.com rank data fetched at (2022-02-26 12:19:53.127789 UTC)
## Statistics
-Enabled/total sites: 2447/2595
+Enabled/total sites: 2447/2595 = 94.3%
-Incomplete checks: 586/1978 (false positive risks)
+Incomplete checks: 582/1978 = 29.42% (false positive risks)
Top 20 profile URLs:
- (796) `{urlMain}/index/8-0-{username} (uCoz)`
@@ -2634,9 +2634,9 @@ Top 20 tags:
- (40) `NO_TAGS` (non-standard)
- (24) `coding`
- (23) `photo`
-- (19) `news`
+- (18) `news`
- (18) `blog`
-- (18) `music`
+- (17) `music`
- (15) `tech`
- (13) `freelance`
- (12) `sharing`