From a29c3c6abe9d83efe3c59a795fe85e522a2b0aac Mon Sep 17 00:00:00 2001 From: Soxoj <31013580+soxoj@users.noreply.github.com> Date: Sat, 26 Feb 2022 13:38:15 +0300 Subject: [PATCH 1/8] CI autoupdate (#359) * CI autoupdate * Updated site list and statistics --- .github/workflows/update-site-data.yml | 24 ++++++++++-- sites.md | 51 +++++++++++++++++++++++++- 2 files changed, 71 insertions(+), 4 deletions(-) diff --git a/.github/workflows/update-site-data.yml b/.github/workflows/update-site-data.yml index d4689da..f0321e1 100644 --- a/.github/workflows/update-site-data.yml +++ b/.github/workflows/update-site-data.yml @@ -1,18 +1,36 @@ -name: Update sites rating +name: Update sites rating and statistics on: push: branches: [ main ] pull_request: branches: [ main ] + types: [opened, synchronize] jobs: build: runs-on: ubuntu-latest steps: - - name: checkout repo - uses: actions/checkout@v2 + - name: Checkout repository + uses: actions/checkout@v2.3.2 + with: + ref: ${{ github.event.pull_request.head.sha }} + fetch-depth: 0 # otherwise, there would be errors pushing refs to the destination repository. + - name: build application run: | pip3 install . python3 ./utils/update_site_data.py --empty-only + + - name: Commit and push changes + run: | + git config --global user.name "Maigret autoupdate" + git config --global user.email "soxoj@protonmail.com" + echo `git name-rev ${{ github.event.pull_request.head.sha }} --name-only` + export BRANCH=`git name-rev ${{ github.event.pull_request.head.sha }} --name-only | sed 's/remotes\/origin\///'` + echo $BRANCH + git remote -v + git checkout $BRANCH + git add sites.md + git commit -m "Updated site list and statistics" + git push origin $BRANCH \ No newline at end of file diff --git a/sites.md b/sites.md index e2b7d20..6bd93d0 100644 --- a/sites.md +++ b/sites.md @@ -2599,4 +2599,53 @@ Rank data fetched from Alexa by domains. 1. ![](https://www.google.com/s2/favicons?domain=https://www.hozpitality.com) [hozpitality (https://www.hozpitality.com)](https://www.hozpitality.com)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain=https://kazanlashkigalab.com) [kazanlashkigalab.com (https://kazanlashkigalab.com)](https://kazanlashkigalab.com)*: top 100M, kz* -Alexa.com rank data fetched at (2022-01-21 21:28:53.863014 UTC) +Alexa.com rank data fetched at (2022-02-23 22:14:26.029891 UTC) +## Statistics + +Enabled/total sites: 2449/2595 + +Incomplete checks: 586/1978 (false positive risks) + +Top 20 profile URLs: +- (796) `{urlMain}/index/8-0-{username} (uCoz)` +- (221) `{urlMain}{urlSubpath}/members/?username={username} (XenForo)` +- (221) `/{username}` +- (138) `/user/{username}` +- (134) `{urlMain}{urlSubpath}/member.php?username={username} (vBulletin)` +- (97) `/profile/{username}` +- (87) `{urlMain}/u/{username}/summary (Discourse)` +- (74) `/users/{username}` +- (44) `{urlMain}{urlSubpath}/search.php?author={username} (phpBB/Search)` +- (41) `/members/?username={username}` +- (39) `SUBDOMAIN` +- (36) `/@{username}` +- (28) `/u/{username}` +- (27) `{urlMain}{urlSubpath}/memberlist.php?username={username} (phpBB)` +- (24) `/members/{username}` +- (18) `/forum/members/?username={username}` +- (18) `/forum/search.php?keywords=&terms=all&author={username}` +- (17) `/search.php?keywords=&terms=all&author={username}` +- (15) `/author/{username}` +- (14) `/profile.php?mode=viewprofile&u={username}` + +Top 20 tags: +- (255) `forum` +- (50) `gaming` +- (40) `NO_TAGS` (non-standard) +- (24) `coding` +- (23) `photo` +- (19) `news` +- (18) `blog` +- (18) `music` +- (15) `tech` +- (13) `freelance` +- (12) `sharing` +- (12) `finance` +- (11) `shopping` +- (10) `dating` +- (10) `art` +- (9) `hobby` +- (8) `movies` +- (7) `sport` +- (7) `hacking` +- (5) `stock` From 8a865a1ce6dc85887fb29881672714a6cd3cdea3 Mon Sep 17 00:00:00 2001 From: Soxoj <31013580+soxoj@users.noreply.github.com> Date: Sat, 26 Feb 2022 14:16:13 +0300 Subject: [PATCH 2/8] Op.gg fixes (#363) * Fixed op.gg sites * Added testing docs, fixed some error * Updated site list and statistics --- docs/source/development.rst | 31 +++++++++++++++ maigret/checking.py | 2 +- maigret/resources/data.json | 76 +++++++++++-------------------------- sites.md | 2 +- 4 files changed, 55 insertions(+), 56 deletions(-) diff --git a/docs/source/development.rst b/docs/source/development.rst index 95ae37a..c17ed63 100644 --- a/docs/source/development.rst +++ b/docs/source/development.rst @@ -3,6 +3,37 @@ Development ============== +Testing +------- + +It is recommended use Python 3.7/3.8 for test due to some conflicts in 3.9. + +Install test requirements: + +.. code-block:: console + + pip install -r test-requirements.txt + + +Use the following commands to check Maigret: + +.. code-block:: console + + # run linter and typing checks + # order of checks% + # - critical syntax errors or undefined names + # - flake checks + # - mypy checks + make lint + + # run testing with coverage html report + # current test coverage is 60% + make text + + # open html report + open htmlcov/index.html + + How to publish new version of Maigret ------------------------------------- diff --git a/maigret/checking.py b/maigret/checking.py index a196cfc..b281287 100644 --- a/maigret/checking.py +++ b/maigret/checking.py @@ -132,7 +132,7 @@ class SimpleAiohttpChecker(CheckerBase): error = CheckError("Unexpected", str(e)) if error == "Invalid proxy response": - self.logger.debug(e, exc_info=True) + self.logger.debug(error, exc_info=True) return str(html_text), status_code, error diff --git a/maigret/resources/data.json b/maigret/resources/data.json index d2673a8..c9fb0a9 100644 --- a/maigret/resources/data.json +++ b/maigret/resources/data.json @@ -16574,12 +16574,7 @@ "br", "us" ], - "checkType": "message", - "presenseStrs": "common.SummonerHistory.Favorite", - "absenceStrs": [ - "SummonerNotFoundLayout" - ], - "alexaRank": 331, + "engine": "op.gg", "urlMain": "https://br.op.gg/", "url": "https://br.op.gg/summoner/userName={username}", "usernameClaimed": "adam", @@ -17120,12 +17115,7 @@ "gaming", "us" ], - "checkType": "message", - "presenseStrs": "common.SummonerHistory.Favorite", - "absenceStrs": [ - "SummonerNotFoundLayout" - ], - "alexaRank": 331, + "engine": "op.gg", "urlMain": "https://eune.op.gg/", "url": "https://eune.op.gg/summoner/userName={username}", "usernameClaimed": "adam", @@ -17136,12 +17126,7 @@ "gaming", "us" ], - "checkType": "message", - "presenseStrs": "common.SummonerHistory.Favorite", - "absenceStrs": [ - "SummonerNotFoundLayout" - ], - "alexaRank": 331, + "engine": "op.gg", "urlMain": "https://euw.op.gg/", "url": "https://euw.op.gg/summoner/userName={username}", "usernameClaimed": "blue", @@ -18080,12 +18065,7 @@ "tags": [ "us" ], - "checkType": "message", - "presenseStrs": "common.SummonerHistory.Favorite", - "absenceStrs": [ - "SummonerNotFoundLayout" - ], - "alexaRank": 331, + "engine": "op.gg", "urlMain": "https://lan.op.gg/", "url": "https://lan.op.gg/summoner/userName={username}", "usernameClaimed": "adam", @@ -18096,12 +18076,7 @@ "gaming", "us" ], - "checkType": "message", - "presenseStrs": "common.SummonerHistory.Favorite", - "absenceStrs": [ - "SummonerNotFoundLayout" - ], - "alexaRank": 331, + "engine": "op.gg", "urlMain": "https://las.op.gg/", "url": "https://las.op.gg/summoner/userName={username}", "usernameClaimed": "adam", @@ -18485,12 +18460,7 @@ "tags": [ "gaming" ], - "checkType": "message", - "presenseStrs": "common.SummonerHistory.Favorite", - "absenceStrs": [ - "SummonerNotFoundLayout" - ], - "alexaRank": 331, + "engine": "op.gg", "urlMain": "https://na.op.gg/", "url": "https://na.op.gg/summoner/userName={username}", "usernameClaimed": "adam", @@ -18670,12 +18640,7 @@ "gaming", "us" ], - "checkType": "message", - "presenseStrs": "common.SummonerHistory.Favorite", - "absenceStrs": [ - "SummonerNotFoundLayout" - ], - "alexaRank": 331, + "engine": "op.gg", "urlMain": "https://oce.op.gg/", "url": "https://oce.op.gg/summoner/userName={username}", "usernameClaimed": "adam", @@ -19142,12 +19107,7 @@ "ru", "us" ], - "checkType": "message", - "presenseStrs": "common.SummonerHistory.Favorite", - "absenceStrs": [ - "SummonerNotFoundLayout" - ], - "alexaRank": 331, + "engine": "op.gg", "urlMain": "https://ru.op.gg/", "url": "https://ru.op.gg/summoner/userName={username}", "usernameClaimed": "adam", @@ -19694,12 +19654,7 @@ "tr", "us" ], - "checkType": "message", - "presenseStrs": "common.SummonerHistory.Favorite", - "absenceStrs": [ - "SummonerNotFoundLayout" - ], - "alexaRank": 331, + "engine": "op.gg", "urlMain": "https://tr.op.gg/", "url": "https://tr.op.gg/summoner/userName={username}", "usernameClaimed": "adam", @@ -29054,6 +29009,19 @@ "404" ] } + }, + "op.gg": { + "name": "op.gg", + "site": { + "checkType": "message", + "presenseStrs": [ + "
" + ], + "absenceStrs": [ + "

This summoner is not registered" + ], + "alexaRank": 331 + } } }, "tags": [ diff --git a/sites.md b/sites.md index 6bd93d0..37523c4 100644 --- a/sites.md +++ b/sites.md @@ -2599,7 +2599,7 @@ Rank data fetched from Alexa by domains. 1. ![](https://www.google.com/s2/favicons?domain=https://www.hozpitality.com) [hozpitality (https://www.hozpitality.com)](https://www.hozpitality.com)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain=https://kazanlashkigalab.com) [kazanlashkigalab.com (https://kazanlashkigalab.com)](https://kazanlashkigalab.com)*: top 100M, kz* -Alexa.com rank data fetched at (2022-02-23 22:14:26.029891 UTC) +Alexa.com rank data fetched at (2022-02-26 11:12:43.279519 UTC) ## Statistics Enabled/total sites: 2449/2595 From be204ff11923de9a049137ece9c97cdeed249be4 Mon Sep 17 00:00:00 2001 From: Soxoj <31013580+soxoj@users.noreply.github.com> Date: Sat, 26 Feb 2022 14:27:08 +0300 Subject: [PATCH 3/8] Wikipedia fix (#365) * Fixed op.gg sites * Added testing docs, fixed some error * Fixed Wikipedia --- .github/workflows/update-site-data.yml | 1 - maigret/resources/data.json | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/update-site-data.yml b/.github/workflows/update-site-data.yml index f0321e1..2d2c81e 100644 --- a/.github/workflows/update-site-data.yml +++ b/.github/workflows/update-site-data.yml @@ -5,7 +5,6 @@ on: branches: [ main ] pull_request: branches: [ main ] - types: [opened, synchronize] jobs: build: diff --git a/maigret/resources/data.json b/maigret/resources/data.json index c9fb0a9..60dd729 100644 --- a/maigret/resources/data.json +++ b/maigret/resources/data.json @@ -15518,7 +15518,8 @@ ], "checkType": "message", "absenceStrs": [ - "is not registered" + "is not registered", + "Wikipedia does not have a" ], "alexaRank": 12, "urlMain": "https://www.wikipedia.org/", From 61452d56d394789d32bc0f811b8787de45cc0a8b Mon Sep 17 00:00:00 2001 From: Soxoj <31013580+soxoj@users.noreply.github.com> Date: Sat, 26 Feb 2022 14:49:43 +0300 Subject: [PATCH 4/8] Disabled Netvibes and LeetCode (#366) * Disabled Netvibes and LeetCode * Specified types of PR for tests in CI * Updated site list and statistics --- .github/workflows/python-package.yml | 1 + maigret/resources/data.json | 10 +++++++--- sites.md | 8 ++++---- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 295160b..132fe8e 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -5,6 +5,7 @@ on: branches: [ main ] pull_request: branches: [ main ] + types: [opened, synchronize] jobs: build: diff --git a/maigret/resources/data.json b/maigret/resources/data.json index 60dd729..6ce0b6a 100644 --- a/maigret/resources/data.json +++ b/maigret/resources/data.json @@ -7760,6 +7760,7 @@ "tags": [ "coding" ], + "disabled": true, "checkType": "status_code", "alexaRank": 1859, "urlMain": "https://leetcode.com/", @@ -9711,12 +9712,15 @@ }, "Netvibes": { "tags": [ - "de", - "fr", - "jp" + "business", + "fr" ], + "disabled": true, "checkType": "status_code", "alexaRank": 5730, + "headers": { + "User-Agent": "curl/7.64.1" + }, "urlMain": "https://www.netvibes.com", "url": "https://www.netvibes.com/{username}#General", "usernameClaimed": "blue", diff --git a/sites.md b/sites.md index 37523c4..f035818 100644 --- a/sites.md +++ b/sites.md @@ -197,7 +197,7 @@ Rank data fetched from Alexa by domains. 1. ![](https://www.google.com/s2/favicons?domain=https://www.polygon.com/) [Polygon (https://www.polygon.com/)](https://www.polygon.com/)*: top 5K, us* 1. ![](https://www.google.com/s2/favicons?domain=https://otzovik.com/) [Otzovik (https://otzovik.com/)](https://otzovik.com/)*: top 5K, ru* 1. ![](https://www.google.com/s2/favicons?domain=https://www.liveinternet.ru) [LiveInternet (https://www.liveinternet.ru)](https://www.liveinternet.ru)*: top 5K, ru* -1. ![](https://www.google.com/s2/favicons?domain=https://leetcode.com/) [LeetCode (https://leetcode.com/)](https://leetcode.com/)*: top 5K, coding* +1. ![](https://www.google.com/s2/favicons?domain=https://leetcode.com/) [LeetCode (https://leetcode.com/)](https://leetcode.com/)*: top 5K, coding*, search is disabled 1. ![](https://www.google.com/s2/favicons?domain=https://www.kaggle.com/) [Kaggle (https://www.kaggle.com/)](https://www.kaggle.com/)*: top 5K, tech* 1. ![](https://www.google.com/s2/favicons?domain=https://codepen.io/) [Codepen (https://codepen.io/)](https://codepen.io/)*: top 5K, coding, in* 1. ![](https://www.google.com/s2/favicons?domain=https://www.rajce.idnes.cz/) [Rajce.net (https://www.rajce.idnes.cz/)](https://www.rajce.idnes.cz/)*: top 5K, cz* @@ -300,7 +300,7 @@ Rank data fetched from Alexa by domains. 1. ![](https://www.google.com/s2/favicons?domain=https://forums.digitalspy.com/) [Digitalspy (https://forums.digitalspy.com/)](https://forums.digitalspy.com/)*: top 10K, forum, gb, us* 1. ![](https://www.google.com/s2/favicons?domain=https://www.bibsonomy.org) [Bibsonomy (https://www.bibsonomy.org)](https://www.bibsonomy.org)*: top 10K, in* 1. ![](https://www.google.com/s2/favicons?domain=https://slashdot.org) [Slashdot (https://slashdot.org)](https://slashdot.org)*: top 10K, news* -1. ![](https://www.google.com/s2/favicons?domain=https://www.netvibes.com) [Netvibes (https://www.netvibes.com)](https://www.netvibes.com)*: top 10K, de, fr, jp* +1. ![](https://www.google.com/s2/favicons?domain=https://www.netvibes.com) [Netvibes (https://www.netvibes.com)](https://www.netvibes.com)*: top 10K, business, fr*, search is disabled 1. ![](https://www.google.com/s2/favicons?domain=https://opensource.com/) [opensource (https://opensource.com/)](https://opensource.com/)*: top 10K, in, us* 1. ![](https://www.google.com/s2/favicons?domain=https://discuss.elastic.co/) [Discuss.Elastic.co (https://discuss.elastic.co/)](https://discuss.elastic.co/)*: top 10K, forum, tech, us* 1. ![](https://www.google.com/s2/favicons?domain=https://www.baby.ru/) [Baby.ru (https://www.baby.ru/)](https://www.baby.ru/)*: top 10K, ru* @@ -2599,10 +2599,10 @@ Rank data fetched from Alexa by domains. 1. ![](https://www.google.com/s2/favicons?domain=https://www.hozpitality.com) [hozpitality (https://www.hozpitality.com)](https://www.hozpitality.com)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain=https://kazanlashkigalab.com) [kazanlashkigalab.com (https://kazanlashkigalab.com)](https://kazanlashkigalab.com)*: top 100M, kz* -Alexa.com rank data fetched at (2022-02-26 11:12:43.279519 UTC) +Alexa.com rank data fetched at (2022-02-26 11:41:48.847517 UTC) ## Statistics -Enabled/total sites: 2449/2595 +Enabled/total sites: 2447/2595 Incomplete checks: 586/1978 (false positive risks) From dcf5181e2851e02e0cfb001520d3ef979fb9208e Mon Sep 17 00:00:00 2001 From: Soxoj <31013580+soxoj@users.noreply.github.com> Date: Sat, 26 Feb 2022 15:31:15 +0300 Subject: [PATCH 5/8] Fixed several false positives, improved statistics info (#368) * Fixed several false positives, improved statistics info * Updated site list and statistics --- maigret/resources/data.json | 42 +++++++++++++++++++++++++------------ maigret/sites.py | 7 +++++-- sites.md | 12 +++++------ 3 files changed, 40 insertions(+), 21 deletions(-) diff --git a/maigret/resources/data.json b/maigret/resources/data.json index 6ce0b6a..6d521b1 100644 --- a/maigret/resources/data.json +++ b/maigret/resources/data.json @@ -5203,11 +5203,14 @@ ], "checkType": "message", "presenceStrs": [ - "userStatsTitle" + "Foursquare " ], "alexaRank": 3413, - "urlMain": "https://ru.foursquare.com/", - "url": "https://ru.foursquare.com/{username}", + "urlMain": "https://foursquare.com/", + "url": "https://foursquare.com/{username}", "usernameClaimed": "adam", "usernameUnclaimed": "noonewouldeverusethis7" }, @@ -6310,7 +6313,10 @@ ], "checkType": "message", "absenceStrs": [ - "Page not found." + "Page not found" + ], + "presenseStrs": [ + "title=\"Gumroad\"" ], "alexaRank": 4728, "urlMain": "https://www.gumroad.com/", @@ -8857,7 +8863,10 @@ ], "checkType": "message", "absenceStrs": [ - "\u0417\u0434\u0435\u0441\u044c \u043f\u043e\u043a\u0430 \u043d\u0438\u0447\u0435\u0433\u043e \u043d\u0435\u0442" + "\u041f\u043e \u0412\u0430\u0448\u0435\u043c\u0443 \u0437\u0430\u043f\u0440\u043e\u0441\u0443 \u043d\u0438\u0447\u0435\u0433\u043e \u043d\u0435 \u043d\u0430\u0439\u0434\u0435\u043d\u043e" + ], + "presenseStrs": [ + "\u041b\u044e\u0434\u0438" ], "alexaRank": 6409, "urlMain": "https://mirtesen.ru", @@ -10166,10 +10175,7 @@ "tags": [ "ru" ], - "checkType": "message", - "absenceStrs": [ - "404 - Not Found" - ], + "checkType": "status_code", "alexaRank": 25200, "urlMain": "https://overclockers.ru", "url": "https://overclockers.ru/cpubase/user/{username}", @@ -10714,7 +10720,11 @@ "checkType": "message", "absenceStrs": [ "Hmm, it seems that you've come across an invalid username", - "404 Not Found" + "404 Not Found", + "Member Not Found" + ], + "presenseStrs": [ + "profile on Planet Minecraft to see their public Minecraft community activity" ], "alexaRank": 9050, "urlMain": "https://www.planetminecraft.com", @@ -12851,7 +12861,13 @@ "tags": [ "music" ], - "checkType": "status_code", + "checkType": "message", + "presenseStrs": [ + "Profile: " + ], + "absenceStrs": [ + "Smule | Page Not Found (404)" + ], "alexaRank": 11742, "urlMain": "https://www.smule.com/", "url": "https://www.smule.com/{username}", @@ -13117,7 +13133,7 @@ "us" ], "headers": { - "authorization": "Bearer BQC-v69M-AcXsPLrSktz0Era-J2P1SXWB42HLKRHnCNpj00jLEbbbDFpIFo1UhBKrHrL7FqLQd-X4MIuhFo" + "authorization": "Bearer BQBFTijjpshGAhX7n9-sO46wb8zJIkhu6TT3Ss7b-0V1dw_jXZhcff1agUpqRgbhznOG8pSIRlHtJAtd2TU" }, "errors": { "Spotify is currently not available in your country.": "Access denied in your country, use proxy/vpn" @@ -14973,7 +14989,7 @@ "video" ], "headers": { - "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2NDExNzg4NjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.9rznMue0JmX9SAPuWQDIYR-mmsozFq5PoKUvlvElpkQ" + "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2NDU4Nzg1NDAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.Bs6VBcKPsl-5dqoThdAImBIex1mas1UcyG2pSnIYqYk" }, "activation": { "url": "https://vimeo.com/_rv/viewer", diff --git a/maigret/sites.py b/maigret/sites.py index 9ea540d..402eb29 100644 --- a/maigret/sites.py +++ b/maigret/sites.py @@ -450,8 +450,11 @@ class MaigretDatabase: for tag in filter(lambda x: not is_country_tag(x), site.tags): tags[tag] = tags.get(tag, 0) + 1 - output += f"Enabled/total sites: {total_count - disabled_count}/{total_count}\n\n" - output += f"Incomplete checks: {message_checks_one_factor}/{message_checks} (false positive risks)\n\n" + enabled_perc = round(100*(total_count-disabled_count)/total_count, 2) + output += f"Enabled/total sites: {total_count - disabled_count}/{total_count} = {enabled_perc}%\n\n" + + checks_perc = round(100*message_checks_one_factor/message_checks, 2) + output += f"Incomplete checks: {message_checks_one_factor}/{message_checks} = {checks_perc}% (false positive risks)\n\n" top_urls_count = 20 output += f"Top {top_urls_count} profile URLs:\n" diff --git a/sites.md b/sites.md index f035818..90f9c46 100644 --- a/sites.md +++ b/sites.md @@ -249,7 +249,7 @@ Rank data fetched from Alexa by domains. 1. ![](https://www.google.com/s2/favicons?domain=https://forum.xda-developers.com) [XDA (https://forum.xda-developers.com)](https://forum.xda-developers.com)*: top 5K, apps, forum*, search is disabled 1. ![](https://www.google.com/s2/favicons?domain=https://i.thechive.com/) [Thechive (https://i.thechive.com/)](https://i.thechive.com/)*: top 5K, us* 1. ![](https://www.google.com/s2/favicons?domain=https://999.md) [999.md (https://999.md)](https://999.md)*: top 5K, freelance, md, shopping* -1. ![](https://www.google.com/s2/favicons?domain=https://ru.foursquare.com/) [Foursquare (https://ru.foursquare.com/)](https://ru.foursquare.com/)*: top 5K, geosocial, in* +1. ![](https://www.google.com/s2/favicons?domain=https://foursquare.com/) [Foursquare (https://foursquare.com/)](https://foursquare.com/)*: top 5K, geosocial, in* 1. ![](https://www.google.com/s2/favicons?domain=https://4pda.ru/) [4pda (https://4pda.ru/)](https://4pda.ru/)*: top 5K, ru* 1. ![](https://www.google.com/s2/favicons?domain=https://www.weforum.org) [Weforum (https://www.weforum.org)](https://www.weforum.org)*: top 5K, forum, us* 1. ![](https://www.google.com/s2/favicons?domain=http://www.techspot.com/community/) [techspot.com (http://www.techspot.com/community/)](http://www.techspot.com/community/)*: top 5K, forum, us* @@ -2599,12 +2599,12 @@ Rank data fetched from Alexa by domains. 1. ![](https://www.google.com/s2/favicons?domain=https://www.hozpitality.com) [hozpitality (https://www.hozpitality.com)](https://www.hozpitality.com)*: top 100M* 1. ![](https://www.google.com/s2/favicons?domain=https://kazanlashkigalab.com) [kazanlashkigalab.com (https://kazanlashkigalab.com)](https://kazanlashkigalab.com)*: top 100M, kz* -Alexa.com rank data fetched at (2022-02-26 11:41:48.847517 UTC) +Alexa.com rank data fetched at (2022-02-26 12:19:53.127789 UTC) ## Statistics -Enabled/total sites: 2447/2595 +Enabled/total sites: 2447/2595 = 94.3% -Incomplete checks: 586/1978 (false positive risks) +Incomplete checks: 582/1978 = 29.42% (false positive risks) Top 20 profile URLs: - (796) `{urlMain}/index/8-0-{username} (uCoz)` @@ -2634,9 +2634,9 @@ Top 20 tags: - (40) `NO_TAGS` (non-standard) - (24) `coding` - (23) `photo` -- (19) `news` +- (18) `news` - (18) `blog` -- (18) `music` +- (17) `music` - (15) `tech` - (13) `freelance` - (12) `sharing` From bc787cdf519e4c4fa417c135f059fad2230948ac Mon Sep 17 00:00:00 2001 From: Soxoj <31013580+soxoj@users.noreply.github.com> Date: Sat, 26 Feb 2022 16:01:22 +0300 Subject: [PATCH 6/8] Fix false positives (#370) * Fixed several false positives, improved statistics info * Disabled some sites, fixed fp percent count method * Updated site list and statistics --- maigret/resources/data.json | 32 +++++++++++++++++++------------- maigret/sites.py | 2 +- sites.md | 30 +++++++++++++++--------------- 3 files changed, 35 insertions(+), 29 deletions(-) diff --git a/maigret/resources/data.json b/maigret/resources/data.json index 6d521b1..124c439 100644 --- a/maigret/resources/data.json +++ b/maigret/resources/data.json @@ -1880,7 +1880,7 @@ ], "checkType": "message", "absenceStrs": [ - "\u041f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u0435\u043b\u044f \u0441 \u0442\u0430\u043a\u0438\u043c \u0438\u043c\u0435\u043d\u0435\u043c \u043d\u0435 \u0441\u0443\u0449\u0435\u0441\u0442\u0432\u0443\u0435\u0442!" + "\u041e\u0448\u0438\u0431\u043a\u0430 / \u041f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u0435\u043b\u044f \u0441 \u0442\u0430\u043a\u0438\u043c \u0438\u043c\u0435\u043d\u0435\u043c \u043d\u0435 \u0441\u0443\u0449\u0435\u0441\u0442\u0432\u0443\u0435\u0442" ], "alexaRank": 2303903, "urlMain": "https://bgforum.ru", @@ -3598,6 +3598,7 @@ "tags": [ "ru" ], + "disabled": true, "checkType": "status_code", "urlMain": "https://dinsk.su", "url": "https://dinsk.su/user/{username}", @@ -4571,6 +4572,7 @@ "tags": [ "ru" ], + "disabled": true, "checkType": "status_code", "alexaRank": 1225740, "urlMain": "https://favera.ru", @@ -8546,6 +8548,7 @@ "tags": [ "forum" ], + "disabled": true, "checkType": "message", "absenceStrs": [ "The specified member cannot be found" @@ -9082,12 +9085,9 @@ }, "Movescount": { "tags": [ - "es", - "in", - "pk", - "ru", - "us" + "maps" ], + "disabled": true, "checkType": "message", "absenceStrs": [ "error=4&" @@ -12891,16 +12891,19 @@ }, "Snooth": { "tags": [ - "in" + "news" ], "checkType": "message", "absenceStrs": [ - "Profiles on Snooth" + "Page not found" + ], + "presenseStrs": [ + "content=\"https://www.snooth.com/author/" ], "alexaRank": 4088489, "urlMain": "https://www.snooth.com/", - "url": "https://www.snooth.com/profiles/{username}/", - "usernameClaimed": "GregT", + "url": "https://www.snooth.com/author/{username}/", + "usernameClaimed": "joshua", "usernameUnclaimed": "noonewouldeverusethis7" }, "SocialLibremOne": { @@ -14989,7 +14992,7 @@ "video" ], "headers": { - "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2NDU4Nzg1NDAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.Bs6VBcKPsl-5dqoThdAImBIex1mas1UcyG2pSnIYqYk" + "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2NDU4Nzk3NDAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.T8E8Vrx0sO-9WP4RdZGNQZw2EB1hYTIXbIguXIZbfNQ" }, "activation": { "url": "https://vimeo.com/_rv/viewer", @@ -24833,7 +24836,7 @@ "usernameClaimed": "alex", "usernameUnclaimed": "noonewouldeverusethis7" }, - "hashnode.com": { + "hashnode": { "tags": [ "in" ], @@ -24846,7 +24849,7 @@ " name=" ], "absenceStrs": [ - "> Date: Sat, 26 Feb 2022 16:43:40 +0300 Subject: [PATCH 7/8] Fixed the rest of false positives for now (#371) * Fixed the rest of false positives for now * Fixed tag * Updated site list and statistics --- maigret/maigret.py | 2 +- maigret/resources/data.json | 35 ++++++++++++++++++++++++----------- sites.md | 18 +++++++++--------- utils/update_site_data.py | 2 +- 4 files changed, 35 insertions(+), 22 deletions(-) diff --git a/maigret/maigret.py b/maigret/maigret.py index 224b33c..17700d4 100755 --- a/maigret/maigret.py +++ b/maigret/maigret.py @@ -48,7 +48,7 @@ def notify_about_errors(search_results: QueryResultWrapper, query_notify): for e in errs: if not errors.is_important(e): continue - text = f'Too many errors of type "{e["err"]}" ({e["perc"]}%)' + text = f'Too many errors of type "{e["err"]}" ({round(e["perc"],2)}%)' solution = errors.solution_of(e['err']) if solution: text = '. '.join([text, solution.capitalize()]) diff --git a/maigret/resources/data.json b/maigret/resources/data.json index 124c439..dd58ebf 100644 --- a/maigret/resources/data.json +++ b/maigret/resources/data.json @@ -1462,6 +1462,7 @@ "forum", "ru" ], + "disabled": true, "checkType": "message", "absenceStrs": [ "\u041f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u0435\u043b\u044c \u043d\u0435 \u043d\u0430\u0439\u0434\u0435\u043d" @@ -18145,6 +18146,7 @@ "tags": [ "ru" ], + "disabled": true, "checkType": "status_code", "alexaRank": 6054365, "urlMain": "http://linuxmint.info", @@ -24487,23 +24489,35 @@ }, "thoughts.com": { "tags": [ - "in" + "blog" + ], + "checkType": "message", + "absenceStrs": [ + "Start a Blog" + ], + "presenseStrs": [ + "– Thoughts.com" ], - "engine": "engine404get", "urlMain": "http://thoughts.com", - "url": "http://thoughts.com/profile/{username}", + "url": "http://thoughts.com/members/{username}", "usernameUnclaimed": "noonewouldeverusethis7", - "usernameClaimed": "red" + "usernameClaimed": "alicia12" }, "hackernoon.com": { "tags": [ - "in", - "us" + "us", + "news" + ], + "checkType": "message", + "absenceStrs": [ + "HackerNoon" + ], + "presenseStrs": [ + " | HackerNoon" ], - "engine": "engine404message", "urlMain": "https://hackernoon.com", "url": "https://hackernoon.com/u/{username}", - "usernameUnclaimed": "noonewouldeverusethis7", + "usernameUnclaimed": "noonewouldeverusethis71", "usernameClaimed": "god" }, "Intigriti": { @@ -28479,10 +28493,9 @@ }, "photoshop-kopona.com": { "absenceStrs": [ - "noonewouldeverusethis7 » \u0420\u0435\u0441\u0443\u0440\u0441\u044b \u0434\u043b\u044f \u0424\u043e\u0442\u043e\u0448\u043e\u043f\u0430" + "