From e962b8c693814bc7d375e2ee503b68f62b9037f6 Mon Sep 17 00:00:00 2001 From: Soxoj <31013580+soxoj@users.noreply.github.com> Date: Sat, 25 Apr 2026 18:15:38 +0200 Subject: [PATCH] Fix site checks: 5 fixed; readme fix (#2562) * Fix site checks: 5 fixed; readme fix * Logging improvements * Improve YouTube data extraction --- README.md | 2 +- maigret/checking.py | 8 +++++++- maigret/resources/data.json | 16 ++++++++++------ maigret/resources/db_meta.json | 4 ++-- sites.md | 6 +++--- 5 files changed, 23 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 6086204..8cb4c9c 100644 --- a/README.md +++ b/README.md @@ -109,7 +109,7 @@ Download a standalone EXE from [Releases](https://github.com/soxoj/maigret/relea Run Maigret in the browser via cloud shells or Jupyter notebooks: -[![Open in Cloud Shell](https://user-images.githubusercontent.com/27065646/92304704-8d146d80-ef80-11ea-8c29-0deaabb1c702.png)](https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/soxoj/maigret&tutorial=cloudshell-tutorial.md) +Open in Cloud Shell Run on Replit Open In Colab diff --git a/maigret/checking.py b/maigret/checking.py index 6e8cc73..97b542d 100644 --- a/maigret/checking.py +++ b/maigret/checking.py @@ -345,7 +345,11 @@ def process_site_result( username = results_info["username"] is_parsing_enabled = results_info["parsing_enabled"] url = results_info.get("url_user") - logger.info(url) + url_probe = results_info.get("url_probe") or url + if url_probe != url: + logger.info(f"{url_probe} (display: {url})") + else: + logger.info(url) status = results_info.get("status") if status is not None: @@ -603,6 +607,8 @@ def make_site_result( for k, v in site.get_params.items(): url_probe += f"&{k}={v}" + results_site["url_probe"] = url_probe + if site.request_method: request_method = site.request_method.lower() elif site.check_type == "status_code" and site.request_head_only: diff --git a/maigret/resources/data.json b/maigret/resources/data.json index 5ef5dc1..b089018 100644 --- a/maigret/resources/data.json +++ b/maigret/resources/data.json @@ -40,7 +40,7 @@ ], "alexaRank": 3, "urlMain": "https://www.youtube.com/", - "url": "https://www.youtube.com/@{username}", + "url": "https://www.youtube.com/@{username}/about", "usernameClaimed": "test", "usernameUnclaimed": "noonewouldeverusethis777" }, @@ -63,7 +63,7 @@ ], "alexaRank": 3, "urlMain": "https://www.youtube.com/", - "url": "https://www.youtube.com/@{username}", + "url": "https://www.youtube.com/@{username}/about", "usernameClaimed": "test", "usernameUnclaimed": "noonewouldeverusethis777" }, @@ -100,7 +100,7 @@ "sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"", "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA", "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36", - "x-guest-token": "2045154491230572773" + "x-guest-token": "2048070238281826593" }, "errors": { "Bad guest token": "x-guest-token update required" @@ -296,7 +296,7 @@ "method": "vimeo" }, "headers": { - "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NzY0Mzg3MjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbCwianRpIjoiNjY0OWY3ZWItMThjZS00ODU1LWIzNmEtNWY3MzRkOGZhNjAyIn0.l1SRcr5UqvxqYLveW7MTECKSfkgsbh1y9QZqZmBX1EI" + "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NzcxMzM4ODAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbCwianRpIjoiZjFiMGJjNWUtMjIyOC00Y2I1LWFlNmItODk0YjZhNGNmODJhIn0.YCPXekRrHnJy8iH1gX4iVuNURiw6sU_FlmsfHnM2oug" }, "urlProbe": "https://api.vimeo.com/users/{username}?fields=name%2Cgender%2Cbio%2Curi%2Clink%2Cbackground_video%2Clocation_details%2Cpictures%2Cverified%2Cmetadata.public_videos.total%2Cavailable_for_hire%2Ccan_work_remotely%2Cmetadata.connections.videos.total%2Cmetadata.connections.albums.total%2Cmetadata.connections.followers.total%2Cmetadata.connections.following.total%2Cmetadata.public_videos.total%2Cmetadata.connections.vimeo_experts.is_enrolled%2Ctotal_collection_count%2Ccreated_time%2Cprofile_preferences%2Cmembership%2Cclients%2Cskills%2Cproject_types%2Crates%2Ccategories%2Cis_expert%2Cprofile_discovery%2Cwebsites%2Ccontact_emails&fetch_user_profile=1", "checkType": "status_code", @@ -1339,6 +1339,9 @@ "did not match any articles", "not match" ], + "errors": { + "Our systems have detected unusual traffic": "Google rate-limit / captcha" + }, "tags": [ "education", "research" @@ -5639,8 +5642,8 @@ "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", "user-id": "0", "x-bc": "0a106d301866494c873ae3a05bc3c97cee59a749", - "time": "1776959404882", - "sign": "57203:46ddb95bceab303946739ba884f008f6a2118657:646:69cfa6d8", + "time": "1777132991121", + "sign": "57203:3723aa7d500e76eabca29df74e4e97c483f14204:66d:69cfa6d8", "referer": "https://onlyfans.com/", "cookie": "__cf_bm=YovfzPN0T_wg6F60L5eZKPOQvlGESws3UDGgEkmPb9A-1776790253-1.0.1.1-KRZgptNe5P9epBZSdITa12VfTEDlDdLckPY3I2FDAacvCPxOj0PqeK86J5mcC7UQ_TM8_O24bAh21ElYINovqk2386EoJYyLmknHJ5UsFts" }, @@ -11185,6 +11188,7 @@ "alexaRank": 14969, "urlMain": "https://www.vivino.com/", "url": "https://www.vivino.com/users/{username}", + "urlProbe": "https://api.vivino.com/users/{username}", "usernameClaimed": "adam", "usernameUnclaimed": "noonewouldeverusethis7" }, diff --git a/maigret/resources/db_meta.json b/maigret/resources/db_meta.json index b11ade7..3974a67 100644 --- a/maigret/resources/db_meta.json +++ b/maigret/resources/db_meta.json @@ -1,8 +1,8 @@ { "version": 1, - "updated_at": "2026-04-23T19:44:45Z", + "updated_at": "2026-04-25T16:11:27Z", "sites_count": 3139, "min_maigret_version": "0.6.0", - "data_sha256": "35bfbb1271a50890c78a03d8e9d9f8d07f78de0e140c8232626de2f6eb124bae", + "data_sha256": "c51ecaa6c0736c5e1e7ca91aaf111445b3ac9ce9541a472d97db2dcc3ff8aa17", "data_url": "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/data.json" } \ No newline at end of file diff --git a/sites.md b/sites.md index 75a757e..a8c7555 100644 --- a/sites.md +++ b/sites.md @@ -3143,7 +3143,7 @@ Rank data fetched from Majestic Million by domains. 1. ![](https://www.google.com/s2/favicons?domain=https://flarum.es) [flarum.es (https://flarum.es)](https://flarum.es)*: top 100M, es, forum* 1. ![](https://www.google.com/s2/favicons?domain=https://forum.fibra.click) [forum.fibra.click (https://forum.fibra.click)](https://forum.fibra.click)*: top 100M, forum, it* -The list was updated at (2026-04-23) +The list was updated at (2026-04-25) ## Statistics Enabled/total sites: 2510/3139 = 79.96% @@ -3154,7 +3154,7 @@ Status code checks: 625/2510 = 24.9% (false positive risks) False positive risk (total): 37.53% -Sites with probing: 500px, Armchairgm, BinarySearch (disabled), BleachFandom, Bluesky, BongaCams, Boosty, BuyMeACoffee, Calendly, Cent, Chess, Code Sandbox, Code Snippet Wiki, DailyMotion, Discord, Diskusjon.no, Disqus, Docker Hub, Duolingo, FandomCommunityCentral, GitHub, GitLab, Google Plus (archived), Gravatar, HackTheBox, Hackerrank, Hashnode, Holopin, Imgur, Issuu, Keybase, Kick, Kvinneguiden, LeetCode, Lesswrong, Livejasmin, LocalCryptos (disabled), Medium, MicrosoftLearn, MixCloud, Monkeytype, NPM, Niftygateway, Omg.lol, OnlyFans, Paragraph, Picsart, Plurk, Polarsteps, Rarible, Reddit (disabled), Reddit Search (Pushshift) (disabled), Revolut.me, RoyalCams, Scratch, Soop, SportsTracker, Spotify, StackOverflow, Substack, TAP'D, Topcoder, Trello, Twitch, Twitter, Twitter Shadowban (disabled), UnstoppableDomains, Vimeo, Warframe Market, Warpcast, Weibo, Wikipedia, Yapisal (disabled), YouNow, en.brickimedia.org, nightbot, notabug.org, qiwi.me (disabled) +Sites with probing: 500px, Armchairgm, BinarySearch (disabled), BleachFandom, Bluesky, BongaCams, Boosty, BuyMeACoffee, Calendly, Cent, Chess, Code Sandbox, Code Snippet Wiki, DailyMotion, Discord, Diskusjon.no, Disqus, Docker Hub, Duolingo, FandomCommunityCentral, GitHub, GitLab, Google Plus (archived), Gravatar, HackTheBox, Hackerrank, Hashnode, Holopin, Imgur, Issuu, Keybase, Kick, Kvinneguiden, LeetCode, Lesswrong, Livejasmin, LocalCryptos (disabled), Medium, MicrosoftLearn, MixCloud, Monkeytype, NPM, Niftygateway, Omg.lol, OnlyFans, Paragraph, Picsart, Plurk, Polarsteps, Rarible, Reddit (disabled), Reddit Search (Pushshift) (disabled), Revolut.me, RoyalCams, Scratch, Soop, SportsTracker, Spotify, StackOverflow, Substack, TAP'D, Topcoder, Trello, Twitch, Twitter, Twitter Shadowban (disabled), UnstoppableDomains, Vimeo, Vivino, Warframe Market, Warpcast, Weibo, Wikipedia, Yapisal (disabled), YouNow, en.brickimedia.org, nightbot, notabug.org, qiwi.me (disabled) Sites with activation: OnlyFans, Twitter, Vimeo, Weibo @@ -3169,7 +3169,7 @@ Top 20 profile URLs: - (116) `/u/{username}` - (93) `/users/{username}` - (87) `{urlMain}/u/{username}/summary (Discourse)` -- (70) `/@{username}` +- (68) `/@{username}` - (55) `/wiki/User:{username}` - (45) `SUBDOMAIN` - (38) `/members/?username={username}`