From 55fc8694ed4743033be518ef969398e0490e76bb Mon Sep 17 00:00:00 2001 From: Soxoj <31013580+soxoj@users.noreply.github.com> Date: Tue, 24 Mar 2026 23:01:11 +0100 Subject: [PATCH] Fix false-positive site checks reported by Maigret Bot (#2376) --- LLM/site-checks-guide.md | 20 ++++++++++ LLM/site-checks-playbook.md | 1 + maigret/resources/data.json | 80 ++++++++++++++++++++++--------------- 3 files changed, 68 insertions(+), 33 deletions(-) diff --git a/LLM/site-checks-guide.md b/LLM/site-checks-guide.md index f5ad1a8..b0ab2c6 100644 --- a/LLM/site-checks-guide.md +++ b/LLM/site-checks-guide.md @@ -427,6 +427,26 @@ https://gql.hashnode.com?query=%7Buser(username%3A%20%22melwinalm%22)%20%7B%20na **Lesson:** When a `urlProbe` needs literal curly braces (GraphQL, JSON in URL, etc.), percent-encode them. This is a general technique for any `data.json` URL field processed by `.format()`. +### 7.15 Rate-limit responses belong in `errors`, not `absenceStrs` + +When a site's API returns a rate-limit response, the text may **not** match the `absenceStrs` entry — either because the wording varies between API versions (`"The resource is being rate limited"` vs `"You are being rate limited."`) or because the JSON structure differs entirely. If the rate-limit string is in `absenceStrs` and the actual response uses a different phrasing, **no** absence string matches. With empty `presenseStrs` (presence always true), the result is a false **CLAIMED**. + +**Fix:** Move rate-limit strings out of `absenceStrs` and into `errors` (mapping to `"Rate limited"` or similar). The `errors` mechanism produces an **UNKNOWN** result instead of CLAIMED or NOT FOUND, which is the correct semantic: rate limiting means "we don't know", not "user exists" or "user doesn't exist". + +```json +{ + "absenceStrs": ["{\"taken\":false}"], + "errors": { + "The resource is being rate limited": "Rate limited", + "You are being rate limited": "Rate limited" + } +} +``` + +**General rule:** Any response that means "I can't answer right now" (rate limit, maintenance page, CAPTCHA, temporary ban) should go into `errors`, never into `absenceStrs` or `presenseStrs`. Only strings that reliably indicate "user does / does not exist" belong in the presence/absence lists. + +**Discord example (2026-03-24):** The POST API at `discord.com/api/v9/unique-username/username-attempt-unauthed` returns `{"taken":true}` / `{"taken":false}` normally, but under load returns varying rate-limit messages. Keeping only `{"taken":false}` in `absenceStrs` and all rate-limit variants in `errors` eliminates the transient false positives the Maigret bot was reporting. + ### 7.7 The playbook classification works The decision tree from the documentation accurately describes real-world cases: diff --git a/LLM/site-checks-playbook.md b/LLM/site-checks-playbook.md index c8e33e6..02ab330 100644 --- a/LLM/site-checks-playbook.md +++ b/LLM/site-checks-playbook.md @@ -81,6 +81,7 @@ Practical observations from fixing top-ranked sites. Full details: section **7** | **GraphQL supports GET too** | hashnode GraphQL works via `GET ?query=...` (URL-encoded). You can use either native POST payloads or GET `urlProbe` for GraphQL. | | **URL-encode braces for template safety** | GraphQL `{...}` conflicts with Maigret's `{username}`. Use `%7B`/`%7D` for literal braces in `urlProbe` — `.format()` ignores percent-encoded chars. | | **Anti-bot bypass via simple UA** | "Anubis" anti-bot PoW screens (like on Weblate) intercept modern browser UAs via HTTP 307. Hardcoding `"headers": {"User-Agent": "python-requests/2.25.1"}` circumvents the scraper filter and restores default detection logic. | +| **Rate-limit → `errors`, not `absenceStrs`** | Rate-limit wording varies across API versions. If the phrasing doesn't match `absenceStrs` and `presenseStrs` is empty, the result is a false CLAIMED. Put all "can't answer right now" strings (rate limit, CAPTCHA, maintenance) in `errors` so the result is UNKNOWN. | ## 8. Documentation maintenance diff --git a/maigret/resources/data.json b/maigret/resources/data.json index 3cc72e2..d426c72 100644 --- a/maigret/resources/data.json +++ b/maigret/resources/data.json @@ -911,8 +911,7 @@ "urlProbe": "https://discord.com/api/v9/unique-username/username-attempt-unauthed", "checkType": "message", "absenceStrs": [ - "{\"taken\":false}", - "The resource is being rate limited" + "{\"taken\":false}" ], "headers": { "Content-Type": "application/json" @@ -923,6 +922,11 @@ "requestMethod": "POST", "requestPayload": { "username": "{username}" + }, + "errors": { + "The resource is being rate limited": "Rate limited", + "You are being rate limited": "Rate limited", + "rate_limited": "Rate limited" } }, "Unsplash": { @@ -10728,12 +10732,19 @@ ] }, "Rarible": { - "url": "https://rarible.com/marketplace/api/v4/urls/{username}", + "url": "https://rarible.com/{username}", "urlMain": "https://rarible.com/", - "checkType": "status_code", + "checkType": "message", "usernameClaimed": "blue", "usernameUnclaimed": "noonewouldeverusethis7", - "alexaRank": 10488 + "alexaRank": 10488, + "presenseStrs": [ + "\"type\":\"USER\"" + ], + "absenceStrs": [ + "Page Not Found" + ], + "urlProbe": "https://rarible.com/marketplace/api/v4/urls/{username}" }, "Computerbase": { "disabled": true, @@ -16328,7 +16339,8 @@ ], "usernameClaimed": "Aimilios", "usernameUnclaimed": "noonewouldeverusethis7", - "alexaRank": 89235 + "alexaRank": 89235, + "disabled": true }, "Icheckmovies": { "tags": [ @@ -17125,21 +17137,7 @@ "usernameUnclaimed": "noonewouldeverusethis7" }, "massagerepublic.com": { - "checkType": "message", - "absenceStrs": [ - "", - "", - " ", - "replaceState", - "

404 - Page not found

" - ], - "presenseStrs": [ - " title=", - " style=", - "og:title", - "page-title", - "female" - ], + "checkType": "status_code", "url": "https://massagerepublic.com/u/{username}", "urlMain": "https://massagerepublic.com", "usernameClaimed": "lily88", @@ -17170,7 +17168,8 @@ "urlMain": "https://mama.ru", "url": "https://mama.ru/members/{username}", "usernameClaimed": "irina", - "usernameUnclaimed": "noonewouldeverusethis7" + "usernameUnclaimed": "noonewouldeverusethis7", + "disabled": true }, "W7forums": { "engine": "XenForo", @@ -20007,14 +20006,20 @@ "CSSBattle": { "url": "https://cssbattle.dev/player/{username}", "urlMain": "https://cssbattle.dev", - "checkType": "status_code", + "checkType": "message", "usernameClaimed": "beo", "usernameUnclaimed": "noonewouldeverusethis7", "tags": [ "coding", "de" ], - "alexaRank": 317659 + "alexaRank": 317659, + "presenseStrs": [ + "\"player\":{\"id\"" + ], + "absenceStrs": [ + "\"player\":null" + ] }, "jeepgarage.org": { "urlMain": "https://jeepgarage.org", @@ -21979,7 +21984,8 @@ "urlMain": "https://nekto.me", "url": "https://nekto.me/{username}/", "usernameClaimed": "green", - "usernameUnclaimed": "noonewouldeverusethis7" + "usernameUnclaimed": "noonewouldeverusethis7", + "disabled": true }, "Play.md": { "tags": [ @@ -22510,7 +22516,8 @@ "urlMain": "http://www.volgogradru.com", "url": "http://www.volgogradru.com/users/{username}/", "usernameClaimed": "rezook", - "usernameUnclaimed": "noonewouldeverusethis7" + "usernameUnclaimed": "noonewouldeverusethis7", + "disabled": true }, "intoclassics.net": { "tags": [ @@ -22558,7 +22565,8 @@ "urlMain": "https://www.freelanced.com", "url": "https://www.freelanced.com/{username}", "usernameClaimed": "mattphilleo", - "usernameUnclaimed": "noonewouldeverusethis7" + "usernameUnclaimed": "noonewouldeverusethis7", + "disabled": true }, "samesound.ru": { "tags": [ @@ -23234,7 +23242,8 @@ "urlMain": "http://mathhelpplanet.com", "url": "http://mathhelpplanet.com/search.php?keywords=&terms=all&author={username}", "usernameClaimed": "adam", - "usernameUnclaimed": "noonewouldeverusethis7" + "usernameUnclaimed": "noonewouldeverusethis7", + "disabled": true }, "wasm.in": { "urlMain": "https://wasm.in", @@ -23399,7 +23408,8 @@ "tags": [ "video" ], - "alexaRank": 777866 + "alexaRank": 777866, + "disabled": true }, "Caduser": { "tags": [ @@ -23475,7 +23485,8 @@ "engine": "vBulletin", "usernameClaimed": "alex", "usernameUnclaimed": "noonewouldeverusethis7", - "alexaRank": 803230 + "alexaRank": 803230, + "disabled": true }, "forum.rarib.ag": { "urlMain": "https://forum.rarib.ag", @@ -24230,7 +24241,8 @@ ], "usernameClaimed": "hacker", "usernameUnclaimed": "noonewouldeverusethis7", - "alexaRank": 989313 + "alexaRank": 989313, + "disabled": true }, "0-3.RU": { "tags": [ @@ -26642,7 +26654,8 @@ "urlMain": "https://www.hairmaniac.ru/", "url": "https://www.hairmaniac.ru/profile/{username}/", "usernameClaimed": "irina", - "usernameUnclaimed": "noonewouldeverusethis7" + "usernameUnclaimed": "noonewouldeverusethis7", + "disabled": true }, "Handgunforum": { "disabled": true, @@ -29025,7 +29038,8 @@ "urlMain": "https://subeta.net/", "url": "https://subeta.net/users/{username}", "usernameClaimed": "green", - "usernameUnclaimed": "noonewouldeverusethis7" + "usernameUnclaimed": "noonewouldeverusethis7", + "disabled": true }, "Suzuri.jp": { "checkType": "message",