mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-06 22:19:01 +00:00
Fix site checks: 5 fixed, 4 disabled; fix UA leak bug (#2569)
This commit is contained in:
+7
-1
@@ -247,9 +247,15 @@ class CurlCffiChecker(CheckerBase):
|
||||
async def check(self) -> Tuple[Optional[str], int, Optional[CheckError]]:
|
||||
try:
|
||||
async with CurlCffiAsyncSession() as session:
|
||||
# Strip the User-Agent so curl_cffi can use the impersonated browser's
|
||||
# matching UA. Mixing a random UA with a Chrome TLS fingerprint trips
|
||||
# composite bot scoring (e.g. Cloudflare returns a JS challenge for
|
||||
# "Chrome 91 UA + Chrome 131 TLS"). Keep any site-specific custom headers.
|
||||
headers = {k: v for k, v in (self.headers or {}).items()
|
||||
if k.lower() not in ('user-agent', 'connection')}
|
||||
kwargs = {
|
||||
'url': self.url,
|
||||
'headers': self.headers,
|
||||
'headers': headers or None,
|
||||
'allow_redirects': self.allow_redirects,
|
||||
'timeout': self.timeout if self.timeout else 10,
|
||||
'impersonate': self.browser_emulate,
|
||||
|
||||
@@ -5974,6 +5974,10 @@
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
},
|
||||
"Muckrack": {
|
||||
"disabled": true,
|
||||
"protection": [
|
||||
"cf_js_challenge"
|
||||
],
|
||||
"absenceStrs": [
|
||||
"(404) Page Not Found"
|
||||
],
|
||||
@@ -6088,6 +6092,9 @@
|
||||
"tags": [
|
||||
"freelance"
|
||||
],
|
||||
"protection": [
|
||||
"tls_fingerprint"
|
||||
],
|
||||
"checkType": "message",
|
||||
"absenceStrs": [
|
||||
"\"users\":{}"
|
||||
@@ -6712,6 +6719,10 @@
|
||||
"usernameUnclaimed": "noonewouldeverusethis777"
|
||||
},
|
||||
"MyFitnessPal": {
|
||||
"disabled": true,
|
||||
"protection": [
|
||||
"custom_bot_protection"
|
||||
],
|
||||
"tags": [
|
||||
"sport"
|
||||
],
|
||||
@@ -6911,6 +6922,9 @@
|
||||
"tags": [
|
||||
"music"
|
||||
],
|
||||
"protection": [
|
||||
"tls_fingerprint"
|
||||
],
|
||||
"checkType": "message",
|
||||
"presenseStrs": [
|
||||
"Points:"
|
||||
@@ -7415,6 +7429,10 @@
|
||||
"tags": [
|
||||
"gaming"
|
||||
],
|
||||
"protection": [
|
||||
"tls_fingerprint"
|
||||
],
|
||||
"ignore403": true,
|
||||
"checkType": "status_code",
|
||||
"alexaRank": 5699,
|
||||
"urlMain": "https://www.moddb.com/",
|
||||
@@ -7757,6 +7775,10 @@
|
||||
"usernameUnclaimed": "noonewouldeverusethis77777"
|
||||
},
|
||||
"Morguefile": {
|
||||
"disabled": true,
|
||||
"protection": [
|
||||
"cf_js_challenge"
|
||||
],
|
||||
"absenceStrs": [
|
||||
"free photographs for commercial use"
|
||||
],
|
||||
@@ -8371,6 +8393,9 @@
|
||||
"Muse Score": {
|
||||
"url": "https://musescore.com/{username}",
|
||||
"urlMain": "https://musescore.com/",
|
||||
"protection": [
|
||||
"tls_fingerprint"
|
||||
],
|
||||
"checkType": "status_code",
|
||||
"usernameClaimed": "arrangeme",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7",
|
||||
@@ -14042,10 +14067,13 @@
|
||||
"gb",
|
||||
"hk"
|
||||
],
|
||||
"protection": [
|
||||
"tls_fingerprint"
|
||||
],
|
||||
"checkType": "status_code",
|
||||
"alexaRank": 49143,
|
||||
"urlMain": "https://www.mybuilder.com",
|
||||
"url": "https://www.mybuilder.com/profile/view/{username}",
|
||||
"url": "https://www.mybuilder.com/profile/{username}",
|
||||
"usernameClaimed": "adam",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
},
|
||||
@@ -16008,6 +16036,10 @@
|
||||
"usernameClaimed": "admin"
|
||||
},
|
||||
"Movieforums": {
|
||||
"disabled": true,
|
||||
"protection": [
|
||||
"cf_js_challenge"
|
||||
],
|
||||
"tags": [
|
||||
"forum",
|
||||
"la"
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
{
|
||||
"version": 1,
|
||||
"updated_at": "2026-04-26T10:28:07Z",
|
||||
"updated_at": "2026-04-26T12:48:48Z",
|
||||
"sites_count": 3139,
|
||||
"min_maigret_version": "0.6.0",
|
||||
"data_sha256": "7a6e2e5d2d970d85a3c0b4ecc6eda34a927b6067bd75f2df0301d8603722428e",
|
||||
"data_sha256": "9aad4f5a0b89c126e033393a79307a7e66ef17e18a3f2fc550bd771aa06a2c56",
|
||||
"data_url": "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/data.json"
|
||||
}
|
||||
@@ -229,7 +229,7 @@ Rank data fetched from Majestic Million by domains.
|
||||
1.  [beacons.ai (https://beacons.ai)](https://beacons.ai)*: top 5K, links*
|
||||
1.  [Artsy (https://www.artsy.net)](https://www.artsy.net)*: top 5K, art*, search is disabled
|
||||
1.  [IFTTT (https://www.ifttt.com/)](https://www.ifttt.com/)*: top 5K, tech*
|
||||
1.  [Muckrack (https://muckrack.com)](https://muckrack.com)*: top 5K, news*
|
||||
1.  [Muckrack (https://muckrack.com)](https://muckrack.com)*: top 5K, news*, search is disabled
|
||||
1.  [Crunchyroll (https://www.crunchyroll.com/)](https://www.crunchyroll.com/)*: top 5K, forum, movies*, search is disabled
|
||||
1.  [Odysee (https://odysee.com/)](https://odysee.com/)*: top 5K, video*
|
||||
1.  [Replit (https://replit.com/)](https://replit.com/)*: top 5K, coding*
|
||||
@@ -258,7 +258,7 @@ Rank data fetched from Majestic Million by domains.
|
||||
1.  [Ultimate-Guitar (https://ultimate-guitar.com/)](https://ultimate-guitar.com/)*: top 5K, music*
|
||||
1.  [ChaturBate (https://chaturbate.com)](https://chaturbate.com)*: top 5K, porn, webcam*
|
||||
1.  [HackerOne (https://hackerone.com/)](https://hackerone.com/)*: top 5K, coding, hacking*
|
||||
1.  [MyFitnessPal (https://www.myfitnesspal.com/)](https://www.myfitnesspal.com/)*: top 5K, sport*
|
||||
1.  [MyFitnessPal (https://www.myfitnesspal.com/)](https://www.myfitnesspal.com/)*: top 5K, sport*, search is disabled
|
||||
1.  [Plurk (https://gab.com/)](https://gab.com/)*: top 5K, social, tw, us*
|
||||
1.  [Contently (https://contently.com/)](https://contently.com/)*: top 5K, freelance*
|
||||
1.  [MyMiniFactory (https://www.myminifactory.com/)](https://www.myminifactory.com/)*: top 5K, 3d, shopping*
|
||||
@@ -310,7 +310,7 @@ Rank data fetched from Majestic Million by domains.
|
||||
1.  [MercadoLivre (https://www.mercadolivre.com.br)](https://www.mercadolivre.com.br)*: top 10K, br*
|
||||
1.  [Tinder (https://tinder.com/)](https://tinder.com/)*: top 10K, dating*
|
||||
1.  [Anobii (https://www.anobii.com)](https://www.anobii.com)*: top 10K, books*
|
||||
1.  [Morguefile (https://morguefile.com)](https://morguefile.com)*: top 10K, photo*
|
||||
1.  [Morguefile (https://morguefile.com)](https://morguefile.com)*: top 10K, photo*, search is disabled
|
||||
1.  [Velog (https://velog.io/)](https://velog.io/)*: top 10K, blog, coding, kr*
|
||||
1.  [Kick (https://kick.com/)](https://kick.com/)*: top 10K, streaming*
|
||||
1.  [domestika.org (https://www.domestika.org)](https://www.domestika.org)*: top 10K, education*
|
||||
@@ -710,7 +710,7 @@ Rank data fetched from Majestic Million by domains.
|
||||
1.  [fotostrana.ru (https://fotostrana.ru)](https://fotostrana.ru)*: top 100K, ru*
|
||||
1.  [bigfooty.com (https://www.bigfooty.com/forum/)](https://www.bigfooty.com/forum/)*: top 100K, au, forum*
|
||||
1.  [Tl (https://tl.net)](https://tl.net)*: top 10M, de, dk*
|
||||
1.  [Movieforums (https://www.movieforums.com)](https://www.movieforums.com)*: top 10M, forum, la*
|
||||
1.  [Movieforums (https://www.movieforums.com)](https://www.movieforums.com)*: top 10M, forum, la*, search is disabled
|
||||
1.  [Crevado (https://crevado.com/)](https://crevado.com/)*: top 10M, design*
|
||||
1.  [Monkeytype (https://monkeytype.com/)](https://monkeytype.com/)*: top 10M, gaming*
|
||||
1.  [Mylot (https://www.mylot.com/)](https://www.mylot.com/)*: top 10M, pl*
|
||||
@@ -3146,13 +3146,13 @@ Rank data fetched from Majestic Million by domains.
|
||||
The list was updated at (2026-04-26)
|
||||
## Statistics
|
||||
|
||||
Enabled/total sites: 2509/3139 = 79.93%
|
||||
Enabled/total sites: 2505/3139 = 79.8%
|
||||
|
||||
Incomplete message checks: 315/2509 = 12.55% (false positive risks)
|
||||
Incomplete message checks: 314/2505 = 12.53% (false positive risks)
|
||||
|
||||
Status code checks: 624/2509 = 24.87% (false positive risks)
|
||||
Status code checks: 624/2505 = 24.91% (false positive risks)
|
||||
|
||||
False positive risk (total): 37.42%
|
||||
False positive risk (total): 37.44%
|
||||
|
||||
Sites with probing: 500px, Armchairgm, BinarySearch (disabled), BleachFandom, Bluesky, BongaCams, Boosty, BuyMeACoffee, Calendly, Cent, Chess, Code Sandbox, Code Snippet Wiki, DailyMotion, Discord, Diskusjon.no, Disqus, Docker Hub, Duolingo, FandomCommunityCentral, GitHub, GitLab, Google Plus (archived), Gravatar, HackTheBox, Hackerrank, Hashnode, Holopin, Imgur, Issuu, Keybase, Kick, Kvinneguiden, LeetCode, Lesswrong, Livejasmin, LocalCryptos (disabled), Medium, MicrosoftLearn, MixCloud, Monkeytype, NPM, Niftygateway, Omg.lol, OnlyFans, Paragraph, Picsart, Plurk, Polarsteps, Rarible, Reddit (disabled), Reddit Search (Pushshift) (disabled), Revolut.me, RoyalCams, Scratch, Soop, SportsTracker, Spotify, StackOverflow, Substack, TAP'D, Topcoder, Trello, Twitch, Twitter, Twitter Shadowban (disabled), UnstoppableDomains, Vimeo, Vivino, Warframe Market, Warpcast, Weibo, Wikipedia, Yapisal (disabled), YouNow, en.brickimedia.org, nightbot, notabug.org, qiwi.me (disabled)
|
||||
|
||||
@@ -3163,7 +3163,7 @@ Top 20 profile URLs:
|
||||
- (312) `/{username}`
|
||||
- (223) `{urlMain}{urlSubpath}/members/?username={username} (XenForo)`
|
||||
- (170) `/user/{username}`
|
||||
- (138) `/profile/{username}`
|
||||
- (139) `/profile/{username}`
|
||||
- (127) `{urlMain}{urlSubpath}/search.php?author={username} (phpBB/Search)`
|
||||
- (120) `{urlMain}{urlSubpath}/member.php?username={username} (vBulletin)`
|
||||
- (116) `/u/{username}`
|
||||
|
||||
@@ -307,3 +307,161 @@ def test_process_site_result_with_error_is_unknown():
|
||||
out = process_site_result(resp, Mock(), Mock(), info, site)
|
||||
assert out["status"].status == MaigretCheckStatus.UNKNOWN
|
||||
assert out["status"].error is not None
|
||||
|
||||
|
||||
# ---- CurlCffiChecker: TLS impersonation header sanitisation ----
|
||||
|
||||
|
||||
class _FakeCurlResponse:
|
||||
def __init__(self, text="ok", status_code=200):
|
||||
self.text = text
|
||||
self.status_code = status_code
|
||||
|
||||
|
||||
class _FakeCurlSession:
|
||||
"""Captures the kwargs of the last .get/.post/.head call for assertions."""
|
||||
|
||||
last_method = None
|
||||
last_kwargs = None
|
||||
|
||||
async def __aenter__(self):
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc, tb):
|
||||
return False
|
||||
|
||||
async def get(self, **kwargs):
|
||||
type(self).last_method = 'get'
|
||||
type(self).last_kwargs = kwargs
|
||||
return _FakeCurlResponse()
|
||||
|
||||
async def post(self, **kwargs):
|
||||
type(self).last_method = 'post'
|
||||
type(self).last_kwargs = kwargs
|
||||
return _FakeCurlResponse()
|
||||
|
||||
async def head(self, **kwargs):
|
||||
type(self).last_method = 'head'
|
||||
type(self).last_kwargs = kwargs
|
||||
return _FakeCurlResponse()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fake_curl_cffi(monkeypatch):
|
||||
"""Replace CurlCffiAsyncSession with a recorder. Resets capture between tests."""
|
||||
from maigret import checking
|
||||
_FakeCurlSession.last_method = None
|
||||
_FakeCurlSession.last_kwargs = None
|
||||
monkeypatch.setattr(checking, 'CurlCffiAsyncSession', _FakeCurlSession)
|
||||
return _FakeCurlSession
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_curl_cffi_strips_random_user_agent_to_let_impersonation_drive_ua(fake_curl_cffi):
|
||||
"""Regression: maigret used to forward `get_random_user_agent()` (often Chrome 91)
|
||||
to curl_cffi alongside `impersonate="chrome"` (Chrome 131 TLS). Cloudflare composite
|
||||
bot scoring rejects the resulting "Chrome 91 UA + Chrome 131 TLS" combo with a JS
|
||||
challenge. The fix strips User-Agent and Connection from the headers passed to
|
||||
curl_cffi so the impersonation default UA wins.
|
||||
"""
|
||||
from maigret.checking import CurlCffiChecker
|
||||
|
||||
checker = CurlCffiChecker(logger=Mock(), browser_emulate='chrome')
|
||||
checker.prepare(
|
||||
url='https://example.com/u/test',
|
||||
headers={
|
||||
"User-Agent": "Mozilla/5.0 ... Chrome/91.0.4472.124 ...", # maigret default
|
||||
"Connection": "close", # maigret default
|
||||
},
|
||||
allow_redirects=True,
|
||||
timeout=10,
|
||||
method='get',
|
||||
)
|
||||
await checker.check()
|
||||
|
||||
sent = fake_curl_cffi.last_kwargs
|
||||
assert fake_curl_cffi.last_method == 'get'
|
||||
assert sent['impersonate'] == 'chrome'
|
||||
# The whole point of the fix: random UA must not leak through.
|
||||
assert sent['headers'] is None or 'User-Agent' not in sent['headers']
|
||||
assert sent['headers'] is None or 'user-agent' not in {k.lower() for k in sent['headers']}
|
||||
# Connection: close also stripped (interferes with impersonation defaults).
|
||||
assert sent['headers'] is None or 'Connection' not in sent['headers']
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_curl_cffi_preserves_site_specific_headers(fake_curl_cffi):
|
||||
"""Site-specific headers (e.g. Content-Type for POST APIs, auth tokens, cookies)
|
||||
must survive the User-Agent strip — only UA and Connection are removed.
|
||||
"""
|
||||
from maigret.checking import CurlCffiChecker
|
||||
|
||||
checker = CurlCffiChecker(logger=Mock(), browser_emulate='chrome')
|
||||
checker.prepare(
|
||||
url='https://example.com/api',
|
||||
headers={
|
||||
"User-Agent": "Mozilla/5.0 random",
|
||||
"Connection": "close",
|
||||
"Content-Type": "application/json",
|
||||
"X-Csrf-Token": "abc123",
|
||||
},
|
||||
allow_redirects=True,
|
||||
timeout=10,
|
||||
method='get',
|
||||
)
|
||||
await checker.check()
|
||||
|
||||
sent_headers = fake_curl_cffi.last_kwargs['headers']
|
||||
assert sent_headers is not None
|
||||
assert sent_headers.get("Content-Type") == "application/json"
|
||||
assert sent_headers.get("X-Csrf-Token") == "abc123"
|
||||
# Sanity: stripped pair is gone
|
||||
assert "User-Agent" not in sent_headers
|
||||
assert "Connection" not in sent_headers
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_curl_cffi_handles_empty_headers(fake_curl_cffi):
|
||||
"""No headers at all → headers kwarg is None (not an empty dict that could confuse
|
||||
curl_cffi's impersonation header injection)."""
|
||||
from maigret.checking import CurlCffiChecker
|
||||
|
||||
checker = CurlCffiChecker(logger=Mock(), browser_emulate='chrome')
|
||||
checker.prepare(
|
||||
url='https://example.com/u/test',
|
||||
headers=None,
|
||||
allow_redirects=True,
|
||||
timeout=10,
|
||||
method='get',
|
||||
)
|
||||
await checker.check()
|
||||
|
||||
assert fake_curl_cffi.last_kwargs['headers'] is None
|
||||
assert fake_curl_cffi.last_kwargs['impersonate'] == 'chrome'
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_curl_cffi_strips_ua_for_post_too(fake_curl_cffi):
|
||||
"""The same UA-strip must apply on POST (e.g. Discord-style POST username probes
|
||||
with `tls_fingerprint`)."""
|
||||
from maigret.checking import CurlCffiChecker
|
||||
|
||||
checker = CurlCffiChecker(logger=Mock(), browser_emulate='chrome')
|
||||
checker.prepare(
|
||||
url='https://example.com/api/check',
|
||||
headers={
|
||||
"User-Agent": "Mozilla/5.0 random",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
allow_redirects=True,
|
||||
timeout=10,
|
||||
method='post',
|
||||
payload={"username": "test"},
|
||||
)
|
||||
await checker.check()
|
||||
|
||||
sent = fake_curl_cffi.last_kwargs
|
||||
assert fake_curl_cffi.last_method == 'post'
|
||||
assert sent['json'] == {"username": "test"}
|
||||
assert "User-Agent" not in sent['headers']
|
||||
assert sent['headers'].get("Content-Type") == "application/json"
|
||||
|
||||
Reference in New Issue
Block a user