Add site protection tracking system, fix broken site checks (Instagra… (#2452)

* Add site protection tracking system, fix broken site checks (Instagram, StackOverflow, LeetCode, Boosty, LiveLib), preserve unicode in data.json

* Update poetry.lock by running poetry lock

Agent-Logs-Url: https://github.com/soxoj/maigret/sessions/14333f41-67d5-4e28-a782-9730b31fc667

Co-authored-by: soxoj <31013580+soxoj@users.noreply.github.com>

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
This commit is contained in:
Soxoj
2026-04-02 20:28:20 +02:00
committed by Soxoj
parent c8a183683a
commit 99847ad3e7
8 changed files with 892 additions and 705 deletions
+1
View File
@@ -43,3 +43,4 @@ settings.json
# other
*.egg-info
build
LLM
+24
View File
@@ -137,6 +137,30 @@ There are few options for sites data.json helpful in various cases:
- ``regexCheck`` - a regex to check if the username is valid, in case of frequent false-positives
- ``requestMethod`` - set the HTTP method to use (e.g., ``POST``). By default, Maigret natively defaults to GET or HEAD.
- ``requestPayload`` - a dictionary with the JSON payload to send for POST requests (e.g., ``{"username": "{username}"}``), extremely useful for parsing GraphQL or modern JSON APIs.
- ``protection`` - a list of protection types detected on the site (see below).
``protection`` (site protection tracking)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The ``protection`` field records what kind of anti-bot protection a site uses. Maigret reads this field and automatically applies the appropriate bypass mechanism.
Supported values:
- ``tls_fingerprint`` — the site fingerprints the TLS handshake (JA3/JA4) and blocks non-browser clients. Maigret automatically uses ``curl_cffi`` with Chrome browser emulation to bypass this. Requires the ``curl_cffi`` package (included as a dependency). Examples: Instagram, NPM, Codepen, Kickstarter, Letterboxd.
- ``ip_reputation`` — the site blocks requests from datacenter/cloud IPs regardless of headers or TLS. Cannot be bypassed automatically; run Maigret from a regular internet connection (not a datacenter) or use a proxy (``--proxy``). Examples: Reddit, Patreon, Figma.
- ``js_challenge`` — the site serves a JavaScript challenge page (e.g. "Just a moment...") that cannot be solved without a browser. Maigret detects challenge signatures and returns UNKNOWN instead of a false positive.
Example:
.. code-block:: json
"Instagram": {
"url": "https://www.instagram.com/{username}/",
"checkType": "message",
"presenseStrs": ["\"routePath\":\"\\/"],
"absenceStrs": ["\"routePath\":null"],
"protection": ["tls_fingerprint"]
}
``urlProbe`` (optional profile probe URL)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+82 -2
View File
@@ -213,6 +213,76 @@ class AiodnsDomainResolver(CheckerBase):
return text, status, error
try:
from curl_cffi.requests import AsyncSession as CurlCffiAsyncSession
CURL_CFFI_AVAILABLE = True
except ImportError:
CURL_CFFI_AVAILABLE = False
class CurlCffiChecker(CheckerBase):
"""Checker using curl_cffi to emulate browser TLS fingerprint and bypass WAF."""
def __init__(self, *args, **kwargs):
self.logger = kwargs.get('logger', Mock())
self.browser_emulate = kwargs.get('browser_emulate', 'chrome')
self.url = None
self.headers = None
self.allow_redirects = True
self.timeout = 0
self.method = 'get'
self.payload = None
def prepare(self, url, headers=None, allow_redirects=True, timeout=0, method='get', payload=None):
self.url = url
self.headers = headers
self.allow_redirects = allow_redirects
self.timeout = timeout
self.method = method
self.payload = payload
return None
async def close(self):
pass
async def check(self) -> Tuple[str, int, Optional[CheckError]]:
try:
async with CurlCffiAsyncSession() as session:
kwargs = {
'url': self.url,
'headers': self.headers,
'allow_redirects': self.allow_redirects,
'timeout': self.timeout if self.timeout else 10,
'impersonate': self.browser_emulate,
}
if self.payload and self.method.lower() == 'post':
kwargs['json'] = self.payload
if self.method.lower() == 'post':
response = await session.post(**kwargs)
elif self.method.lower() == 'head':
response = await session.head(**kwargs)
else:
response = await session.get(**kwargs)
status_code = response.status_code
decoded_content = response.text
self.logger.debug(decoded_content)
error = CheckError("Connection lost") if status_code == 0 else None
return decoded_content, status_code, error
except asyncio.TimeoutError as e:
return None, 0, CheckError("Request timeout", str(e))
except KeyboardInterrupt:
return None, 0, CheckError("Interrupted")
except Exception as e:
self.logger.debug(e, exc_info=True)
return None, 0, CheckError("Unexpected", str(e))
class CheckerMock:
def __init__(self, *args, **kwargs):
pass
@@ -469,8 +539,18 @@ def make_site_result(
# workaround to prevent slash errors
url = re.sub("(?<!:)/+", "/", url)
# always clearweb_checker for now
checker = options["checkers"][site.protocol]
# Select checker: use curl_cffi for sites requiring TLS impersonation
needs_impersonation = 'tls_fingerprint' in site.protection
if needs_impersonation and CURL_CFFI_AVAILABLE:
checker = CurlCffiChecker(logger=logger, browser_emulate='chrome')
elif needs_impersonation and not CURL_CFFI_AVAILABLE:
logger.warning(
f"Site {site.name} requires TLS impersonation (curl_cffi) but it's not installed. "
"Install with: pip install curl_cffi"
)
checker = options["checkers"][site.protocol]
else:
checker = options["checkers"][site.protocol]
# site check is disabled
if site.disabled and not options['forced']:
+90 -43
View File
@@ -68,23 +68,29 @@
"usernameUnclaimed": "noonewouldeverusethis777"
},
"Instagram": {
"disabled": true,
"tags": [
"photo",
"social"
],
"errors": {
"Login • Instagram": "Login required"
},
"checkType": "message",
"presenseStrs": [
"<div id=\"splash-screen\">"
"\"routePath\":\"\\/{username}"
],
"absenceStrs": [
"\"routePath\":null"
],
"errors": {
"Login • Instagram": "Login required",
"Just a moment": "Cloudflare challenge"
},
"alexaRank": 4,
"urlMain": "https://www.instagram.com/",
"url": "https://www.instagram.com/{username}",
"usernameClaimed": "blue",
"usernameUnclaimed": "noonewouldeverusethis7"
"url": "https://www.instagram.com/{username}/",
"usernameClaimed": "cristiano",
"usernameUnclaimed": "noonewouldeverusethis77777",
"protection": [
"tls_fingerprint"
]
},
"Twitter": {
"tags": [
@@ -95,7 +101,7 @@
"sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
"authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
"x-guest-token": "2037668354144538994"
"x-guest-token": "2039637579922866279"
},
"errors": {
"Bad guest token": "x-guest-token update required"
@@ -288,7 +294,7 @@
"method": "vimeo"
},
"headers": {
"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NzQ2NTM5MDAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbCwianRpIjoiZDM3N2QyY2EtYTEyMC00NWRlLThkYjAtMGUzYWZlNWQ0NGRlIn0.5ZfqU66p6wQtFNg5-7Syrmu3mXTOzQ4Tju97eaw3Nbo"
"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NzUxMjM0MDAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbCwianRpIjoiZDY4YjViMGMtYTE3OC00ZDdhLWIyM2QtMDg5Y2MwZjAwOGEyIn0.0bGwlqckn4J07em2-nEX10OfW1JAmi54QCrPtm8Qn6A"
},
"urlProbe": "https://api.vimeo.com/users/{username}?fields=name%2Cgender%2Cbio%2Curi%2Clink%2Cbackground_video%2Clocation_details%2Cpictures%2Cverified%2Cmetadata.public_videos.total%2Cavailable_for_hire%2Ccan_work_remotely%2Cmetadata.connections.videos.total%2Cmetadata.connections.albums.total%2Cmetadata.connections.followers.total%2Cmetadata.connections.following.total%2Cmetadata.public_videos.total%2Cmetadata.connections.vimeo_experts.is_enrolled%2Ctotal_collection_count%2Ccreated_time%2Cprofile_preferences%2Cmembership%2Cclients%2Cskills%2Cproject_types%2Crates%2Ccategories%2Cis_expert%2Cprofile_discovery%2Cwebsites%2Ccontact_emails&fetch_user_profile=1",
"checkType": "status_code",
@@ -1079,18 +1085,18 @@
},
"StackOverflow": {
"similarSearch": true,
"absenceStrs": [
"no-search-results"
],
"presenseStrs": [
"user-info",
" user-details"
],
"url": "https://stackoverflow.com/users/filter?search={username}",
"urlProbe": "https://api.stackexchange.com/2.3/users?order=desc&sort=name&inname={username}&site=stackoverflow",
"urlMain": "https://stackoverflow.com",
"checkType": "message",
"presenseStrs": [
"\"items\":[{"
],
"absenceStrs": [
"\"items\":[]"
],
"usernameClaimed": "maigret",
"usernameUnclaimed": "noonewouldeverusethis7",
"checkType": "message",
"tags": [
"coding"
],
@@ -1651,7 +1657,10 @@
"urlMain": "https://www.kickstarter.com",
"url": "https://www.kickstarter.com/profile/{username}",
"usernameClaimed": "zhovner",
"usernameUnclaimed": "noonewouldeverusethis7"
"usernameUnclaimed": "noonewouldeverusethis7",
"protection": [
"tls_fingerprint"
]
},
"Change.org": {
"tags": [
@@ -2128,7 +2137,10 @@
"urlMain": "https://www.npmjs.com/",
"url": "https://www.npmjs.com/~{username}",
"usernameClaimed": "kennethsweezy",
"usernameUnclaimed": "noonewould"
"usernameUnclaimed": "noonewould",
"protection": [
"tls_fingerprint"
]
},
"NPM-Package": {
"tags": [
@@ -2139,7 +2151,10 @@
"urlMain": "https://www.npmjs.com/",
"url": "https://www.npmjs.com/package/{username}",
"usernameClaimed": "blue",
"usernameUnclaimed": "noonewouldeverusethis7"
"usernameUnclaimed": "noonewouldeverusethis7",
"protection": [
"tls_fingerprint"
]
},
"Allods": {
"urlSubpath": "/forums",
@@ -2883,6 +2898,9 @@
"alexaRank": 932,
"tags": [
"design"
],
"protection": [
"ip_reputation"
]
},
"forum.pkp.sfu.ca": {
@@ -3013,7 +3031,10 @@
"urlMain": "https://codepen.io/",
"url": "https://codepen.io/{username}",
"usernameClaimed": "blue",
"usernameUnclaimed": "noonewouldeverusethis7"
"usernameUnclaimed": "noonewouldeverusethis7",
"protection": [
"tls_fingerprint"
]
},
"Rottentomatoes": {
"tags": [
@@ -5353,7 +5374,10 @@
"urlMain": "https://letterboxd.com/",
"url": "https://letterboxd.com/{username}",
"usernameClaimed": "blue",
"usernameUnclaimed": "noonewouldeverusethis7"
"usernameUnclaimed": "noonewouldeverusethis7",
"protection": [
"tls_fingerprint"
]
},
"MyAnimeList": {
"tags": [
@@ -6057,16 +6081,29 @@
"disabled": true
},
"LeetCode": {
"url": "https://leetcode.com/u/{username}/",
"urlProbe": "https://leetcode.com/graphql/",
"urlMain": "https://leetcode.com/",
"checkType": "message",
"requestMethod": "POST",
"requestPayload": {
"query": "{{ matchedUser(username: \"{username}\") {{ username }} }}"
},
"headers": {
"Content-Type": "application/json"
},
"presenseStrs": [
"\"username\":"
],
"absenceStrs": [
"\"matchedUser\":null"
],
"usernameClaimed": "soxoj",
"usernameUnclaimed": "noonewouldeverusethis7",
"tags": [
"coding"
],
"disabled": true,
"checkType": "status_code",
"alexaRank": 3061,
"urlMain": "https://leetcode.com/",
"url": "https://leetcode.com/{username}",
"usernameClaimed": "blue",
"usernameUnclaimed": "noonewouldeverusethis7"
"alexaRank": 3061
},
"Teletype": {
"tags": [
@@ -7045,21 +7082,22 @@
"usernameUnclaimed": "noonewouldeverusethis7"
},
"Boosty": {
"url": "https://boosty.to/{username}",
"urlProbe": "https://api.boosty.to/v1/blog/{username}",
"urlMain": "https://boosty.to",
"checkType": "message",
"presenseStrs": [
"\"id\":"
],
"absenceStrs": [
"blog_not_found"
],
"usernameClaimed": "soxoj",
"usernameUnclaimed": "noonewouldeverusethis7",
"tags": [
"ru"
],
"checkType": "message",
"absenceStrs": [
"<title></title>"
],
"presenseStrs": [
"Boosty </title>"
],
"alexaRank": 5155,
"urlMain": "https://boosty.to",
"url": "https://boosty.to/{username}",
"usernameClaimed": "adam",
"usernameUnclaimed": "noonewouldeverusethis7"
"alexaRank": 5155
},
"Soup": {
"tags": [
@@ -12334,7 +12372,10 @@
"url": "https://www.picuki.com/profile/{username}",
"source": "Instagram",
"usernameClaimed": "adam",
"usernameUnclaimed": "noonewouldeverusethis7"
"usernameUnclaimed": "noonewouldeverusethis7",
"protection": [
"js_challenge"
]
},
"1x": {
"tags": [
@@ -15431,6 +15472,9 @@
"source": "Instagram",
"tags": [
"photo"
],
"protection": [
"ip_reputation"
]
},
"forum.spyderco.com": {
@@ -22069,6 +22113,9 @@
"source": "Instagram",
"tags": [
"photo"
],
"protection": [
"ip_reputation"
]
},
"crown6.org": {
+4 -2
View File
@@ -96,6 +96,8 @@ class MaigretSite:
# URL protocol (http/https)
protocol = ''
# Protection types detected on this site (e.g. ["tls_fingerprint", "ddos_guard"])
protection: List[str] = []
def __init__(self, name, information):
self.name = name
@@ -462,9 +464,9 @@ class MaigretDatabase:
"tags": self._tags,
}
json_data = json.dumps(db_data, indent=4)
json_data = json.dumps(db_data, indent=4, ensure_ascii=False)
with open(filename, "w") as f:
with open(filename, "w", encoding="utf-8") as f:
f.write(json_data)
return self
Generated
+35 -2
View File
@@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand.
# This file is automatically @generated by Poetry 2.3.3 and should not be changed by hand.
[[package]]
name = "about-time"
@@ -724,6 +724,7 @@ files = [
{file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"},
{file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"},
]
markers = {dev = "platform_system == \"Windows\" or sys_platform == \"win32\""}
[[package]]
name = "coverage"
@@ -940,6 +941,38 @@ webencodings = "*"
doc = ["sphinx", "sphinx_rtd_theme"]
test = ["flake8", "isort", "pytest"]
[[package]]
name = "curl-cffi"
version = "0.14.0"
description = "libcurl ffi bindings for Python, with impersonation support."
optional = false
python-versions = ">=3.10"
groups = ["main"]
files = [
{file = "curl_cffi-0.14.0-cp39-abi3-macosx_14_0_arm64.whl", hash = "sha256:e35e89c6a69872f9749d6d5fda642ed4fc159619329e99d577d0104c9aad5893"},
{file = "curl_cffi-0.14.0-cp39-abi3-macosx_15_0_x86_64.whl", hash = "sha256:5945478cd28ad7dfb5c54473bcfb6743ee1d66554d57951fdf8fc0e7d8cf4e45"},
{file = "curl_cffi-0.14.0-cp39-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c42e8fa3c667db9ccd2e696ee47adcd3cd5b0838d7282f3fc45f6c0ef3cfdfa7"},
{file = "curl_cffi-0.14.0-cp39-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:060fe2c99c41d3cb7f894de318ddf4b0301b08dca70453d769bd4e74b36b8483"},
{file = "curl_cffi-0.14.0-cp39-abi3-manylinux_2_28_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:b158c41a25388690dd0d40b5bc38d1e0f512135f17fdb8029868cbc1993d2e5b"},
{file = "curl_cffi-0.14.0-cp39-abi3-manylinux_2_28_i686.whl", hash = "sha256:1439fbef3500fb723333c826adf0efb0e2e5065a703fb5eccce637a2250db34a"},
{file = "curl_cffi-0.14.0-cp39-abi3-manylinux_2_34_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e7176f2c2d22b542e3cf261072a81deb018cfa7688930f95dddef215caddb469"},
{file = "curl_cffi-0.14.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:03f21ade2d72978c2bb8670e9b6de5260e2755092b02d94b70b906813662998d"},
{file = "curl_cffi-0.14.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:58ebf02de64ee5c95613209ddacb014c2d2f86298d7080c0a1c12ed876ee0690"},
{file = "curl_cffi-0.14.0-cp39-abi3-win_amd64.whl", hash = "sha256:6e503f9a103f6ae7acfb3890c843b53ec030785a22ae7682a22cc43afb94123e"},
{file = "curl_cffi-0.14.0-cp39-abi3-win_arm64.whl", hash = "sha256:2eed50a969201605c863c4c31269dfc3e0da52916086ac54553cfa353022425c"},
{file = "curl_cffi-0.14.0.tar.gz", hash = "sha256:5ffbc82e59f05008ec08ea432f0e535418823cda44178ee518906a54f27a5f0f"},
]
[package.dependencies]
certifi = ">=2024.2.2"
cffi = ">=1.12.0"
[package.extras]
build = ["cibuildwheel", "wheel"]
dev = ["charset_normalizer (>=3.3.2,<4.0)", "coverage (>=6.4.1,<7.0)", "cryptography (>=42.0.5,<43.0)", "httpx (==0.23.1)", "mypy (>=1.9.0,<2.0)", "pytest (>=8.1.1,<9.0)", "pytest-asyncio (>=0.23.6,<1.0)", "pytest-trio (>=0.8.0,<1.0)", "ruff (>=0.3.5,<1.0)", "trio (>=0.25.0,<1.0)", "trustme (>=1.1.0,<2.0)", "typing_extensions", "uvicorn (>=0.29.0,<1.0)", "websockets (>=14.0)"]
extra = ["lxml_html_clean", "markdownify (>=1.1.0)", "readability-lxml (>=0.8.1)"]
test = ["charset_normalizer (>=3.3.2,<4.0)", "cryptography (>=42.0.5,<43.0)", "fastapi (>=0.110.0,<1.0)", "httpx (==0.23.1)", "proxy.py (>=2.4.3,<3.0)", "pytest (>=8.1.1,<9.0)", "pytest-asyncio (>=0.23.6,<1.0)", "pytest-trio (>=0.8.0,<1.0)", "python-multipart (>=0.0.9,<1.0)", "trio (>=0.25.0,<1.0)", "trustme (>=1.1.0,<2.0)", "typing_extensions", "uvicorn (>=0.29.0,<1.0)", "websockets (>=14.0)"]
[[package]]
name = "decorator"
version = "5.1.1"
@@ -3695,4 +3728,4 @@ propcache = ">=0.2.1"
[metadata]
lock-version = "2.1"
python-versions = "^3.10"
content-hash = "242a27f15596139e173e679ad98ef816e2c8ffe49566f74a953996affc0c9579"
content-hash = "edc8e2596a73519ad93c4a1e8c235f95d8070c5ecde2b2d7aba16f58be9e6e0a"
+1
View File
@@ -74,6 +74,7 @@ cloudscraper = "^1.2.71"
flask = {extras = ["async"], version = "^3.1.1"}
asgiref = "^3.9.1"
platformdirs = "^4.3.8"
curl-cffi = ">=0.14,<1.0"
[tool.poetry.group.dev.dependencies]
+655 -656
View File
File diff suppressed because it is too large Load Diff