Add site protection tracking system, fix broken site checks (Instagra… (#2452)

* Add site protection tracking system, fix broken site checks (Instagram, StackOverflow, LeetCode, Boosty, LiveLib), preserve unicode in data.json

* Update poetry.lock by running poetry lock

Agent-Logs-Url: https://github.com/soxoj/maigret/sessions/14333f41-67d5-4e28-a782-9730b31fc667

Co-authored-by: soxoj <31013580+soxoj@users.noreply.github.com>

---------

Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
This commit is contained in:
Soxoj
2026-04-02 20:28:20 +02:00
committed by GitHub
parent 9e8a701c54
commit 5d502eaef6
8 changed files with 892 additions and 705 deletions
+82 -2
View File
@@ -213,6 +213,76 @@ class AiodnsDomainResolver(CheckerBase):
return text, status, error
try:
from curl_cffi.requests import AsyncSession as CurlCffiAsyncSession
CURL_CFFI_AVAILABLE = True
except ImportError:
CURL_CFFI_AVAILABLE = False
class CurlCffiChecker(CheckerBase):
"""Checker using curl_cffi to emulate browser TLS fingerprint and bypass WAF."""
def __init__(self, *args, **kwargs):
self.logger = kwargs.get('logger', Mock())
self.browser_emulate = kwargs.get('browser_emulate', 'chrome')
self.url = None
self.headers = None
self.allow_redirects = True
self.timeout = 0
self.method = 'get'
self.payload = None
def prepare(self, url, headers=None, allow_redirects=True, timeout=0, method='get', payload=None):
self.url = url
self.headers = headers
self.allow_redirects = allow_redirects
self.timeout = timeout
self.method = method
self.payload = payload
return None
async def close(self):
pass
async def check(self) -> Tuple[str, int, Optional[CheckError]]:
try:
async with CurlCffiAsyncSession() as session:
kwargs = {
'url': self.url,
'headers': self.headers,
'allow_redirects': self.allow_redirects,
'timeout': self.timeout if self.timeout else 10,
'impersonate': self.browser_emulate,
}
if self.payload and self.method.lower() == 'post':
kwargs['json'] = self.payload
if self.method.lower() == 'post':
response = await session.post(**kwargs)
elif self.method.lower() == 'head':
response = await session.head(**kwargs)
else:
response = await session.get(**kwargs)
status_code = response.status_code
decoded_content = response.text
self.logger.debug(decoded_content)
error = CheckError("Connection lost") if status_code == 0 else None
return decoded_content, status_code, error
except asyncio.TimeoutError as e:
return None, 0, CheckError("Request timeout", str(e))
except KeyboardInterrupt:
return None, 0, CheckError("Interrupted")
except Exception as e:
self.logger.debug(e, exc_info=True)
return None, 0, CheckError("Unexpected", str(e))
class CheckerMock:
def __init__(self, *args, **kwargs):
pass
@@ -469,8 +539,18 @@ def make_site_result(
# workaround to prevent slash errors
url = re.sub("(?<!:)/+", "/", url)
# always clearweb_checker for now
checker = options["checkers"][site.protocol]
# Select checker: use curl_cffi for sites requiring TLS impersonation
needs_impersonation = 'tls_fingerprint' in site.protection
if needs_impersonation and CURL_CFFI_AVAILABLE:
checker = CurlCffiChecker(logger=logger, browser_emulate='chrome')
elif needs_impersonation and not CURL_CFFI_AVAILABLE:
logger.warning(
f"Site {site.name} requires TLS impersonation (curl_cffi) but it's not installed. "
"Install with: pip install curl_cffi"
)
checker = options["checkers"][site.protocol]
else:
checker = options["checkers"][site.protocol]
# site check is disabled
if site.disabled and not options['forced']:
+90 -43
View File
@@ -68,23 +68,29 @@
"usernameUnclaimed": "noonewouldeverusethis777"
},
"Instagram": {
"disabled": true,
"tags": [
"photo",
"social"
],
"errors": {
"Login • Instagram": "Login required"
},
"checkType": "message",
"presenseStrs": [
"<div id=\"splash-screen\">"
"\"routePath\":\"\\/{username}"
],
"absenceStrs": [
"\"routePath\":null"
],
"errors": {
"Login • Instagram": "Login required",
"Just a moment": "Cloudflare challenge"
},
"alexaRank": 4,
"urlMain": "https://www.instagram.com/",
"url": "https://www.instagram.com/{username}",
"usernameClaimed": "blue",
"usernameUnclaimed": "noonewouldeverusethis7"
"url": "https://www.instagram.com/{username}/",
"usernameClaimed": "cristiano",
"usernameUnclaimed": "noonewouldeverusethis77777",
"protection": [
"tls_fingerprint"
]
},
"Twitter": {
"tags": [
@@ -95,7 +101,7 @@
"sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
"authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
"x-guest-token": "2037668354144538994"
"x-guest-token": "2039637579922866279"
},
"errors": {
"Bad guest token": "x-guest-token update required"
@@ -288,7 +294,7 @@
"method": "vimeo"
},
"headers": {
"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NzQ2NTM5MDAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbCwianRpIjoiZDM3N2QyY2EtYTEyMC00NWRlLThkYjAtMGUzYWZlNWQ0NGRlIn0.5ZfqU66p6wQtFNg5-7Syrmu3mXTOzQ4Tju97eaw3Nbo"
"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NzUxMjM0MDAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbCwianRpIjoiZDY4YjViMGMtYTE3OC00ZDdhLWIyM2QtMDg5Y2MwZjAwOGEyIn0.0bGwlqckn4J07em2-nEX10OfW1JAmi54QCrPtm8Qn6A"
},
"urlProbe": "https://api.vimeo.com/users/{username}?fields=name%2Cgender%2Cbio%2Curi%2Clink%2Cbackground_video%2Clocation_details%2Cpictures%2Cverified%2Cmetadata.public_videos.total%2Cavailable_for_hire%2Ccan_work_remotely%2Cmetadata.connections.videos.total%2Cmetadata.connections.albums.total%2Cmetadata.connections.followers.total%2Cmetadata.connections.following.total%2Cmetadata.public_videos.total%2Cmetadata.connections.vimeo_experts.is_enrolled%2Ctotal_collection_count%2Ccreated_time%2Cprofile_preferences%2Cmembership%2Cclients%2Cskills%2Cproject_types%2Crates%2Ccategories%2Cis_expert%2Cprofile_discovery%2Cwebsites%2Ccontact_emails&fetch_user_profile=1",
"checkType": "status_code",
@@ -1079,18 +1085,18 @@
},
"StackOverflow": {
"similarSearch": true,
"absenceStrs": [
"no-search-results"
],
"presenseStrs": [
"user-info",
" user-details"
],
"url": "https://stackoverflow.com/users/filter?search={username}",
"urlProbe": "https://api.stackexchange.com/2.3/users?order=desc&sort=name&inname={username}&site=stackoverflow",
"urlMain": "https://stackoverflow.com",
"checkType": "message",
"presenseStrs": [
"\"items\":[{"
],
"absenceStrs": [
"\"items\":[]"
],
"usernameClaimed": "maigret",
"usernameUnclaimed": "noonewouldeverusethis7",
"checkType": "message",
"tags": [
"coding"
],
@@ -1651,7 +1657,10 @@
"urlMain": "https://www.kickstarter.com",
"url": "https://www.kickstarter.com/profile/{username}",
"usernameClaimed": "zhovner",
"usernameUnclaimed": "noonewouldeverusethis7"
"usernameUnclaimed": "noonewouldeverusethis7",
"protection": [
"tls_fingerprint"
]
},
"Change.org": {
"tags": [
@@ -2128,7 +2137,10 @@
"urlMain": "https://www.npmjs.com/",
"url": "https://www.npmjs.com/~{username}",
"usernameClaimed": "kennethsweezy",
"usernameUnclaimed": "noonewould"
"usernameUnclaimed": "noonewould",
"protection": [
"tls_fingerprint"
]
},
"NPM-Package": {
"tags": [
@@ -2139,7 +2151,10 @@
"urlMain": "https://www.npmjs.com/",
"url": "https://www.npmjs.com/package/{username}",
"usernameClaimed": "blue",
"usernameUnclaimed": "noonewouldeverusethis7"
"usernameUnclaimed": "noonewouldeverusethis7",
"protection": [
"tls_fingerprint"
]
},
"Allods": {
"urlSubpath": "/forums",
@@ -2883,6 +2898,9 @@
"alexaRank": 932,
"tags": [
"design"
],
"protection": [
"ip_reputation"
]
},
"forum.pkp.sfu.ca": {
@@ -3013,7 +3031,10 @@
"urlMain": "https://codepen.io/",
"url": "https://codepen.io/{username}",
"usernameClaimed": "blue",
"usernameUnclaimed": "noonewouldeverusethis7"
"usernameUnclaimed": "noonewouldeverusethis7",
"protection": [
"tls_fingerprint"
]
},
"Rottentomatoes": {
"tags": [
@@ -5353,7 +5374,10 @@
"urlMain": "https://letterboxd.com/",
"url": "https://letterboxd.com/{username}",
"usernameClaimed": "blue",
"usernameUnclaimed": "noonewouldeverusethis7"
"usernameUnclaimed": "noonewouldeverusethis7",
"protection": [
"tls_fingerprint"
]
},
"MyAnimeList": {
"tags": [
@@ -6057,16 +6081,29 @@
"disabled": true
},
"LeetCode": {
"url": "https://leetcode.com/u/{username}/",
"urlProbe": "https://leetcode.com/graphql/",
"urlMain": "https://leetcode.com/",
"checkType": "message",
"requestMethod": "POST",
"requestPayload": {
"query": "{{ matchedUser(username: \"{username}\") {{ username }} }}"
},
"headers": {
"Content-Type": "application/json"
},
"presenseStrs": [
"\"username\":"
],
"absenceStrs": [
"\"matchedUser\":null"
],
"usernameClaimed": "soxoj",
"usernameUnclaimed": "noonewouldeverusethis7",
"tags": [
"coding"
],
"disabled": true,
"checkType": "status_code",
"alexaRank": 3061,
"urlMain": "https://leetcode.com/",
"url": "https://leetcode.com/{username}",
"usernameClaimed": "blue",
"usernameUnclaimed": "noonewouldeverusethis7"
"alexaRank": 3061
},
"Teletype": {
"tags": [
@@ -7045,21 +7082,22 @@
"usernameUnclaimed": "noonewouldeverusethis7"
},
"Boosty": {
"url": "https://boosty.to/{username}",
"urlProbe": "https://api.boosty.to/v1/blog/{username}",
"urlMain": "https://boosty.to",
"checkType": "message",
"presenseStrs": [
"\"id\":"
],
"absenceStrs": [
"blog_not_found"
],
"usernameClaimed": "soxoj",
"usernameUnclaimed": "noonewouldeverusethis7",
"tags": [
"ru"
],
"checkType": "message",
"absenceStrs": [
"<title></title>"
],
"presenseStrs": [
"Boosty </title>"
],
"alexaRank": 5155,
"urlMain": "https://boosty.to",
"url": "https://boosty.to/{username}",
"usernameClaimed": "adam",
"usernameUnclaimed": "noonewouldeverusethis7"
"alexaRank": 5155
},
"Soup": {
"tags": [
@@ -12334,7 +12372,10 @@
"url": "https://www.picuki.com/profile/{username}",
"source": "Instagram",
"usernameClaimed": "adam",
"usernameUnclaimed": "noonewouldeverusethis7"
"usernameUnclaimed": "noonewouldeverusethis7",
"protection": [
"js_challenge"
]
},
"1x": {
"tags": [
@@ -15431,6 +15472,9 @@
"source": "Instagram",
"tags": [
"photo"
],
"protection": [
"ip_reputation"
]
},
"forum.spyderco.com": {
@@ -22069,6 +22113,9 @@
"source": "Instagram",
"tags": [
"photo"
],
"protection": [
"ip_reputation"
]
},
"crown6.org": {
+4 -2
View File
@@ -96,6 +96,8 @@ class MaigretSite:
# URL protocol (http/https)
protocol = ''
# Protection types detected on this site (e.g. ["tls_fingerprint", "ddos_guard"])
protection: List[str] = []
def __init__(self, name, information):
self.name = name
@@ -462,9 +464,9 @@ class MaigretDatabase:
"tags": self._tags,
}
json_data = json.dumps(db_data, indent=4)
json_data = json.dumps(db_data, indent=4, ensure_ascii=False)
with open(filename, "w") as f:
with open(filename, "w", encoding="utf-8") as f:
f.write(json_data)
return self