mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-15 19:05:43 +00:00
Add site protection tracking system, fix broken site checks (Instagra… (#2452)
* Add site protection tracking system, fix broken site checks (Instagram, StackOverflow, LeetCode, Boosty, LiveLib), preserve unicode in data.json * Update poetry.lock by running poetry lock Agent-Logs-Url: https://github.com/soxoj/maigret/sessions/14333f41-67d5-4e28-a782-9730b31fc667 Co-authored-by: soxoj <31013580+soxoj@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
This commit is contained in:
+82
-2
@@ -213,6 +213,76 @@ class AiodnsDomainResolver(CheckerBase):
|
||||
return text, status, error
|
||||
|
||||
|
||||
try:
|
||||
from curl_cffi.requests import AsyncSession as CurlCffiAsyncSession
|
||||
|
||||
CURL_CFFI_AVAILABLE = True
|
||||
except ImportError:
|
||||
CURL_CFFI_AVAILABLE = False
|
||||
|
||||
|
||||
class CurlCffiChecker(CheckerBase):
|
||||
"""Checker using curl_cffi to emulate browser TLS fingerprint and bypass WAF."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.logger = kwargs.get('logger', Mock())
|
||||
self.browser_emulate = kwargs.get('browser_emulate', 'chrome')
|
||||
self.url = None
|
||||
self.headers = None
|
||||
self.allow_redirects = True
|
||||
self.timeout = 0
|
||||
self.method = 'get'
|
||||
self.payload = None
|
||||
|
||||
def prepare(self, url, headers=None, allow_redirects=True, timeout=0, method='get', payload=None):
|
||||
self.url = url
|
||||
self.headers = headers
|
||||
self.allow_redirects = allow_redirects
|
||||
self.timeout = timeout
|
||||
self.method = method
|
||||
self.payload = payload
|
||||
return None
|
||||
|
||||
async def close(self):
|
||||
pass
|
||||
|
||||
async def check(self) -> Tuple[str, int, Optional[CheckError]]:
|
||||
try:
|
||||
async with CurlCffiAsyncSession() as session:
|
||||
kwargs = {
|
||||
'url': self.url,
|
||||
'headers': self.headers,
|
||||
'allow_redirects': self.allow_redirects,
|
||||
'timeout': self.timeout if self.timeout else 10,
|
||||
'impersonate': self.browser_emulate,
|
||||
}
|
||||
if self.payload and self.method.lower() == 'post':
|
||||
kwargs['json'] = self.payload
|
||||
|
||||
if self.method.lower() == 'post':
|
||||
response = await session.post(**kwargs)
|
||||
elif self.method.lower() == 'head':
|
||||
response = await session.head(**kwargs)
|
||||
else:
|
||||
response = await session.get(**kwargs)
|
||||
|
||||
status_code = response.status_code
|
||||
decoded_content = response.text
|
||||
|
||||
self.logger.debug(decoded_content)
|
||||
|
||||
error = CheckError("Connection lost") if status_code == 0 else None
|
||||
return decoded_content, status_code, error
|
||||
|
||||
except asyncio.TimeoutError as e:
|
||||
return None, 0, CheckError("Request timeout", str(e))
|
||||
except KeyboardInterrupt:
|
||||
return None, 0, CheckError("Interrupted")
|
||||
except Exception as e:
|
||||
self.logger.debug(e, exc_info=True)
|
||||
return None, 0, CheckError("Unexpected", str(e))
|
||||
|
||||
|
||||
class CheckerMock:
|
||||
def __init__(self, *args, **kwargs):
|
||||
pass
|
||||
@@ -469,8 +539,18 @@ def make_site_result(
|
||||
# workaround to prevent slash errors
|
||||
url = re.sub("(?<!:)/+", "/", url)
|
||||
|
||||
# always clearweb_checker for now
|
||||
checker = options["checkers"][site.protocol]
|
||||
# Select checker: use curl_cffi for sites requiring TLS impersonation
|
||||
needs_impersonation = 'tls_fingerprint' in site.protection
|
||||
if needs_impersonation and CURL_CFFI_AVAILABLE:
|
||||
checker = CurlCffiChecker(logger=logger, browser_emulate='chrome')
|
||||
elif needs_impersonation and not CURL_CFFI_AVAILABLE:
|
||||
logger.warning(
|
||||
f"Site {site.name} requires TLS impersonation (curl_cffi) but it's not installed. "
|
||||
"Install with: pip install curl_cffi"
|
||||
)
|
||||
checker = options["checkers"][site.protocol]
|
||||
else:
|
||||
checker = options["checkers"][site.protocol]
|
||||
|
||||
# site check is disabled
|
||||
if site.disabled and not options['forced']:
|
||||
|
||||
+90
-43
@@ -68,23 +68,29 @@
|
||||
"usernameUnclaimed": "noonewouldeverusethis777"
|
||||
},
|
||||
"Instagram": {
|
||||
"disabled": true,
|
||||
"tags": [
|
||||
"photo",
|
||||
"social"
|
||||
],
|
||||
"errors": {
|
||||
"Login • Instagram": "Login required"
|
||||
},
|
||||
"checkType": "message",
|
||||
"presenseStrs": [
|
||||
"<div id=\"splash-screen\">"
|
||||
"\"routePath\":\"\\/{username}"
|
||||
],
|
||||
"absenceStrs": [
|
||||
"\"routePath\":null"
|
||||
],
|
||||
"errors": {
|
||||
"Login • Instagram": "Login required",
|
||||
"Just a moment": "Cloudflare challenge"
|
||||
},
|
||||
"alexaRank": 4,
|
||||
"urlMain": "https://www.instagram.com/",
|
||||
"url": "https://www.instagram.com/{username}",
|
||||
"usernameClaimed": "blue",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
"url": "https://www.instagram.com/{username}/",
|
||||
"usernameClaimed": "cristiano",
|
||||
"usernameUnclaimed": "noonewouldeverusethis77777",
|
||||
"protection": [
|
||||
"tls_fingerprint"
|
||||
]
|
||||
},
|
||||
"Twitter": {
|
||||
"tags": [
|
||||
@@ -95,7 +101,7 @@
|
||||
"sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
|
||||
"authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
|
||||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
|
||||
"x-guest-token": "2037668354144538994"
|
||||
"x-guest-token": "2039637579922866279"
|
||||
},
|
||||
"errors": {
|
||||
"Bad guest token": "x-guest-token update required"
|
||||
@@ -288,7 +294,7 @@
|
||||
"method": "vimeo"
|
||||
},
|
||||
"headers": {
|
||||
"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NzQ2NTM5MDAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbCwianRpIjoiZDM3N2QyY2EtYTEyMC00NWRlLThkYjAtMGUzYWZlNWQ0NGRlIn0.5ZfqU66p6wQtFNg5-7Syrmu3mXTOzQ4Tju97eaw3Nbo"
|
||||
"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NzUxMjM0MDAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbCwianRpIjoiZDY4YjViMGMtYTE3OC00ZDdhLWIyM2QtMDg5Y2MwZjAwOGEyIn0.0bGwlqckn4J07em2-nEX10OfW1JAmi54QCrPtm8Qn6A"
|
||||
},
|
||||
"urlProbe": "https://api.vimeo.com/users/{username}?fields=name%2Cgender%2Cbio%2Curi%2Clink%2Cbackground_video%2Clocation_details%2Cpictures%2Cverified%2Cmetadata.public_videos.total%2Cavailable_for_hire%2Ccan_work_remotely%2Cmetadata.connections.videos.total%2Cmetadata.connections.albums.total%2Cmetadata.connections.followers.total%2Cmetadata.connections.following.total%2Cmetadata.public_videos.total%2Cmetadata.connections.vimeo_experts.is_enrolled%2Ctotal_collection_count%2Ccreated_time%2Cprofile_preferences%2Cmembership%2Cclients%2Cskills%2Cproject_types%2Crates%2Ccategories%2Cis_expert%2Cprofile_discovery%2Cwebsites%2Ccontact_emails&fetch_user_profile=1",
|
||||
"checkType": "status_code",
|
||||
@@ -1079,18 +1085,18 @@
|
||||
},
|
||||
"StackOverflow": {
|
||||
"similarSearch": true,
|
||||
"absenceStrs": [
|
||||
"no-search-results"
|
||||
],
|
||||
"presenseStrs": [
|
||||
"user-info",
|
||||
" user-details"
|
||||
],
|
||||
"url": "https://stackoverflow.com/users/filter?search={username}",
|
||||
"urlProbe": "https://api.stackexchange.com/2.3/users?order=desc&sort=name&inname={username}&site=stackoverflow",
|
||||
"urlMain": "https://stackoverflow.com",
|
||||
"checkType": "message",
|
||||
"presenseStrs": [
|
||||
"\"items\":[{"
|
||||
],
|
||||
"absenceStrs": [
|
||||
"\"items\":[]"
|
||||
],
|
||||
"usernameClaimed": "maigret",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7",
|
||||
"checkType": "message",
|
||||
"tags": [
|
||||
"coding"
|
||||
],
|
||||
@@ -1651,7 +1657,10 @@
|
||||
"urlMain": "https://www.kickstarter.com",
|
||||
"url": "https://www.kickstarter.com/profile/{username}",
|
||||
"usernameClaimed": "zhovner",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
"usernameUnclaimed": "noonewouldeverusethis7",
|
||||
"protection": [
|
||||
"tls_fingerprint"
|
||||
]
|
||||
},
|
||||
"Change.org": {
|
||||
"tags": [
|
||||
@@ -2128,7 +2137,10 @@
|
||||
"urlMain": "https://www.npmjs.com/",
|
||||
"url": "https://www.npmjs.com/~{username}",
|
||||
"usernameClaimed": "kennethsweezy",
|
||||
"usernameUnclaimed": "noonewould"
|
||||
"usernameUnclaimed": "noonewould",
|
||||
"protection": [
|
||||
"tls_fingerprint"
|
||||
]
|
||||
},
|
||||
"NPM-Package": {
|
||||
"tags": [
|
||||
@@ -2139,7 +2151,10 @@
|
||||
"urlMain": "https://www.npmjs.com/",
|
||||
"url": "https://www.npmjs.com/package/{username}",
|
||||
"usernameClaimed": "blue",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
"usernameUnclaimed": "noonewouldeverusethis7",
|
||||
"protection": [
|
||||
"tls_fingerprint"
|
||||
]
|
||||
},
|
||||
"Allods": {
|
||||
"urlSubpath": "/forums",
|
||||
@@ -2883,6 +2898,9 @@
|
||||
"alexaRank": 932,
|
||||
"tags": [
|
||||
"design"
|
||||
],
|
||||
"protection": [
|
||||
"ip_reputation"
|
||||
]
|
||||
},
|
||||
"forum.pkp.sfu.ca": {
|
||||
@@ -3013,7 +3031,10 @@
|
||||
"urlMain": "https://codepen.io/",
|
||||
"url": "https://codepen.io/{username}",
|
||||
"usernameClaimed": "blue",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
"usernameUnclaimed": "noonewouldeverusethis7",
|
||||
"protection": [
|
||||
"tls_fingerprint"
|
||||
]
|
||||
},
|
||||
"Rottentomatoes": {
|
||||
"tags": [
|
||||
@@ -5353,7 +5374,10 @@
|
||||
"urlMain": "https://letterboxd.com/",
|
||||
"url": "https://letterboxd.com/{username}",
|
||||
"usernameClaimed": "blue",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
"usernameUnclaimed": "noonewouldeverusethis7",
|
||||
"protection": [
|
||||
"tls_fingerprint"
|
||||
]
|
||||
},
|
||||
"MyAnimeList": {
|
||||
"tags": [
|
||||
@@ -6057,16 +6081,29 @@
|
||||
"disabled": true
|
||||
},
|
||||
"LeetCode": {
|
||||
"url": "https://leetcode.com/u/{username}/",
|
||||
"urlProbe": "https://leetcode.com/graphql/",
|
||||
"urlMain": "https://leetcode.com/",
|
||||
"checkType": "message",
|
||||
"requestMethod": "POST",
|
||||
"requestPayload": {
|
||||
"query": "{{ matchedUser(username: \"{username}\") {{ username }} }}"
|
||||
},
|
||||
"headers": {
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
"presenseStrs": [
|
||||
"\"username\":"
|
||||
],
|
||||
"absenceStrs": [
|
||||
"\"matchedUser\":null"
|
||||
],
|
||||
"usernameClaimed": "soxoj",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7",
|
||||
"tags": [
|
||||
"coding"
|
||||
],
|
||||
"disabled": true,
|
||||
"checkType": "status_code",
|
||||
"alexaRank": 3061,
|
||||
"urlMain": "https://leetcode.com/",
|
||||
"url": "https://leetcode.com/{username}",
|
||||
"usernameClaimed": "blue",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
"alexaRank": 3061
|
||||
},
|
||||
"Teletype": {
|
||||
"tags": [
|
||||
@@ -7045,21 +7082,22 @@
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
},
|
||||
"Boosty": {
|
||||
"url": "https://boosty.to/{username}",
|
||||
"urlProbe": "https://api.boosty.to/v1/blog/{username}",
|
||||
"urlMain": "https://boosty.to",
|
||||
"checkType": "message",
|
||||
"presenseStrs": [
|
||||
"\"id\":"
|
||||
],
|
||||
"absenceStrs": [
|
||||
"blog_not_found"
|
||||
],
|
||||
"usernameClaimed": "soxoj",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7",
|
||||
"tags": [
|
||||
"ru"
|
||||
],
|
||||
"checkType": "message",
|
||||
"absenceStrs": [
|
||||
"<title></title>"
|
||||
],
|
||||
"presenseStrs": [
|
||||
"Boosty </title>"
|
||||
],
|
||||
"alexaRank": 5155,
|
||||
"urlMain": "https://boosty.to",
|
||||
"url": "https://boosty.to/{username}",
|
||||
"usernameClaimed": "adam",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
"alexaRank": 5155
|
||||
},
|
||||
"Soup": {
|
||||
"tags": [
|
||||
@@ -12334,7 +12372,10 @@
|
||||
"url": "https://www.picuki.com/profile/{username}",
|
||||
"source": "Instagram",
|
||||
"usernameClaimed": "adam",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
"usernameUnclaimed": "noonewouldeverusethis7",
|
||||
"protection": [
|
||||
"js_challenge"
|
||||
]
|
||||
},
|
||||
"1x": {
|
||||
"tags": [
|
||||
@@ -15431,6 +15472,9 @@
|
||||
"source": "Instagram",
|
||||
"tags": [
|
||||
"photo"
|
||||
],
|
||||
"protection": [
|
||||
"ip_reputation"
|
||||
]
|
||||
},
|
||||
"forum.spyderco.com": {
|
||||
@@ -22069,6 +22113,9 @@
|
||||
"source": "Instagram",
|
||||
"tags": [
|
||||
"photo"
|
||||
],
|
||||
"protection": [
|
||||
"ip_reputation"
|
||||
]
|
||||
},
|
||||
"crown6.org": {
|
||||
|
||||
+4
-2
@@ -96,6 +96,8 @@ class MaigretSite:
|
||||
|
||||
# URL protocol (http/https)
|
||||
protocol = ''
|
||||
# Protection types detected on this site (e.g. ["tls_fingerprint", "ddos_guard"])
|
||||
protection: List[str] = []
|
||||
|
||||
def __init__(self, name, information):
|
||||
self.name = name
|
||||
@@ -462,9 +464,9 @@ class MaigretDatabase:
|
||||
"tags": self._tags,
|
||||
}
|
||||
|
||||
json_data = json.dumps(db_data, indent=4)
|
||||
json_data = json.dumps(db_data, indent=4, ensure_ascii=False)
|
||||
|
||||
with open(filename, "w") as f:
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
f.write(json_data)
|
||||
|
||||
return self
|
||||
|
||||
Reference in New Issue
Block a user