mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-07 06:24:35 +00:00
Improve site-check quality: fix broken site configs, add diagnostic utilities, and make self-check report-only by default with opt-in auto-disable. (#2301)
- Fix VK and TradingView checkType; add Reddit and Microsoft Learn API-style probes where appropriate; adjust or disable entries that are unreliable under anti-bot protection. - Self-check: stop aggressive auto-disable; default to reporting issues only; add --auto-disable and --diagnose for optional fixes and deeper output. - Tooling: add utils/site_check.py and utils/check_top_n.py (and related helpers) to inspect and rank site behavior against the top-N list - Scope: aligns with fixing top-traffic / high-impact sites and making diagnostics repeatable without silently flipping disabled flags
This commit is contained in:
+102
-13
@@ -826,9 +826,21 @@ async def site_self_check(
|
||||
i2p_proxy=None,
|
||||
skip_errors=False,
|
||||
cookies=None,
|
||||
auto_disable=False,
|
||||
diagnose=False,
|
||||
):
|
||||
"""
|
||||
Self-check a site configuration.
|
||||
|
||||
Args:
|
||||
auto_disable: If True, automatically disable sites that fail checks.
|
||||
If False (default), only report issues without disabling.
|
||||
diagnose: If True, print detailed diagnosis information.
|
||||
"""
|
||||
changes = {
|
||||
"disabled": False,
|
||||
"issues": [],
|
||||
"recommendations": [],
|
||||
}
|
||||
|
||||
check_data = [
|
||||
@@ -838,6 +850,8 @@ async def site_self_check(
|
||||
|
||||
logger.info(f"Checking {site.name}...")
|
||||
|
||||
results_cache = {}
|
||||
|
||||
for username, status in check_data:
|
||||
async with semaphore:
|
||||
results_dict = await maigret(
|
||||
@@ -859,15 +873,20 @@ async def site_self_check(
|
||||
# TODO: make normal checking
|
||||
if site.name not in results_dict:
|
||||
logger.info(results_dict)
|
||||
changes["disabled"] = True
|
||||
changes["issues"].append(f"Site {site.name} not in results (wrong id_type?)")
|
||||
if auto_disable:
|
||||
changes["disabled"] = True
|
||||
continue
|
||||
|
||||
logger.debug(results_dict)
|
||||
|
||||
result = results_dict[site.name]["status"]
|
||||
results_cache[username] = results_dict[site.name]
|
||||
|
||||
if result.error and 'Cannot connect to host' in result.error.desc:
|
||||
changes["disabled"] = True
|
||||
changes["issues"].append(f"Cannot connect to host")
|
||||
if auto_disable:
|
||||
changes["disabled"] = True
|
||||
|
||||
site_status = result.status
|
||||
|
||||
@@ -875,6 +894,8 @@ async def site_self_check(
|
||||
if site_status == MaigretCheckStatus.UNKNOWN:
|
||||
msgs = site.absence_strs
|
||||
etype = site.check_type
|
||||
error_msg = f"Error checking {username}: {result.context}"
|
||||
changes["issues"].append(error_msg)
|
||||
logger.warning(
|
||||
f"Error while searching {username} in {site.name}: {result.context}, {msgs}, type {etype}"
|
||||
)
|
||||
@@ -884,28 +905,62 @@ async def site_self_check(
|
||||
if skip_errors:
|
||||
pass
|
||||
# don't disable in case of available username
|
||||
elif status == MaigretCheckStatus.CLAIMED:
|
||||
elif status == MaigretCheckStatus.CLAIMED and auto_disable:
|
||||
changes["disabled"] = True
|
||||
elif status == MaigretCheckStatus.CLAIMED:
|
||||
changes["issues"].append(f"Claimed user '{username}' not detected as claimed")
|
||||
logger.warning(
|
||||
f"Not found `{username}` in {site.name}, must be claimed"
|
||||
)
|
||||
logger.info(results_dict[site.name])
|
||||
changes["disabled"] = True
|
||||
if auto_disable:
|
||||
changes["disabled"] = True
|
||||
else:
|
||||
changes["issues"].append(f"Unclaimed user '{username}' detected as claimed")
|
||||
logger.warning(f"Found `{username}` in {site.name}, must be available")
|
||||
logger.info(results_dict[site.name])
|
||||
changes["disabled"] = True
|
||||
if auto_disable:
|
||||
changes["disabled"] = True
|
||||
|
||||
logger.info(f"Site {site.name} checking is finished")
|
||||
|
||||
if changes["disabled"] != site.disabled:
|
||||
# Generate recommendations based on issues
|
||||
if changes["issues"] and len(results_cache) == 2:
|
||||
claimed_result = results_cache.get(site.username_claimed, {})
|
||||
unclaimed_result = results_cache.get(site.username_unclaimed, {})
|
||||
|
||||
claimed_http = claimed_result.get("http_status")
|
||||
unclaimed_http = unclaimed_result.get("http_status")
|
||||
|
||||
if claimed_http and unclaimed_http:
|
||||
if claimed_http != unclaimed_http and site.check_type != "status_code":
|
||||
changes["recommendations"].append(
|
||||
f"Consider checkType: status_code (HTTP {claimed_http} vs {unclaimed_http})"
|
||||
)
|
||||
|
||||
# Print diagnosis if requested
|
||||
if diagnose and changes["issues"]:
|
||||
print(f"\n--- {site.name} DIAGNOSIS ---")
|
||||
print(f" Check type: {site.check_type}")
|
||||
print(f" Issues:")
|
||||
for issue in changes["issues"]:
|
||||
print(f" - {issue}")
|
||||
if changes["recommendations"]:
|
||||
print(f" Recommendations:")
|
||||
for rec in changes["recommendations"]:
|
||||
print(f" -> {rec}")
|
||||
|
||||
# Only modify site if auto_disable is enabled
|
||||
if auto_disable and changes["disabled"] != site.disabled:
|
||||
site.disabled = changes["disabled"]
|
||||
logger.info(f"Switching property 'disabled' for {site.name} to {site.disabled}")
|
||||
db.update_site(site)
|
||||
if not silent:
|
||||
action = "Disabled" if site.disabled else "Enabled"
|
||||
print(f"{action} site {site.name}...")
|
||||
elif changes["issues"] and not silent and not diagnose:
|
||||
# Report issues without disabling
|
||||
print(f"Issues found in {site.name}: {len(changes['issues'])} (not auto-disabled)")
|
||||
|
||||
# remove service tag "unchecked"
|
||||
if "unchecked" in site.tags:
|
||||
@@ -924,10 +979,24 @@ async def self_check(
|
||||
proxy=None,
|
||||
tor_proxy=None,
|
||||
i2p_proxy=None,
|
||||
) -> bool:
|
||||
auto_disable=False,
|
||||
diagnose=False,
|
||||
) -> dict:
|
||||
"""
|
||||
Run self-check on sites.
|
||||
|
||||
Args:
|
||||
auto_disable: If True, automatically disable sites that fail checks.
|
||||
If False (default), only report issues without disabling.
|
||||
diagnose: If True, print detailed diagnosis for each failing site.
|
||||
|
||||
Returns:
|
||||
dict with 'needs_update' bool and 'results' list of check results
|
||||
"""
|
||||
sem = asyncio.Semaphore(max_connections)
|
||||
tasks = []
|
||||
all_sites = site_data
|
||||
all_results = []
|
||||
|
||||
def disabled_count(lst):
|
||||
return len(list(filter(lambda x: x.disabled, lst)))
|
||||
@@ -939,15 +1008,18 @@ async def self_check(
|
||||
|
||||
for _, site in all_sites.items():
|
||||
check_coro = site_self_check(
|
||||
site, logger, sem, db, silent, proxy, tor_proxy, i2p_proxy, skip_errors=True
|
||||
site, logger, sem, db, silent, proxy, tor_proxy, i2p_proxy,
|
||||
skip_errors=True, auto_disable=auto_disable, diagnose=diagnose
|
||||
)
|
||||
future = asyncio.ensure_future(check_coro)
|
||||
tasks.append(future)
|
||||
tasks.append((site.name, future))
|
||||
|
||||
if tasks:
|
||||
with alive_bar(len(tasks), title='Self-checking', force_tty=True) as progress:
|
||||
for f in asyncio.as_completed(tasks):
|
||||
await f
|
||||
for site_name, f in tasks:
|
||||
result = await f
|
||||
result['site_name'] = site_name
|
||||
all_results.append(result)
|
||||
progress() # Update the progress bar
|
||||
|
||||
unchecked_new_count = len(
|
||||
@@ -956,7 +1028,10 @@ async def self_check(
|
||||
disabled_new_count = disabled_count(all_sites.values())
|
||||
total_disabled = disabled_new_count - disabled_old_count
|
||||
|
||||
if total_disabled:
|
||||
# Count issues
|
||||
total_issues = sum(1 for r in all_results if r.get('issues'))
|
||||
|
||||
if auto_disable and total_disabled:
|
||||
if total_disabled >= 0:
|
||||
message = "Disabled"
|
||||
else:
|
||||
@@ -968,11 +1043,25 @@ async def self_check(
|
||||
f"{message} {total_disabled} ({disabled_old_count} => {disabled_new_count}) checked sites. "
|
||||
"Run with `--info` flag to get more information"
|
||||
)
|
||||
elif total_issues and not silent:
|
||||
print(f"\nFound issues in {total_issues} sites (auto-disable is OFF)")
|
||||
print("Use --auto-disable to automatically disable failing sites")
|
||||
print("Use --diagnose to see detailed diagnosis for each site")
|
||||
|
||||
if unchecked_new_count != unchecked_old_count:
|
||||
print(f"Unchecked sites verified: {unchecked_old_count - unchecked_new_count}")
|
||||
|
||||
return total_disabled != 0 or unchecked_new_count != unchecked_old_count
|
||||
needs_update = total_disabled != 0 or unchecked_new_count != unchecked_old_count
|
||||
|
||||
# For backwards compatibility, return bool if auto_disable is True
|
||||
if auto_disable:
|
||||
return needs_update
|
||||
|
||||
return {
|
||||
'needs_update': needs_update,
|
||||
'results': all_results,
|
||||
'total_issues': total_issues,
|
||||
}
|
||||
|
||||
|
||||
def extract_ids_data(html_text, logger, site) -> Dict:
|
||||
|
||||
+23
-2
@@ -316,7 +316,19 @@ def setup_arguments_parser(settings: Settings):
|
||||
"--self-check",
|
||||
action="store_true",
|
||||
default=settings.self_check_enabled,
|
||||
help="Do self check for sites and database and disable non-working ones.",
|
||||
help="Do self check for sites and database. Use --auto-disable to disable failing sites.",
|
||||
)
|
||||
modes_group.add_argument(
|
||||
"--auto-disable",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="With --self-check: automatically disable sites that fail checks.",
|
||||
)
|
||||
modes_group.add_argument(
|
||||
"--diagnose",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="With --self-check: print detailed diagnosis for each failing site.",
|
||||
)
|
||||
modes_group.add_argument(
|
||||
"--stats",
|
||||
@@ -566,7 +578,7 @@ async def main():
|
||||
query_notify.success(
|
||||
f'Maigret sites database self-check started for {len(site_data)} sites...'
|
||||
)
|
||||
is_need_update = await self_check(
|
||||
check_result = await self_check(
|
||||
db,
|
||||
site_data,
|
||||
logger,
|
||||
@@ -574,7 +586,16 @@ async def main():
|
||||
max_connections=args.connections,
|
||||
tor_proxy=args.tor_proxy,
|
||||
i2p_proxy=args.i2p_proxy,
|
||||
auto_disable=args.auto_disable,
|
||||
diagnose=args.diagnose,
|
||||
)
|
||||
|
||||
# Handle both old (bool) and new (dict) return types
|
||||
if isinstance(check_result, dict):
|
||||
is_need_update = check_result.get('needs_update', False)
|
||||
else:
|
||||
is_need_update = check_result
|
||||
|
||||
if is_need_update:
|
||||
if input('Do you want to save changes permanently? [Yn]\n').lower() in (
|
||||
'y',
|
||||
|
||||
+51
-23
@@ -3214,18 +3214,17 @@
|
||||
" <h1>404 Page not found</h1>",
|
||||
"_404-header",
|
||||
"_404-inner-container",
|
||||
" no-nav "
|
||||
" no-nav ",
|
||||
"not found."
|
||||
],
|
||||
"presenseStrs": [
|
||||
"profile-top",
|
||||
"og:title",
|
||||
" style=",
|
||||
"view-profile",
|
||||
" data-username="
|
||||
"\"player_id\":",
|
||||
"\"@id\":\"https://api.chess.com/pub/player/"
|
||||
],
|
||||
"alexaRank": 211,
|
||||
"urlMain": "https://www.chess.com",
|
||||
"url": "https://www.chess.com/member/{username}",
|
||||
"urlProbe": "https://api.chess.com/pub/player/{username}",
|
||||
"usernameClaimed": "sexytwerker69",
|
||||
"usernameUnclaimed": "aublurbrxm",
|
||||
"headers": {
|
||||
@@ -4929,6 +4928,7 @@
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
},
|
||||
"Etsy": {
|
||||
"disabled": true,
|
||||
"tags": [
|
||||
"shopping",
|
||||
"us"
|
||||
@@ -7385,11 +7385,18 @@
|
||||
"tags": [
|
||||
"in"
|
||||
],
|
||||
"checkType": "response_url",
|
||||
"checkType": "message",
|
||||
"presenseStrs": [
|
||||
"id=\"profileApp\""
|
||||
],
|
||||
"absenceStrs": [
|
||||
"Guru.com - Page Not Found",
|
||||
"Guru.com - Content Deleted"
|
||||
],
|
||||
"alexaRank": 4420,
|
||||
"urlMain": "https://www.guru.com",
|
||||
"url": "https://www.guru.com/freelancers/{username}",
|
||||
"usernameClaimed": "adam",
|
||||
"usernameClaimed": "longhui-zhao",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
},
|
||||
"GuruShots": {
|
||||
@@ -10294,6 +10301,19 @@
|
||||
"usernameClaimed": "blue",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
},
|
||||
"MicrosoftLearn": {
|
||||
"tags": [
|
||||
"tech",
|
||||
"us"
|
||||
],
|
||||
"checkType": "status_code",
|
||||
"alexaRank": 21,
|
||||
"urlMain": "https://learn.microsoft.com",
|
||||
"url": "https://learn.microsoft.com/en-us/users/{username}",
|
||||
"urlProbe": "https://learn.microsoft.com/api/profiles/{username}",
|
||||
"usernameClaimed": "blue",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
},
|
||||
"Minecraft-statistic": {
|
||||
"tags": [
|
||||
"ru",
|
||||
@@ -12345,7 +12365,8 @@
|
||||
],
|
||||
"alexaRank": 8904,
|
||||
"urlMain": "https://picsart.com/",
|
||||
"url": "https://api.picsart.com/users/show/{username}.json",
|
||||
"url": "https://picsart.com/u/{username}",
|
||||
"urlProbe": "https://api.picsart.com/users/show/{username}.json",
|
||||
"usernameClaimed": "adam",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
},
|
||||
@@ -12806,6 +12827,7 @@
|
||||
"tags": [
|
||||
"porn"
|
||||
],
|
||||
"disabled": true,
|
||||
"checkType": "message",
|
||||
"presenseStrs": [
|
||||
"profileInformation"
|
||||
@@ -12817,7 +12839,7 @@
|
||||
"alexaRank": 74,
|
||||
"urlMain": "https://pornhub.com/",
|
||||
"url": "https://pornhub.com/users/{username}",
|
||||
"usernameClaimed": "blue",
|
||||
"usernameClaimed": "verified",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
},
|
||||
"PornhubPornstars": {
|
||||
@@ -13640,14 +13662,18 @@
|
||||
],
|
||||
"checkType": "message",
|
||||
"absenceStrs": [
|
||||
"Sorry, nobody on Reddit goes by that name."
|
||||
"Not Found"
|
||||
],
|
||||
"presenseStrs": [
|
||||
"Post karma"
|
||||
"\"name\":"
|
||||
],
|
||||
"headers": {
|
||||
"User-Agent": "maigret/0.4"
|
||||
},
|
||||
"alexaRank": 19,
|
||||
"urlMain": "https://www.reddit.com/",
|
||||
"url": "https://www.reddit.com/user/{username}",
|
||||
"urlProbe": "https://api.reddit.com/user/{username}/about",
|
||||
"usernameClaimed": "blue",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
},
|
||||
@@ -16690,13 +16716,7 @@
|
||||
"trading",
|
||||
"us"
|
||||
],
|
||||
"checkType": "message",
|
||||
"presenseStrs": [
|
||||
"tv-profile"
|
||||
],
|
||||
"absenceStrs": [
|
||||
"<title>Page not found \u2014 TradingView</title>"
|
||||
],
|
||||
"checkType": "status_code",
|
||||
"alexaRank": 61,
|
||||
"urlMain": "https://www.tradingview.com/",
|
||||
"url": "https://www.tradingview.com/u/{username}",
|
||||
@@ -17185,6 +17205,7 @@
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
},
|
||||
"Udemy": {
|
||||
"disabled": true,
|
||||
"tags": [
|
||||
"in"
|
||||
],
|
||||
@@ -17357,7 +17378,7 @@
|
||||
"tags": [
|
||||
"ru"
|
||||
],
|
||||
"checkType": "response_url",
|
||||
"checkType": "status_code",
|
||||
"regexCheck": "^(?!id\\d)\\w*$",
|
||||
"alexaRank": 27,
|
||||
"urlMain": "https://vk.com/",
|
||||
@@ -17584,7 +17605,7 @@
|
||||
"method": "vimeo"
|
||||
},
|
||||
"headers": {
|
||||
"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3MzQxMTc1NDAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbCwianRpIjoiNDc4Y2ZhZGUtZjI0Yy00MDVkLTliYWItN2RlNGEzNGM4MzI5In0.guN7Fg8dqq7EYdckrJ-6Rdkj_5MOl6FaC4YUSOceDpU"
|
||||
"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NzQxOTIxNDAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbCwianRpIjoiYzdmMWJkYjAtMGZiMi00M2JiLTg0N2YtMGY5ZGViYTdkOGY0In0._ork2l2kSy1Xn4Pj8WmYvUfAezmXJeXxOZCoHAs5Q2M"
|
||||
},
|
||||
"urlProbe": "https://api.vimeo.com/users/{username}?fields=name%2Cgender%2Cbio%2Curi%2Clink%2Cbackground_video%2Clocation_details%2Cpictures%2Cverified%2Cmetadata.public_videos.total%2Cavailable_for_hire%2Ccan_work_remotely%2Cmetadata.connections.videos.total%2Cmetadata.connections.albums.total%2Cmetadata.connections.followers.total%2Cmetadata.connections.following.total%2Cmetadata.public_videos.total%2Cmetadata.connections.vimeo_experts.is_enrolled%2Ctotal_collection_count%2Ccreated_time%2Cprofile_preferences%2Cmembership%2Cclients%2Cskills%2Cproject_types%2Crates%2Ccategories%2Cis_expert%2Cprofile_discovery%2Cwebsites%2Ccontact_emails&fetch_user_profile=1",
|
||||
"checkType": "status_code",
|
||||
@@ -18189,6 +18210,7 @@
|
||||
"usernameUnclaimed": "noonewouldeverusethis77777"
|
||||
},
|
||||
"Wikipedia": {
|
||||
"disabled": true,
|
||||
"tags": [
|
||||
"wiki"
|
||||
],
|
||||
@@ -18198,8 +18220,8 @@
|
||||
"Wikipedia does not have a"
|
||||
],
|
||||
"alexaRank": 12,
|
||||
"urlMain": "https://www.wikipedia.org/",
|
||||
"url": "https://www.wikipedia.org/wiki/User:{username}",
|
||||
"urlMain": "https://en.wikipedia.org/",
|
||||
"url": "https://en.wikipedia.org/wiki/User:{username}",
|
||||
"usernameClaimed": "Hoadlck",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
},
|
||||
@@ -18743,6 +18765,7 @@
|
||||
"usernameUnclaimed": "noonewouldeverusethis77777"
|
||||
},
|
||||
"YandexMusic": {
|
||||
"disabled": true,
|
||||
"tags": [
|
||||
"music",
|
||||
"ru"
|
||||
@@ -31073,6 +31096,7 @@
|
||||
"alexaRank": 1513399
|
||||
},
|
||||
"Baidu": {
|
||||
"disabled": true,
|
||||
"absenceStrs": [
|
||||
"error_404_iframe"
|
||||
],
|
||||
@@ -31868,6 +31892,7 @@
|
||||
]
|
||||
},
|
||||
"rblx.trade": {
|
||||
"disabled": true,
|
||||
"absenceStrs": [
|
||||
"isRblxTradeException"
|
||||
],
|
||||
@@ -31960,6 +31985,7 @@
|
||||
]
|
||||
},
|
||||
"giters.com": {
|
||||
"disabled": true,
|
||||
"absenceStrs": [
|
||||
"This page could not be found"
|
||||
],
|
||||
@@ -31978,6 +32004,7 @@
|
||||
]
|
||||
},
|
||||
"githubplus.com": {
|
||||
"disabled": true,
|
||||
"absenceStrs": [
|
||||
"preconnect"
|
||||
],
|
||||
@@ -32166,6 +32193,7 @@
|
||||
]
|
||||
},
|
||||
"Aparat": {
|
||||
"disabled": true,
|
||||
"absenceStrs": [
|
||||
"404 - Page Not Found"
|
||||
],
|
||||
|
||||
Reference in New Issue
Block a user