From f83b73bf89c83f0923af207d707028f03767cbe3 Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Tue, 7 Apr 2026 19:44:09 +0200 Subject: [PATCH] Fix crash on `-a --self-check` by adding exception handling to site check coroutines (#2466) * Initial plan * Fix crash on -a --self-check by adding exception handling in site_self_check and self_check Wrap the body of site_self_check in try/except to catch unexpected errors and always return a valid changes dict. Also add a safety-net try/except in self_check around awaiting individual site check futures so that a single site failure doesn't crash the entire self-check process. Agent-Logs-Url: https://github.com/soxoj/maigret/sessions/5e27d620-5cbb-43d2-a9f9-ecb53a29904d Co-authored-by: soxoj <31013580+soxoj@users.noreply.github.com> * Restore @pytest.mark.slow on test_maigret_results Agent-Logs-Url: https://github.com/soxoj/maigret/sessions/5e27d620-5cbb-43d2-a9f9-ecb53a29904d Co-authored-by: soxoj <31013580+soxoj@users.noreply.github.com> * Document --self-check error resilience, --auto-disable, and --diagnose in docs/ Update command-line-options.rst with expanded --self-check description and new --auto-disable and --diagnose entries. Add a "Database self-check" section to features.rst explaining error-resilient behaviour and usage examples. Update usage-examples.rst to reference --auto-disable. Agent-Logs-Url: https://github.com/soxoj/maigret/sessions/af1f0f09-9112-4902-8475-e81d235ff3ed Co-authored-by: soxoj <31013580+soxoj@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: soxoj <31013580+soxoj@users.noreply.github.com> --- docs/source/command-line-options.rst | 25 ++- docs/source/features.rst | 29 ++++ docs/source/usage-examples.rst | 2 +- maigret/checking.py | 247 +++++++++++++++------------ tests/test_maigret.py | 38 ++++- 5 files changed, 222 insertions(+), 119 deletions(-) diff --git a/docs/source/command-line-options.rst b/docs/source/command-line-options.rst index 619cfe3..e7fe06f 100644 --- a/docs/source/command-line-options.rst +++ b/docs/source/command-line-options.rst @@ -133,12 +133,25 @@ Other operations modes ``--version`` - Display version information and dependencies. -``--self-check`` - Do self-checking for sites and database and disable -non-working ones **for current search session** by default. It’s useful -for testing new internet connection (it depends on provider/hosting on -which sites there will be censorship stub or captcha display). After -checking Maigret asks if you want to save updates, answering y/Y will -rewrite the local database. +``--self-check`` - Do self-checking for sites and database. Each site is +tested by looking up its known-claimed and known-unclaimed usernames and +verifying that the results match expectations. Individual site failures +(network errors, unexpected exceptions, etc.) are caught and logged +without stopping the overall process, so the check always runs to +completion. After checking, Maigret reports a summary of issues found. +If any sites were disabled (see ``--auto-disable``), Maigret asks if you +want to save updates; answering y/Y will rewrite the local database. + +``--auto-disable`` - Used with ``--self-check``: automatically disable +sites that fail checks (incorrect detection of claimed/unclaimed +usernames, connection errors, or unexpected exceptions). Without this +flag, ``--self-check`` only **reports** issues without modifying the +database. + +``--diagnose`` - Used with ``--self-check``: print detailed diagnosis +information for each failing site, including the check type, the list +of issues found, and recommendations (e.g. suggesting a different +``checkType``). ``--submit URL`` - Do an automatic analysis of the given account URL or site main page URL to determine the site engine and methods to check diff --git a/docs/source/features.rst b/docs/source/features.rst index f06f86e..00e3c45 100644 --- a/docs/source/features.rst +++ b/docs/source/features.rst @@ -170,6 +170,35 @@ Maigret will do retries of the requests with temporary errors got (connection fa One attempt by default, can be changed with option ``--retries N``. +Database self-check +------------------- + +Maigret includes a self-check mode (``--self-check``) that validates every site +in the database by looking up its known-claimed and known-unclaimed usernames +and verifying that the detection results match expectations. + +The self-check is **error-resilient**: if an individual site check raises an +unexpected exception (e.g. a network error or a parsing failure), the error is +caught, logged, and recorded as an issue — the remaining sites continue to be +checked without interruption. This means the process always runs to completion, +even when checking hundreds of sites with ``-a --self-check``. + +Use ``--auto-disable`` together with ``--self-check`` to automatically disable +sites that fail checks. Without it, issues are only reported. Use ``--diagnose`` +to print detailed per-site diagnosis including the check type, specific issues, +and recommendations. + +.. code-block:: console + + # Report-only mode (no changes to the database) + maigret --self-check + + # Automatically disable failing sites and save updates + maigret -a --self-check --auto-disable + + # Show detailed diagnosis for each failing site + maigret -a --self-check --diagnose + Archives and mirrors checking ----------------------------- diff --git a/docs/source/usage-examples.rst b/docs/source/usage-examples.rst index 45ea4ec..d56c291 100644 --- a/docs/source/usage-examples.rst +++ b/docs/source/usage-examples.rst @@ -33,7 +33,7 @@ Use Cases If you experience many false positives, you can do the following: - Install the last development version of Maigret from GitHub - - Run Maigret with ``--self-check`` flag and agree on disabling of problematic sites + - Run Maigret with ``--self-check --auto-disable`` flag and agree on disabling of problematic sites 3. Search for accounts with username ``machine42`` and generate HTML and PDF reports. diff --git a/maigret/checking.py b/maigret/checking.py index f469af5..c1e6994 100644 --- a/maigret/checking.py +++ b/maigret/checking.py @@ -967,129 +967,143 @@ async def site_self_check( "recommendations": [], } - check_data = [ - (site.username_claimed, MaigretCheckStatus.CLAIMED), - (site.username_unclaimed, MaigretCheckStatus.AVAILABLE), - ] + try: + check_data = [ + (site.username_claimed, MaigretCheckStatus.CLAIMED), + (site.username_unclaimed, MaigretCheckStatus.AVAILABLE), + ] - logger.info(f"Checking {site.name}...") + logger.info(f"Checking {site.name}...") - results_cache = {} + results_cache = {} - for username, status in check_data: - async with semaphore: - results_dict = await maigret( - username=username, - site_dict={site.name: site}, - logger=logger, - timeout=30, - id_type=site.type, - forced=True, - no_progressbar=True, - retries=1, - proxy=proxy, - tor_proxy=tor_proxy, - i2p_proxy=i2p_proxy, - cookies=cookies, - ) - - # don't disable entries with other ids types - # TODO: make normal checking - if site.name not in results_dict: - logger.info(results_dict) - changes["issues"].append(f"Site {site.name} not in results (wrong id_type?)") - if auto_disable: - changes["disabled"] = True - continue - - logger.debug(results_dict) - - result = results_dict[site.name]["status"] - results_cache[username] = results_dict[site.name] - - if result.error and 'Cannot connect to host' in result.error.desc: - changes["issues"].append("Cannot connect to host") - if auto_disable: - changes["disabled"] = True - - site_status = result.status - - if site_status != status: - if site_status == MaigretCheckStatus.UNKNOWN: - msgs = site.absence_strs - etype = site.check_type - error_msg = f"Error checking {username}: {result.context}" - changes["issues"].append(error_msg) - logger.warning( - f"Error while searching {username} in {site.name}: {result.context}, {msgs}, type {etype}" + for username, status in check_data: + async with semaphore: + results_dict = await maigret( + username=username, + site_dict={site.name: site}, + logger=logger, + timeout=30, + id_type=site.type, + forced=True, + no_progressbar=True, + retries=1, + proxy=proxy, + tor_proxy=tor_proxy, + i2p_proxy=i2p_proxy, + cookies=cookies, ) - # don't disable sites after the error - # meaning that the site could be available, but returned error for the check - # e.g. many sites protected by cloudflare and available in general - if skip_errors: - pass - # don't disable in case of available username - elif status == MaigretCheckStatus.CLAIMED and auto_disable: - changes["disabled"] = True - elif status == MaigretCheckStatus.CLAIMED: - changes["issues"].append(f"Claimed user '{username}' not detected as claimed") - logger.warning( - f"Not found `{username}` in {site.name}, must be claimed" - ) - logger.info(results_dict[site.name]) - if auto_disable: - changes["disabled"] = True - else: - changes["issues"].append(f"Unclaimed user '{username}' detected as claimed") - logger.warning(f"Found `{username}` in {site.name}, must be available") - logger.info(results_dict[site.name]) + + # don't disable entries with other ids types + # TODO: make normal checking + if site.name not in results_dict: + logger.info(results_dict) + changes["issues"].append(f"Site {site.name} not in results (wrong id_type?)") + if auto_disable: + changes["disabled"] = True + continue + + logger.debug(results_dict) + + result = results_dict[site.name]["status"] + results_cache[username] = results_dict[site.name] + + if result.error and 'Cannot connect to host' in result.error.desc: + changes["issues"].append("Cannot connect to host") if auto_disable: changes["disabled"] = True - logger.info(f"Site {site.name} checking is finished") + site_status = result.status - # Generate recommendations based on issues - if changes["issues"] and len(results_cache) == 2: - claimed_result = results_cache.get(site.username_claimed, {}) - unclaimed_result = results_cache.get(site.username_unclaimed, {}) + if site_status != status: + if site_status == MaigretCheckStatus.UNKNOWN: + msgs = site.absence_strs + etype = site.check_type + error_msg = f"Error checking {username}: {result.context}" + changes["issues"].append(error_msg) + logger.warning( + f"Error while searching {username} in {site.name}: {result.context}, {msgs}, type {etype}" + ) + # don't disable sites after the error + # meaning that the site could be available, but returned error for the check + # e.g. many sites protected by cloudflare and available in general + if skip_errors: + pass + # don't disable in case of available username + elif status == MaigretCheckStatus.CLAIMED and auto_disable: + changes["disabled"] = True + elif status == MaigretCheckStatus.CLAIMED: + changes["issues"].append(f"Claimed user '{username}' not detected as claimed") + logger.warning( + f"Not found `{username}` in {site.name}, must be claimed" + ) + logger.info(results_dict[site.name]) + if auto_disable: + changes["disabled"] = True + else: + changes["issues"].append(f"Unclaimed user '{username}' detected as claimed") + logger.warning(f"Found `{username}` in {site.name}, must be available") + logger.info(results_dict[site.name]) + if auto_disable: + changes["disabled"] = True - claimed_http = claimed_result.get("http_status") - unclaimed_http = unclaimed_result.get("http_status") + logger.info(f"Site {site.name} checking is finished") - if claimed_http and unclaimed_http: - if claimed_http != unclaimed_http and site.check_type != "status_code": - changes["recommendations"].append( - f"Consider checkType: status_code (HTTP {claimed_http} vs {unclaimed_http})" - ) + # Generate recommendations based on issues + if changes["issues"] and len(results_cache) == 2: + claimed_result = results_cache.get(site.username_claimed, {}) + unclaimed_result = results_cache.get(site.username_unclaimed, {}) - # Print diagnosis if requested - if diagnose and changes["issues"]: - print(f"\n--- {site.name} DIAGNOSIS ---") - print(f" Check type: {site.check_type}") - print(" Issues:") - for issue in changes["issues"]: - print(f" - {issue}") - if changes["recommendations"]: - print(" Recommendations:") - for rec in changes["recommendations"]: - print(f" -> {rec}") + claimed_http = claimed_result.get("http_status") + unclaimed_http = unclaimed_result.get("http_status") - # Only modify site if auto_disable is enabled - if auto_disable and changes["disabled"] != site.disabled: - site.disabled = changes["disabled"] - logger.info(f"Switching property 'disabled' for {site.name} to {site.disabled}") - db.update_site(site) - if not silent: - action = "Disabled" if site.disabled else "Enabled" - print(f"{action} site {site.name}...") - elif changes["issues"] and not silent and not diagnose: - # Report issues without disabling - print(f"Issues found in {site.name}: {len(changes['issues'])} (not auto-disabled)") + if claimed_http and unclaimed_http: + if claimed_http != unclaimed_http and site.check_type != "status_code": + changes["recommendations"].append( + f"Consider checkType: status_code (HTTP {claimed_http} vs {unclaimed_http})" + ) - # remove service tag "unchecked" - if "unchecked" in site.tags: - site.tags.remove("unchecked") - db.update_site(site) + # Print diagnosis if requested + if diagnose and changes["issues"]: + print(f"\n--- {site.name} DIAGNOSIS ---") + print(f" Check type: {site.check_type}") + print(" Issues:") + for issue in changes["issues"]: + print(f" - {issue}") + if changes["recommendations"]: + print(" Recommendations:") + for rec in changes["recommendations"]: + print(f" -> {rec}") + + # Only modify site if auto_disable is enabled + if auto_disable and changes["disabled"] != site.disabled: + site.disabled = changes["disabled"] + logger.info(f"Switching property 'disabled' for {site.name} to {site.disabled}") + db.update_site(site) + if not silent: + action = "Disabled" if site.disabled else "Enabled" + print(f"{action} site {site.name}...") + elif changes["issues"] and not silent and not diagnose: + # Report issues without disabling + print(f"Issues found in {site.name}: {len(changes['issues'])} (not auto-disabled)") + + # remove service tag "unchecked" + if "unchecked" in site.tags: + site.tags.remove("unchecked") + db.update_site(site) + + except Exception as e: + logger.warning( + f"Self-check of {site.name} failed with unexpected error: {e}", + exc_info=True, + ) + changes["issues"].append(f"Unexpected error: {e}") + if auto_disable and not site.disabled: + changes["disabled"] = True + site.disabled = True + db.update_site(site) + if not silent: + print(f"Disabled site {site.name} (unexpected error)...") return changes @@ -1142,7 +1156,18 @@ async def self_check( if tasks: with alive_bar(len(tasks), title='Self-checking', force_tty=True, disable=no_progressbar) as progress: for site_name, f in tasks: - result = await f + try: + result = await f + except Exception as e: + logger.warning( + f"Self-check task for {site_name} raised unexpected error: {e}", + exc_info=True, + ) + result = { + "disabled": False, + "issues": [f"Unexpected error: {e}"], + "recommendations": [], + } result['site_name'] = site_name all_results.append(result) progress() # Update the progress bar diff --git a/tests/test_maigret.py b/tests/test_maigret.py index 87a3c46..4fd5719 100644 --- a/tests/test_maigret.py +++ b/tests/test_maigret.py @@ -12,7 +12,8 @@ from maigret.maigret import ( extract_ids_from_page, extract_ids_from_results, ) -from maigret.sites import MaigretSite +from maigret.checking import site_self_check +from maigret.sites import MaigretSite, MaigretDatabase from maigret.result import MaigretCheckResult, MaigretCheckStatus from tests.conftest import RESULTS_EXAMPLE @@ -83,6 +84,41 @@ async def test_self_check_progressbar_enabled_by_default(test_db): assert kwargs.get('disable') is False +@pytest.mark.asyncio +async def test_site_self_check_handles_exception(test_db): + """Verify that site_self_check catches unexpected exceptions and returns a valid result.""" + logger = Mock() + sem = asyncio.Semaphore(1) + site = test_db.sites_dict['ValidActive'] + + with patch('maigret.checking.maigret', side_effect=RuntimeError("test crash")): + result = await site_self_check(site, logger, sem, test_db) + + assert isinstance(result, dict) + assert "issues" in result + assert len(result["issues"]) > 0 + assert any("Unexpected error" in issue for issue in result["issues"]) + + +@pytest.mark.asyncio +async def test_self_check_handles_task_exception(test_db): + """Verify that self_check continues when individual site checks raise exceptions.""" + logger = Mock() + + with patch('maigret.checking.maigret', side_effect=RuntimeError("test crash")): + result = await self_check( + test_db, test_db.sites_dict, logger, silent=True, + no_progressbar=True, + ) + + assert isinstance(result, dict) + assert 'results' in result + assert len(result['results']) == len(test_db.sites_dict) + for r in result['results']: + assert 'site_name' in r + assert 'issues' in r + + @pytest.mark.slow @pytest.mark.skip(reason="broken, fixme") def test_maigret_results(test_db):