diff --git a/maigret/checking.py b/maigret/checking.py index 1088349..8ad0554 100644 --- a/maigret/checking.py +++ b/maigret/checking.py @@ -31,7 +31,7 @@ from .executors import AsyncioQueueGeneratorExecutor from .result import MaigretCheckResult, MaigretCheckStatus from .sites import MaigretDatabase, MaigretSite from .types import QueryOptions, QueryResultWrapper -from .utils import ascii_data_display, get_random_user_agent +from .utils import ascii_data_display, get_random_user_agent, is_plausible_username SUPPORTED_IDS = ( @@ -639,7 +639,6 @@ def process_site_result( html_text, status_code, check_error = response - # TODO: add elapsed request time counting response_time = None if logger.level == logging.DEBUG: @@ -673,7 +672,6 @@ def process_site_result( f"Failed activation {method} for site {site.name}: {str(e)}", exc_info=True, ) - # TODO: temporary check error site_name = site.pretty_name # presense flags @@ -1296,7 +1294,6 @@ async def site_self_check( ) # don't disable entries with other ids types - # TODO: make normal checking if site.name not in results_dict: logger.info(results_dict) changes["issues"].append(f"Site {site.name} not in results (wrong id_type?)") @@ -1525,13 +1522,23 @@ def parse_usernames(extracted_ids_data, logger) -> Dict: new_usernames = {} for k, v in extracted_ids_data.items(): if "username" in k and not "usernames" in k: - new_usernames[v] = "username" + if is_plausible_username(v): + new_usernames[v] = "username" + else: + logger.debug( + f"Rejected non-username value extracted under key {k!r}: {v!r}" + ) elif "usernames" in k: try: tree = ast.literal_eval(v) if isinstance(tree, list): for n in tree: - new_usernames[n] = "username" + if is_plausible_username(n): + new_usernames[n] = "username" + else: + logger.debug( + f"Rejected non-username item from list under key {k!r}: {n!r}" + ) except Exception as e: logger.warning(e) if k in SUPPORTED_IDS: diff --git a/maigret/errors.py b/maigret/errors.py index 986a59e..5d55ff5 100644 --- a/maigret/errors.py +++ b/maigret/errors.py @@ -77,7 +77,6 @@ ERRORS_TYPES = { 'Connecting failure': 'Try to decrease number of parallel connections (e.g. -n 10)', } -# TODO: checking for reason ERRORS_REASONS = { 'Login required': 'Add authorization cookies through `--cookies-jar-file` (see cookies.txt)', } diff --git a/maigret/maigret.py b/maigret/maigret.py index eeb4b81..1c75cc2 100755 --- a/maigret/maigret.py +++ b/maigret/maigret.py @@ -55,7 +55,7 @@ from .report import ( from .sites import MaigretDatabase from .submit import Submitter from .types import QueryResultWrapper -from .utils import get_dict_ascii_tree +from .utils import get_dict_ascii_tree, is_plausible_username from .settings import Settings from .permutator import Permute @@ -85,13 +85,23 @@ def extract_ids_from_page(url, logger, timeout=5) -> dict: for k, v in info.items(): # TODO: merge with the same functionality in checking module if 'username' in k and not 'usernames' in k: - results[v] = 'username' + if is_plausible_username(v): + results[v] = 'username' + else: + logger.debug( + f"Rejected non-username value extracted under key {k!r}: {v!r}" + ) elif 'usernames' in k: try: tree = ast.literal_eval(v) if isinstance(tree, list): for n in tree: - results[n] = 'username' + if is_plausible_username(n): + results[n] = 'username' + else: + logger.debug( + f"Rejected non-username item from list under key {k!r}: {n!r}" + ) except Exception as e: logger.warning(e) if k in SUPPORTED_IDS: diff --git a/maigret/report.py b/maigret/report.py index c79eef5..fd2f656 100644 --- a/maigret/report.py +++ b/maigret/report.py @@ -516,7 +516,6 @@ def generate_report_context(username_results: list): tag = pycountry.countries.search_fuzzy(v)[ 0 ].alpha_2.lower() # type: ignore[attr-defined] - # TODO: move countries to another struct tags[tag] = tags.get(tag, 0) + 1 except Exception as e: logging.debug( @@ -568,7 +567,6 @@ def generate_report_context(username_results: list): return { "username": first_username, - # TODO: return brief list "brief": brief, "results": username_results, "first_seen": first_seen, diff --git a/maigret/resources/db_meta.json b/maigret/resources/db_meta.json index e479cfb..08dbad9 100644 --- a/maigret/resources/db_meta.json +++ b/maigret/resources/db_meta.json @@ -1,6 +1,6 @@ { "version": 1, - "updated_at": "2026-05-16T10:45:38Z", + "updated_at": "2026-05-16T15:49:11Z", "sites_count": 3155, "min_maigret_version": "0.6.1", "data_sha256": "df2ab3dbc96bdcdc8aa4e9da485df75ce6c3274814080f00a35e89f7f43783e1", diff --git a/maigret/utils.py b/maigret/utils.py index c2a2160..6e5d311 100644 --- a/maigret/utils.py +++ b/maigret/utils.py @@ -127,3 +127,29 @@ def get_match_ratio(base_strs: list): def generate_random_username(): return ''.join(random.choices(string.ascii_lowercase, k=10)) + + +def is_plausible_username(value: Any) -> bool: + """Reject obviously non-username strings extracted from sites' identity data. + + Extractor schemes occasionally populate fields named like ``*_username`` + with URLs (e.g. ``instagram_username`` -> ``https://instagram.com/X``) or + emails (e.g. ``your_username`` -> ``user@example.com``). Feeding such a + value back into a site URL template produces broken requests on every + subsequent site, which manifests as a cascade of false errors and the + "wrong username" symptom in #1403. + """ + if not isinstance(value, str): + return False + s = value.strip() + if not s: + return False + if "://" in s or s.startswith(("http://", "https://", "www.", "//")): + return False + if "/" in s: + return False + if any(c.isspace() for c in s): + return False + if "@" in s and "." in s: + return False + return True diff --git a/tests/test_checking.py b/tests/test_checking.py index dcbc650..9db3ef7 100644 --- a/tests/test_checking.py +++ b/tests/test_checking.py @@ -146,6 +146,33 @@ def test_parse_usernames_malformed_list(): assert logger.warning.called +def test_parse_usernames_rejects_url_value(): + """Regression for #1403: extractors sometimes return a URL under a *_username + key; that URL must not be fed back as a candidate username.""" + logger = Mock() + result = parse_usernames( + {"instagram_username": "https://instagram.com/zuck"}, logger + ) + assert result == {} + + +def test_parse_usernames_rejects_email_value(): + """Regression for #1403: e.g. socid_extractor's 'your_username' returns an + email under a key matching the username heuristic.""" + logger = Mock() + result = parse_usernames({"your_username": "alice@example.com"}, logger) + assert result == {} + + +def test_parse_usernames_filters_urls_inside_list(): + logger = Mock() + result = parse_usernames( + {"other_usernames": "['alice', 'https://example.com/bob']"}, logger + ) + # 'alice' should survive; the URL should be dropped. + assert result == {"alice": "username"} + + def test_parse_usernames_supported_id(): logger = Mock() # "telegram" is in SUPPORTED_IDS per socid_extractor diff --git a/tests/test_utils.py b/tests/test_utils.py index d9219b9..cae36f5 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -10,6 +10,7 @@ from maigret.utils import ( URLMatcher, get_dict_ascii_tree, get_match_ratio, + is_plausible_username, ) @@ -144,3 +145,52 @@ def test_get_match_ratio(): fun = get_match_ratio(["test", "maigret", "username"]) assert fun("test") == 1 + + +# Regression tests for #1403 — Gravatar URL leaking into next-iteration username. +# Extractor schemes occasionally store URLs/emails under '*_username' keys; without +# validation these were fed back into the search loop and produced cascades of false +# errors. See maigret/utils.py::is_plausible_username. +def test_is_plausible_username_accepts_bare_usernames(): + assert is_plausible_username("alice") + assert is_plausible_username("alice.bob") + assert is_plausible_username("alice_bob-42") + assert is_plausible_username("Алиса") + + +def test_is_plausible_username_rejects_urls(): + assert not is_plausible_username("https://gravatar.com/alice") + assert not is_plausible_username("http://example.com/user/alice") + assert not is_plausible_username("//example.com/alice") + assert not is_plausible_username("www.facebook.com/zuck") + + +def test_is_plausible_username_accepts_http_prefixed_handles(): + """Don't over-match: bare names that just happen to start with 'http' or 'www' + are legitimate (e.g. the httpie CLI maintainer's handle).""" + assert is_plausible_username("httpie") + assert is_plausible_username("http_user") + assert is_plausible_username("wwwsuperstar") + + +def test_is_plausible_username_rejects_path_like(): + assert not is_plausible_username("user/alice") + assert not is_plausible_username("alice/") + + +def test_is_plausible_username_rejects_emails(): + assert not is_plausible_username("alice@example.com") + assert not is_plausible_username("user@maigret.io") + + +def test_is_plausible_username_rejects_whitespace_and_empty(): + assert not is_plausible_username("") + assert not is_plausible_username(" ") + assert not is_plausible_username("alice bob") + assert not is_plausible_username("alice\nbob") + + +def test_is_plausible_username_rejects_non_strings(): + assert not is_plausible_username(None) + assert not is_plausible_username(42) + assert not is_plausible_username(["alice"]) diff --git a/utils/import_sites.py b/utils/import_sites.py index 6b047b3..c67074b 100755 --- a/utils/import_sites.py +++ b/utils/import_sites.py @@ -165,7 +165,6 @@ if __name__ == '__main__': sites = {site.name: site for site in sites_subset} engines = db.engines - # TODO: usernames extractors ok_usernames = ['alex', 'god', 'admin', 'red', 'blue', 'john'] if args.username: ok_usernames = [args.username] + ok_usernames