fix(checking): reject URLs and emails extracted as usernames (#2673)

Closes #1403
2026-05-17 03:45:36 +00:00 · 2026-05-16 17:52:00 +02:00
parent 3e77c13743
commit 51a5169987
9 changed files with 130 additions and 14 deletions
@@ -31,7 +31,7 @@ from .executors import AsyncioQueueGeneratorExecutor
 from .result import MaigretCheckResult, MaigretCheckStatus
 from .sites import MaigretDatabase, MaigretSite
 from .types import QueryOptions, QueryResultWrapper
-from .utils import ascii_data_display, get_random_user_agent
+from .utils import ascii_data_display, get_random_user_agent, is_plausible_username
 SUPPORTED_IDS = (
@@ -639,7 +639,6 @@ def process_site_result(
    html_text, status_code, check_error = response
    # TODO: add elapsed request time counting
    response_time = None
    if logger.level == logging.DEBUG:
@@ -673,7 +672,6 @@ def process_site_result(
                f"Failed activation {method} for site {site.name}: {str(e)}",
                exc_info=True,
            )
        # TODO: temporary check error
    site_name = site.pretty_name
    # presense flags
@@ -1296,7 +1294,6 @@ async def site_self_check(
                )
                # don't disable entries with other ids types
                # TODO: make normal checking
                if site.name not in results_dict:
                    logger.info(results_dict)
                    changes["issues"].append(f"Site {site.name} not in results (wrong id_type?)")
@@ -1525,13 +1522,23 @@ def parse_usernames(extracted_ids_data, logger) -> Dict:
    new_usernames = {}
    for k, v in extracted_ids_data.items():
        if "username" in k and not "usernames" in k:
            if is_plausible_username(v):
                new_usernames[v] = "username"
            else:
                logger.debug(
                    f"Rejected non-username value extracted under key {k!r}: {v!r}"
                )
        elif "usernames" in k:
            try:
                tree = ast.literal_eval(v)
                if isinstance(tree, list):
                    for n in tree:
                        if is_plausible_username(n):
                            new_usernames[n] = "username"
                        else:
                            logger.debug(
                                f"Rejected non-username item from list under key {k!r}: {n!r}"
                            )
            except Exception as e:
                logger.warning(e)
        if k in SUPPORTED_IDS:
@@ -77,7 +77,6 @@ ERRORS_TYPES = {
    'Connecting failure': 'Try to decrease number of parallel connections (e.g. -n 10)',
 }
 # TODO: checking for reason
 ERRORS_REASONS = {
    'Login required': 'Add authorization cookies through `--cookies-jar-file` (see cookies.txt)',
 }
@@ -55,7 +55,7 @@ from .report import (
 from .sites import MaigretDatabase
 from .submit import Submitter
 from .types import QueryResultWrapper
-from .utils import get_dict_ascii_tree
+from .utils import get_dict_ascii_tree, is_plausible_username
 from .settings import Settings
 from .permutator import Permute
@@ -85,13 +85,23 @@ def extract_ids_from_page(url, logger, timeout=5) -> dict:
        for k, v in info.items():
            # TODO: merge with the same functionality in checking module
            if 'username' in k and not 'usernames' in k:
                if is_plausible_username(v):
                    results[v] = 'username'
                else:
                    logger.debug(
                        f"Rejected non-username value extracted under key {k!r}: {v!r}"
                    )
            elif 'usernames' in k:
                try:
                    tree = ast.literal_eval(v)
                    if isinstance(tree, list):
                        for n in tree:
                            if is_plausible_username(n):
                                results[n] = 'username'
                            else:
                                logger.debug(
                                    f"Rejected non-username item from list under key {k!r}: {n!r}"
                                )
                except Exception as e:
                    logger.warning(e)
            if k in SUPPORTED_IDS:
@@ -516,7 +516,6 @@ def generate_report_context(username_results: list):
                                tag = pycountry.countries.search_fuzzy(v)[
                                    0
                                ].alpha_2.lower()  # type: ignore[attr-defined]
                            # TODO: move countries to another struct
                            tags[tag] = tags.get(tag, 0) + 1
                        except Exception as e:
                            logging.debug(
@@ -568,7 +567,6 @@ def generate_report_context(username_results: list):
    return {
        "username": first_username,
        # TODO: return brief list
        "brief": brief,
        "results": username_results,
        "first_seen": first_seen,
@@ -1,6 +1,6 @@
 {
    "version": 1,
-    "updated_at": "2026-05-16T10:45:38Z",
+    "updated_at": "2026-05-16T15:49:11Z",
    "sites_count": 3155,
    "min_maigret_version": "0.6.1",
    "data_sha256": "df2ab3dbc96bdcdc8aa4e9da485df75ce6c3274814080f00a35e89f7f43783e1",
@@ -127,3 +127,29 @@ def get_match_ratio(base_strs: list):
 def generate_random_username():
    return ''.join(random.choices(string.ascii_lowercase, k=10))
 def is_plausible_username(value: Any) -> bool:
    """Reject obviously non-username strings extracted from sites' identity data.
    Extractor schemes occasionally populate fields named like ``*_username``
    with URLs (e.g. ``instagram_username`` -> ``https://instagram.com/X``) or
    emails (e.g. ``your_username`` -> ``user@example.com``). Feeding such a
    value back into a site URL template produces broken requests on every
    subsequent site, which manifests as a cascade of false errors and the
    "wrong username" symptom in #1403.
    """
    if not isinstance(value, str):
        return False
    s = value.strip()
    if not s:
        return False
    if "://" in s or s.startswith(("http://", "https://", "www.", "//")):
        return False
    if "/" in s:
        return False
    if any(c.isspace() for c in s):
        return False
    if "@" in s and "." in s:
        return False
    return True
@@ -146,6 +146,33 @@ def test_parse_usernames_malformed_list():
    assert logger.warning.called
 def test_parse_usernames_rejects_url_value():
    """Regression for #1403: extractors sometimes return a URL under a *_username
    key; that URL must not be fed back as a candidate username."""
    logger = Mock()
    result = parse_usernames(
        {"instagram_username": "https://instagram.com/zuck"}, logger
    )
    assert result == {}
 def test_parse_usernames_rejects_email_value():
    """Regression for #1403: e.g. socid_extractor's 'your_username' returns an
    email under a key matching the username heuristic."""
    logger = Mock()
    result = parse_usernames({"your_username": "alice@example.com"}, logger)
    assert result == {}
 def test_parse_usernames_filters_urls_inside_list():
    logger = Mock()
    result = parse_usernames(
        {"other_usernames": "['alice', 'https://example.com/bob']"}, logger
    )
    # 'alice' should survive; the URL should be dropped.
    assert result == {"alice": "username"}
 def test_parse_usernames_supported_id():
    logger = Mock()
    # "telegram" is in SUPPORTED_IDS per socid_extractor
@@ -10,6 +10,7 @@ from maigret.utils import (
    URLMatcher,
    get_dict_ascii_tree,
    get_match_ratio,
    is_plausible_username,
 )
@@ -144,3 +145,52 @@ def test_get_match_ratio():
    fun = get_match_ratio(["test", "maigret", "username"])
    assert fun("test") == 1
 # Regression tests for #1403 — Gravatar URL leaking into next-iteration username.
 # Extractor schemes occasionally store URLs/emails under '*_username' keys; without
 # validation these were fed back into the search loop and produced cascades of false
 # errors. See maigret/utils.py::is_plausible_username.
 def test_is_plausible_username_accepts_bare_usernames():
    assert is_plausible_username("alice")
    assert is_plausible_username("alice.bob")
    assert is_plausible_username("alice_bob-42")
    assert is_plausible_username("Алиса")
 def test_is_plausible_username_rejects_urls():
    assert not is_plausible_username("https://gravatar.com/alice")
    assert not is_plausible_username("http://example.com/user/alice")
    assert not is_plausible_username("//example.com/alice")
    assert not is_plausible_username("www.facebook.com/zuck")
 def test_is_plausible_username_accepts_http_prefixed_handles():
    """Don't over-match: bare names that just happen to start with 'http' or 'www'
    are legitimate (e.g. the httpie CLI maintainer's handle)."""
    assert is_plausible_username("httpie")
    assert is_plausible_username("http_user")
    assert is_plausible_username("wwwsuperstar")
 def test_is_plausible_username_rejects_path_like():
    assert not is_plausible_username("user/alice")
    assert not is_plausible_username("alice/")
 def test_is_plausible_username_rejects_emails():
    assert not is_plausible_username("alice@example.com")
    assert not is_plausible_username("user@maigret.io")
 def test_is_plausible_username_rejects_whitespace_and_empty():
    assert not is_plausible_username("")
    assert not is_plausible_username("   ")
    assert not is_plausible_username("alice bob")
    assert not is_plausible_username("alice\nbob")
 def test_is_plausible_username_rejects_non_strings():
    assert not is_plausible_username(None)
    assert not is_plausible_username(42)
    assert not is_plausible_username(["alice"])
@@ -165,7 +165,6 @@ if __name__ == '__main__':
    sites = {site.name: site for site in sites_subset}
    engines = db.engines
    # TODO: usernames extractors
    ok_usernames = ['alex', 'god', 'admin', 'red', 'blue', 'john']
    if args.username:
        ok_usernames = [args.username] + ok_usernames