fix(checking): reject URLs and emails extracted as usernames (#2673)

Closes #1403
2026-05-17 03:45:36 +00:00 · 2026-05-16 17:52:00 +02:00
parent 3e77c13743
commit 51a5169987
9 changed files with 130 additions and 14 deletions
@@ -31,7 +31,7 @@ from .executors import AsyncioQueueGeneratorExecutor
 from .result import MaigretCheckResult, MaigretCheckStatus
 from .sites import MaigretDatabase, MaigretSite
 from .types import QueryOptions, QueryResultWrapper
-from .utils import ascii_data_display, get_random_user_agent
+from .utils import ascii_data_display, get_random_user_agent, is_plausible_username


 SUPPORTED_IDS = (
@@ -639,7 +639,6 @@ def process_site_result(

    html_text, status_code, check_error = response

-    # TODO: add elapsed request time counting
    response_time = None

    if logger.level == logging.DEBUG:
@@ -673,7 +672,6 @@ def process_site_result(
                f"Failed activation {method} for site {site.name}: {str(e)}",
                exc_info=True,
            )
-        # TODO: temporary check error

    site_name = site.pretty_name
    # presense flags
@@ -1296,7 +1294,6 @@ async def site_self_check(
                )

                # don't disable entries with other ids types
-                # TODO: make normal checking
                if site.name not in results_dict:
                    logger.info(results_dict)
                    changes["issues"].append(f"Site {site.name} not in results (wrong id_type?)")
@@ -1525,13 +1522,23 @@ def parse_usernames(extracted_ids_data, logger) -> Dict:
    new_usernames = {}
    for k, v in extracted_ids_data.items():
        if "username" in k and not "usernames" in k:
+            if is_plausible_username(v):
                new_usernames[v] = "username"
+            else:
+                logger.debug(
+                    f"Rejected non-username value extracted under key {k!r}: {v!r}"
+                )
        elif "usernames" in k:
            try:
                tree = ast.literal_eval(v)
                if isinstance(tree, list):
                    for n in tree:
+                        if is_plausible_username(n):
                            new_usernames[n] = "username"
+                        else:
+                            logger.debug(
+                                f"Rejected non-username item from list under key {k!r}: {n!r}"
+                            )
            except Exception as e:
                logger.warning(e)
        if k in SUPPORTED_IDS:
@@ -77,7 +77,6 @@ ERRORS_TYPES = {
    'Connecting failure': 'Try to decrease number of parallel connections (e.g. -n 10)',
 }

-# TODO: checking for reason
 ERRORS_REASONS = {
    'Login required': 'Add authorization cookies through `--cookies-jar-file` (see cookies.txt)',
 }
@@ -55,7 +55,7 @@ from .report import (
 from .sites import MaigretDatabase
 from .submit import Submitter
 from .types import QueryResultWrapper
-from .utils import get_dict_ascii_tree
+from .utils import get_dict_ascii_tree, is_plausible_username
 from .settings import Settings
 from .permutator import Permute

@@ -85,13 +85,23 @@ def extract_ids_from_page(url, logger, timeout=5) -> dict:
        for k, v in info.items():
            # TODO: merge with the same functionality in checking module
            if 'username' in k and not 'usernames' in k:
+                if is_plausible_username(v):
                    results[v] = 'username'
+                else:
+                    logger.debug(
+                        f"Rejected non-username value extracted under key {k!r}: {v!r}"
+                    )
            elif 'usernames' in k:
                try:
                    tree = ast.literal_eval(v)
                    if isinstance(tree, list):
                        for n in tree:
+                            if is_plausible_username(n):
                                results[n] = 'username'
+                            else:
+                                logger.debug(
+                                    f"Rejected non-username item from list under key {k!r}: {n!r}"
+                                )
                except Exception as e:
                    logger.warning(e)
            if k in SUPPORTED_IDS:
@@ -516,7 +516,6 @@ def generate_report_context(username_results: list):
                                tag = pycountry.countries.search_fuzzy(v)[
                                    0
                                ].alpha_2.lower()  # type: ignore[attr-defined]
-                            # TODO: move countries to another struct
                            tags[tag] = tags.get(tag, 0) + 1
                        except Exception as e:
                            logging.debug(
@@ -568,7 +567,6 @@ def generate_report_context(username_results: list):

    return {
        "username": first_username,
-        # TODO: return brief list
        "brief": brief,
        "results": username_results,
        "first_seen": first_seen,
@@ -1,6 +1,6 @@
 {
    "version": 1,
-    "updated_at": "2026-05-16T10:45:38Z",
+    "updated_at": "2026-05-16T15:49:11Z",
    "sites_count": 3155,
    "min_maigret_version": "0.6.1",
    "data_sha256": "df2ab3dbc96bdcdc8aa4e9da485df75ce6c3274814080f00a35e89f7f43783e1",
@@ -127,3 +127,29 @@ def get_match_ratio(base_strs: list):

 def generate_random_username():
    return ''.join(random.choices(string.ascii_lowercase, k=10))
+
+
+def is_plausible_username(value: Any) -> bool:
+    """Reject obviously non-username strings extracted from sites' identity data.
+
+    Extractor schemes occasionally populate fields named like ``*_username``
+    with URLs (e.g. ``instagram_username`` -> ``https://instagram.com/X``) or
+    emails (e.g. ``your_username`` -> ``user@example.com``). Feeding such a
+    value back into a site URL template produces broken requests on every
+    subsequent site, which manifests as a cascade of false errors and the
+    "wrong username" symptom in #1403.
+    """
+    if not isinstance(value, str):
+        return False
+    s = value.strip()
+    if not s:
+        return False
+    if "://" in s or s.startswith(("http://", "https://", "www.", "//")):
+        return False
+    if "/" in s:
+        return False
+    if any(c.isspace() for c in s):
+        return False
+    if "@" in s and "." in s:
+        return False
+    return True
@@ -146,6 +146,33 @@ def test_parse_usernames_malformed_list():
    assert logger.warning.called


+def test_parse_usernames_rejects_url_value():
+    """Regression for #1403: extractors sometimes return a URL under a *_username
+    key; that URL must not be fed back as a candidate username."""
+    logger = Mock()
+    result = parse_usernames(
+        {"instagram_username": "https://instagram.com/zuck"}, logger
+    )
+    assert result == {}
+
+
+def test_parse_usernames_rejects_email_value():
+    """Regression for #1403: e.g. socid_extractor's 'your_username' returns an
+    email under a key matching the username heuristic."""
+    logger = Mock()
+    result = parse_usernames({"your_username": "alice@example.com"}, logger)
+    assert result == {}
+
+
+def test_parse_usernames_filters_urls_inside_list():
+    logger = Mock()
+    result = parse_usernames(
+        {"other_usernames": "['alice', 'https://example.com/bob']"}, logger
+    )
+    # 'alice' should survive; the URL should be dropped.
+    assert result == {"alice": "username"}
+
+
 def test_parse_usernames_supported_id():
    logger = Mock()
    # "telegram" is in SUPPORTED_IDS per socid_extractor
@@ -10,6 +10,7 @@ from maigret.utils import (
    URLMatcher,
    get_dict_ascii_tree,
    get_match_ratio,
+    is_plausible_username,
 )


@@ -144,3 +145,52 @@ def test_get_match_ratio():
    fun = get_match_ratio(["test", "maigret", "username"])

    assert fun("test") == 1
+
+
+# Regression tests for #1403 — Gravatar URL leaking into next-iteration username.
+# Extractor schemes occasionally store URLs/emails under '*_username' keys; without
+# validation these were fed back into the search loop and produced cascades of false
+# errors. See maigret/utils.py::is_plausible_username.
+def test_is_plausible_username_accepts_bare_usernames():
+    assert is_plausible_username("alice")
+    assert is_plausible_username("alice.bob")
+    assert is_plausible_username("alice_bob-42")
+    assert is_plausible_username("Алиса")
+
+
+def test_is_plausible_username_rejects_urls():
+    assert not is_plausible_username("https://gravatar.com/alice")
+    assert not is_plausible_username("http://example.com/user/alice")
+    assert not is_plausible_username("//example.com/alice")
+    assert not is_plausible_username("www.facebook.com/zuck")
+
+
+def test_is_plausible_username_accepts_http_prefixed_handles():
+    """Don't over-match: bare names that just happen to start with 'http' or 'www'
+    are legitimate (e.g. the httpie CLI maintainer's handle)."""
+    assert is_plausible_username("httpie")
+    assert is_plausible_username("http_user")
+    assert is_plausible_username("wwwsuperstar")
+
+
+def test_is_plausible_username_rejects_path_like():
+    assert not is_plausible_username("user/alice")
+    assert not is_plausible_username("alice/")
+
+
+def test_is_plausible_username_rejects_emails():
+    assert not is_plausible_username("alice@example.com")
+    assert not is_plausible_username("user@maigret.io")
+
+
+def test_is_plausible_username_rejects_whitespace_and_empty():
+    assert not is_plausible_username("")
+    assert not is_plausible_username("   ")
+    assert not is_plausible_username("alice bob")
+    assert not is_plausible_username("alice\nbob")
+
+
+def test_is_plausible_username_rejects_non_strings():
+    assert not is_plausible_username(None)
+    assert not is_plausible_username(42)
+    assert not is_plausible_username(["alice"])
@@ -165,7 +165,6 @@ if __name__ == '__main__':
    sites = {site.name: site for site in sites_subset}
    engines = db.engines

-    # TODO: usernames extractors
    ok_usernames = ['alex', 'god', 'admin', 'red', 'blue', 'john']
    if args.username:
        ok_usernames = [args.username] + ok_usernames