fix(checking): reject URLs and emails extracted as usernames (#2673)

Closes #1403
This commit is contained in:
Soxoj
2026-05-16 17:52:00 +02:00
committed by GitHub
parent 3e77c13743
commit 51a5169987
9 changed files with 130 additions and 14 deletions
+13 -6
View File
@@ -31,7 +31,7 @@ from .executors import AsyncioQueueGeneratorExecutor
from .result import MaigretCheckResult, MaigretCheckStatus from .result import MaigretCheckResult, MaigretCheckStatus
from .sites import MaigretDatabase, MaigretSite from .sites import MaigretDatabase, MaigretSite
from .types import QueryOptions, QueryResultWrapper from .types import QueryOptions, QueryResultWrapper
from .utils import ascii_data_display, get_random_user_agent from .utils import ascii_data_display, get_random_user_agent, is_plausible_username
SUPPORTED_IDS = ( SUPPORTED_IDS = (
@@ -639,7 +639,6 @@ def process_site_result(
html_text, status_code, check_error = response html_text, status_code, check_error = response
# TODO: add elapsed request time counting
response_time = None response_time = None
if logger.level == logging.DEBUG: if logger.level == logging.DEBUG:
@@ -673,7 +672,6 @@ def process_site_result(
f"Failed activation {method} for site {site.name}: {str(e)}", f"Failed activation {method} for site {site.name}: {str(e)}",
exc_info=True, exc_info=True,
) )
# TODO: temporary check error
site_name = site.pretty_name site_name = site.pretty_name
# presense flags # presense flags
@@ -1296,7 +1294,6 @@ async def site_self_check(
) )
# don't disable entries with other ids types # don't disable entries with other ids types
# TODO: make normal checking
if site.name not in results_dict: if site.name not in results_dict:
logger.info(results_dict) logger.info(results_dict)
changes["issues"].append(f"Site {site.name} not in results (wrong id_type?)") changes["issues"].append(f"Site {site.name} not in results (wrong id_type?)")
@@ -1525,13 +1522,23 @@ def parse_usernames(extracted_ids_data, logger) -> Dict:
new_usernames = {} new_usernames = {}
for k, v in extracted_ids_data.items(): for k, v in extracted_ids_data.items():
if "username" in k and not "usernames" in k: if "username" in k and not "usernames" in k:
new_usernames[v] = "username" if is_plausible_username(v):
new_usernames[v] = "username"
else:
logger.debug(
f"Rejected non-username value extracted under key {k!r}: {v!r}"
)
elif "usernames" in k: elif "usernames" in k:
try: try:
tree = ast.literal_eval(v) tree = ast.literal_eval(v)
if isinstance(tree, list): if isinstance(tree, list):
for n in tree: for n in tree:
new_usernames[n] = "username" if is_plausible_username(n):
new_usernames[n] = "username"
else:
logger.debug(
f"Rejected non-username item from list under key {k!r}: {n!r}"
)
except Exception as e: except Exception as e:
logger.warning(e) logger.warning(e)
if k in SUPPORTED_IDS: if k in SUPPORTED_IDS:
-1
View File
@@ -77,7 +77,6 @@ ERRORS_TYPES = {
'Connecting failure': 'Try to decrease number of parallel connections (e.g. -n 10)', 'Connecting failure': 'Try to decrease number of parallel connections (e.g. -n 10)',
} }
# TODO: checking for reason
ERRORS_REASONS = { ERRORS_REASONS = {
'Login required': 'Add authorization cookies through `--cookies-jar-file` (see cookies.txt)', 'Login required': 'Add authorization cookies through `--cookies-jar-file` (see cookies.txt)',
} }
+13 -3
View File
@@ -55,7 +55,7 @@ from .report import (
from .sites import MaigretDatabase from .sites import MaigretDatabase
from .submit import Submitter from .submit import Submitter
from .types import QueryResultWrapper from .types import QueryResultWrapper
from .utils import get_dict_ascii_tree from .utils import get_dict_ascii_tree, is_plausible_username
from .settings import Settings from .settings import Settings
from .permutator import Permute from .permutator import Permute
@@ -85,13 +85,23 @@ def extract_ids_from_page(url, logger, timeout=5) -> dict:
for k, v in info.items(): for k, v in info.items():
# TODO: merge with the same functionality in checking module # TODO: merge with the same functionality in checking module
if 'username' in k and not 'usernames' in k: if 'username' in k and not 'usernames' in k:
results[v] = 'username' if is_plausible_username(v):
results[v] = 'username'
else:
logger.debug(
f"Rejected non-username value extracted under key {k!r}: {v!r}"
)
elif 'usernames' in k: elif 'usernames' in k:
try: try:
tree = ast.literal_eval(v) tree = ast.literal_eval(v)
if isinstance(tree, list): if isinstance(tree, list):
for n in tree: for n in tree:
results[n] = 'username' if is_plausible_username(n):
results[n] = 'username'
else:
logger.debug(
f"Rejected non-username item from list under key {k!r}: {n!r}"
)
except Exception as e: except Exception as e:
logger.warning(e) logger.warning(e)
if k in SUPPORTED_IDS: if k in SUPPORTED_IDS:
-2
View File
@@ -516,7 +516,6 @@ def generate_report_context(username_results: list):
tag = pycountry.countries.search_fuzzy(v)[ tag = pycountry.countries.search_fuzzy(v)[
0 0
].alpha_2.lower() # type: ignore[attr-defined] ].alpha_2.lower() # type: ignore[attr-defined]
# TODO: move countries to another struct
tags[tag] = tags.get(tag, 0) + 1 tags[tag] = tags.get(tag, 0) + 1
except Exception as e: except Exception as e:
logging.debug( logging.debug(
@@ -568,7 +567,6 @@ def generate_report_context(username_results: list):
return { return {
"username": first_username, "username": first_username,
# TODO: return brief list
"brief": brief, "brief": brief,
"results": username_results, "results": username_results,
"first_seen": first_seen, "first_seen": first_seen,
+1 -1
View File
@@ -1,6 +1,6 @@
{ {
"version": 1, "version": 1,
"updated_at": "2026-05-16T10:45:38Z", "updated_at": "2026-05-16T15:49:11Z",
"sites_count": 3155, "sites_count": 3155,
"min_maigret_version": "0.6.1", "min_maigret_version": "0.6.1",
"data_sha256": "df2ab3dbc96bdcdc8aa4e9da485df75ce6c3274814080f00a35e89f7f43783e1", "data_sha256": "df2ab3dbc96bdcdc8aa4e9da485df75ce6c3274814080f00a35e89f7f43783e1",
+26
View File
@@ -127,3 +127,29 @@ def get_match_ratio(base_strs: list):
def generate_random_username(): def generate_random_username():
return ''.join(random.choices(string.ascii_lowercase, k=10)) return ''.join(random.choices(string.ascii_lowercase, k=10))
def is_plausible_username(value: Any) -> bool:
"""Reject obviously non-username strings extracted from sites' identity data.
Extractor schemes occasionally populate fields named like ``*_username``
with URLs (e.g. ``instagram_username`` -> ``https://instagram.com/X``) or
emails (e.g. ``your_username`` -> ``user@example.com``). Feeding such a
value back into a site URL template produces broken requests on every
subsequent site, which manifests as a cascade of false errors and the
"wrong username" symptom in #1403.
"""
if not isinstance(value, str):
return False
s = value.strip()
if not s:
return False
if "://" in s or s.startswith(("http://", "https://", "www.", "//")):
return False
if "/" in s:
return False
if any(c.isspace() for c in s):
return False
if "@" in s and "." in s:
return False
return True
+27
View File
@@ -146,6 +146,33 @@ def test_parse_usernames_malformed_list():
assert logger.warning.called assert logger.warning.called
def test_parse_usernames_rejects_url_value():
"""Regression for #1403: extractors sometimes return a URL under a *_username
key; that URL must not be fed back as a candidate username."""
logger = Mock()
result = parse_usernames(
{"instagram_username": "https://instagram.com/zuck"}, logger
)
assert result == {}
def test_parse_usernames_rejects_email_value():
"""Regression for #1403: e.g. socid_extractor's 'your_username' returns an
email under a key matching the username heuristic."""
logger = Mock()
result = parse_usernames({"your_username": "alice@example.com"}, logger)
assert result == {}
def test_parse_usernames_filters_urls_inside_list():
logger = Mock()
result = parse_usernames(
{"other_usernames": "['alice', 'https://example.com/bob']"}, logger
)
# 'alice' should survive; the URL should be dropped.
assert result == {"alice": "username"}
def test_parse_usernames_supported_id(): def test_parse_usernames_supported_id():
logger = Mock() logger = Mock()
# "telegram" is in SUPPORTED_IDS per socid_extractor # "telegram" is in SUPPORTED_IDS per socid_extractor
+50
View File
@@ -10,6 +10,7 @@ from maigret.utils import (
URLMatcher, URLMatcher,
get_dict_ascii_tree, get_dict_ascii_tree,
get_match_ratio, get_match_ratio,
is_plausible_username,
) )
@@ -144,3 +145,52 @@ def test_get_match_ratio():
fun = get_match_ratio(["test", "maigret", "username"]) fun = get_match_ratio(["test", "maigret", "username"])
assert fun("test") == 1 assert fun("test") == 1
# Regression tests for #1403 — Gravatar URL leaking into next-iteration username.
# Extractor schemes occasionally store URLs/emails under '*_username' keys; without
# validation these were fed back into the search loop and produced cascades of false
# errors. See maigret/utils.py::is_plausible_username.
def test_is_plausible_username_accepts_bare_usernames():
assert is_plausible_username("alice")
assert is_plausible_username("alice.bob")
assert is_plausible_username("alice_bob-42")
assert is_plausible_username("Алиса")
def test_is_plausible_username_rejects_urls():
assert not is_plausible_username("https://gravatar.com/alice")
assert not is_plausible_username("http://example.com/user/alice")
assert not is_plausible_username("//example.com/alice")
assert not is_plausible_username("www.facebook.com/zuck")
def test_is_plausible_username_accepts_http_prefixed_handles():
"""Don't over-match: bare names that just happen to start with 'http' or 'www'
are legitimate (e.g. the httpie CLI maintainer's handle)."""
assert is_plausible_username("httpie")
assert is_plausible_username("http_user")
assert is_plausible_username("wwwsuperstar")
def test_is_plausible_username_rejects_path_like():
assert not is_plausible_username("user/alice")
assert not is_plausible_username("alice/")
def test_is_plausible_username_rejects_emails():
assert not is_plausible_username("alice@example.com")
assert not is_plausible_username("user@maigret.io")
def test_is_plausible_username_rejects_whitespace_and_empty():
assert not is_plausible_username("")
assert not is_plausible_username(" ")
assert not is_plausible_username("alice bob")
assert not is_plausible_username("alice\nbob")
def test_is_plausible_username_rejects_non_strings():
assert not is_plausible_username(None)
assert not is_plausible_username(42)
assert not is_plausible_username(["alice"])
-1
View File
@@ -165,7 +165,6 @@ if __name__ == '__main__':
sites = {site.name: site for site in sites_subset} sites = {site.name: site for site in sites_subset}
engines = db.engines engines = db.engines
# TODO: usernames extractors
ok_usernames = ['alex', 'god', 'admin', 'red', 'blue', 'john'] ok_usernames = ['alex', 'god', 'admin', 'red', 'blue', 'john']
if args.username: if args.username:
ok_usernames = [args.username] + ok_usernames ok_usernames = [args.username] + ok_usernames