mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-17 03:45:36 +00:00
fix(checking): reject URLs and emails extracted as usernames (#2673)
Closes #1403
This commit is contained in:
+11
-4
@@ -31,7 +31,7 @@ from .executors import AsyncioQueueGeneratorExecutor
|
|||||||
from .result import MaigretCheckResult, MaigretCheckStatus
|
from .result import MaigretCheckResult, MaigretCheckStatus
|
||||||
from .sites import MaigretDatabase, MaigretSite
|
from .sites import MaigretDatabase, MaigretSite
|
||||||
from .types import QueryOptions, QueryResultWrapper
|
from .types import QueryOptions, QueryResultWrapper
|
||||||
from .utils import ascii_data_display, get_random_user_agent
|
from .utils import ascii_data_display, get_random_user_agent, is_plausible_username
|
||||||
|
|
||||||
|
|
||||||
SUPPORTED_IDS = (
|
SUPPORTED_IDS = (
|
||||||
@@ -639,7 +639,6 @@ def process_site_result(
|
|||||||
|
|
||||||
html_text, status_code, check_error = response
|
html_text, status_code, check_error = response
|
||||||
|
|
||||||
# TODO: add elapsed request time counting
|
|
||||||
response_time = None
|
response_time = None
|
||||||
|
|
||||||
if logger.level == logging.DEBUG:
|
if logger.level == logging.DEBUG:
|
||||||
@@ -673,7 +672,6 @@ def process_site_result(
|
|||||||
f"Failed activation {method} for site {site.name}: {str(e)}",
|
f"Failed activation {method} for site {site.name}: {str(e)}",
|
||||||
exc_info=True,
|
exc_info=True,
|
||||||
)
|
)
|
||||||
# TODO: temporary check error
|
|
||||||
|
|
||||||
site_name = site.pretty_name
|
site_name = site.pretty_name
|
||||||
# presense flags
|
# presense flags
|
||||||
@@ -1296,7 +1294,6 @@ async def site_self_check(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# don't disable entries with other ids types
|
# don't disable entries with other ids types
|
||||||
# TODO: make normal checking
|
|
||||||
if site.name not in results_dict:
|
if site.name not in results_dict:
|
||||||
logger.info(results_dict)
|
logger.info(results_dict)
|
||||||
changes["issues"].append(f"Site {site.name} not in results (wrong id_type?)")
|
changes["issues"].append(f"Site {site.name} not in results (wrong id_type?)")
|
||||||
@@ -1525,13 +1522,23 @@ def parse_usernames(extracted_ids_data, logger) -> Dict:
|
|||||||
new_usernames = {}
|
new_usernames = {}
|
||||||
for k, v in extracted_ids_data.items():
|
for k, v in extracted_ids_data.items():
|
||||||
if "username" in k and not "usernames" in k:
|
if "username" in k and not "usernames" in k:
|
||||||
|
if is_plausible_username(v):
|
||||||
new_usernames[v] = "username"
|
new_usernames[v] = "username"
|
||||||
|
else:
|
||||||
|
logger.debug(
|
||||||
|
f"Rejected non-username value extracted under key {k!r}: {v!r}"
|
||||||
|
)
|
||||||
elif "usernames" in k:
|
elif "usernames" in k:
|
||||||
try:
|
try:
|
||||||
tree = ast.literal_eval(v)
|
tree = ast.literal_eval(v)
|
||||||
if isinstance(tree, list):
|
if isinstance(tree, list):
|
||||||
for n in tree:
|
for n in tree:
|
||||||
|
if is_plausible_username(n):
|
||||||
new_usernames[n] = "username"
|
new_usernames[n] = "username"
|
||||||
|
else:
|
||||||
|
logger.debug(
|
||||||
|
f"Rejected non-username item from list under key {k!r}: {n!r}"
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(e)
|
logger.warning(e)
|
||||||
if k in SUPPORTED_IDS:
|
if k in SUPPORTED_IDS:
|
||||||
|
|||||||
@@ -77,7 +77,6 @@ ERRORS_TYPES = {
|
|||||||
'Connecting failure': 'Try to decrease number of parallel connections (e.g. -n 10)',
|
'Connecting failure': 'Try to decrease number of parallel connections (e.g. -n 10)',
|
||||||
}
|
}
|
||||||
|
|
||||||
# TODO: checking for reason
|
|
||||||
ERRORS_REASONS = {
|
ERRORS_REASONS = {
|
||||||
'Login required': 'Add authorization cookies through `--cookies-jar-file` (see cookies.txt)',
|
'Login required': 'Add authorization cookies through `--cookies-jar-file` (see cookies.txt)',
|
||||||
}
|
}
|
||||||
|
|||||||
+11
-1
@@ -55,7 +55,7 @@ from .report import (
|
|||||||
from .sites import MaigretDatabase
|
from .sites import MaigretDatabase
|
||||||
from .submit import Submitter
|
from .submit import Submitter
|
||||||
from .types import QueryResultWrapper
|
from .types import QueryResultWrapper
|
||||||
from .utils import get_dict_ascii_tree
|
from .utils import get_dict_ascii_tree, is_plausible_username
|
||||||
from .settings import Settings
|
from .settings import Settings
|
||||||
from .permutator import Permute
|
from .permutator import Permute
|
||||||
|
|
||||||
@@ -85,13 +85,23 @@ def extract_ids_from_page(url, logger, timeout=5) -> dict:
|
|||||||
for k, v in info.items():
|
for k, v in info.items():
|
||||||
# TODO: merge with the same functionality in checking module
|
# TODO: merge with the same functionality in checking module
|
||||||
if 'username' in k and not 'usernames' in k:
|
if 'username' in k and not 'usernames' in k:
|
||||||
|
if is_plausible_username(v):
|
||||||
results[v] = 'username'
|
results[v] = 'username'
|
||||||
|
else:
|
||||||
|
logger.debug(
|
||||||
|
f"Rejected non-username value extracted under key {k!r}: {v!r}"
|
||||||
|
)
|
||||||
elif 'usernames' in k:
|
elif 'usernames' in k:
|
||||||
try:
|
try:
|
||||||
tree = ast.literal_eval(v)
|
tree = ast.literal_eval(v)
|
||||||
if isinstance(tree, list):
|
if isinstance(tree, list):
|
||||||
for n in tree:
|
for n in tree:
|
||||||
|
if is_plausible_username(n):
|
||||||
results[n] = 'username'
|
results[n] = 'username'
|
||||||
|
else:
|
||||||
|
logger.debug(
|
||||||
|
f"Rejected non-username item from list under key {k!r}: {n!r}"
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(e)
|
logger.warning(e)
|
||||||
if k in SUPPORTED_IDS:
|
if k in SUPPORTED_IDS:
|
||||||
|
|||||||
@@ -516,7 +516,6 @@ def generate_report_context(username_results: list):
|
|||||||
tag = pycountry.countries.search_fuzzy(v)[
|
tag = pycountry.countries.search_fuzzy(v)[
|
||||||
0
|
0
|
||||||
].alpha_2.lower() # type: ignore[attr-defined]
|
].alpha_2.lower() # type: ignore[attr-defined]
|
||||||
# TODO: move countries to another struct
|
|
||||||
tags[tag] = tags.get(tag, 0) + 1
|
tags[tag] = tags.get(tag, 0) + 1
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.debug(
|
logging.debug(
|
||||||
@@ -568,7 +567,6 @@ def generate_report_context(username_results: list):
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
"username": first_username,
|
"username": first_username,
|
||||||
# TODO: return brief list
|
|
||||||
"brief": brief,
|
"brief": brief,
|
||||||
"results": username_results,
|
"results": username_results,
|
||||||
"first_seen": first_seen,
|
"first_seen": first_seen,
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"version": 1,
|
"version": 1,
|
||||||
"updated_at": "2026-05-16T10:45:38Z",
|
"updated_at": "2026-05-16T15:49:11Z",
|
||||||
"sites_count": 3155,
|
"sites_count": 3155,
|
||||||
"min_maigret_version": "0.6.1",
|
"min_maigret_version": "0.6.1",
|
||||||
"data_sha256": "df2ab3dbc96bdcdc8aa4e9da485df75ce6c3274814080f00a35e89f7f43783e1",
|
"data_sha256": "df2ab3dbc96bdcdc8aa4e9da485df75ce6c3274814080f00a35e89f7f43783e1",
|
||||||
|
|||||||
@@ -127,3 +127,29 @@ def get_match_ratio(base_strs: list):
|
|||||||
|
|
||||||
def generate_random_username():
|
def generate_random_username():
|
||||||
return ''.join(random.choices(string.ascii_lowercase, k=10))
|
return ''.join(random.choices(string.ascii_lowercase, k=10))
|
||||||
|
|
||||||
|
|
||||||
|
def is_plausible_username(value: Any) -> bool:
|
||||||
|
"""Reject obviously non-username strings extracted from sites' identity data.
|
||||||
|
|
||||||
|
Extractor schemes occasionally populate fields named like ``*_username``
|
||||||
|
with URLs (e.g. ``instagram_username`` -> ``https://instagram.com/X``) or
|
||||||
|
emails (e.g. ``your_username`` -> ``user@example.com``). Feeding such a
|
||||||
|
value back into a site URL template produces broken requests on every
|
||||||
|
subsequent site, which manifests as a cascade of false errors and the
|
||||||
|
"wrong username" symptom in #1403.
|
||||||
|
"""
|
||||||
|
if not isinstance(value, str):
|
||||||
|
return False
|
||||||
|
s = value.strip()
|
||||||
|
if not s:
|
||||||
|
return False
|
||||||
|
if "://" in s or s.startswith(("http://", "https://", "www.", "//")):
|
||||||
|
return False
|
||||||
|
if "/" in s:
|
||||||
|
return False
|
||||||
|
if any(c.isspace() for c in s):
|
||||||
|
return False
|
||||||
|
if "@" in s and "." in s:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|||||||
@@ -146,6 +146,33 @@ def test_parse_usernames_malformed_list():
|
|||||||
assert logger.warning.called
|
assert logger.warning.called
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_usernames_rejects_url_value():
|
||||||
|
"""Regression for #1403: extractors sometimes return a URL under a *_username
|
||||||
|
key; that URL must not be fed back as a candidate username."""
|
||||||
|
logger = Mock()
|
||||||
|
result = parse_usernames(
|
||||||
|
{"instagram_username": "https://instagram.com/zuck"}, logger
|
||||||
|
)
|
||||||
|
assert result == {}
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_usernames_rejects_email_value():
|
||||||
|
"""Regression for #1403: e.g. socid_extractor's 'your_username' returns an
|
||||||
|
email under a key matching the username heuristic."""
|
||||||
|
logger = Mock()
|
||||||
|
result = parse_usernames({"your_username": "alice@example.com"}, logger)
|
||||||
|
assert result == {}
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_usernames_filters_urls_inside_list():
|
||||||
|
logger = Mock()
|
||||||
|
result = parse_usernames(
|
||||||
|
{"other_usernames": "['alice', 'https://example.com/bob']"}, logger
|
||||||
|
)
|
||||||
|
# 'alice' should survive; the URL should be dropped.
|
||||||
|
assert result == {"alice": "username"}
|
||||||
|
|
||||||
|
|
||||||
def test_parse_usernames_supported_id():
|
def test_parse_usernames_supported_id():
|
||||||
logger = Mock()
|
logger = Mock()
|
||||||
# "telegram" is in SUPPORTED_IDS per socid_extractor
|
# "telegram" is in SUPPORTED_IDS per socid_extractor
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ from maigret.utils import (
|
|||||||
URLMatcher,
|
URLMatcher,
|
||||||
get_dict_ascii_tree,
|
get_dict_ascii_tree,
|
||||||
get_match_ratio,
|
get_match_ratio,
|
||||||
|
is_plausible_username,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -144,3 +145,52 @@ def test_get_match_ratio():
|
|||||||
fun = get_match_ratio(["test", "maigret", "username"])
|
fun = get_match_ratio(["test", "maigret", "username"])
|
||||||
|
|
||||||
assert fun("test") == 1
|
assert fun("test") == 1
|
||||||
|
|
||||||
|
|
||||||
|
# Regression tests for #1403 — Gravatar URL leaking into next-iteration username.
|
||||||
|
# Extractor schemes occasionally store URLs/emails under '*_username' keys; without
|
||||||
|
# validation these were fed back into the search loop and produced cascades of false
|
||||||
|
# errors. See maigret/utils.py::is_plausible_username.
|
||||||
|
def test_is_plausible_username_accepts_bare_usernames():
|
||||||
|
assert is_plausible_username("alice")
|
||||||
|
assert is_plausible_username("alice.bob")
|
||||||
|
assert is_plausible_username("alice_bob-42")
|
||||||
|
assert is_plausible_username("Алиса")
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_plausible_username_rejects_urls():
|
||||||
|
assert not is_plausible_username("https://gravatar.com/alice")
|
||||||
|
assert not is_plausible_username("http://example.com/user/alice")
|
||||||
|
assert not is_plausible_username("//example.com/alice")
|
||||||
|
assert not is_plausible_username("www.facebook.com/zuck")
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_plausible_username_accepts_http_prefixed_handles():
|
||||||
|
"""Don't over-match: bare names that just happen to start with 'http' or 'www'
|
||||||
|
are legitimate (e.g. the httpie CLI maintainer's handle)."""
|
||||||
|
assert is_plausible_username("httpie")
|
||||||
|
assert is_plausible_username("http_user")
|
||||||
|
assert is_plausible_username("wwwsuperstar")
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_plausible_username_rejects_path_like():
|
||||||
|
assert not is_plausible_username("user/alice")
|
||||||
|
assert not is_plausible_username("alice/")
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_plausible_username_rejects_emails():
|
||||||
|
assert not is_plausible_username("alice@example.com")
|
||||||
|
assert not is_plausible_username("user@maigret.io")
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_plausible_username_rejects_whitespace_and_empty():
|
||||||
|
assert not is_plausible_username("")
|
||||||
|
assert not is_plausible_username(" ")
|
||||||
|
assert not is_plausible_username("alice bob")
|
||||||
|
assert not is_plausible_username("alice\nbob")
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_plausible_username_rejects_non_strings():
|
||||||
|
assert not is_plausible_username(None)
|
||||||
|
assert not is_plausible_username(42)
|
||||||
|
assert not is_plausible_username(["alice"])
|
||||||
|
|||||||
@@ -165,7 +165,6 @@ if __name__ == '__main__':
|
|||||||
sites = {site.name: site for site in sites_subset}
|
sites = {site.name: site for site in sites_subset}
|
||||||
engines = db.engines
|
engines = db.engines
|
||||||
|
|
||||||
# TODO: usernames extractors
|
|
||||||
ok_usernames = ['alex', 'god', 'admin', 'red', 'blue', 'john']
|
ok_usernames = ['alex', 'god', 'admin', 'red', 'blue', 'john']
|
||||||
if args.username:
|
if args.username:
|
||||||
ok_usernames = [args.username] + ok_usernames
|
ok_usernames = [args.username] + ok_usernames
|
||||||
|
|||||||
Reference in New Issue
Block a user