mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-17 03:45:36 +00:00
fix(checking): reject URLs and emails extracted as usernames (#2673)
Closes #1403
This commit is contained in:
@@ -146,6 +146,33 @@ def test_parse_usernames_malformed_list():
|
||||
assert logger.warning.called
|
||||
|
||||
|
||||
def test_parse_usernames_rejects_url_value():
|
||||
"""Regression for #1403: extractors sometimes return a URL under a *_username
|
||||
key; that URL must not be fed back as a candidate username."""
|
||||
logger = Mock()
|
||||
result = parse_usernames(
|
||||
{"instagram_username": "https://instagram.com/zuck"}, logger
|
||||
)
|
||||
assert result == {}
|
||||
|
||||
|
||||
def test_parse_usernames_rejects_email_value():
|
||||
"""Regression for #1403: e.g. socid_extractor's 'your_username' returns an
|
||||
email under a key matching the username heuristic."""
|
||||
logger = Mock()
|
||||
result = parse_usernames({"your_username": "alice@example.com"}, logger)
|
||||
assert result == {}
|
||||
|
||||
|
||||
def test_parse_usernames_filters_urls_inside_list():
|
||||
logger = Mock()
|
||||
result = parse_usernames(
|
||||
{"other_usernames": "['alice', 'https://example.com/bob']"}, logger
|
||||
)
|
||||
# 'alice' should survive; the URL should be dropped.
|
||||
assert result == {"alice": "username"}
|
||||
|
||||
|
||||
def test_parse_usernames_supported_id():
|
||||
logger = Mock()
|
||||
# "telegram" is in SUPPORTED_IDS per socid_extractor
|
||||
|
||||
@@ -10,6 +10,7 @@ from maigret.utils import (
|
||||
URLMatcher,
|
||||
get_dict_ascii_tree,
|
||||
get_match_ratio,
|
||||
is_plausible_username,
|
||||
)
|
||||
|
||||
|
||||
@@ -144,3 +145,52 @@ def test_get_match_ratio():
|
||||
fun = get_match_ratio(["test", "maigret", "username"])
|
||||
|
||||
assert fun("test") == 1
|
||||
|
||||
|
||||
# Regression tests for #1403 — Gravatar URL leaking into next-iteration username.
|
||||
# Extractor schemes occasionally store URLs/emails under '*_username' keys; without
|
||||
# validation these were fed back into the search loop and produced cascades of false
|
||||
# errors. See maigret/utils.py::is_plausible_username.
|
||||
def test_is_plausible_username_accepts_bare_usernames():
|
||||
assert is_plausible_username("alice")
|
||||
assert is_plausible_username("alice.bob")
|
||||
assert is_plausible_username("alice_bob-42")
|
||||
assert is_plausible_username("Алиса")
|
||||
|
||||
|
||||
def test_is_plausible_username_rejects_urls():
|
||||
assert not is_plausible_username("https://gravatar.com/alice")
|
||||
assert not is_plausible_username("http://example.com/user/alice")
|
||||
assert not is_plausible_username("//example.com/alice")
|
||||
assert not is_plausible_username("www.facebook.com/zuck")
|
||||
|
||||
|
||||
def test_is_plausible_username_accepts_http_prefixed_handles():
|
||||
"""Don't over-match: bare names that just happen to start with 'http' or 'www'
|
||||
are legitimate (e.g. the httpie CLI maintainer's handle)."""
|
||||
assert is_plausible_username("httpie")
|
||||
assert is_plausible_username("http_user")
|
||||
assert is_plausible_username("wwwsuperstar")
|
||||
|
||||
|
||||
def test_is_plausible_username_rejects_path_like():
|
||||
assert not is_plausible_username("user/alice")
|
||||
assert not is_plausible_username("alice/")
|
||||
|
||||
|
||||
def test_is_plausible_username_rejects_emails():
|
||||
assert not is_plausible_username("alice@example.com")
|
||||
assert not is_plausible_username("user@maigret.io")
|
||||
|
||||
|
||||
def test_is_plausible_username_rejects_whitespace_and_empty():
|
||||
assert not is_plausible_username("")
|
||||
assert not is_plausible_username(" ")
|
||||
assert not is_plausible_username("alice bob")
|
||||
assert not is_plausible_username("alice\nbob")
|
||||
|
||||
|
||||
def test_is_plausible_username_rejects_non_strings():
|
||||
assert not is_plausible_username(None)
|
||||
assert not is_plausible_username(42)
|
||||
assert not is_plausible_username(["alice"])
|
||||
|
||||
Reference in New Issue
Block a user