mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-17 03:45:36 +00:00
51a5169987
Closes #1403
197 lines
6.4 KiB
Python
197 lines
6.4 KiB
Python
"""Maigret utils test functions"""
|
|
|
|
import itertools
|
|
import re
|
|
|
|
from maigret.utils import (
|
|
CaseConverter,
|
|
is_country_tag,
|
|
enrich_link_str,
|
|
URLMatcher,
|
|
get_dict_ascii_tree,
|
|
get_match_ratio,
|
|
is_plausible_username,
|
|
)
|
|
|
|
|
|
def test_case_convert_camel_to_snake():
|
|
a = 'SnakeCasedString'
|
|
b = CaseConverter.camel_to_snake(a)
|
|
|
|
assert b == 'snake_cased_string'
|
|
|
|
|
|
def test_case_convert_snake_to_camel():
|
|
a = 'camel_cased_string'
|
|
b = CaseConverter.snake_to_camel(a)
|
|
|
|
assert b == 'camelCasedString'
|
|
|
|
|
|
def test_case_convert_snake_to_title():
|
|
a = 'camel_cased_string'
|
|
b = CaseConverter.snake_to_title(a)
|
|
|
|
assert b == 'Camel cased string'
|
|
|
|
|
|
def test_case_convert_camel_with_digits_to_snake():
|
|
a = 'ignore403'
|
|
b = CaseConverter.camel_to_snake(a)
|
|
|
|
assert b == 'ignore403'
|
|
|
|
|
|
def test_is_country_tag():
|
|
assert is_country_tag('ru') is True
|
|
assert is_country_tag('FR') is True
|
|
|
|
assert is_country_tag('a1') is False
|
|
assert is_country_tag('dating') is False
|
|
|
|
assert is_country_tag('global') is True
|
|
|
|
|
|
def test_enrich_link_str():
|
|
assert enrich_link_str('test') == 'test'
|
|
assert (
|
|
enrich_link_str(' www.flickr.com/photos/alexaimephotography/')
|
|
== '<a class="auto-link" href="www.flickr.com/photos/alexaimephotography/">www.flickr.com/photos/alexaimephotography/</a>'
|
|
)
|
|
|
|
|
|
def test_url_extract_main_part_negative():
|
|
url_main_part = 'None'
|
|
assert URLMatcher.extract_main_part(url_main_part) == ''
|
|
|
|
|
|
def test_url_extract_main_part():
|
|
url_main_part = 'flickr.com/photos/alexaimephotography'
|
|
|
|
parts = [
|
|
['http://', 'https://'],
|
|
['www.', ''],
|
|
[url_main_part],
|
|
['/', ''],
|
|
]
|
|
|
|
url_regexp = re.compile(r'^https?://(www\.)?flickr.com/photos/(.+?)$')
|
|
# combine parts variations
|
|
for url_parts in itertools.product(*parts):
|
|
url = ''.join(url_parts)
|
|
# ensure all combinations give valid main part
|
|
assert URLMatcher.extract_main_part(url) == url_main_part
|
|
assert not url_regexp.match(url) is None
|
|
|
|
|
|
def test_url_make_profile_url_regexp():
|
|
url_main_part = 'flickr.com/photos/{username}'
|
|
|
|
parts = [
|
|
['http://', 'https://'],
|
|
['www.', ''],
|
|
[url_main_part],
|
|
['/', ''],
|
|
]
|
|
|
|
# combine parts variations
|
|
for url_parts in itertools.product(*parts):
|
|
url = ''.join(url_parts)
|
|
# ensure all combinations match pattern
|
|
assert (
|
|
URLMatcher.make_profile_url_regexp(url).pattern
|
|
== r'^https?://(www.|m.)?flickr\.com/photos/(.+?)$'
|
|
)
|
|
|
|
|
|
def test_get_dict_ascii_tree():
|
|
data = {
|
|
'uid': 'dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==',
|
|
'legacy_id': '26403415',
|
|
'username': 'alexaimephotographycars',
|
|
'name': 'Alex Aimé',
|
|
'links': "['www.instagram.com/street.reality.photography/']",
|
|
'created_at': '2018-05-04T10:17:01.000+0000',
|
|
'image': 'https://drscdn.500px.org/user_avatar/26403415/q%3D85_w%3D300_h%3D300/v2?webp=true&v=2&sig=0235678a4f7b65e007e864033ebfaf5ef6d87fad34f80a8639d985320c20fe3b',
|
|
'image_bg': 'https://drscdn.500px.org/user_cover/26403415/q%3D65_m%3D2048/v2?webp=true&v=1&sig=bea411fb158391a4fdad498874ff17088f91257e59dfb376ff67e3a44c3a4201',
|
|
'website': 'www.instagram.com/street.reality.photography/',
|
|
'facebook_link': ' www.instagram.com/street.reality.photography/',
|
|
'instagram_username': 'Street.Reality.Photography',
|
|
'twitter_username': 'Alexaimephotogr',
|
|
}
|
|
|
|
ascii_tree = get_dict_ascii_tree(data.items(), prepend=" ")
|
|
|
|
assert (
|
|
ascii_tree
|
|
== """
|
|
├─uid: dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==
|
|
├─legacy_id: 26403415
|
|
├─username: alexaimephotographycars
|
|
├─name: Alex Aimé
|
|
├─links:
|
|
│ └─ www.instagram.com/street.reality.photography/
|
|
├─created_at: 2018-05-04T10:17:01.000+0000
|
|
├─image: https://drscdn.500px.org/user_avatar/26403415/q%3D85_w%3D300_h%3D300/v2?webp=true&v=2&sig=0235678a4f7b65e007e864033ebfaf5ef6d87fad34f80a8639d985320c20fe3b
|
|
├─image_bg: https://drscdn.500px.org/user_cover/26403415/q%3D65_m%3D2048/v2?webp=true&v=1&sig=bea411fb158391a4fdad498874ff17088f91257e59dfb376ff67e3a44c3a4201
|
|
├─website: www.instagram.com/street.reality.photography/
|
|
├─facebook_link: www.instagram.com/street.reality.photography/
|
|
├─instagram_username: Street.Reality.Photography
|
|
└─twitter_username: Alexaimephotogr"""
|
|
)
|
|
|
|
|
|
def test_get_match_ratio():
|
|
fun = get_match_ratio(["test", "maigret", "username"])
|
|
|
|
assert fun("test") == 1
|
|
|
|
|
|
# Regression tests for #1403 — Gravatar URL leaking into next-iteration username.
|
|
# Extractor schemes occasionally store URLs/emails under '*_username' keys; without
|
|
# validation these were fed back into the search loop and produced cascades of false
|
|
# errors. See maigret/utils.py::is_plausible_username.
|
|
def test_is_plausible_username_accepts_bare_usernames():
|
|
assert is_plausible_username("alice")
|
|
assert is_plausible_username("alice.bob")
|
|
assert is_plausible_username("alice_bob-42")
|
|
assert is_plausible_username("Алиса")
|
|
|
|
|
|
def test_is_plausible_username_rejects_urls():
|
|
assert not is_plausible_username("https://gravatar.com/alice")
|
|
assert not is_plausible_username("http://example.com/user/alice")
|
|
assert not is_plausible_username("//example.com/alice")
|
|
assert not is_plausible_username("www.facebook.com/zuck")
|
|
|
|
|
|
def test_is_plausible_username_accepts_http_prefixed_handles():
|
|
"""Don't over-match: bare names that just happen to start with 'http' or 'www'
|
|
are legitimate (e.g. the httpie CLI maintainer's handle)."""
|
|
assert is_plausible_username("httpie")
|
|
assert is_plausible_username("http_user")
|
|
assert is_plausible_username("wwwsuperstar")
|
|
|
|
|
|
def test_is_plausible_username_rejects_path_like():
|
|
assert not is_plausible_username("user/alice")
|
|
assert not is_plausible_username("alice/")
|
|
|
|
|
|
def test_is_plausible_username_rejects_emails():
|
|
assert not is_plausible_username("alice@example.com")
|
|
assert not is_plausible_username("user@maigret.io")
|
|
|
|
|
|
def test_is_plausible_username_rejects_whitespace_and_empty():
|
|
assert not is_plausible_username("")
|
|
assert not is_plausible_username(" ")
|
|
assert not is_plausible_username("alice bob")
|
|
assert not is_plausible_username("alice\nbob")
|
|
|
|
|
|
def test_is_plausible_username_rejects_non_strings():
|
|
assert not is_plausible_username(None)
|
|
assert not is_plausible_username(42)
|
|
assert not is_plausible_username(["alice"])
|