From 0e9655c46a9fc1e6cdc199232d0a1518c4b3a4b0 Mon Sep 17 00:00:00 2001 From: Soxoj Date: Thu, 6 May 2021 22:35:44 +0300 Subject: [PATCH] Improve extracting ids from URLs, tests --- .gitignore | 5 +--- maigret/maigret.py | 42 +++++++++++++++++++++---------- maigret/notify.py | 2 ++ maigret/resources/data.json | 2 ++ maigret/sites.py | 15 +++++++++++- maigret/utils.py | 6 +++-- tests/test_cli.py | 13 +++++++++- tests/test_maigret.py | 17 +++++++++---- tests/test_notify.py | 49 +++++++++++++++++++++++++++++++++++++ tests/test_utils.py | 4 +++ 10 files changed, 129 insertions(+), 26 deletions(-) create mode 100644 tests/test_notify.py diff --git a/.gitignore b/.gitignore index eb3759b..3be27da 100644 --- a/.gitignore +++ b/.gitignore @@ -22,9 +22,6 @@ src/ # Comma-Separated Values (CSV) Reports *.csv -# Excluded sites list -tests/.excluded_sites - # MacOS Folder Metadata File .DS_Store /reports/ @@ -33,4 +30,4 @@ tests/.excluded_sites .coverage dist/ htmlcov/ -test_* \ No newline at end of file +/test_* \ No newline at end of file diff --git a/maigret/maigret.py b/maigret/maigret.py index d807052..5eae2bc 100755 --- a/maigret/maigret.py +++ b/maigret/maigret.py @@ -60,6 +60,17 @@ def notify_about_errors(search_results: QueryResultWrapper, query_notify): ) +def extract_ids_from_url(url: str, db: MaigretDatabase) -> dict: + results = {} + for s in db.sites: + result = s.extract_id_from_url(url) + if not result: + continue + _id, _type = result + results[_id] = _type + return results + + def extract_ids_from_page(url, logger, timeout=5) -> dict: results = {} # url, headers @@ -105,10 +116,8 @@ def extract_ids_from_results(results: QueryResultWrapper, db: MaigretDatabase) - ids_results[u] = utype for url in dictionary.get('ids_links', []): - for s in db.sites: - u = s.detect_username(url) - if u: - ids_results[u] = 'username' + ids_results.update(extract_ids_from_url(url, db)) + return ids_results @@ -129,10 +138,9 @@ def setup_arguments_parser(): ) parser.add_argument( "username", - nargs='?', + nargs='*', metavar="USERNAMES", - action="append", - help="One or more usernames to check with social networks.", + help="One or more usernames to search by.", ) parser.add_argument( "--version", @@ -231,7 +239,9 @@ def setup_arguments_parser(): help="Make requests over a proxy. e.g. socks5://127.0.0.1:1080", ) - filter_group = parser.add_argument_group('Site filtering', 'Options to set site search scope') + filter_group = parser.add_argument_group( + 'Site filtering', 'Options to set site search scope' + ) filter_group.add_argument( "-a", "--all-sites", @@ -269,7 +279,7 @@ def setup_arguments_parser(): modes_group = parser.add_argument_group( 'Operating modes', 'Various functions except the default search by a username. ' - 'Modes are executed sequentially in the order of declaration.' + 'Modes are executed sequentially in the order of declaration.', ) modes_group.add_argument( "--parse", @@ -296,10 +306,12 @@ def setup_arguments_parser(): "--stats", action="store_true", default=False, - help="Show database statistics (most frequent sites engines and tags)." + help="Show database statistics (most frequent sites engines and tags).", ) - output_group = parser.add_argument_group('Output options', 'Options to change verbosity and view of the console output') + output_group = parser.add_argument_group( + 'Output options', 'Options to change verbosity and view of the console output' + ) output_group.add_argument( "--print-not-found", action="store_true", @@ -354,7 +366,9 @@ def setup_arguments_parser(): help="Don't show progressbar.", ) - report_group = parser.add_argument_group('Report formats', 'Supported formats of report files') + report_group = parser.add_argument_group( + 'Report formats', 'Supported formats of report files' + ) report_group.add_argument( "-T", "--txt", @@ -446,7 +460,9 @@ async def main(): print("Using the proxy: " + args.proxy) if args.parse_url: - extracted_ids = extract_ids_from_page(args.parse_url, logger, timeout=args.timeout) + extracted_ids = extract_ids_from_page( + args.parse_url, logger, timeout=args.timeout + ) usernames.update(extracted_ids) if args.tags: diff --git a/maigret/notify.py b/maigret/notify.py index 03d1049..a6d8292 100644 --- a/maigret/notify.py +++ b/maigret/notify.py @@ -282,6 +282,8 @@ class QueryNotifyPrint(QueryNotify): sys.stdout.write("\x1b[1K\r") print(notify) + return notify + def __str__(self): """Convert Object To String. diff --git a/maigret/resources/data.json b/maigret/resources/data.json index f2185a6..2f676c7 100644 --- a/maigret/resources/data.json +++ b/maigret/resources/data.json @@ -14365,6 +14365,7 @@ "ru" ], "checkType": "response_url", + "regexCheck": "^(?!id\\d)\\w*$", "alexaRank": 27, "urlMain": "https://vk.com/", "url": "https://vk.com/{username}", @@ -14379,6 +14380,7 @@ "checkType": "response_url", "alexaRank": 27, "urlMain": "https://vk.com/", + "regexCheck": "^\\d+$", "url": "https://vk.com/id{username}", "source": "VK", "usernameClaimed": "270433952", diff --git a/maigret/sites.py b/maigret/sites.py index ac05372..07d1833 100644 --- a/maigret/sites.py +++ b/maigret/sites.py @@ -3,7 +3,7 @@ import copy import json import sys -from typing import Optional, List, Dict, Any +from typing import Optional, List, Dict, Any, Tuple import requests @@ -146,6 +146,19 @@ class MaigretSite: return None + def extract_id_from_url(self, url: str) -> Optional[Tuple[str, str]]: + if not self.url_regexp: + return None + + match_groups = self.url_regexp.match(url) + if not match_groups: + return None + + _id = match_groups.groups()[-1].rstrip("/") + _type = self.type + + return _id, _type + @property def pretty_name(self): if self.source: diff --git a/maigret/utils.py b/maigret/utils.py index 3de46f3..383ee17 100644 --- a/maigret/utils.py +++ b/maigret/utils.py @@ -55,9 +55,11 @@ class URLMatcher: url_main_part = self.extract_main_part(url) for c in self.UNSAFE_SYMBOLS: url_main_part = url_main_part.replace(c, f"\\{c}") - username_regexp = username_regexp or ".+?" + prepared_username_regexp = (username_regexp or ".+?").lstrip('^').rstrip('$') - url_regexp = url_main_part.replace("{username}", f"({username_regexp})") + url_regexp = url_main_part.replace( + "{username}", f"({prepared_username_regexp})" + ) regexp_str = self._HTTP_URL_RE_STR.replace("(.+)", url_regexp) return re.compile(regexp_str) diff --git a/tests/test_cli.py b/tests/test_cli.py index 9e2cce3..d7984a8 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -51,6 +51,17 @@ def test_args_search_mode(argparser): assert args == Namespace(**want_args) +def test_args_search_mode_several_usernames(argparser): + args = argparser.parse_args('username1 username2'.split()) + + assert args.username == ['username1', 'username2'] + + want_args = dict(DEFAULT_ARGS) + want_args.update({'username': ['username1', 'username2']}) + + assert args == Namespace(**want_args) + + def test_args_self_check_mode(argparser): args = argparser.parse_args('--self-check --site GitHub'.split()) @@ -59,7 +70,7 @@ def test_args_self_check_mode(argparser): { 'self_check': True, 'site_list': ['GitHub'], - 'username': [None], + 'username': [], } ) diff --git a/tests/test_maigret.py b/tests/test_maigret.py index 0f3fef5..8f17467 100644 --- a/tests/test_maigret.py +++ b/tests/test_maigret.py @@ -5,7 +5,8 @@ import copy import pytest from mock import Mock -from maigret.maigret import self_check, maigret, extract_ids_from_page, extract_ids_from_results +from maigret.maigret import self_check, maigret +from maigret.maigret import extract_ids_from_page, extract_ids_from_results, extract_ids_from_url from maigret.sites import MaigretSite from maigret.result import QueryResult, QueryStatus @@ -137,11 +138,18 @@ def test_maigret_results(test_db): assert results == RESULTS_EXAMPLE +def test_extract_ids_from_url(default_db): + assert extract_ids_from_url('https://www.reddit.com/user/test', default_db) == {'test': 'username'} + assert extract_ids_from_url('https://vk.com/id123', default_db) == {'123': 'vk_id'} + assert extract_ids_from_url('https://vk.com/ida123', default_db) == {'ida123': 'username'} + assert extract_ids_from_url('https://my.mail.ru/yandex.ru/dipres8904/', default_db) == {'dipres8904': 'username'} + assert extract_ids_from_url('https://reviews.yandex.ru/user/adbced123', default_db) == {'adbced123': 'yandex_public_id'} + + @pytest.mark.slow def test_extract_ids_from_page(test_db): logger = Mock() - found_ids = extract_ids_from_page('https://www.reddit.com/user/test', logger) - assert found_ids == {'test': 'username'} + extract_ids_from_page('https://www.reddit.com/user/test', logger) == {'test': 'username'} def test_extract_ids_from_results(test_db): @@ -149,5 +157,4 @@ def test_extract_ids_from_results(test_db): TEST_EXAMPLE['Reddit']['ids_usernames'] = {'test1': 'yandex_public_id'} TEST_EXAMPLE['Reddit']['ids_links'] = ['https://www.reddit.com/user/test2'] - found_ids = extract_ids_from_results(TEST_EXAMPLE, test_db) - assert found_ids == {'test1': 'yandex_public_id', 'test2': 'username'} + extract_ids_from_results(TEST_EXAMPLE, test_db) == {'test1': 'yandex_public_id', 'test2': 'username'} diff --git a/tests/test_notify.py b/tests/test_notify.py new file mode 100644 index 0000000..838ff2e --- /dev/null +++ b/tests/test_notify.py @@ -0,0 +1,49 @@ +from maigret.errors import CheckError +from maigret.notify import QueryNotifyPrint +from maigret.result import QueryStatus, QueryResult + + +def test_notify_illegal(): + n = QueryNotifyPrint(color=False) + + assert n.update(QueryResult( + username="test", + status=QueryStatus.ILLEGAL, + site_name="TEST_SITE", + site_url_user="http://example.com/test" + )) == "[-] TEST_SITE: Illegal Username Format For This Site!" + + +def test_notify_claimed(): + n = QueryNotifyPrint(color=False) + + assert n.update(QueryResult( + username="test", + status=QueryStatus.CLAIMED, + site_name="TEST_SITE", + site_url_user="http://example.com/test" + )) == "[+] TEST_SITE: http://example.com/test" + + +def test_notify_available(): + n = QueryNotifyPrint(color=False) + + assert n.update(QueryResult( + username="test", + status=QueryStatus.AVAILABLE, + site_name="TEST_SITE", + site_url_user="http://example.com/test" + )) == "[-] TEST_SITE: Not found!" + + +def test_notify_unknown(): + n = QueryNotifyPrint(color=False) + result = QueryResult( + username="test", + status=QueryStatus.UNKNOWN, + site_name="TEST_SITE", + site_url_user="http://example.com/test" + ) + result.error = CheckError('Type', 'Reason') + + assert n.update(result) == "[?] TEST_SITE: Type error: Reason" diff --git a/tests/test_utils.py b/tests/test_utils.py index 4954bf0..9382653 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -68,8 +68,10 @@ def test_url_extract_main_part(): ] url_regexp = re.compile('^https?://(www.)?flickr.com/photos/(.+?)$') + # combine parts variations for url_parts in itertools.product(*parts): url = ''.join(url_parts) + # ensure all combinations give valid main part assert URLMatcher.extract_main_part(url) == url_main_part assert not url_regexp.match(url) is None @@ -84,8 +86,10 @@ def test_url_make_profile_url_regexp(): ['/', ''], ] + # combine parts variations for url_parts in itertools.product(*parts): url = ''.join(url_parts) + # ensure all combinations match pattern assert ( URLMatcher.make_profile_url_regexp(url).pattern == r'^https?://(www.)?flickr\.com/photos/(.+?)$'