From a2ddb15f09ebb8e8eedbe2a4aa2402aa7c3fb18a Mon Sep 17 00:00:00 2001 From: Soxoj Date: Sun, 21 Mar 2021 18:34:57 +0300 Subject: [PATCH] Improving "parse" mode for extracting usernames and other info for a further search --- maigret/maigret.py | 33 ++++++++++++++++++++++++--------- maigret/notify.py | 19 ++----------------- maigret/resources/data.json | 5 ++++- maigret/utils.py | 21 +++++++++++++++++++++ requirements.txt | 2 +- tests/test_utils.py | 21 ++++++++++++++++++++- 6 files changed, 72 insertions(+), 29 deletions(-) diff --git a/maigret/maigret.py b/maigret/maigret.py index b9578ad..9e4e343 100755 --- a/maigret/maigret.py +++ b/maigret/maigret.py @@ -19,6 +19,7 @@ from .report import save_csv_report, save_xmind_report, save_html_report, save_p save_json_report from .sites import MaigretDatabase from .submit import submit_dialog +from .utils import get_dict_ascii_tree __version__ = '0.1.15' @@ -218,15 +219,29 @@ async def main(): print("Using the proxy: " + args.proxy) if args.parse_url: - page, _ = parse(args.parse_url, cookies_str='') - info = extract(page) - text = 'Extracted ID data from webpage: ' + ', '.join([f'{a}: {b}' for a, b in info.items()]) - print(text) - for k, v in info.items(): - if 'username' in k: - usernames[v] = 'username' - if k in supported_recursive_search_ids: - usernames[v] = k + # url, headers + reqs = [(args.parse_url, set())] + try: + # temporary workaround for URL mutations MVP + from socid_extractor import mutate_url + reqs += list(mutate_url(args.parse_url)) + except: + pass + + for req in reqs: + url, headers = req + print(f'Scanning webpage by URL {url}...') + page, _ = parse(url, cookies_str='', headers=headers) + info = extract(page) + if not info: + print('Nothing extracted') + else: + print(get_dict_ascii_tree(info.items(), new_line=False), ' ') + for k, v in info.items(): + if 'username' in k: + usernames[v] = 'username' + if k in supported_recursive_search_ids: + usernames[v] = k if args.tags: args.tags = list(set(str(args.tags).split(','))) diff --git a/maigret/notify.py b/maigret/notify.py index ea3186d..86dabda 100644 --- a/maigret/notify.py +++ b/maigret/notify.py @@ -8,6 +8,7 @@ import sys from colorama import Fore, Style, init from .result import QueryStatus +from .utils import get_dict_ascii_tree class QueryNotify(): @@ -176,22 +177,6 @@ class QueryNotifyPrint(QueryNotify): else: print(msg) - def get_additional_data_text(self, items, prepend=''): - text = '' - for num, item in enumerate(items): - box_symbol = '┣╸' if num != len(items) - 1 else '┗╸' - - if type(item) == tuple: - field_name, field_value = item - if field_value.startswith('[\''): - is_last_item = num == len(items) - 1 - prepend_symbols = ' ' * 3 if is_last_item else ' ┃ ' - field_value = self.get_additional_data_text(eval(field_value), prepend_symbols) - text += f'\n{prepend}{box_symbol}{field_name}: {field_value}' - else: - text += f'\n{prepend}{box_symbol} {item}' - - return text def update(self, result, is_similar=False): """Notify Update. @@ -211,7 +196,7 @@ class QueryNotifyPrint(QueryNotify): if not self.result.ids_data: ids_data_text = "" else: - ids_data_text = self.get_additional_data_text(self.result.ids_data.items(), ' ') + ids_data_text = get_dict_ascii_tree(self.result.ids_data.items(), ' ') def make_colored_terminal_notify(status, text, status_color, text_color, appendix): text = [ diff --git a/maigret/resources/data.json b/maigret/resources/data.json index c9ade31..a197370 100644 --- a/maigret/resources/data.json +++ b/maigret/resources/data.json @@ -22761,8 +22761,11 @@ }, "codeforces.com": { "tags": [ - "in" + "coding" ], + "errors": { + "The page is temporarily blocked by administrator.": "IP ban" + }, "engine": "engineRedirect", "alexaRank": 8156, "url": "http://codeforces.com/profile/{username}", diff --git a/maigret/utils.py b/maigret/utils.py index ce11b59..86da659 100644 --- a/maigret/utils.py +++ b/maigret/utils.py @@ -55,3 +55,24 @@ class URLMatcher: regexp_str = self._HTTP_URL_RE_STR.replace('(.+)', url_regexp) return re.compile(regexp_str) + + +def get_dict_ascii_tree(items, prepend='', new_line=True): + text = '' + for num, item in enumerate(items): + box_symbol = '┣╸' if num != len(items) - 1 else '┗╸' + + if type(item) == tuple: + field_name, field_value = item + if field_value.startswith('[\''): + is_last_item = num == len(items) - 1 + prepend_symbols = ' ' * 3 if is_last_item else ' ┃ ' + field_value = print_ascii_tree(eval(field_value), prepend_symbols) + text += f'\n{prepend}{box_symbol}{field_name}: {field_value}' + else: + text += f'\n{prepend}{box_symbol} {item}' + + if not new_line: + text = text[1:] + + return text diff --git a/requirements.txt b/requirements.txt index 98bf068..b6cd8c0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,7 +28,7 @@ reportlab==3.5.59 requests>=2.24.0 requests-futures==1.0.0 six==1.15.0 -socid-extractor>=0.0.13 +socid-extractor>=0.0.15 soupsieve==2.1 stem==1.8.0 torrequest==0.1.0 diff --git a/tests/test_utils.py b/tests/test_utils.py index fee4cb3..98f63f8 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,7 +2,7 @@ import itertools import re -from maigret.utils import CaseConverter, is_country_tag, enrich_link_str, URLMatcher +from maigret.utils import CaseConverter, is_country_tag, enrich_link_str, URLMatcher, get_dict_ascii_tree def test_case_convert_camel_to_snake(): @@ -72,3 +72,22 @@ def test_url_make_profile_url_regexp(): for url_parts in itertools.product(*parts): url = ''.join(url_parts) assert URLMatcher.make_profile_url_regexp(url).pattern == r'^https?://(www.)?flickr\.com/photos/(.+?)$' + + +def test_get_dict_ascii_tree(): + data = {'uid': 'dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==', 'legacy_id': '26403415', 'username': 'alexaimephotographycars', 'name': 'Alex Aimé', 'created_at': '2018-05-04T10:17:01.000+0000', 'image': 'https://drscdn.500px.org/user_avatar/26403415/q%3D85_w%3D300_h%3D300/v2?webp=true&v=2&sig=0235678a4f7b65e007e864033ebfaf5ef6d87fad34f80a8639d985320c20fe3b', 'image_bg': 'https://drscdn.500px.org/user_cover/26403415/q%3D65_m%3D2048/v2?webp=true&v=1&sig=bea411fb158391a4fdad498874ff17088f91257e59dfb376ff67e3a44c3a4201', 'website': 'www.instagram.com/street.reality.photography/', 'facebook_link': ' www.instagram.com/street.reality.photography/', 'instagram_username': 'Street.Reality.Photography', 'twitter_username': 'Alexaimephotogr'} + + ascii_tree = get_dict_ascii_tree(data.items()) + + assert ascii_tree == """ +┣╸uid: dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ== +┣╸legacy_id: 26403415 +┣╸username: alexaimephotographycars +┣╸name: Alex Aimé +┣╸created_at: 2018-05-04T10:17:01.000+0000 +┣╸image: https://drscdn.500px.org/user_avatar/26403415/q%3D85_w%3D300_h%3D300/v2?webp=true&v=2&sig=0235678a4f7b65e007e864033ebfaf5ef6d87fad34f80a8639d985320c20fe3b +┣╸image_bg: https://drscdn.500px.org/user_cover/26403415/q%3D65_m%3D2048/v2?webp=true&v=1&sig=bea411fb158391a4fdad498874ff17088f91257e59dfb376ff67e3a44c3a4201 +┣╸website: www.instagram.com/street.reality.photography/ +┣╸facebook_link: www.instagram.com/street.reality.photography/ +┣╸instagram_username: Street.Reality.Photography +┗╸twitter_username: Alexaimephotogr""" \ No newline at end of file