mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-07 14:34:33 +00:00
Merge pull request #88 from soxoj/parsing-mode-improve
Improving "parse" mode for extracting usernames and other info for a …
This commit is contained in:
+18
-3
@@ -19,6 +19,7 @@ from .report import save_csv_report, save_xmind_report, save_html_report, save_p
|
|||||||
save_json_report
|
save_json_report
|
||||||
from .sites import MaigretDatabase
|
from .sites import MaigretDatabase
|
||||||
from .submit import submit_dialog
|
from .submit import submit_dialog
|
||||||
|
from .utils import get_dict_ascii_tree
|
||||||
|
|
||||||
__version__ = '0.1.15'
|
__version__ = '0.1.15'
|
||||||
|
|
||||||
@@ -218,10 +219,24 @@ async def main():
|
|||||||
print("Using the proxy: " + args.proxy)
|
print("Using the proxy: " + args.proxy)
|
||||||
|
|
||||||
if args.parse_url:
|
if args.parse_url:
|
||||||
page, _ = parse(args.parse_url, cookies_str='')
|
# url, headers
|
||||||
|
reqs = [(args.parse_url, set())]
|
||||||
|
try:
|
||||||
|
# temporary workaround for URL mutations MVP
|
||||||
|
from socid_extractor import mutate_url
|
||||||
|
reqs += list(mutate_url(args.parse_url))
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
for req in reqs:
|
||||||
|
url, headers = req
|
||||||
|
print(f'Scanning webpage by URL {url}...')
|
||||||
|
page, _ = parse(url, cookies_str='', headers=headers)
|
||||||
info = extract(page)
|
info = extract(page)
|
||||||
text = 'Extracted ID data from webpage: ' + ', '.join([f'{a}: {b}' for a, b in info.items()])
|
if not info:
|
||||||
print(text)
|
print('Nothing extracted')
|
||||||
|
else:
|
||||||
|
print(get_dict_ascii_tree(info.items(), new_line=False), ' ')
|
||||||
for k, v in info.items():
|
for k, v in info.items():
|
||||||
if 'username' in k:
|
if 'username' in k:
|
||||||
usernames[v] = 'username'
|
usernames[v] = 'username'
|
||||||
|
|||||||
+2
-17
@@ -8,6 +8,7 @@ import sys
|
|||||||
from colorama import Fore, Style, init
|
from colorama import Fore, Style, init
|
||||||
|
|
||||||
from .result import QueryStatus
|
from .result import QueryStatus
|
||||||
|
from .utils import get_dict_ascii_tree
|
||||||
|
|
||||||
|
|
||||||
class QueryNotify():
|
class QueryNotify():
|
||||||
@@ -176,22 +177,6 @@ class QueryNotifyPrint(QueryNotify):
|
|||||||
else:
|
else:
|
||||||
print(msg)
|
print(msg)
|
||||||
|
|
||||||
def get_additional_data_text(self, items, prepend=''):
|
|
||||||
text = ''
|
|
||||||
for num, item in enumerate(items):
|
|
||||||
box_symbol = '┣╸' if num != len(items) - 1 else '┗╸'
|
|
||||||
|
|
||||||
if type(item) == tuple:
|
|
||||||
field_name, field_value = item
|
|
||||||
if field_value.startswith('[\''):
|
|
||||||
is_last_item = num == len(items) - 1
|
|
||||||
prepend_symbols = ' ' * 3 if is_last_item else ' ┃ '
|
|
||||||
field_value = self.get_additional_data_text(eval(field_value), prepend_symbols)
|
|
||||||
text += f'\n{prepend}{box_symbol}{field_name}: {field_value}'
|
|
||||||
else:
|
|
||||||
text += f'\n{prepend}{box_symbol} {item}'
|
|
||||||
|
|
||||||
return text
|
|
||||||
|
|
||||||
def update(self, result, is_similar=False):
|
def update(self, result, is_similar=False):
|
||||||
"""Notify Update.
|
"""Notify Update.
|
||||||
@@ -211,7 +196,7 @@ class QueryNotifyPrint(QueryNotify):
|
|||||||
if not self.result.ids_data:
|
if not self.result.ids_data:
|
||||||
ids_data_text = ""
|
ids_data_text = ""
|
||||||
else:
|
else:
|
||||||
ids_data_text = self.get_additional_data_text(self.result.ids_data.items(), ' ')
|
ids_data_text = get_dict_ascii_tree(self.result.ids_data.items(), ' ')
|
||||||
|
|
||||||
def make_colored_terminal_notify(status, text, status_color, text_color, appendix):
|
def make_colored_terminal_notify(status, text, status_color, text_color, appendix):
|
||||||
text = [
|
text = [
|
||||||
|
|||||||
@@ -22761,8 +22761,11 @@
|
|||||||
},
|
},
|
||||||
"codeforces.com": {
|
"codeforces.com": {
|
||||||
"tags": [
|
"tags": [
|
||||||
"in"
|
"coding"
|
||||||
],
|
],
|
||||||
|
"errors": {
|
||||||
|
"The page is temporarily blocked by administrator.": "IP ban"
|
||||||
|
},
|
||||||
"engine": "engineRedirect",
|
"engine": "engineRedirect",
|
||||||
"alexaRank": 8156,
|
"alexaRank": 8156,
|
||||||
"url": "http://codeforces.com/profile/{username}",
|
"url": "http://codeforces.com/profile/{username}",
|
||||||
|
|||||||
@@ -55,3 +55,24 @@ class URLMatcher:
|
|||||||
regexp_str = self._HTTP_URL_RE_STR.replace('(.+)', url_regexp)
|
regexp_str = self._HTTP_URL_RE_STR.replace('(.+)', url_regexp)
|
||||||
|
|
||||||
return re.compile(regexp_str)
|
return re.compile(regexp_str)
|
||||||
|
|
||||||
|
|
||||||
|
def get_dict_ascii_tree(items, prepend='', new_line=True):
|
||||||
|
text = ''
|
||||||
|
for num, item in enumerate(items):
|
||||||
|
box_symbol = '┣╸' if num != len(items) - 1 else '┗╸'
|
||||||
|
|
||||||
|
if type(item) == tuple:
|
||||||
|
field_name, field_value = item
|
||||||
|
if field_value.startswith('[\''):
|
||||||
|
is_last_item = num == len(items) - 1
|
||||||
|
prepend_symbols = ' ' * 3 if is_last_item else ' ┃ '
|
||||||
|
field_value = print_ascii_tree(eval(field_value), prepend_symbols)
|
||||||
|
text += f'\n{prepend}{box_symbol}{field_name}: {field_value}'
|
||||||
|
else:
|
||||||
|
text += f'\n{prepend}{box_symbol} {item}'
|
||||||
|
|
||||||
|
if not new_line:
|
||||||
|
text = text[1:]
|
||||||
|
|
||||||
|
return text
|
||||||
|
|||||||
+1
-1
@@ -28,7 +28,7 @@ reportlab==3.5.59
|
|||||||
requests>=2.24.0
|
requests>=2.24.0
|
||||||
requests-futures==1.0.0
|
requests-futures==1.0.0
|
||||||
six==1.15.0
|
six==1.15.0
|
||||||
socid-extractor>=0.0.13
|
socid-extractor>=0.0.15
|
||||||
soupsieve==2.1
|
soupsieve==2.1
|
||||||
stem==1.8.0
|
stem==1.8.0
|
||||||
torrequest==0.1.0
|
torrequest==0.1.0
|
||||||
|
|||||||
+20
-1
@@ -2,7 +2,7 @@
|
|||||||
import itertools
|
import itertools
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from maigret.utils import CaseConverter, is_country_tag, enrich_link_str, URLMatcher
|
from maigret.utils import CaseConverter, is_country_tag, enrich_link_str, URLMatcher, get_dict_ascii_tree
|
||||||
|
|
||||||
|
|
||||||
def test_case_convert_camel_to_snake():
|
def test_case_convert_camel_to_snake():
|
||||||
@@ -72,3 +72,22 @@ def test_url_make_profile_url_regexp():
|
|||||||
for url_parts in itertools.product(*parts):
|
for url_parts in itertools.product(*parts):
|
||||||
url = ''.join(url_parts)
|
url = ''.join(url_parts)
|
||||||
assert URLMatcher.make_profile_url_regexp(url).pattern == r'^https?://(www.)?flickr\.com/photos/(.+?)$'
|
assert URLMatcher.make_profile_url_regexp(url).pattern == r'^https?://(www.)?flickr\.com/photos/(.+?)$'
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_dict_ascii_tree():
|
||||||
|
data = {'uid': 'dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==', 'legacy_id': '26403415', 'username': 'alexaimephotographycars', 'name': 'Alex Aimé', 'created_at': '2018-05-04T10:17:01.000+0000', 'image': 'https://drscdn.500px.org/user_avatar/26403415/q%3D85_w%3D300_h%3D300/v2?webp=true&v=2&sig=0235678a4f7b65e007e864033ebfaf5ef6d87fad34f80a8639d985320c20fe3b', 'image_bg': 'https://drscdn.500px.org/user_cover/26403415/q%3D65_m%3D2048/v2?webp=true&v=1&sig=bea411fb158391a4fdad498874ff17088f91257e59dfb376ff67e3a44c3a4201', 'website': 'www.instagram.com/street.reality.photography/', 'facebook_link': ' www.instagram.com/street.reality.photography/', 'instagram_username': 'Street.Reality.Photography', 'twitter_username': 'Alexaimephotogr'}
|
||||||
|
|
||||||
|
ascii_tree = get_dict_ascii_tree(data.items())
|
||||||
|
|
||||||
|
assert ascii_tree == """
|
||||||
|
┣╸uid: dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==
|
||||||
|
┣╸legacy_id: 26403415
|
||||||
|
┣╸username: alexaimephotographycars
|
||||||
|
┣╸name: Alex Aimé
|
||||||
|
┣╸created_at: 2018-05-04T10:17:01.000+0000
|
||||||
|
┣╸image: https://drscdn.500px.org/user_avatar/26403415/q%3D85_w%3D300_h%3D300/v2?webp=true&v=2&sig=0235678a4f7b65e007e864033ebfaf5ef6d87fad34f80a8639d985320c20fe3b
|
||||||
|
┣╸image_bg: https://drscdn.500px.org/user_cover/26403415/q%3D65_m%3D2048/v2?webp=true&v=1&sig=bea411fb158391a4fdad498874ff17088f91257e59dfb376ff67e3a44c3a4201
|
||||||
|
┣╸website: www.instagram.com/street.reality.photography/
|
||||||
|
┣╸facebook_link: www.instagram.com/street.reality.photography/
|
||||||
|
┣╸instagram_username: Street.Reality.Photography
|
||||||
|
┗╸twitter_username: Alexaimephotogr"""
|
||||||
Reference in New Issue
Block a user