Tags and custom checks bugfixes

This commit is contained in:
Soxoj
2021-01-16 14:25:13 +03:00
parent 5bffa83061
commit d1f7343832
6 changed files with 1929 additions and 1679 deletions
+2
View File
@@ -1,6 +1,8 @@
# Changelog
## [Unreleased]
* tags bugfix
* custom data checks bugfix
## [0.1.10] - 2021-01-13
* added report static resources into package
+35 -22
View File
@@ -116,7 +116,6 @@ async def update_site_dict_from_response(sitename, site_dict, results_info, sema
site_dict[sitename] = process_site_result(response, query_notify, logger, results_info, site_obj)
# TODO: move info separate module
def detect_error_page(html_text, status_code, fail_flags, ignore_403):
# Detect service restrictions such as a country restriction
@@ -197,8 +196,18 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
# presense flags
# True by default
presense_flags = site.presense_strs
is_presense_detected = html_text and all(
[(presense_flag in html_text) for presense_flag in presense_flags]) or not presense_flags
if html_text:
is_presense_detected = False
if not presense_flags:
is_presense_detected = True
site.stats['presense_flag'] = None
else:
for presense_flag in presense_flags:
if presense_flag in html_text:
is_presense_detected = True
site.stats['presense_flag'] = presense_flag
logger.info(presense_flag)
break
if error_text is not None:
logger.debug(error_text)
@@ -300,7 +309,7 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
async def maigret(username, site_dict, query_notify, logger,
proxy=None, timeout=None, recursive_search=False,
id_type='username', tags=None, debug=False, forced=False,
id_type='username', debug=False, forced=False,
max_connections=100, no_progressbar=False):
"""Main search func
@@ -333,8 +342,6 @@ async def maigret(username, site_dict, query_notify, logger,
"""
# Notify caller that we are starting the query.
if tags is None:
tags = set()
query_notify.start(username, id_type)
# TODO: connector
@@ -358,17 +365,11 @@ async def maigret(username, site_dict, query_notify, logger,
# First create futures for all requests. This allows for the requests to run in parallel
for site_name, site in site_dict.items():
fulltags = site.tags
if site.type != id_type:
continue
site_tags = set(fulltags)
if tags:
if not set(tags).intersection(site_tags):
continue
if site.disabled and not forced:
logger.debug(f'Site {site.name} is disabled, skipping...')
continue
# Results from analysis of this specific site
@@ -579,13 +580,13 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F
site.disabled = changes['disabled']
db.update_site(site)
if not silent:
action = 'Disabled' if not site.disabled else 'Enabled'
action = 'Disabled' if site.disabled else 'Enabled'
print(f'{action} site {site.name}...')
return changes
async def self_check(db: MaigretDatabase, site_data: dict, logger, silent=False):
async def self_check(db: MaigretDatabase, site_data: dict, logger, silent=False) -> bool:
sem = asyncio.Semaphore(10)
tasks = []
all_sites = site_data
@@ -613,7 +614,9 @@ async def self_check(db: MaigretDatabase, site_data: dict, logger, silent=False)
total_disabled *= -1
if not silent:
print(f'{message} {total_disabled} checked sites. Run with `--info` flag to get more information')
print(f'{message} {total_disabled} ({disabled_old_count} => {disabled_new_count}) checked sites. Run with `--info` flag to get more information')
return total_disabled != 0
async def main():
@@ -664,9 +667,18 @@ async def main():
"A longer timeout will be more likely to get results from slow sites."
"On the other hand, this may cause a long delay to gather all results."
)
parser.add_argument("-n", "--max-connections",
action="store", type=int,
dest="connections", default=100,
help="Allowed number of concurrent connections."
)
parser.add_argument("-a", "--all-sites",
action="store_true", dest="all_sites", default=False,
help="Use all sites for scan."
)
parser.add_argument("--top-sites",
action="store", default=500, type=int,
help="Count of sites for checking ranked by Alexa Top (default: 500)."
help="Count of sites for scan ranked by Alexa Top (default: 500)."
)
parser.add_argument("--print-not-found",
action="store_true", dest="print_not_found", default=False,
@@ -789,7 +801,7 @@ async def main():
"resources/data.json"
)
if args.top_sites == 0:
if args.top_sites == 0 or args.all_sites:
args.top_sites = sys.maxsize
# Create object with all information about sites we are aware of.
@@ -803,12 +815,14 @@ async def main():
# Database self-checking
if args.self_check:
print('Maigret sites database self-checking...')
await self_check(db, site_data, logger)
is_need_update = await self_check(db, site_data, logger)
if is_need_update:
if input('Do you want to save changes permanently? [yYnN]\n').lower() == 'y':
db.save_to_file(args.json_file)
print('Database was successfully updated.')
else:
print('Updates will be applied only for current search session.')
print(db.get_stats(site_data))
# Make reports folder is not exists
os.makedirs(args.folderoutput, exist_ok=True)
@@ -865,10 +879,10 @@ async def main():
timeout=args.timeout,
recursive_search=recursive_search_enabled,
id_type=id_type,
tags=args.tags,
debug=args.verbose,
logger=logger,
forced=args.use_disabled_sites,
max_connections=args.connections,
)
username_result = (username, id_type, results)
@@ -902,6 +916,7 @@ async def main():
print(f'TXT report for {username} saved in {filename}')
# reporting for all the result
if general_results:
report_context = generate_report_context(general_results)
# determine main username
username = report_context['username']
@@ -915,8 +930,6 @@ async def main():
filename = report_filepath_tpl.format(username=username, postfix='.pdf')
save_pdf_report(filename, report_context)
print(f'PDF report on all usernames saved in {filename}')
# update database
db.save_to_file(args.json_file)
+1777 -1625
View File
File diff suppressed because it is too large Load Diff
+29 -9
View File
@@ -41,6 +41,7 @@ class MaigretSite:
self.presense_strs = []
self.absence_strs = []
self.stats = {}
self.engine = None
self.engine_data = {}
@@ -68,7 +69,7 @@ class MaigretSite:
# strip empty elements
if v in (False, '', [], {}, None, sys.maxsize, 'username'):
continue
if field in ['name', 'engineData', 'requestFuture', 'detectedEngine', 'engineObj']:
if field in ['name', 'engineData', 'requestFuture', 'detectedEngine', 'engineObj', 'stats']:
continue
result[field] = v
@@ -87,6 +88,8 @@ class MaigretSite:
# TODO: assertion of intersecting keys
# update dicts like errors
self.__dict__.get(field, {}).update(v)
elif isinstance(v, list):
self.__dict__[field] = self.__dict__.get(field, []) + v
else:
self.__dict__[field] = v
@@ -101,16 +104,23 @@ class MaigretSite:
self.request_future = None
self_copy = copy.deepcopy(self)
engine_data = self_copy.engine_obj.site
for field in engine_data.keys():
if isinstance(engine_data[field], dict):
for k in engine_data[field].keys():
del self_copy.__dict__[field][k]
continue
site_data_keys = list(self_copy.__dict__.keys())
if field in list(self_copy.__dict__.keys()):
for k in engine_data.keys():
field = CaseConverter.camel_to_snake(k)
is_exists = field in site_data_keys
# remove dict keys
if isinstance(engine_data[k], dict) and is_exists:
for f in engine_data[k].keys():
del self_copy.__dict__[field][f]
continue
# remove list items
if isinstance(engine_data[k], list) and is_exists:
for f in engine_data[k]:
self_copy.__dict__[field].remove(f)
continue
if is_exists:
del self_copy.__dict__[field]
if CaseConverter.camel_to_snake(field) in list(self_copy.__dict__.keys()):
del self_copy.__dict__[CaseConverter.camel_to_snake(field)]
return self_copy
@@ -255,3 +265,13 @@ class MaigretDatabase:
)
return self.load_from_json(data)
def get_stats(self, sites_dict):
sites = sites_dict or self.sites_dict
found_flags = {}
for _, s in sites.items():
if 'presense_flag' in s.stats:
flag = s.stats['presense_flag']
found_flags[flag] = found_flags.get(flag, 0) + 1
return found_flags
+58 -7
View File
@@ -27,28 +27,79 @@ EXAMPLE_RESULTS = {
}
}
GOOD_RESULT = QueryResult('', '', '', QueryStatus.CLAIMED)
BAD_RESULT = QueryResult('', '', '', QueryStatus.AVAILABLE)
GOOD_500PX_RESULT = copy.deepcopy(GOOD_RESULT)
GOOD_500PX_RESULT.tags = ['photo', 'us', 'global']
GOOD_500PX_RESULT.ids_data = {"uid": "dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==", "legacy_id": "26403415", "username": "alexaimephotographycars", "name": "Alex Aim\u00e9", "website": "www.flickr.com/photos/alexaimephotography/", "facebook_link": " www.instagram.com/street.reality.photography/", "instagram_username": "alexaimephotography", "twitter_username": "Alexaimephotogr"}
GOOD_500PX_RESULT.ids_data = {"uid": "dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==", "legacy_id": "26403415",
"username": "alexaimephotographycars", "name": "Alex Aim\u00e9",
"website": "www.flickr.com/photos/alexaimephotography/",
"facebook_link": " www.instagram.com/street.reality.photography/",
"instagram_username": "alexaimephotography", "twitter_username": "Alexaimephotogr"}
GOOD_REDDIT_RESULT = copy.deepcopy(GOOD_RESULT)
GOOD_REDDIT_RESULT.tags = ['news', 'us']
GOOD_REDDIT_RESULT.ids_data = {"reddit_id": "t5_1nytpy", "reddit_username": "alexaimephotography", "fullname": "alexaimephotography", "image": "https://styles.redditmedia.com/t5_1nytpy/styles/profileIcon_7vmhdwzd3g931.jpg?width=256&height=256&crop=256:256,smart&frame=1&s=4f355f16b4920844a3f4eacd4237a7bf76b2e97e", "is_employee": "False", "is_nsfw": "False", "is_mod": "True", "is_following": "True", "has_user_profile": "True", "hide_from_robots": "False", "created_at": "2019-07-10 12:20:03", "total_karma": "53959", "post_karma": "52738"}
GOOD_REDDIT_RESULT.ids_data = {"reddit_id": "t5_1nytpy", "reddit_username": "alexaimephotography",
"fullname": "alexaimephotography",
"image": "https://styles.redditmedia.com/t5_1nytpy/styles/profileIcon_7vmhdwzd3g931.jpg?width=256&height=256&crop=256:256,smart&frame=1&s=4f355f16b4920844a3f4eacd4237a7bf76b2e97e",
"is_employee": "False", "is_nsfw": "False", "is_mod": "True", "is_following": "True",
"has_user_profile": "True", "hide_from_robots": "False",
"created_at": "2019-07-10 12:20:03", "total_karma": "53959", "post_karma": "52738"}
GOOD_IG_RESULT = copy.deepcopy(GOOD_RESULT)
GOOD_IG_RESULT.tags = ['photo', 'global']
GOOD_IG_RESULT.ids_data = {"instagram_username": "alexaimephotography", "fullname": "Alexaimephotography", "id": "6828488620", "image": "https://scontent-hel3-1.cdninstagram.com/v/t51.2885-19/s320x320/95420076_1169632876707608_8741505804647006208_n.jpg?_nc_ht=scontent-hel3-1.cdninstagram.com&_nc_ohc=jd87OUGsX4MAX_Ym5GX&tp=1&oh=0f42badd68307ba97ec7fb1ef7b4bfd4&oe=601E5E6F", "bio": "Photographer \nChild of fine street arts", "external_url": "https://www.flickr.com/photos/alexaimephotography2020/"}
GOOD_IG_RESULT.ids_data = {"instagram_username": "alexaimephotography", "fullname": "Alexaimephotography",
"id": "6828488620",
"image": "https://scontent-hel3-1.cdninstagram.com/v/t51.2885-19/s320x320/95420076_1169632876707608_8741505804647006208_n.jpg?_nc_ht=scontent-hel3-1.cdninstagram.com&_nc_ohc=jd87OUGsX4MAX_Ym5GX&tp=1&oh=0f42badd68307ba97ec7fb1ef7b4bfd4&oe=601E5E6F",
"bio": "Photographer \nChild of fine street arts",
"external_url": "https://www.flickr.com/photos/alexaimephotography2020/"}
GOOD_TWITTER_RESULT = copy.deepcopy(GOOD_RESULT)
GOOD_TWITTER_RESULT.tags = ['social', 'us']
TEST = [('alexaimephotographycars', 'username', {'500px': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://500px.com/', 'url_user': 'https://500px.com/p/alexaimephotographycars', 'ids_usernames': {'alexaimephotographycars': 'username', 'alexaimephotography': 'username', 'Alexaimephotogr': 'username'}, 'status': GOOD_500PX_RESULT, 'http_status': 200, 'is_similar': False, 'rank': 2981}, 'Reddit': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/', 'url_user': 'https://www.reddit.com/user/alexaimephotographycars', 'status': BAD_RESULT, 'http_status': 404, 'is_similar': False, 'rank': 17}, 'Twitter': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/', 'url_user': 'https://twitter.com/alexaimephotographycars', 'status': BAD_RESULT, 'http_status': 400, 'is_similar': False, 'rank': 55}, 'Instagram': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://www.instagram.com/', 'url_user': 'https://www.instagram.com/alexaimephotographycars', 'status': BAD_RESULT, 'http_status': 404, 'is_similar': False, 'rank': 29}}), ('alexaimephotography', 'username', {'500px': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://500px.com/', 'url_user': 'https://500px.com/p/alexaimephotography', 'status': BAD_RESULT, 'http_status': 200, 'is_similar': False, 'rank': 2981}, 'Reddit': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/', 'url_user': 'https://www.reddit.com/user/alexaimephotography', 'ids_usernames': {'alexaimephotography': 'username'}, 'status': GOOD_REDDIT_RESULT, 'http_status': 200, 'is_similar': False, 'rank': 17}, 'Twitter': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/', 'url_user': 'https://twitter.com/alexaimephotography', 'status': BAD_RESULT, 'http_status': 400, 'is_similar': False, 'rank': 55}, 'Instagram': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.instagram.com/', 'url_user': 'https://www.instagram.com/alexaimephotography', 'ids_usernames': {'alexaimephotography': 'username'}, 'status': GOOD_IG_RESULT, 'http_status': 200, 'is_similar': False, 'rank': 29}}), ('Alexaimephotogr', 'username', {'500px': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://500px.com/', 'url_user': 'https://500px.com/p/Alexaimephotogr', 'status': BAD_RESULT, 'http_status': 200, 'is_similar': False, 'rank': 2981}, 'Reddit': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/', 'url_user': 'https://www.reddit.com/user/Alexaimephotogr', 'status': BAD_RESULT, 'http_status': 404, 'is_similar': False, 'rank': 17}, 'Twitter': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/', 'url_user': 'https://twitter.com/Alexaimephotogr', 'status': GOOD_TWITTER_RESULT, 'http_status': 400, 'is_similar': False, 'rank': 55}, 'Instagram': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.instagram.com/', 'url_user': 'https://www.instagram.com/Alexaimephotogr', 'status':BAD_RESULT, 'http_status': 404, 'is_similar': False, 'rank': 29}})]
TEST = [('alexaimephotographycars', 'username', {
'500px': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://500px.com/',
'url_user': 'https://500px.com/p/alexaimephotographycars',
'ids_usernames': {'alexaimephotographycars': 'username', 'alexaimephotography': 'username',
'Alexaimephotogr': 'username'}, 'status': GOOD_500PX_RESULT, 'http_status': 200,
'is_similar': False, 'rank': 2981},
'Reddit': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/',
'url_user': 'https://www.reddit.com/user/alexaimephotographycars', 'status': BAD_RESULT,
'http_status': 404, 'is_similar': False, 'rank': 17},
'Twitter': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/',
'url_user': 'https://twitter.com/alexaimephotographycars', 'status': BAD_RESULT, 'http_status': 400,
'is_similar': False, 'rank': 55},
'Instagram': {'username': 'alexaimephotographycars', 'parsing_enabled': True,
'url_main': 'https://www.instagram.com/',
'url_user': 'https://www.instagram.com/alexaimephotographycars', 'status': BAD_RESULT,
'http_status': 404, 'is_similar': False, 'rank': 29}}), ('alexaimephotography', 'username', {
'500px': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://500px.com/',
'url_user': 'https://500px.com/p/alexaimephotography', 'status': BAD_RESULT, 'http_status': 200,
'is_similar': False, 'rank': 2981},
'Reddit': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/',
'url_user': 'https://www.reddit.com/user/alexaimephotography',
'ids_usernames': {'alexaimephotography': 'username'}, 'status': GOOD_REDDIT_RESULT, 'http_status': 200,
'is_similar': False, 'rank': 17},
'Twitter': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/',
'url_user': 'https://twitter.com/alexaimephotography', 'status': BAD_RESULT, 'http_status': 400,
'is_similar': False, 'rank': 55},
'Instagram': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.instagram.com/',
'url_user': 'https://www.instagram.com/alexaimephotography',
'ids_usernames': {'alexaimephotography': 'username'}, 'status': GOOD_IG_RESULT, 'http_status': 200,
'is_similar': False, 'rank': 29}}), ('Alexaimephotogr', 'username', {
'500px': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://500px.com/',
'url_user': 'https://500px.com/p/Alexaimephotogr', 'status': BAD_RESULT, 'http_status': 200,
'is_similar': False, 'rank': 2981},
'Reddit': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/',
'url_user': 'https://www.reddit.com/user/Alexaimephotogr', 'status': BAD_RESULT, 'http_status': 404,
'is_similar': False, 'rank': 17},
'Twitter': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/',
'url_user': 'https://twitter.com/Alexaimephotogr', 'status': GOOD_TWITTER_RESULT, 'http_status': 400,
'is_similar': False, 'rank': 55},
'Instagram': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.instagram.com/',
'url_user': 'https://www.instagram.com/Alexaimephotogr', 'status': BAD_RESULT, 'http_status': 404,
'is_similar': False, 'rank': 29}})]
SUPPOSED_BRIEF = """Search by username alexaimephotographycars returned 1 accounts. Found target's other IDs: alexaimephotography, Alexaimephotogr. Search by username alexaimephotography returned 2 accounts. Search by username Alexaimephotogr returned 1 accounts. Extended info extracted from 3 accounts."""
+12
View File
@@ -86,6 +86,18 @@ def test_site_strip_engine_data():
assert amperka_stripped.json == EXAMPLE_DB['sites']['Amperka']
def test_site_strip_engine_data_with_site_prior_updates():
db = MaigretDatabase()
UPDATED_EXAMPLE_DB = dict(EXAMPLE_DB)
UPDATED_EXAMPLE_DB['sites']['Amperka']['absenceStrs'] = ["test"]
db.load_from_json(UPDATED_EXAMPLE_DB)
amperka = db.sites[0]
amperka_stripped = amperka.strip_engine_data()
assert amperka_stripped.json == UPDATED_EXAMPLE_DB['sites']['Amperka']
def test_saving_site_error():
db = MaigretDatabase()