Tags and custom checks bugfixes

2026-05-06 22:19:01 +00:00 · 2021-01-16 14:25:13 +03:00
parent 5bffa83061
commit d1f7343832
6 changed files with 1929 additions and 1679 deletions
@@ -1,6 +1,8 @@
 # Changelog

 ## [Unreleased]
+* tags bugfix
+* custom data checks bugfix

 ## [0.1.10] - 2021-01-13
 * added report static resources into package
@@ -116,7 +116,6 @@ async def update_site_dict_from_response(sitename, site_dict, results_info, sema

        site_dict[sitename] = process_site_result(response, query_notify, logger, results_info, site_obj)

-
 # TODO: move info separate module
 def detect_error_page(html_text, status_code, fail_flags, ignore_403):
    # Detect service restrictions such as a country restriction
@@ -197,8 +196,18 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
    # presense flags
    # True by default
    presense_flags = site.presense_strs
-    is_presense_detected = html_text and all(
-        [(presense_flag in html_text) for presense_flag in presense_flags]) or not presense_flags
+    if html_text:
+        is_presense_detected = False
+        if not presense_flags:
+            is_presense_detected = True
+            site.stats['presense_flag'] = None
+        else:
+            for presense_flag in presense_flags:
+                if presense_flag in html_text:
+                    is_presense_detected = True
+                    site.stats['presense_flag'] = presense_flag
+                    logger.info(presense_flag)
+                    break

    if error_text is not None:
        logger.debug(error_text)
@@ -300,7 +309,7 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig

 async def maigret(username, site_dict, query_notify, logger,
                  proxy=None, timeout=None, recursive_search=False,
-                  id_type='username', tags=None, debug=False, forced=False,
+                  id_type='username', debug=False, forced=False,
                  max_connections=100, no_progressbar=False):
    """Main search func

@@ -333,8 +342,6 @@ async def maigret(username, site_dict, query_notify, logger,
    """

    # Notify caller that we are starting the query.
-    if tags is None:
-        tags = set()
    query_notify.start(username, id_type)

    # TODO: connector
@@ -358,17 +365,11 @@ async def maigret(username, site_dict, query_notify, logger,
    # First create futures for all requests. This allows for the requests to run in parallel
    for site_name, site in site_dict.items():

-        fulltags = site.tags
-
        if site.type != id_type:
            continue

-        site_tags = set(fulltags)
-        if tags:
-            if not set(tags).intersection(site_tags):
-                continue
-
        if site.disabled and not forced:
+            logger.debug(f'Site {site.name} is disabled, skipping...')
            continue

        # Results from analysis of this specific site
@@ -579,13 +580,13 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F
        site.disabled = changes['disabled']
        db.update_site(site)
        if not silent:
-            action = 'Disabled' if not site.disabled else 'Enabled'
+            action = 'Disabled' if site.disabled else 'Enabled'
            print(f'{action} site {site.name}...')

    return changes


-async def self_check(db: MaigretDatabase, site_data: dict, logger, silent=False):
+async def self_check(db: MaigretDatabase, site_data: dict, logger, silent=False) -> bool:
    sem = asyncio.Semaphore(10)
    tasks = []
    all_sites = site_data
@@ -613,7 +614,9 @@ async def self_check(db: MaigretDatabase, site_data: dict, logger, silent=False)
        total_disabled *= -1

    if not silent:
-        print(f'{message} {total_disabled} checked sites. Run with `--info` flag to get more information')
+        print(f'{message} {total_disabled} ({disabled_old_count} => {disabled_new_count}) checked sites. Run with `--info` flag to get more information')
+
+    return total_disabled != 0


 async def main():
@@ -664,9 +667,18 @@ async def main():
                             "A longer timeout will be more likely to get results from slow sites."
                             "On the other hand, this may cause a long delay to gather all results."
                        )
+    parser.add_argument("-n", "--max-connections",
+                        action="store", type=int,
+                        dest="connections", default=100,
+                        help="Allowed number of concurrent connections."
+                        )
+    parser.add_argument("-a", "--all-sites",
+                        action="store_true", dest="all_sites", default=False,
+                        help="Use all sites for scan."
+                        )
    parser.add_argument("--top-sites",
                        action="store", default=500, type=int,
-                        help="Count of sites for checking ranked by Alexa Top (default: 500)."
+                        help="Count of sites for scan ranked by Alexa Top (default: 500)."
                        )
    parser.add_argument("--print-not-found",
                        action="store_true", dest="print_not_found", default=False,
@@ -789,7 +801,7 @@ async def main():
                         "resources/data.json"
                         )

-    if args.top_sites == 0:
+    if args.top_sites == 0 or args.all_sites:
        args.top_sites = sys.maxsize

    # Create object with all information about sites we are aware of.
@@ -803,12 +815,14 @@ async def main():
    # Database self-checking
    if args.self_check:
        print('Maigret sites database self-checking...')
-        await self_check(db, site_data, logger)
+        is_need_update = await self_check(db, site_data, logger)
+        if is_need_update:
            if input('Do you want to save changes permanently? [yYnN]\n').lower() == 'y':
                db.save_to_file(args.json_file)
                print('Database was successfully updated.')
            else:
                print('Updates will be applied only for current search session.')
+        print(db.get_stats(site_data))

    # Make reports folder is not exists
    os.makedirs(args.folderoutput, exist_ok=True)
@@ -865,10 +879,10 @@ async def main():
                                timeout=args.timeout,
                                recursive_search=recursive_search_enabled,
                                id_type=id_type,
-                                tags=args.tags,
                                debug=args.verbose,
                                logger=logger,
                                forced=args.use_disabled_sites,
+                                max_connections=args.connections,
                                )

        username_result = (username, id_type, results)
@@ -902,6 +916,7 @@ async def main():
            print(f'TXT report for {username} saved in {filename}')

    # reporting for all the result
+    if general_results:
        report_context = generate_report_context(general_results)
        # determine main username
        username = report_context['username']
@@ -915,8 +930,6 @@ async def main():
            filename = report_filepath_tpl.format(username=username, postfix='.pdf')
            save_pdf_report(filename, report_context)
            print(f'PDF report on all usernames saved in {filename}')
-
-
    # update database
    db.save_to_file(args.json_file)

@@ -41,6 +41,7 @@ class MaigretSite:

        self.presense_strs = []
        self.absence_strs = []
+        self.stats = {}

        self.engine = None
        self.engine_data = {}
@@ -68,7 +69,7 @@ class MaigretSite:
            # strip empty elements
            if v in (False, '', [], {}, None, sys.maxsize, 'username'):
                continue
-            if field in ['name', 'engineData', 'requestFuture', 'detectedEngine', 'engineObj']:
+            if field in ['name', 'engineData', 'requestFuture', 'detectedEngine', 'engineObj', 'stats']:
                continue
            result[field] = v

@@ -87,6 +88,8 @@ class MaigretSite:
                # TODO: assertion of intersecting keys
                # update dicts like errors
                self.__dict__.get(field, {}).update(v)
+            elif isinstance(v, list):
+                self.__dict__[field] = self.__dict__.get(field, []) + v
            else:
                self.__dict__[field] = v

@@ -101,16 +104,23 @@ class MaigretSite:
        self.request_future = None
        self_copy = copy.deepcopy(self)
        engine_data = self_copy.engine_obj.site
-        for field in engine_data.keys():
-            if isinstance(engine_data[field], dict):
-                for k in engine_data[field].keys():
-                    del self_copy.__dict__[field][k]
-                continue
+        site_data_keys = list(self_copy.__dict__.keys())

-            if field in list(self_copy.__dict__.keys()):
+        for k in engine_data.keys():
+            field = CaseConverter.camel_to_snake(k)
+            is_exists = field in site_data_keys
+            # remove dict keys
+            if isinstance(engine_data[k], dict) and is_exists:
+                for f in engine_data[k].keys():
+                    del self_copy.__dict__[field][f]
+                continue
+            # remove list items
+            if isinstance(engine_data[k], list) and is_exists:
+                for f in engine_data[k]:
+                    self_copy.__dict__[field].remove(f)
+                continue
+            if is_exists:
                del self_copy.__dict__[field]
-            if CaseConverter.camel_to_snake(field) in list(self_copy.__dict__.keys()):
-                del self_copy.__dict__[CaseConverter.camel_to_snake(field)]

        return self_copy

@@ -255,3 +265,13 @@ class MaigretDatabase:
                                    )

        return self.load_from_json(data)
+
+    def get_stats(self, sites_dict):
+        sites = sites_dict or self.sites_dict
+        found_flags = {}
+        for _, s in sites.items():
+            if 'presense_flag' in s.stats:
+                flag = s.stats['presense_flag']
+                found_flags[flag] = found_flags.get(flag, 0) + 1
+
+        return found_flags
@@ -27,28 +27,79 @@ EXAMPLE_RESULTS = {
    }
 }

-
 GOOD_RESULT = QueryResult('', '', '', QueryStatus.CLAIMED)
 BAD_RESULT = QueryResult('', '', '', QueryStatus.AVAILABLE)

 GOOD_500PX_RESULT = copy.deepcopy(GOOD_RESULT)
 GOOD_500PX_RESULT.tags = ['photo', 'us', 'global']
-GOOD_500PX_RESULT.ids_data = {"uid": "dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==", "legacy_id": "26403415", "username": "alexaimephotographycars", "name": "Alex Aim\u00e9", "website": "www.flickr.com/photos/alexaimephotography/", "facebook_link": " www.instagram.com/street.reality.photography/", "instagram_username": "alexaimephotography", "twitter_username": "Alexaimephotogr"}
+GOOD_500PX_RESULT.ids_data = {"uid": "dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==", "legacy_id": "26403415",
+                              "username": "alexaimephotographycars", "name": "Alex Aim\u00e9",
+                              "website": "www.flickr.com/photos/alexaimephotography/",
+                              "facebook_link": " www.instagram.com/street.reality.photography/",
+                              "instagram_username": "alexaimephotography", "twitter_username": "Alexaimephotogr"}

 GOOD_REDDIT_RESULT = copy.deepcopy(GOOD_RESULT)
 GOOD_REDDIT_RESULT.tags = ['news', 'us']
-GOOD_REDDIT_RESULT.ids_data = {"reddit_id": "t5_1nytpy", "reddit_username": "alexaimephotography", "fullname": "alexaimephotography", "image": "https://styles.redditmedia.com/t5_1nytpy/styles/profileIcon_7vmhdwzd3g931.jpg?width=256&height=256&crop=256:256,smart&frame=1&s=4f355f16b4920844a3f4eacd4237a7bf76b2e97e", "is_employee": "False", "is_nsfw": "False", "is_mod": "True", "is_following": "True", "has_user_profile": "True", "hide_from_robots": "False", "created_at": "2019-07-10 12:20:03", "total_karma": "53959", "post_karma": "52738"}
+GOOD_REDDIT_RESULT.ids_data = {"reddit_id": "t5_1nytpy", "reddit_username": "alexaimephotography",
+                               "fullname": "alexaimephotography",
+                               "image": "https://styles.redditmedia.com/t5_1nytpy/styles/profileIcon_7vmhdwzd3g931.jpg?width=256&height=256&crop=256:256,smart&frame=1&s=4f355f16b4920844a3f4eacd4237a7bf76b2e97e",
+                               "is_employee": "False", "is_nsfw": "False", "is_mod": "True", "is_following": "True",
+                               "has_user_profile": "True", "hide_from_robots": "False",
+                               "created_at": "2019-07-10 12:20:03", "total_karma": "53959", "post_karma": "52738"}

 GOOD_IG_RESULT = copy.deepcopy(GOOD_RESULT)
 GOOD_IG_RESULT.tags = ['photo', 'global']
-GOOD_IG_RESULT.ids_data = {"instagram_username": "alexaimephotography", "fullname": "Alexaimephotography", "id": "6828488620", "image": "https://scontent-hel3-1.cdninstagram.com/v/t51.2885-19/s320x320/95420076_1169632876707608_8741505804647006208_n.jpg?_nc_ht=scontent-hel3-1.cdninstagram.com&_nc_ohc=jd87OUGsX4MAX_Ym5GX&tp=1&oh=0f42badd68307ba97ec7fb1ef7b4bfd4&oe=601E5E6F", "bio": "Photographer \nChild of fine street arts", "external_url": "https://www.flickr.com/photos/alexaimephotography2020/"}
+GOOD_IG_RESULT.ids_data = {"instagram_username": "alexaimephotography", "fullname": "Alexaimephotography",
+                           "id": "6828488620",
+                           "image": "https://scontent-hel3-1.cdninstagram.com/v/t51.2885-19/s320x320/95420076_1169632876707608_8741505804647006208_n.jpg?_nc_ht=scontent-hel3-1.cdninstagram.com&_nc_ohc=jd87OUGsX4MAX_Ym5GX&tp=1&oh=0f42badd68307ba97ec7fb1ef7b4bfd4&oe=601E5E6F",
+                           "bio": "Photographer \nChild of fine street arts",
+                           "external_url": "https://www.flickr.com/photos/alexaimephotography2020/"}

 GOOD_TWITTER_RESULT = copy.deepcopy(GOOD_RESULT)
 GOOD_TWITTER_RESULT.tags = ['social', 'us']

-
-TEST = [('alexaimephotographycars', 'username', {'500px': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://500px.com/', 'url_user': 'https://500px.com/p/alexaimephotographycars', 'ids_usernames': {'alexaimephotographycars': 'username', 'alexaimephotography': 'username', 'Alexaimephotogr': 'username'}, 'status': GOOD_500PX_RESULT, 'http_status': 200, 'is_similar': False, 'rank': 2981}, 'Reddit': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/', 'url_user': 'https://www.reddit.com/user/alexaimephotographycars', 'status': BAD_RESULT, 'http_status': 404, 'is_similar': False, 'rank': 17}, 'Twitter': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/', 'url_user': 'https://twitter.com/alexaimephotographycars', 'status': BAD_RESULT, 'http_status': 400, 'is_similar': False, 'rank': 55}, 'Instagram': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://www.instagram.com/', 'url_user': 'https://www.instagram.com/alexaimephotographycars', 'status': BAD_RESULT, 'http_status': 404, 'is_similar': False, 'rank': 29}}), ('alexaimephotography', 'username', {'500px': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://500px.com/', 'url_user': 'https://500px.com/p/alexaimephotography', 'status': BAD_RESULT, 'http_status': 200, 'is_similar': False, 'rank': 2981}, 'Reddit': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/', 'url_user': 'https://www.reddit.com/user/alexaimephotography', 'ids_usernames': {'alexaimephotography': 'username'}, 'status': GOOD_REDDIT_RESULT, 'http_status': 200, 'is_similar': False, 'rank': 17}, 'Twitter': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/', 'url_user': 'https://twitter.com/alexaimephotography', 'status': BAD_RESULT, 'http_status': 400, 'is_similar': False, 'rank': 55}, 'Instagram': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.instagram.com/', 'url_user': 'https://www.instagram.com/alexaimephotography', 'ids_usernames': {'alexaimephotography': 'username'}, 'status': GOOD_IG_RESULT, 'http_status': 200, 'is_similar': False, 'rank': 29}}), ('Alexaimephotogr', 'username', {'500px': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://500px.com/', 'url_user': 'https://500px.com/p/Alexaimephotogr', 'status': BAD_RESULT, 'http_status': 200, 'is_similar': False, 'rank': 2981}, 'Reddit': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/', 'url_user': 'https://www.reddit.com/user/Alexaimephotogr', 'status': BAD_RESULT, 'http_status': 404, 'is_similar': False, 'rank': 17}, 'Twitter': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/', 'url_user': 'https://twitter.com/Alexaimephotogr', 'status': GOOD_TWITTER_RESULT, 'http_status': 400, 'is_similar': False, 'rank': 55}, 'Instagram': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.instagram.com/', 'url_user': 'https://www.instagram.com/Alexaimephotogr', 'status':BAD_RESULT, 'http_status': 404, 'is_similar': False, 'rank': 29}})]
-
+TEST = [('alexaimephotographycars', 'username', {
+    '500px': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://500px.com/',
+              'url_user': 'https://500px.com/p/alexaimephotographycars',
+              'ids_usernames': {'alexaimephotographycars': 'username', 'alexaimephotography': 'username',
+                                'Alexaimephotogr': 'username'}, 'status': GOOD_500PX_RESULT, 'http_status': 200,
+              'is_similar': False, 'rank': 2981},
+    'Reddit': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/',
+               'url_user': 'https://www.reddit.com/user/alexaimephotographycars', 'status': BAD_RESULT,
+               'http_status': 404, 'is_similar': False, 'rank': 17},
+    'Twitter': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/',
+                'url_user': 'https://twitter.com/alexaimephotographycars', 'status': BAD_RESULT, 'http_status': 400,
+                'is_similar': False, 'rank': 55},
+    'Instagram': {'username': 'alexaimephotographycars', 'parsing_enabled': True,
+                  'url_main': 'https://www.instagram.com/',
+                  'url_user': 'https://www.instagram.com/alexaimephotographycars', 'status': BAD_RESULT,
+                  'http_status': 404, 'is_similar': False, 'rank': 29}}), ('alexaimephotography', 'username', {
+    '500px': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://500px.com/',
+              'url_user': 'https://500px.com/p/alexaimephotography', 'status': BAD_RESULT, 'http_status': 200,
+              'is_similar': False, 'rank': 2981},
+    'Reddit': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/',
+               'url_user': 'https://www.reddit.com/user/alexaimephotography',
+               'ids_usernames': {'alexaimephotography': 'username'}, 'status': GOOD_REDDIT_RESULT, 'http_status': 200,
+               'is_similar': False, 'rank': 17},
+    'Twitter': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/',
+                'url_user': 'https://twitter.com/alexaimephotography', 'status': BAD_RESULT, 'http_status': 400,
+                'is_similar': False, 'rank': 55},
+    'Instagram': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.instagram.com/',
+                  'url_user': 'https://www.instagram.com/alexaimephotography',
+                  'ids_usernames': {'alexaimephotography': 'username'}, 'status': GOOD_IG_RESULT, 'http_status': 200,
+                  'is_similar': False, 'rank': 29}}), ('Alexaimephotogr', 'username', {
+    '500px': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://500px.com/',
+              'url_user': 'https://500px.com/p/Alexaimephotogr', 'status': BAD_RESULT, 'http_status': 200,
+              'is_similar': False, 'rank': 2981},
+    'Reddit': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/',
+               'url_user': 'https://www.reddit.com/user/Alexaimephotogr', 'status': BAD_RESULT, 'http_status': 404,
+               'is_similar': False, 'rank': 17},
+    'Twitter': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/',
+                'url_user': 'https://twitter.com/Alexaimephotogr', 'status': GOOD_TWITTER_RESULT, 'http_status': 400,
+                'is_similar': False, 'rank': 55},
+    'Instagram': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.instagram.com/',
+                  'url_user': 'https://www.instagram.com/Alexaimephotogr', 'status': BAD_RESULT, 'http_status': 404,
+                  'is_similar': False, 'rank': 29}})]

 SUPPOSED_BRIEF = """Search by username alexaimephotographycars returned 1 accounts. Found target's other IDs: alexaimephotography, Alexaimephotogr. Search by username alexaimephotography returned 2 accounts. Search by username Alexaimephotogr returned 1 accounts. Extended info extracted from 3 accounts."""

@@ -86,6 +86,18 @@ def test_site_strip_engine_data():
    assert amperka_stripped.json == EXAMPLE_DB['sites']['Amperka']


+def test_site_strip_engine_data_with_site_prior_updates():
+    db = MaigretDatabase()
+    UPDATED_EXAMPLE_DB = dict(EXAMPLE_DB)
+    UPDATED_EXAMPLE_DB['sites']['Amperka']['absenceStrs'] = ["test"]
+    db.load_from_json(UPDATED_EXAMPLE_DB)
+
+    amperka = db.sites[0]
+    amperka_stripped = amperka.strip_engine_data()
+
+    assert amperka_stripped.json == UPDATED_EXAMPLE_DB['sites']['Amperka']
+
+
 def test_saving_site_error():
    db = MaigretDatabase()