Refactoring: updated data & sites storing, tests added

2026-05-06 14:08:59 +00:00 · 2021-01-03 23:48:33 +03:00
parent 9d3e2d114c
commit d389ba9e76
7 changed files with 18572 additions and 20732 deletions
@@ -154,8 +154,8 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
        # We have already determined the user doesn't exist here
        return results_info

-    # Get the expected error type
-    error_type = site.check_type
+    # Get the expected check type
+    check_type = site.check_type

    # Get the failure messages and comments
    failure_errors = site.errors
@@ -196,7 +196,7 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
                             QueryStatus.UNKNOWN,
                             query_time=response_time,
                             context=f'{error_text}: {site_error_text}', tags=fulltags)
-    elif error_type == "message":
+    elif check_type == "message":
        absence_flags = site.absence_strs
        is_absence_flags_list = isinstance(absence_flags, list)
        absence_flags_set = set(absence_flags) if is_absence_flags_list else {absence_flags}
@@ -214,7 +214,7 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
                                 url,
                                 QueryStatus.AVAILABLE,
                                 query_time=response_time, tags=fulltags)
-    elif error_type == "status_code":
+    elif check_type == "status_code":
        # Checks if the status code of the response is 2XX
        if (not status_code >= 300 or status_code < 200) and is_presense_detected:
            result = QueryResult(username,
@@ -228,7 +228,7 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
                                 url,
                                 QueryStatus.AVAILABLE,
                                 query_time=response_time, tags=fulltags)
-    elif error_type == "response_url":
+    elif check_type == "response_url":
        # For this detection method, we have turned off the redirect.
        # So, there is no need to check the response URL: it will always
        # match the request.  Instead, we will ensure that the response
@@ -248,8 +248,8 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
                                 query_time=response_time, tags=fulltags)
    else:
        # It should be impossible to ever get here...
-        raise ValueError(f"Unknown Error Type '{error_type}' for "
-                         f"site '{site_name}'")
+        raise ValueError(f"Unknown check type '{check_type}' for "
+                         f"site '{site.name}'")

    extracted_ids_data = {}

@@ -257,7 +257,7 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
        try:
            extracted_ids_data = extract(html_text)
        except Exception as e:
-            logger.warning(f'Error while parsing {site_name}: {e}', exc_info=True)
+            logger.warning(f'Error while parsing {site.name}: {e}', exc_info=True)

        if extracted_ids_data:
            new_usernames = {}
@@ -280,14 +280,14 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
    results_info['http_status'] = status_code
    results_info['is_similar'] = site.similar_search
    # results_site['response_text'] = html_text
-    results_info['rank'] = site.popularity_rank
+    results_info['rank'] = site.alexa_rank
    return results_info


 async def maigret(username, site_dict, query_notify, logger,
                  proxy=None, timeout=None, recursive_search=False,
                  id_type='username', tags=None, debug=False, forced=False,
-                  max_connections=100):
+                  max_connections=100, no_progressbar=False):
    """Main search func

    Checks for existence of username on various social media sites.
@@ -372,14 +372,16 @@ async def maigret(username, site_dict, query_notify, logger,

        headers.update(site.headers)

+        if not 'url' in site.__dict__:
+            logger.error('No URL for site %s', site.name)
        # URL of user on site (if it exists)
-        url = site.url_username_format.format(
+        url = site.url.format(
            urlMain=site.url_main,
            urlSubpath=site.url_subpath,
            username=username
        )
        # workaround to prevent slash errors
-        url = url.replace('///', '/')
+        url = re.sub('(?<!:)/+', '/', url)

        # Don't make request if username is invalid for the site
        if site.regex_check and re.search(site.regex_check, username) is None:
@@ -462,8 +464,11 @@ async def maigret(username, site_dict, query_notify, logger,
        future = asyncio.ensure_future(update_site_coro)
        tasks.append(future)

-    for f in tqdm.asyncio.tqdm.as_completed(tasks):
-        await f
+    if no_progressbar:
+        await asyncio.gather(*tasks)
+    else:
+        for f in tqdm.asyncio.tqdm.as_completed(tasks):
+            await f

    await session.close()

@@ -498,15 +503,15 @@ def timeout_check(value):
    return timeout


-async def site_self_check(site_name, site_data, logger):
+async def site_self_check(site_name, site_data, logger, no_progressbar=False):
    query_notify = Mock()
    changes = {
        'disabled': False,
    }

    check_data = [
-        (site_data['username_claimed'], QueryStatus.CLAIMED),
-        (site_data['username_unclaimed'], QueryStatus.AVAILABLE),
+        (site_data.username_claimed, QueryStatus.CLAIMED),
+        (site_data.username_unclaimed, QueryStatus.AVAILABLE),
    ]

    logger.info(f'Checking {site_name}...')
@@ -519,29 +524,33 @@ async def site_self_check(site_name, site_data, logger):
            logger,
            timeout=30,
            forced=True,
+            no_progressbar=no_progressbar,
        )
        # don't disable entries with other ids types
        if site_name not in results:
            logger.info(results)
            changes['disabled'] = True
            continue
+
        site_status = results[site_name]['status'].status
        if site_status != status:
            if site_status == QueryStatus.UNKNOWN:
-                msg = site_data.get('errorMsg')
-                etype = site_data.get('errorType')
-                logger.info(f'Error while searching {username} in {site_name}: {msg}, type {etype}')
+                msgs = site_data.absence_strs
+                etype = site_data.check_type
+                logger.info(f'Error while searching {username} in {site_name}: {msgs}, type {etype}')
                # don't disable in case of available username
                if status == QueryStatus.CLAIMED:
                    changes['disabled'] = True
            elif status == QueryStatus.CLAIMED:
                logger.info(f'Not found `{username}` in {site_name}, must be claimed')
+                logger.info(results[site_name])
                changes['disabled'] = True
            else:
                logger.info(f'Found `{username}` in {site_name}, must be available')
+                logger.info(results[site_name])
                changes['disabled'] = True

-    logger.info(f'Site {site_name} is okay')
+    logger.info(f'Site {site_name} checking is finished')
    return changes


@@ -756,7 +765,7 @@ async def main():
    logging.basicConfig(
        format='[%(filename)s:%(lineno)d] %(levelname)-3s  %(asctime)s %(message)s',
        datefmt='%H:%M:%S',
-        level=logging.ERROR
+        level=log_level
    )

    if args.debug:
@@ -1,96 +1,118 @@
 """Maigret Sites Information"""
 from __future__ import annotations
+import copy
 import json
 import operator
 import sys

 import requests

+from maigret.utils import CaseConverter
+

 class MaigretEngine:
-    def __init__(self, name, *args, **kwargs):
+    def __init__(self, name, data):
        self.name = name
-        self.__dict__.update(kwargs)
+        self.__dict__.update(data)
+
+    @property
+    def json(self):
+        return self.__dict__


 class MaigretSite:
-    def __init__(self, name, url_main, url_username_format, popularity_rank,
-                 username_claimed, username_unclaimed,
-                 information):
-        """Create Site Information Object.
-
-        Contains information about a specific web site.
-
-        Keyword Arguments:
-        self                   -- This object.
-        name                   -- String which identifies site.
-        url_main               -- String containing URL for home of site.
-        url_username_format    -- String containing URL for Username format
-                                  on site.
-                                  NOTE:  The string should contain the
-                                         token "{}" where the username should
-                                         be substituted.  For example, a string
-                                         of "https://somesite.com/users/{}"
-                                         indicates that the individual
-                                         usernames would show up under the
-                                         "https://somesite.com/users/" area of
-                                         the web site.
-        popularity_rank        -- Integer indicating popularity of site.
-                                  In general, smaller numbers mean more
-                                  popular ("0" or None means ranking
-                                  information not available).
-        username_claimed       -- String containing username which is known
-                                  to be claimed on web site.
-        username_unclaimed     -- String containing username which is known
-                                  to be unclaimed on web site.
-        information            -- Dictionary containing all known information
-                                  about web site.
-                                  NOTE:  Custom information about how to
-                                         actually detect the existence of the
-                                         username will be included in this
-                                         dictionary.  This information will
-                                         be needed by the detection method,
-                                         but it is only recorded in this
-                                         object for future use.
-
-        Return Value:
-        Nothing.
-        """
-
+    def __init__(self, name, information):
        self.name = name
-        self.url_main = url_main
-        self.url_username_format = url_username_format

-        if (popularity_rank is None) or (popularity_rank == 0):
-            # We do not know the popularity, so make site go to bottom of list.
-            popularity_rank = sys.maxsize
-        self.popularity_rank = popularity_rank
+        self.disabled = False
+        self.similar_search = False
+        self.ignore_403 = False
+        self.tags = []

-        self.username_claimed = username_claimed
-        self.username_unclaimed = username_unclaimed
-        self.information = information
-        self.disabled = information.get('disabled', False)
-        self.similar_search = information.get('similarSearch', False)
-        self.ignore_403 = information.get('ignore_403', False)
-        self.tags = information.get('tags', [])
+        self.type = 'username'
+        self.headers = {}
+        self.errors = {}
+        self.url_subpath = ''
+        self.regex_check = None
+        self.url_probe = None
+        self.check_type = ''
+        self.request_head_only = ''

-        self.type = information.get('type', 'username')
-        self.headers = information.get('headers', {})
-        self.errors = information.get('errors', {})
-        self.url_subpath = information.get('urlSubpath', '')
-        self.regex_check = information.get('regexCheck', None)
-        self.url_probe = information.get('urlProbe', None)
-        self.check_type = information.get('errorType', '')
-        self.request_head_only = information.get('request_head_only', '')
+        self.presense_strs = []
+        self.absence_strs = []

-        self.presense_strs = information.get('presenseStrs', [])
-        self.absence_strs = information.get('errorMsg', [])
+        self.engine = None
+        self.engine_data = {}
+        self.engine_obj = None
        self.request_future = None
+        self.alexa_rank = None
+
+        for k, v in information.items():
+            self.__dict__[CaseConverter.camel_to_snake(k)] = v
+
+        if (self.alexa_rank is None) or (self.alexa_rank == 0):
+            # We do not know the popularity, so make site go to bottom of list.
+            self.alexa_rank = sys.maxsize


    def __str__(self):
        return f"{self.name} ({self.url_main})"

+    @property
+    def json(self):
+        result = {}
+        for k, v in self.__dict__.items():
+            # convert to camelCase
+            field = CaseConverter.snake_to_camel(k)
+            # strip empty elements
+            if v in (False, '', [], {}, None, sys.maxsize, 'username'):
+                continue
+            if field in ['name', 'engineData', 'requestFuture', 'detectedEngine', 'engineObj']:
+                continue
+            result[field] = v
+
+        return result
+
+    def update(self, updates: dict) -> MaigretSite:
+        self.__dict__.update(updates)
+
+        return self
+
+    def update_from_engine(self, engine: MaigretEngine) -> MaigretSite:
+        engine_data = engine.site
+        for k, v in engine_data.items():
+            field = CaseConverter.camel_to_snake(k)
+            if isinstance(v, dict):
+                # TODO: assertion of intersecting keys
+                # update dicts like errors
+                self.__dict__.get(field, {}).update(v)
+            else:
+                self.__dict__[field] = v
+
+        self.engine_obj = engine
+
+        return self
+
+    def strip_engine_data(self) -> MaigretSite:
+        if not self.engine_obj:
+            return self
+
+        self.request_future = None
+        self_copy = copy.deepcopy(self)
+        engine_data = self_copy.engine_obj.site
+        for field in engine_data.keys():
+            if isinstance(engine_data[field], dict):
+                for k in engine_data[field].keys():
+                    del self_copy.__dict__[field][k]
+                continue
+
+            if field in list(self_copy.__dict__.keys()):
+                del self_copy.__dict__[field]
+            if CaseConverter.camel_to_snake(field) in list(self_copy.__dict__.keys()):
+                del self_copy.__dict__[CaseConverter.camel_to_snake(field)]
+
+        return self_copy
+

 class MaigretDatabase:
    def __init__(self):
@@ -98,20 +120,43 @@ class MaigretDatabase:
        self._engines = []

    @property
-    def sites(self: MaigretDatabase):
+    def sites(self):
        return self._sites

    @property
    def sites_dict(self):
        return {site.name: site for site in self._sites}
-    

    @property
-    def engines(self: MaigretDatabase):
+    def engines(self):
        return self._engines

+    @property
+    def engines_dict(self):
+        return {engine.name: engine for engine in self._engines}

-    def load_from_json(self: MaigretDatabase, json_data: dict) -> MaigretDatabase:
+    def update_site(self, site: MaigretSite) -> MaigretDatabase:
+        for s in self._sites:
+            if s.name == site.name:
+                s = site
+
+        return self
+
+    def save_to_file(self, filename: str) -> MaigretDatabase:
+        json_data = {
+            'sites': {site.name: site.strip_engine_data().json for site in self._sites},
+            'engines': {engine.name: engine.json for engine in self._engines},
+        }
+
+        json_data = json.dumps(json_data, indent=4)
+
+        with open(filename, 'w') as f:
+            f.write(json_data)
+
+        return self
+
+
+    def load_from_json(self, json_data: dict) -> MaigretDatabase:
        # Add all of site information from the json file to internal site list.
        site_data = json_data.get("sites")
        engines_data = json_data.get("engines")
@@ -121,25 +166,11 @@ class MaigretDatabase:

        for site_name in site_data:
            try:
-                site = {}
-                site_user_info = site_data[site_name]
-                # If popularity unknown, make site be at bottom of list.
-                popularity_rank = site_user_info.get("rank", sys.maxsize)
+                maigret_site = MaigretSite(site_name, site_data[site_name])

-                if 'engine' in site_user_info:
-                    engine_info = engines_data[site_user_info['engine']]['site']
-                    site.update(engine_info)
-
-                site.update(site_user_info)
-
-                maigret_site = MaigretSite(site_name,
-                                    site["urlMain"],
-                                    site["url"],
-                                    popularity_rank,
-                                    site["username_claimed"],
-                                    site["username_unclaimed"],
-                                    site
-                                    )
+                engine = site_data[site_name].get('engine')
+                if engine:
+                    maigret_site.update_from_engine(self.engines_dict[engine])

                self._sites.append(maigret_site)
            except KeyError as error:
@@ -150,7 +181,7 @@ class MaigretDatabase:
        return self


-    def load_from_str(self: MaigretDatabase, db_str: str) -> MaigretDatabase:
+    def load_from_str(self, db_str: str) -> MaigretDatabase:
        try:
            data = json.loads(db_str)
        except Exception as error:
@@ -161,7 +192,7 @@ class MaigretDatabase:
        return self.load_from_json(data)


-    def load_from_url(self: MaigretDatabase, url: str) -> MaigretDatabase:
+    def load_from_url(self, url: str) -> MaigretDatabase:
        is_url_valid = url.startswith('http://') or url.startswith('https://')

        if not is_url_valid:
@@ -190,7 +221,7 @@ class MaigretDatabase:
        return self.load_from_json(data)


-    def load_from_file(self: MaigretDatabase, filename: str) -> MaigretDatabase:
+    def load_from_file(self, filename: str) -> MaigretDatabase:
        try:
            with open(filename, 'r', encoding='utf-8') as file:
                try:
@@ -207,7 +238,7 @@ class MaigretDatabase:
        return self.load_from_json(data)


-    def site_name_list(self: MaigretDatabase, popularity_rank=False):
+    def site_name_list(self, popularity_rank=False):
        """Get Site Name List.

        Keyword Arguments:
@@ -0,0 +1,13 @@
+import re
+
+
+class CaseConverter:
+	@staticmethod
+	def camel_to_snake(camelcased_string: str):
+		return re.sub(r'(?<!^)(?=[A-Z])', '_', camelcased_string).lower()
+
+	@staticmethod
+	def snake_to_camel(snakecased_string: str):
+		formatted = ''.join(word.title() for word in snakecased_string.split('_'))
+		result = formatted[0].lower() + formatted[1:]
+		return result
@@ -2,6 +2,37 @@
 from maigret.sites import MaigretDatabase


+EXAMPLE_DB = {
+    'engines': {
+        "XenForo": {
+          "presenseStrs": ["XenForo"],
+          "site": {
+            "absenceStrs": [
+              "The specified member cannot be found. Please enter a member's entire name.",
+            ],
+            "checkType": "message",
+            "errors": {
+              "You must be logged-in to do that.": "Login required"
+            },
+            "url": "{urlMain}{urlSubpath}/members/?username={username}"
+          }
+        },
+    },
+    'sites': {
+        "Amperka": {
+          "engine": "XenForo",
+          "rank": 121613,
+          "tags": [
+            "ru"
+          ],
+          "urlMain": "http://forum.amperka.ru",
+          "usernameClaimed": "adam",
+          "usernameUnclaimed": "noonewouldeverusethis7"
+        },
+    }
+}
+
+
 def test_load_empty_db_from_str():
    db = MaigretDatabase()
    db.load_from_str('{"engines": {}, "sites": {}}')
@@ -12,38 +43,59 @@ def test_load_empty_db_from_str():

 def test_load_valid_db():
    db = MaigretDatabase()
-    db.load_from_json({
-        'engines': {
-            "XenForo": {
-              "presenseStrs": ["XenForo"],
-              "site": {
-                "errorMsg": [
-                  "The specified member cannot be found. Please enter a member's entire name.",
-                ],
-                "errorType": "message",
-                "errors": {
-                  "You must be logged-in to do that.": "Login required"
-                },
-                "url": "{urlMain}{urlSubpath}/members/?username={username}"
-              }
-            },
-        },
-        'sites': {
-            "Amperka": {
-              "engine": "XenForo",
-              "rank": 121613,
-              "tags": [
-                "ru"
-              ],
-              "urlMain": "http://forum.amperka.ru",
-              "username_claimed": "adam",
-              "username_unclaimed": "noonewouldeverusethis7"
-            },
-        }
-    })
+    db.load_from_json(EXAMPLE_DB)

    assert len(db.sites) == 1
    assert len(db.engines) == 1

    assert db.sites[0].name == 'Amperka'
    assert db.engines[0].name == 'XenForo'
+
+
+def test_site_json_dump():
+    db = MaigretDatabase()
+    db.load_from_json(EXAMPLE_DB)
+
+    init_keys = EXAMPLE_DB['sites']['Amperka'].keys()
+    # contains engine data
+    obj_keys = db.sites[0].json.keys()
+
+    assert set(init_keys).issubset(set(obj_keys))
+
+
+def test_site_correct_initialization():
+    db = MaigretDatabase()
+    db.load_from_json(EXAMPLE_DB)
+
+    xenforo = db.engines[0]
+    assert xenforo.name == 'XenForo'
+    assert xenforo.site['checkType'] == 'message'
+
+    amperka = db.sites[0]
+    assert amperka.name == 'Amperka'
+    assert amperka.check_type == 'message'
+
+
+def test_site_strip_engine_data():
+    db = MaigretDatabase()
+    db.load_from_json(EXAMPLE_DB)
+
+    amperka = db.sites[0]
+    amperka_stripped = amperka.strip_engine_data()
+
+    assert amperka_stripped.json == EXAMPLE_DB['sites']['Amperka']
+
+
+def test_saving_site_error():
+    db = MaigretDatabase()
+
+    DB = dict(EXAMPLE_DB)
+    DB['sites']['Amperka']['errors'] = {'error1': 'text1'}
+
+    db.load_from_json(DB)
+
+    amperka = db.sites[0]
+    assert len(amperka.errors) == 2
+
+    assert amperka.strip_engine_data().errors == {'error1': 'text1'}
+    assert amperka.strip_engine_data().json['errors'] == {'error1': 'text1'}
@@ -0,0 +1,15 @@
+"""Maigret utils test functions"""
+from maigret.utils import CaseConverter
+
+
+def test_case_convert_camel_to_snake():
+	a = 'SnakeCasedString'
+	b = CaseConverter.camel_to_snake(a)
+
+	assert b == 'snake_cased_string'
+
+def test_case_convert_snake_to_camel():
+	a = 'camel_cased_string'
+	b = CaseConverter.snake_to_camel(a)
+
+	assert b == 'camelCasedString'