Merge pull request #117 from soxoj/retries-refactoring

Introduced `--retries` flag, made thorough refactoring
2026-05-07 06:24:35 +00:00 · 2021-05-01 23:58:28 +03:00
parent 7fd4a2c516 5ee91f6659
commit 8e30e969f9
18 changed files with 6182 additions and 4943 deletions
@@ -26,6 +26,7 @@ Currently supported more than 2000 sites ([full list](./sites.md)), by default s
 * Search by tags (site categories, countries)
 * Censorship and captcha detection
 * Very few false positives
+* Failed requests' restarts

 ## Installation

@@ -49,7 +50,7 @@ pip3 install .
 git clone https://github.com/soxoj/maigret && cd maigret
 ```

-You can use your a free virtual machine, the repo will be automatically cloned:
+You can use a free virtual machine, the repo will be automatically cloned:

 [![Open in Cloud Shell](https://user-images.githubusercontent.com/27065646/92304704-8d146d80-ef80-11ea-8c29-0deaabb1c702.png)](https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/soxoj/maigret&tutorial=README.md) [![Run on Repl.it](https://user-images.githubusercontent.com/27065646/92304596-bf719b00-ef7f-11ea-987f-2c1f3c323088.png)](https://repl.it/github/soxoj/maigret)
 <a href="https://colab.research.google.com/gist//soxoj/879b51bc3b2f8b695abb054090645000/maigret.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" height="40"></a>
@@ -1,5 +1,5 @@
 #!/bin/sh
-FILES="maigret wizard.py maigret.py"
+FILES="maigret wizard.py maigret.py tests"

 echo 'black'
 black --skip-string-normalization $FILES
@@ -1,5 +1,5 @@
 #!/bin/sh
-FILES="maigret wizard.py maigret.py"
+FILES="maigret wizard.py maigret.py tests"

 echo 'syntax errors or undefined names'
 flake8 --count --select=E9,F63,F7,F82 --show-source --statistics $FILES
@@ -8,4 +8,4 @@ echo 'warning'
 flake8 --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --ignore=E731,W503 $FILES

 echo 'mypy'
-mypy ./maigret
+mypy ./maigret ./wizard.py ./tests
@@ -5,7 +5,7 @@ import re
 import ssl
 import sys
 import tqdm
-from typing import Tuple, Optional
+from typing import Tuple, Optional, Dict, List

 import aiohttp
 import tqdm.asyncio
@@ -16,9 +16,14 @@ from socid_extractor import extract
 from .activation import ParsingActivator, import_aiohttp_cookies
 from . import errors
 from .errors import CheckError
-from .executors import AsyncioSimpleExecutor, AsyncioProgressbarQueueExecutor
+from .executors import (
+    AsyncExecutor,
+    AsyncioSimpleExecutor,
+    AsyncioProgressbarQueueExecutor,
+)
 from .result import QueryResult, QueryStatus
 from .sites import MaigretDatabase, MaigretSite
+from .types import QueryOptions, QueryResultWrapper
 from .utils import get_random_user_agent


@@ -35,12 +40,10 @@ supported_recursive_search_ids = (
 unsupported_characters = "#"


-async def get_response(
-    request_future, site_name, logger
-) -> Tuple[str, int, Optional[CheckError]]:
+async def get_response(request_future, logger) -> Tuple[str, int, Optional[CheckError]]:
    html_text = None
    status_code = 0
-    error: Optional[CheckError] = CheckError("Error")
+    error: Optional[CheckError] = CheckError("Unknown")

    try:
        response = await request_future
@@ -76,32 +79,12 @@ async def get_response(
            ):
                error = CheckError("SSL", str(e))
        else:
-            logger.warning(f"Unhandled error while requesting {site_name}: {e}")
            logger.debug(e, exc_info=True)
-            error = CheckError("Error", str(e))
+            error = CheckError("Unexpected", str(e))

-    # TODO: return only needed information
    return str(html_text), status_code, error


-async def update_site_dict_from_response(
-    sitename, site_dict, results_info, logger, query_notify
-):
-    site_obj = site_dict[sitename]
-    future = site_obj.request_future
-    if not future:
-        # ignore: search by incompatible id type
-        return
-
-    response = await get_response(
-        request_future=future, site_name=sitename, logger=logger
-    )
-
-    return sitename, process_site_result(
-        response, query_notify, logger, results_info, site_obj
-    )
-
-
 # TODO: move to separate class
 def detect_error_page(
    html_text, status_code, fail_flags, ignore_403
@@ -127,7 +110,7 @@ def detect_error_page(


 def process_site_result(
-    response, query_notify, logger, results_info, site: MaigretSite
+    response, query_notify, logger, results_info: QueryResultWrapper, site: MaigretSite
 ):
    if not response:
        return results_info
@@ -205,6 +188,17 @@ def process_site_result(
                    logger.debug(presense_flag)
                    break

+    def build_result(status, **kwargs):
+        return QueryResult(
+            username,
+            site_name,
+            url,
+            status,
+            query_time=response_time,
+            tags=fulltags,
+            **kwargs,
+        )
+
    if check_error:
        logger.debug(check_error)
        result = QueryResult(
@@ -218,53 +212,20 @@ def process_site_result(
            tags=fulltags,
        )
    elif check_type == "message":
-        absence_flags = site.absence_strs
-        is_absence_flags_list = isinstance(absence_flags, list)
-        absence_flags_set = (
-            set(absence_flags) if is_absence_flags_list else {absence_flags}
-        )
        # Checks if the error message is in the HTML
        is_absence_detected = any(
-            [(absence_flag in html_text) for absence_flag in absence_flags_set]
+            [(absence_flag in html_text) for absence_flag in site.absence_strs]
        )
        if not is_absence_detected and is_presense_detected:
-            result = QueryResult(
-                username,
-                site_name,
-                url,
-                QueryStatus.CLAIMED,
-                query_time=response_time,
-                tags=fulltags,
-            )
+            result = build_result(QueryStatus.CLAIMED)
        else:
-            result = QueryResult(
-                username,
-                site_name,
-                url,
-                QueryStatus.AVAILABLE,
-                query_time=response_time,
-                tags=fulltags,
-            )
+            result = build_result(QueryStatus.AVAILABLE)
    elif check_type == "status_code":
        # Checks if the status code of the response is 2XX
-        if (not status_code >= 300 or status_code < 200) and is_presense_detected:
-            result = QueryResult(
-                username,
-                site_name,
-                url,
-                QueryStatus.CLAIMED,
-                query_time=response_time,
-                tags=fulltags,
-            )
+        if is_presense_detected and (not status_code >= 300 or status_code < 200):
+            result = build_result(QueryStatus.CLAIMED)
        else:
-            result = QueryResult(
-                username,
-                site_name,
-                url,
-                QueryStatus.AVAILABLE,
-                query_time=response_time,
-                tags=fulltags,
-            )
+            result = build_result(QueryStatus.AVAILABLE)
    elif check_type == "response_url":
        # For this detection method, we have turned off the redirect.
        # So, there is no need to check the response URL: it will always
@@ -272,23 +233,9 @@ def process_site_result(
        # code indicates that the request was successful (i.e. no 404, or
        # forward to some odd redirect).
        if 200 <= status_code < 300 and is_presense_detected:
-            result = QueryResult(
-                username,
-                site_name,
-                url,
-                QueryStatus.CLAIMED,
-                query_time=response_time,
-                tags=fulltags,
-            )
+            result = build_result(QueryStatus.CLAIMED)
        else:
-            result = QueryResult(
-                username,
-                site_name,
-                url,
-                QueryStatus.AVAILABLE,
-                query_time=response_time,
-                tags=fulltags,
-            )
+            result = build_result(QueryStatus.AVAILABLE)
    else:
        # It should be impossible to ever get here...
        raise ValueError(
@@ -329,109 +276,20 @@ def process_site_result(
    return results_info


-async def maigret(
-    username,
-    site_dict,
-    logger,
-    query_notify=None,
-    proxy=None,
-    timeout=None,
-    is_parsing_enabled=False,
-    id_type="username",
-    debug=False,
-    forced=False,
-    max_connections=100,
-    no_progressbar=False,
-    cookies=None,
-):
-    """Main search func
-
-    Checks for existence of username on certain sites.
-
-    Keyword Arguments:
-    username               -- Username string will be used for search.
-    site_dict              -- Dictionary containing sites data.
-    query_notify           -- Object with base type of QueryNotify().
-                              This will be used to notify the caller about
-                              query results.
-    logger                 -- Standard Python logger object.
-    timeout                -- Time in seconds to wait before timing out request.
-                              Default is no timeout.
-    is_parsing_enabled     -- Extract additional info from account pages.
-    id_type                -- Type of username to search.
-                              Default is 'username', see all supported here:
-                              https://github.com/soxoj/maigret/wiki/Supported-identifier-types
-    max_connections        -- Maximum number of concurrent connections allowed.
-                              Default is 100.
-    no_progressbar         -- Displaying of ASCII progressbar during scanner.
-    cookies                -- Filename of a cookie jar file to use for each request.
-
-    Return Value:
-    Dictionary containing results from report. Key of dictionary is the name
-    of the social network site, and the value is another dictionary with
-    the following keys:
-        url_main:      URL of main site.
-        url_user:      URL of user on site (if account exists).
-        status:        QueryResult() object indicating results of test for
-                       account existence.
-        http_status:   HTTP status code of query which checked for existence on
-                       site.
-        response_text: Text that came back from request.  May be None if
-                       there was an HTTP error when checking for existence.
-    """
-
-    # Notify caller that we are starting the query.
-    if not query_notify:
-        query_notify = Mock()
-
-    query_notify.start(username, id_type)
-
-    # TODO: connector
-    connector = (
-        ProxyConnector.from_url(proxy) if proxy else aiohttp.TCPConnector(ssl=False)
-    )
-    # connector = aiohttp.TCPConnector(ssl=False)
-    connector.verify_ssl = False
-
-    cookie_jar = None
-    if cookies:
-        logger.debug(f"Using cookies jar file {cookies}")
-        cookie_jar = await import_aiohttp_cookies(cookies)
-
-    session = aiohttp.ClientSession(
-        connector=connector, trust_env=True, cookie_jar=cookie_jar
-    )
-
-    if logger.level == logging.DEBUG:
-        future = session.get(url="https://icanhazip.com")
-        ip, status, check_error = await get_response(future, None, logger)
-        if ip:
-            logger.debug(f"My IP is: {ip.strip()}")
-        else:
-            logger.debug(f"IP requesting {check_error[0]}: {check_error[1]}")
-
-    # Results from analysis of all sites
-    results_total = {}
-
-    # First create futures for all requests. This allows for the requests to run in parallel
-    for site_name, site in site_dict.items():
-
-        if site.type != id_type:
-            continue
-
-        if site.disabled and not forced:
-            logger.debug(f"Site {site.name} is disabled, skipping...")
-            continue
-
-        # Results from analysis of this specific site
-        results_site = {}
+def make_site_result(
+    site: MaigretSite, username: str, options: QueryOptions, logger
+) -> QueryResultWrapper:
+    results_site: QueryResultWrapper = {}

    # Record URL of main site and username
+    results_site["site"] = site
    results_site["username"] = username
-        results_site["parsing_enabled"] = is_parsing_enabled
+    results_site["parsing_enabled"] = options["parsing"]
    results_site["url_main"] = site.url_main
    results_site["cookies"] = (
-            cookie_jar and cookie_jar.filter_cookies(site.url_main) or None
+        options.get("cookie_jar")
+        and options["cookie_jar"].filter_cookies(site.url_main)
+        or None
    )

    headers = {
@@ -442,23 +300,51 @@ async def maigret(

    if "url" not in site.__dict__:
        logger.error("No URL for site %s", site.name)
+
    # URL of user on site (if it exists)
    url = site.url.format(
        urlMain=site.url_main, urlSubpath=site.url_subpath, username=username
    )
+
    # workaround to prevent slash errors
    url = re.sub("(?<!:)/+", "/", url)

-        # Don't make request if username is invalid for the site
-        if site.regex_check and re.search(site.regex_check, username) is None:
-            # No need to do the check at the site: this user name is not allowed.
+    session = options['session']
+
+    # site check is disabled
+    if site.disabled and not options['forced']:
+        logger.debug(f"Site {site.name} is disabled, skipping...")
        results_site["status"] = QueryResult(
-                username, site_name, url, QueryStatus.ILLEGAL
+            username,
+            site.name,
+            url,
+            QueryStatus.ILLEGAL,
+            error=CheckError("Check is disabled"),
+        )
+    # current username type could not be applied
+    elif site.type != options["id_type"]:
+        results_site["status"] = QueryResult(
+            username,
+            site.name,
+            url,
+            QueryStatus.ILLEGAL,
+            error=CheckError('Unsupported identifier type', f'Want "{site.type}"'),
+        )
+    # username is not allowed.
+    elif site.regex_check and re.search(site.regex_check, username) is None:
+        results_site["status"] = QueryResult(
+            username,
+            site.name,
+            url,
+            QueryStatus.ILLEGAL,
+            error=CheckError(
+                'Unsupported username format', f'Want "{site.regex_check}"'
+            ),
        )
        results_site["url_user"] = ""
        results_site["http_status"] = ""
        results_site["response_text"] = ""
-            query_notify.update(results_site["status"])
+        # query_notify.update(results_site["status"])
    else:
        # URL of user on site (if it exists)
        results_site["url_user"] = url
@@ -503,26 +389,130 @@ async def maigret(
            url=url_probe,
            headers=headers,
            allow_redirects=allow_redirects,
-                timeout=timeout,
+            timeout=options['timeout'],
        )

-            # Store future in data for access later
-            # TODO: move to separate obj
-            site.request_future = future
+        # Store future request object in the results object
+        results_site["future"] = future

-        # Add this site's results into final dictionary with all of the other results.
-        results_total[site_name] = results_site
+    return results_site

-    coroutines = []
-    for sitename, result_obj in results_total.items():
-        coroutines.append(
-            (
-                update_site_dict_from_response,
-                [sitename, site_dict, result_obj, logger, query_notify],
-                {},
-            )
+
+async def check_site_for_username(
+    site, username, options: QueryOptions, logger, query_notify, *args, **kwargs
+) -> Tuple[str, QueryResultWrapper]:
+    default_result = make_site_result(site, username, options, logger)
+    future = default_result.get("future")
+    if not future:
+        return site.name, default_result
+
+    response = await get_response(request_future=future, logger=logger)
+
+    response_result = process_site_result(
+        response, query_notify, logger, default_result, site
    )

+    return site.name, response_result
+
+
+async def debug_ip_request(session, logger):
+    future = session.get(url="https://icanhazip.com")
+    ip, status, check_error = await get_response(future, logger)
+    if ip:
+        logger.debug(f"My IP is: {ip.strip()}")
+    else:
+        logger.debug(f"IP requesting {check_error.type}: {check_error.desc}")
+
+
+def get_failed_sites(results: Dict[str, QueryResultWrapper]) -> List[str]:
+    sites = []
+    for sitename, r in results.items():
+        status = r.get('status', {})
+        if status and status.error:
+            if errors.is_permanent(status.error.type):
+                continue
+            sites.append(sitename)
+    return sites
+
+
+async def maigret(
+    username: str,
+    site_dict: Dict[str, MaigretSite],
+    logger,
+    query_notify=None,
+    proxy=None,
+    timeout=None,
+    is_parsing_enabled=False,
+    id_type="username",
+    debug=False,
+    forced=False,
+    max_connections=100,
+    no_progressbar=False,
+    cookies=None,
+    retries=0,
+) -> QueryResultWrapper:
+    """Main search func
+
+    Checks for existence of username on certain sites.
+
+    Keyword Arguments:
+    username               -- Username string will be used for search.
+    site_dict              -- Dictionary containing sites data in MaigretSite objects.
+    query_notify           -- Object with base type of QueryNotify().
+                              This will be used to notify the caller about
+                              query results.
+    logger                 -- Standard Python logger object.
+    timeout                -- Time in seconds to wait before timing out request.
+                              Default is no timeout.
+    is_parsing_enabled     -- Extract additional info from account pages.
+    id_type                -- Type of username to search.
+                              Default is 'username', see all supported here:
+                              https://github.com/soxoj/maigret/wiki/Supported-identifier-types
+    max_connections        -- Maximum number of concurrent connections allowed.
+                              Default is 100.
+    no_progressbar         -- Displaying of ASCII progressbar during scanner.
+    cookies                -- Filename of a cookie jar file to use for each request.
+
+    Return Value:
+    Dictionary containing results from report. Key of dictionary is the name
+    of the social network site, and the value is another dictionary with
+    the following keys:
+        url_main:      URL of main site.
+        url_user:      URL of user on site (if account exists).
+        status:        QueryResult() object indicating results of test for
+                       account existence.
+        http_status:   HTTP status code of query which checked for existence on
+                       site.
+        response_text: Text that came back from request.  May be None if
+                       there was an HTTP error when checking for existence.
+    """
+
+    # notify caller that we are starting the query.
+    if not query_notify:
+        query_notify = Mock()
+
+    query_notify.start(username, id_type)
+
+    # make http client session
+    connector = (
+        ProxyConnector.from_url(proxy) if proxy else aiohttp.TCPConnector(ssl=False)
+    )
+    connector.verify_ssl = False
+
+    cookie_jar = None
+    if cookies:
+        logger.debug(f"Using cookies jar file {cookies}")
+        cookie_jar = await import_aiohttp_cookies(cookies)
+
+    session = aiohttp.ClientSession(
+        connector=connector, trust_env=True, cookie_jar=cookie_jar
+    )
+
+    if logger.level == logging.DEBUG:
+        await debug_ip_request(session, logger)
+
+    # setup parallel executor
+    executor: Optional[AsyncExecutor] = None
    if no_progressbar:
        executor = AsyncioSimpleExecutor(logger=logger)
    else:
@@ -530,24 +520,68 @@ async def maigret(
            logger=logger, in_parallel=max_connections, timeout=timeout + 0.5
        )

-    results = await executor.run(coroutines)
+    # make options objects for all the requests
+    options: QueryOptions = {}
+    options["cookies"] = cookie_jar
+    options["session"] = session
+    options["parsing"] = is_parsing_enabled
+    options["timeout"] = timeout
+    options["id_type"] = id_type
+    options["forced"] = forced

+    # results from analysis of all sites
+    all_results: Dict[str, QueryResultWrapper] = {}
+
+    sites = list(site_dict.keys())
+
+    attempts = retries + 1
+    while attempts:
+        tasks_dict = {}
+
+        for sitename, site in site_dict.items():
+            if sitename not in sites:
+                continue
+            default_result: QueryResultWrapper = {
+                'site': site,
+                'status': QueryResult(
+                    username,
+                    sitename,
+                    '',
+                    QueryStatus.UNKNOWN,
+                    error=CheckError('Request failed'),
+                ),
+            }
+            tasks_dict[sitename] = (
+                check_site_for_username,
+                [site, username, options, logger, query_notify],
+                {'default': (sitename, default_result)},
+            )
+
+        cur_results = await executor.run(tasks_dict.values())
+
+        # wait for executor timeout errors
+        await asyncio.sleep(1)
+
+        all_results.update(cur_results)
+
+        sites = get_failed_sites(dict(cur_results))
+        attempts -= 1
+
+        if not sites:
+            break
+
+        if attempts:
+            query_notify.warning(
+                f'Restarting checks for {len(sites)} sites... ({attempts} attempts left)'
+            )
+
+    # closing http client session
    await session.close()

-    # Notify caller that all queries are finished.
+    # notify caller that all queries are finished
    query_notify.finish()

-    data = {}
-    for result in results:
-        # TODO: still can be empty
-        if result:
-            try:
-                data[result[0]] = result[1]
-            except Exception as e:
-                logger.error(e, exc_info=True)
-                logger.info(result)
-
-    return data
+    return all_results


 def timeout_check(value):
@@ -575,7 +609,9 @@ def timeout_check(value):
    return timeout


-async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False):
+async def site_self_check(
+    site: MaigretSite, logger, semaphore, db: MaigretDatabase, silent=False
+):
    changes = {
        "disabled": False,
    }
@@ -602,6 +638,7 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F
                id_type=site.type,
                forced=True,
                no_progressbar=True,
+                retries=1,
            )

            # don't disable entries with other ids types
@@ -57,6 +57,17 @@ ERRORS_TYPES = {
    'Request timeout': 'Try to increase timeout or to switch to another internet service provider',
 }

+TEMPORARY_ERRORS_TYPES = [
+    'Request timeout',
+    'Unknown',
+    'Request failed',
+    'Connecting failure',
+    'HTTP',
+    'Proxy',
+    'Interrupted',
+    'Connection lost',
+]
+
 THRESHOLD = 3  # percent


@@ -64,8 +75,8 @@ def is_important(err_data):
    return err_data['perc'] >= THRESHOLD


-def is_not_permanent(err_data):
-    return True
+def is_permanent(err_type):
+    return err_type not in TEMPORARY_ERRORS_TYPES


 def detect(text):
@@ -93,7 +93,7 @@ class AsyncioProgressbarQueueExecutor(AsyncExecutor):
            try:
                result = await asyncio.wait_for(query_task, timeout=self.timeout)
            except asyncio.TimeoutError:
-                result = None
+                result = kwargs.get('default')

            self.results.append(result)
            self.progress.update(1)
@@ -59,7 +59,7 @@ def notify_about_errors(search_results, query_notify):
        )


-async def main():
+def setup_arguments_parser():
    version_string = '\n'.join(
        [
            f'%(prog)s {__version__}',
@@ -148,6 +148,14 @@ async def main():
        "A longer timeout will be more likely to get results from slow sites. "
        "On the other hand, this may cause a long delay to gather all results. ",
    )
+    parser.add_argument(
+        "--retries",
+        action="store",
+        type=int,
+        metavar='RETRIES',
+        default=1,
+        help="Attempts to restart temporary failed requests.",
+    )
    parser.add_argument(
        "-n",
        "--max-connections",
@@ -334,8 +342,12 @@ async def main():
        help=f"Generate a JSON report of specific type: {', '.join(SUPPORTED_JSON_REPORT_FORMATS)}"
        " (one report per username).",
    )
+    return parser

-    args = parser.parse_args()
+
+async def main():
+    arg_parser = setup_arguments_parser()
+    args = arg_parser.parse_args()

    # Logging
    log_level = logging.ERROR
@@ -528,6 +540,7 @@ async def main():
            forced=args.use_disabled_sites,
            max_connections=args.connections,
            no_progressbar=args.no_progressbar,
+            retries=args.retries,
        )

        notify_about_errors(results, query_notify)
@@ -3,7 +3,7 @@
 import copy
 import json
 import sys
-from typing import Optional
+from typing import Optional, List, Dict, Any

 import requests

@@ -57,9 +57,10 @@ SUPPORTED_TAGS = [


 class MaigretEngine:
+    site: Dict[str, Any] = {}
+
    def __init__(self, name, data):
        self.name = name
-        self.site = {}
        self.__dict__.update(data)

    @property
@@ -78,35 +79,40 @@ class MaigretSite:
        "urlRegexp",
    ]

+    username_claimed = ""
+    username_unclaimed = ""
+    url_subpath = ""
+    url_main = ""
+    url = ""
+    disabled = False
+    similar_search = False
+    ignore403 = False
+    tags: List[str] = []
+
+    type = "username"
+    headers: Dict[str, str] = {}
+    errors: Dict[str, str] = {}
+    activation: Dict[str, Any] = {}
+    regex_check = None
+    url_probe = None
+    check_type = ""
+    request_head_only = ""
+    get_params: Dict[str, Any] = {}
+
+    presense_strs: List[str] = []
+    absence_strs: List[str] = []
+    stats: Dict[str, Any] = {}
+
+    engine = None
+    engine_data: Dict[str, Any] = {}
+    engine_obj: Optional["MaigretEngine"] = None
+    request_future = None
+    alexa_rank = None
+    source = None
+
    def __init__(self, name, information):
        self.name = name
-
-        self.disabled = False
-        self.similar_search = False
-        self.ignore403 = False
-        self.tags = []
-
-        self.type = "username"
-        self.headers = {}
-        self.errors = {}
-        self.activation = {}
        self.url_subpath = ""
-        self.regex_check = None
-        self.url_probe = None
-        self.check_type = ""
-        self.request_head_only = ""
-        self.get_params = {}
-
-        self.presense_strs = []
-        self.absence_strs = []
-        self.stats = {}
-
-        self.engine = None
-        self.engine_data = {}
-        self.engine_obj = None
-        self.request_future = None
-        self.alexa_rank = None
-        self.source = None

        for k, v in information.items():
            self.__dict__[CaseConverter.camel_to_snake(k)] = v
@@ -193,7 +199,7 @@ class MaigretSite:
        self.url_regexp = None

        self_copy = copy.deepcopy(self)
-        engine_data = self_copy.engine_obj.site
+        engine_data = self_copy.engine_obj and self_copy.engine_obj.site or {}
        site_data_keys = list(self_copy.__dict__.keys())

        for k in engine_data.keys():
@@ -1,5 +1,11 @@
-from typing import Callable, Any, Tuple
+from typing import Callable, List, Dict, Tuple, Any


 # search query
-QueryDraft = Tuple[Callable, Any, Any]
+QueryDraft = Tuple[Callable, List, Dict]
+
+# options dict
+QueryOptions = Dict[str, Any]
+
+# TODO: throw out
+QueryResultWrapper = Dict[str, Any]
@@ -26,7 +26,8 @@ def get_test_reports_filenames():

 def remove_test_reports():
    reports_list = get_test_reports_filenames()
-    for f in reports_list: os.remove(f)
+    for f in reports_list:
+        os.remove(f)
    logging.error(f'Removed test reports {reports_list}')


@@ -44,8 +44,9 @@ async def test_import_aiohttp_cookies():

    url = 'https://httpbin.org/cookies'
    connector = aiohttp.TCPConnector(ssl=False)
-    session = aiohttp.ClientSession(connector=connector, trust_env=True,
-                                    cookie_jar=cookie_jar)
+    session = aiohttp.ClientSession(
+        connector=connector, trust_env=True, cookie_jar=cookie_jar
+    )

    response = await session.get(url=url)
    result = json.loads(await response.content.read())
@@ -2,11 +2,16 @@
 import pytest
 import asyncio
 import logging
-from maigret.executors import AsyncioSimpleExecutor, AsyncioProgressbarExecutor, \
-                              AsyncioProgressbarSemaphoreExecutor, AsyncioProgressbarQueueExecutor
+from maigret.executors import (
+    AsyncioSimpleExecutor,
+    AsyncioProgressbarExecutor,
+    AsyncioProgressbarSemaphoreExecutor,
+    AsyncioProgressbarQueueExecutor,
+)

 logger = logging.getLogger(__name__)

+
 async def func(n):
    await asyncio.sleep(0.1 * (n % 3))
    return n
@@ -20,6 +25,7 @@ async def test_simple_asyncio_executor():
    assert executor.execution_time > 0.2
    assert executor.execution_time < 0.3

+
@pytest.mark.asyncio
 async def test_asyncio_progressbar_executor():
    tasks = [(func, [n], {}) for n in range(10)]
@@ -8,40 +8,30 @@ from maigret.maigret import self_check
 from maigret.sites import MaigretDatabase

 EXAMPLE_DB = {
-    'engines': {
-    },
+    'engines': {},
    'sites': {
        "GooglePlayStore": {
-            "tags": [
-                "global",
-                "us"
-            ],
+            "tags": ["global", "us"],
            "disabled": False,
            "checkType": "status_code",
            "alexaRank": 1,
            "url": "https://play.google.com/store/apps/developer?id={username}",
            "urlMain": "https://play.google.com/store",
            "usernameClaimed": "Facebook_nosuchname",
-            "usernameUnclaimed": "noonewouldeverusethis7"
+            "usernameUnclaimed": "noonewouldeverusethis7",
        },
        "Reddit": {
-            "tags": [
-                "news",
-                "social",
-                "us"
-            ],
+            "tags": ["news", "social", "us"],
            "checkType": "status_code",
-            "presenseStrs": [
-                "totalKarma"
-            ],
+            "presenseStrs": ["totalKarma"],
            "disabled": True,
            "alexaRank": 17,
            "url": "https://www.reddit.com/user/{username}",
            "urlMain": "https://www.reddit.com/",
            "usernameClaimed": "blue",
-            "usernameUnclaimed": "noonewouldeverusethis7"
+            "usernameUnclaimed": "noonewouldeverusethis7",
+        },
    },
-    }
 }


@@ -7,8 +7,16 @@ from io import StringIO
 import xmind
 from jinja2 import Template

-from maigret.report import generate_csv_report, generate_txt_report, save_xmind_report, save_html_report, \
-    save_pdf_report, generate_report_template, generate_report_context, generate_json_report
+from maigret.report import (
+    generate_csv_report,
+    generate_txt_report,
+    save_xmind_report,
+    save_html_report,
+    save_pdf_report,
+    generate_report_template,
+    generate_report_context,
+    generate_json_report,
+)
 from maigret.result import QueryResult, QueryStatus

 EXAMPLE_RESULTS = {
@@ -17,14 +25,16 @@ EXAMPLE_RESULTS = {
        'parsing_enabled': True,
        'url_main': 'https://www.github.com/',
        'url_user': 'https://www.github.com/test',
-        'status': QueryResult('test',
+        'status': QueryResult(
+            'test',
            'GitHub',
            'https://www.github.com/test',
            QueryStatus.CLAIMED,
-                              tags=['test_tag']),
+            tags=['test_tag'],
+        ),
        'http_status': 200,
        'is_similar': False,
-        'rank': 78
+        'rank': 78,
    }
 }

@@ -33,74 +43,196 @@ BAD_RESULT = QueryResult('', '', '', QueryStatus.AVAILABLE)

 GOOD_500PX_RESULT = copy.deepcopy(GOOD_RESULT)
 GOOD_500PX_RESULT.tags = ['photo', 'us', 'global']
-GOOD_500PX_RESULT.ids_data = {"uid": "dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==", "legacy_id": "26403415",
-                              "username": "alexaimephotographycars", "name": "Alex Aim\u00e9",
+GOOD_500PX_RESULT.ids_data = {
+    "uid": "dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==",
+    "legacy_id": "26403415",
+    "username": "alexaimephotographycars",
+    "name": "Alex Aim\u00e9",
    "website": "www.flickr.com/photos/alexaimephotography/",
    "facebook_link": " www.instagram.com/street.reality.photography/",
-                              "instagram_username": "alexaimephotography", "twitter_username": "Alexaimephotogr"}
+    "instagram_username": "alexaimephotography",
+    "twitter_username": "Alexaimephotogr",
+}

 GOOD_REDDIT_RESULT = copy.deepcopy(GOOD_RESULT)
 GOOD_REDDIT_RESULT.tags = ['news', 'us']
-GOOD_REDDIT_RESULT.ids_data = {"reddit_id": "t5_1nytpy", "reddit_username": "alexaimephotography",
+GOOD_REDDIT_RESULT.ids_data = {
+    "reddit_id": "t5_1nytpy",
+    "reddit_username": "alexaimephotography",
    "fullname": "alexaimephotography",
    "image": "https://styles.redditmedia.com/t5_1nytpy/styles/profileIcon_7vmhdwzd3g931.jpg?width=256&height=256&crop=256:256,smart&frame=1&s=4f355f16b4920844a3f4eacd4237a7bf76b2e97e",
-                               "is_employee": "False", "is_nsfw": "False", "is_mod": "True", "is_following": "True",
-                               "has_user_profile": "True", "hide_from_robots": "False",
-                               "created_at": "2019-07-10 12:20:03", "total_karma": "53959", "post_karma": "52738"}
+    "is_employee": "False",
+    "is_nsfw": "False",
+    "is_mod": "True",
+    "is_following": "True",
+    "has_user_profile": "True",
+    "hide_from_robots": "False",
+    "created_at": "2019-07-10 12:20:03",
+    "total_karma": "53959",
+    "post_karma": "52738",
+}

 GOOD_IG_RESULT = copy.deepcopy(GOOD_RESULT)
 GOOD_IG_RESULT.tags = ['photo', 'global']
-GOOD_IG_RESULT.ids_data = {"instagram_username": "alexaimephotography", "fullname": "Alexaimephotography",
+GOOD_IG_RESULT.ids_data = {
+    "instagram_username": "alexaimephotography",
+    "fullname": "Alexaimephotography",
    "id": "6828488620",
    "image": "https://scontent-hel3-1.cdninstagram.com/v/t51.2885-19/s320x320/95420076_1169632876707608_8741505804647006208_n.jpg?_nc_ht=scontent-hel3-1.cdninstagram.com&_nc_ohc=jd87OUGsX4MAX_Ym5GX&tp=1&oh=0f42badd68307ba97ec7fb1ef7b4bfd4&oe=601E5E6F",
    "bio": "Photographer \nChild of fine street arts",
-                           "external_url": "https://www.flickr.com/photos/alexaimephotography2020/"}
+    "external_url": "https://www.flickr.com/photos/alexaimephotography2020/",
+}

 GOOD_TWITTER_RESULT = copy.deepcopy(GOOD_RESULT)
 GOOD_TWITTER_RESULT.tags = ['social', 'us']

-TEST = [('alexaimephotographycars', 'username', {
-    '500px': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://500px.com/',
+TEST = [
+    (
+        'alexaimephotographycars',
+        'username',
+        {
+            '500px': {
+                'username': 'alexaimephotographycars',
+                'parsing_enabled': True,
+                'url_main': 'https://500px.com/',
                'url_user': 'https://500px.com/p/alexaimephotographycars',
-              'ids_usernames': {'alexaimephotographycars': 'username', 'alexaimephotography': 'username',
-                                'Alexaimephotogr': 'username'}, 'status': GOOD_500PX_RESULT, 'http_status': 200,
-              'is_similar': False, 'rank': 2981},
-    'Reddit': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/',
-               'url_user': 'https://www.reddit.com/user/alexaimephotographycars', 'status': BAD_RESULT,
-               'http_status': 404, 'is_similar': False, 'rank': 17},
-    'Twitter': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/',
-                'url_user': 'https://twitter.com/alexaimephotographycars', 'status': BAD_RESULT, 'http_status': 400,
-                'is_similar': False, 'rank': 55},
-    'Instagram': {'username': 'alexaimephotographycars', 'parsing_enabled': True,
+                'ids_usernames': {
+                    'alexaimephotographycars': 'username',
+                    'alexaimephotography': 'username',
+                    'Alexaimephotogr': 'username',
+                },
+                'status': GOOD_500PX_RESULT,
+                'http_status': 200,
+                'is_similar': False,
+                'rank': 2981,
+            },
+            'Reddit': {
+                'username': 'alexaimephotographycars',
+                'parsing_enabled': True,
+                'url_main': 'https://www.reddit.com/',
+                'url_user': 'https://www.reddit.com/user/alexaimephotographycars',
+                'status': BAD_RESULT,
+                'http_status': 404,
+                'is_similar': False,
+                'rank': 17,
+            },
+            'Twitter': {
+                'username': 'alexaimephotographycars',
+                'parsing_enabled': True,
+                'url_main': 'https://www.twitter.com/',
+                'url_user': 'https://twitter.com/alexaimephotographycars',
+                'status': BAD_RESULT,
+                'http_status': 400,
+                'is_similar': False,
+                'rank': 55,
+            },
+            'Instagram': {
+                'username': 'alexaimephotographycars',
+                'parsing_enabled': True,
                'url_main': 'https://www.instagram.com/',
-                  'url_user': 'https://www.instagram.com/alexaimephotographycars', 'status': BAD_RESULT,
-                  'http_status': 404, 'is_similar': False, 'rank': 29}}), ('alexaimephotography', 'username', {
-    '500px': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://500px.com/',
-              'url_user': 'https://500px.com/p/alexaimephotography', 'status': BAD_RESULT, 'http_status': 200,
-              'is_similar': False, 'rank': 2981},
-    'Reddit': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/',
+                'url_user': 'https://www.instagram.com/alexaimephotographycars',
+                'status': BAD_RESULT,
+                'http_status': 404,
+                'is_similar': False,
+                'rank': 29,
+            },
+        },
+    ),
+    (
+        'alexaimephotography',
+        'username',
+        {
+            '500px': {
+                'username': 'alexaimephotography',
+                'parsing_enabled': True,
+                'url_main': 'https://500px.com/',
+                'url_user': 'https://500px.com/p/alexaimephotography',
+                'status': BAD_RESULT,
+                'http_status': 200,
+                'is_similar': False,
+                'rank': 2981,
+            },
+            'Reddit': {
+                'username': 'alexaimephotography',
+                'parsing_enabled': True,
+                'url_main': 'https://www.reddit.com/',
                'url_user': 'https://www.reddit.com/user/alexaimephotography',
-               'ids_usernames': {'alexaimephotography': 'username'}, 'status': GOOD_REDDIT_RESULT, 'http_status': 200,
-               'is_similar': False, 'rank': 17},
-    'Twitter': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/',
-                'url_user': 'https://twitter.com/alexaimephotography', 'status': BAD_RESULT, 'http_status': 400,
-                'is_similar': False, 'rank': 55},
-    'Instagram': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.instagram.com/',
+                'ids_usernames': {'alexaimephotography': 'username'},
+                'status': GOOD_REDDIT_RESULT,
+                'http_status': 200,
+                'is_similar': False,
+                'rank': 17,
+            },
+            'Twitter': {
+                'username': 'alexaimephotography',
+                'parsing_enabled': True,
+                'url_main': 'https://www.twitter.com/',
+                'url_user': 'https://twitter.com/alexaimephotography',
+                'status': BAD_RESULT,
+                'http_status': 400,
+                'is_similar': False,
+                'rank': 55,
+            },
+            'Instagram': {
+                'username': 'alexaimephotography',
+                'parsing_enabled': True,
+                'url_main': 'https://www.instagram.com/',
                'url_user': 'https://www.instagram.com/alexaimephotography',
-                  'ids_usernames': {'alexaimephotography': 'username'}, 'status': GOOD_IG_RESULT, 'http_status': 200,
-                  'is_similar': False, 'rank': 29}}), ('Alexaimephotogr', 'username', {
-    '500px': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://500px.com/',
-              'url_user': 'https://500px.com/p/Alexaimephotogr', 'status': BAD_RESULT, 'http_status': 200,
-              'is_similar': False, 'rank': 2981},
-    'Reddit': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/',
-               'url_user': 'https://www.reddit.com/user/Alexaimephotogr', 'status': BAD_RESULT, 'http_status': 404,
-               'is_similar': False, 'rank': 17},
-    'Twitter': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/',
-                'url_user': 'https://twitter.com/Alexaimephotogr', 'status': GOOD_TWITTER_RESULT, 'http_status': 400,
-                'is_similar': False, 'rank': 55},
-    'Instagram': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.instagram.com/',
-                  'url_user': 'https://www.instagram.com/Alexaimephotogr', 'status': BAD_RESULT, 'http_status': 404,
-                  'is_similar': False, 'rank': 29}})]
+                'ids_usernames': {'alexaimephotography': 'username'},
+                'status': GOOD_IG_RESULT,
+                'http_status': 200,
+                'is_similar': False,
+                'rank': 29,
+            },
+        },
+    ),
+    (
+        'Alexaimephotogr',
+        'username',
+        {
+            '500px': {
+                'username': 'Alexaimephotogr',
+                'parsing_enabled': True,
+                'url_main': 'https://500px.com/',
+                'url_user': 'https://500px.com/p/Alexaimephotogr',
+                'status': BAD_RESULT,
+                'http_status': 200,
+                'is_similar': False,
+                'rank': 2981,
+            },
+            'Reddit': {
+                'username': 'Alexaimephotogr',
+                'parsing_enabled': True,
+                'url_main': 'https://www.reddit.com/',
+                'url_user': 'https://www.reddit.com/user/Alexaimephotogr',
+                'status': BAD_RESULT,
+                'http_status': 404,
+                'is_similar': False,
+                'rank': 17,
+            },
+            'Twitter': {
+                'username': 'Alexaimephotogr',
+                'parsing_enabled': True,
+                'url_main': 'https://www.twitter.com/',
+                'url_user': 'https://twitter.com/Alexaimephotogr',
+                'status': GOOD_TWITTER_RESULT,
+                'http_status': 400,
+                'is_similar': False,
+                'rank': 55,
+            },
+            'Instagram': {
+                'username': 'Alexaimephotogr',
+                'parsing_enabled': True,
+                'url_main': 'https://www.instagram.com/',
+                'url_user': 'https://www.instagram.com/Alexaimephotogr',
+                'status': BAD_RESULT,
+                'http_status': 404,
+                'is_similar': False,
+                'rank': 29,
+            },
+        },
+    ),
+]

 SUPPOSED_BRIEF = """Search by username alexaimephotographycars returned 1 accounts. Found target's other IDs: alexaimephotography, Alexaimephotogr. Search by username alexaimephotography returned 2 accounts. Search by username Alexaimephotogr returned 1 accounts. Extended info extracted from 3 accounts."""

@@ -187,7 +319,10 @@ def test_save_xmind_report():
    assert data['topic']['topics'][0]['title'] == 'Undefined'
    assert data['topic']['topics'][1]['title'] == 'test_tag'
    assert len(data['topic']['topics'][1]['topics']) == 1
-    assert data['topic']['topics'][1]['topics'][0]['label'] == 'https://www.github.com/test'
+    assert (
+        data['topic']['topics'][1]['topics'][0]['label']
+        == 'https://www.github.com/test'
+    )


 def test_html_report():
@@ -10,25 +10,21 @@ EXAMPLE_DB = {
                    "The specified member cannot be found. Please enter a member's entire name.",
                ],
                "checkType": "message",
-                "errors": {
-                    "You must be logged-in to do that.": "Login required"
+                "errors": {"You must be logged-in to do that.": "Login required"},
+                "url": "{urlMain}{urlSubpath}/members/?username={username}",
            },
-                "url": "{urlMain}{urlSubpath}/members/?username={username}"
-            }
        },
    },
    'sites': {
        "Amperka": {
            "engine": "XenForo",
            "rank": 121613,
-            "tags": [
-                "ru"
-            ],
+            "tags": ["ru"],
            "urlMain": "http://forum.amperka.ru",
            "usernameClaimed": "adam",
-            "usernameUnclaimed": "noonewouldeverusethis7"
+            "usernameUnclaimed": "noonewouldeverusethis7",
+        },
    },
-    }
 }


@@ -116,8 +112,14 @@ def test_site_url_detector():
    db = MaigretDatabase()
    db.load_from_json(EXAMPLE_DB)

-    assert db.sites[0].url_regexp.pattern == r'^https?://(www.)?forum\.amperka\.ru/members/\?username=(.+?)$'
-    assert db.sites[0].detect_username('http://forum.amperka.ru/members/?username=test') == 'test'
+    assert (
+        db.sites[0].url_regexp.pattern
+        == r'^https?://(www.)?forum\.amperka\.ru/members/\?username=(.+?)$'
+    )
+    assert (
+        db.sites[0].detect_username('http://forum.amperka.ru/members/?username=test')
+        == 'test'
+    )


 def test_ranked_sites_dict():
@@ -2,7 +2,13 @@
 import itertools
 import re

-from maigret.utils import CaseConverter, is_country_tag, enrich_link_str, URLMatcher, get_dict_ascii_tree
+from maigret.utils import (
+    CaseConverter,
+    is_country_tag,
+    enrich_link_str,
+    URLMatcher,
+    get_dict_ascii_tree,
+)


 def test_case_convert_camel_to_snake():
@@ -45,8 +51,10 @@ def test_is_country_tag():

 def test_enrich_link_str():
    assert enrich_link_str('test') == 'test'
-    assert enrich_link_str(
-        ' www.flickr.com/photos/alexaimephotography/') == '<a class="auto-link" href="www.flickr.com/photos/alexaimephotography/">www.flickr.com/photos/alexaimephotography/</a>'
+    assert (
+        enrich_link_str(' www.flickr.com/photos/alexaimephotography/')
+        == '<a class="auto-link" href="www.flickr.com/photos/alexaimephotography/">www.flickr.com/photos/alexaimephotography/</a>'
+    )


 def test_url_extract_main_part():
@@ -78,15 +86,32 @@ def test_url_make_profile_url_regexp():

    for url_parts in itertools.product(*parts):
        url = ''.join(url_parts)
-        assert URLMatcher.make_profile_url_regexp(url).pattern == r'^https?://(www.)?flickr\.com/photos/(.+?)$'
+        assert (
+            URLMatcher.make_profile_url_regexp(url).pattern
+            == r'^https?://(www.)?flickr\.com/photos/(.+?)$'
+        )


 def test_get_dict_ascii_tree():
-    data = {'uid': 'dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==', 'legacy_id': '26403415', 'username': 'alexaimephotographycars', 'name': 'Alex Aimé', 'created_at': '2018-05-04T10:17:01.000+0000', 'image': 'https://drscdn.500px.org/user_avatar/26403415/q%3D85_w%3D300_h%3D300/v2?webp=true&v=2&sig=0235678a4f7b65e007e864033ebfaf5ef6d87fad34f80a8639d985320c20fe3b', 'image_bg': 'https://drscdn.500px.org/user_cover/26403415/q%3D65_m%3D2048/v2?webp=true&v=1&sig=bea411fb158391a4fdad498874ff17088f91257e59dfb376ff67e3a44c3a4201', 'website': 'www.instagram.com/street.reality.photography/', 'facebook_link': ' www.instagram.com/street.reality.photography/', 'instagram_username': 'Street.Reality.Photography', 'twitter_username': 'Alexaimephotogr'}
+    data = {
+        'uid': 'dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==',
+        'legacy_id': '26403415',
+        'username': 'alexaimephotographycars',
+        'name': 'Alex Aimé',
+        'created_at': '2018-05-04T10:17:01.000+0000',
+        'image': 'https://drscdn.500px.org/user_avatar/26403415/q%3D85_w%3D300_h%3D300/v2?webp=true&v=2&sig=0235678a4f7b65e007e864033ebfaf5ef6d87fad34f80a8639d985320c20fe3b',
+        'image_bg': 'https://drscdn.500px.org/user_cover/26403415/q%3D65_m%3D2048/v2?webp=true&v=1&sig=bea411fb158391a4fdad498874ff17088f91257e59dfb376ff67e3a44c3a4201',
+        'website': 'www.instagram.com/street.reality.photography/',
+        'facebook_link': ' www.instagram.com/street.reality.photography/',
+        'instagram_username': 'Street.Reality.Photography',
+        'twitter_username': 'Alexaimephotogr',
+    }

    ascii_tree = get_dict_ascii_tree(data.items())

-    assert ascii_tree == """
+    assert (
+        ascii_tree
+        == """
 ┣╸uid: dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==
 ┣╸legacy_id: 26403415
 ┣╸username: alexaimephotographycars
@@ -98,3 +123,4 @@ def test_get_dict_ascii_tree():
 ┣╸facebook_link:  www.instagram.com/street.reality.photography/
 ┣╸instagram_username: Street.Reality.Photography
 ┗╸twitter_username: Alexaimephotogr"""
+    )