Introduced --retries flag, made thorough refactoring

- updated sites list - test scripts linting
2026-05-15 19:05:43 +00:00 · 2021-05-01 23:51:48 +03:00
parent 7fd4a2c516
commit 5ee91f6659
18 changed files with 6182 additions and 4943 deletions
@@ -5,7 +5,7 @@ import re
 import ssl
 import sys
 import tqdm
-from typing import Tuple, Optional
+from typing import Tuple, Optional, Dict, List

 import aiohttp
 import tqdm.asyncio
@@ -16,9 +16,14 @@ from socid_extractor import extract
 from .activation import ParsingActivator, import_aiohttp_cookies
 from . import errors
 from .errors import CheckError
-from .executors import AsyncioSimpleExecutor, AsyncioProgressbarQueueExecutor
+from .executors import (
+    AsyncExecutor,
+    AsyncioSimpleExecutor,
+    AsyncioProgressbarQueueExecutor,
+)
 from .result import QueryResult, QueryStatus
 from .sites import MaigretDatabase, MaigretSite
+from .types import QueryOptions, QueryResultWrapper
 from .utils import get_random_user_agent


@@ -35,12 +40,10 @@ supported_recursive_search_ids = (
 unsupported_characters = "#"


-async def get_response(
-    request_future, site_name, logger
-) -> Tuple[str, int, Optional[CheckError]]:
+async def get_response(request_future, logger) -> Tuple[str, int, Optional[CheckError]]:
    html_text = None
    status_code = 0
-    error: Optional[CheckError] = CheckError("Error")
+    error: Optional[CheckError] = CheckError("Unknown")

    try:
        response = await request_future
@@ -76,32 +79,12 @@ async def get_response(
            ):
                error = CheckError("SSL", str(e))
        else:
-            logger.warning(f"Unhandled error while requesting {site_name}: {e}")
            logger.debug(e, exc_info=True)
-            error = CheckError("Error", str(e))
+            error = CheckError("Unexpected", str(e))

-    # TODO: return only needed information
    return str(html_text), status_code, error


-async def update_site_dict_from_response(
-    sitename, site_dict, results_info, logger, query_notify
-):
-    site_obj = site_dict[sitename]
-    future = site_obj.request_future
-    if not future:
-        # ignore: search by incompatible id type
-        return
-
-    response = await get_response(
-        request_future=future, site_name=sitename, logger=logger
-    )
-
-    return sitename, process_site_result(
-        response, query_notify, logger, results_info, site_obj
-    )
-
-
 # TODO: move to separate class
 def detect_error_page(
    html_text, status_code, fail_flags, ignore_403
@@ -127,7 +110,7 @@ def detect_error_page(


 def process_site_result(
-    response, query_notify, logger, results_info, site: MaigretSite
+    response, query_notify, logger, results_info: QueryResultWrapper, site: MaigretSite
 ):
    if not response:
        return results_info
@@ -205,6 +188,17 @@ def process_site_result(
                    logger.debug(presense_flag)
                    break

+    def build_result(status, **kwargs):
+        return QueryResult(
+            username,
+            site_name,
+            url,
+            status,
+            query_time=response_time,
+            tags=fulltags,
+            **kwargs,
+        )
+
    if check_error:
        logger.debug(check_error)
        result = QueryResult(
@@ -218,53 +212,20 @@ def process_site_result(
            tags=fulltags,
        )
    elif check_type == "message":
-        absence_flags = site.absence_strs
-        is_absence_flags_list = isinstance(absence_flags, list)
-        absence_flags_set = (
-            set(absence_flags) if is_absence_flags_list else {absence_flags}
-        )
        # Checks if the error message is in the HTML
        is_absence_detected = any(
-            [(absence_flag in html_text) for absence_flag in absence_flags_set]
+            [(absence_flag in html_text) for absence_flag in site.absence_strs]
        )
        if not is_absence_detected and is_presense_detected:
-            result = QueryResult(
-                username,
-                site_name,
-                url,
-                QueryStatus.CLAIMED,
-                query_time=response_time,
-                tags=fulltags,
-            )
+            result = build_result(QueryStatus.CLAIMED)
        else:
-            result = QueryResult(
-                username,
-                site_name,
-                url,
-                QueryStatus.AVAILABLE,
-                query_time=response_time,
-                tags=fulltags,
-            )
+            result = build_result(QueryStatus.AVAILABLE)
    elif check_type == "status_code":
        # Checks if the status code of the response is 2XX
-        if (not status_code >= 300 or status_code < 200) and is_presense_detected:
-            result = QueryResult(
-                username,
-                site_name,
-                url,
-                QueryStatus.CLAIMED,
-                query_time=response_time,
-                tags=fulltags,
-            )
+        if is_presense_detected and (not status_code >= 300 or status_code < 200):
+            result = build_result(QueryStatus.CLAIMED)
        else:
-            result = QueryResult(
-                username,
-                site_name,
-                url,
-                QueryStatus.AVAILABLE,
-                query_time=response_time,
-                tags=fulltags,
-            )
+            result = build_result(QueryStatus.AVAILABLE)
    elif check_type == "response_url":
        # For this detection method, we have turned off the redirect.
        # So, there is no need to check the response URL: it will always
@@ -272,23 +233,9 @@ def process_site_result(
        # code indicates that the request was successful (i.e. no 404, or
        # forward to some odd redirect).
        if 200 <= status_code < 300 and is_presense_detected:
-            result = QueryResult(
-                username,
-                site_name,
-                url,
-                QueryStatus.CLAIMED,
-                query_time=response_time,
-                tags=fulltags,
-            )
+            result = build_result(QueryStatus.CLAIMED)
        else:
-            result = QueryResult(
-                username,
-                site_name,
-                url,
-                QueryStatus.AVAILABLE,
-                query_time=response_time,
-                tags=fulltags,
-            )
+            result = build_result(QueryStatus.AVAILABLE)
    else:
        # It should be impossible to ever get here...
        raise ValueError(
@@ -329,9 +276,168 @@ def process_site_result(
    return results_info


+def make_site_result(
+    site: MaigretSite, username: str, options: QueryOptions, logger
+) -> QueryResultWrapper:
+    results_site: QueryResultWrapper = {}
+
+    # Record URL of main site and username
+    results_site["site"] = site
+    results_site["username"] = username
+    results_site["parsing_enabled"] = options["parsing"]
+    results_site["url_main"] = site.url_main
+    results_site["cookies"] = (
+        options.get("cookie_jar")
+        and options["cookie_jar"].filter_cookies(site.url_main)
+        or None
+    )
+
+    headers = {
+        "User-Agent": get_random_user_agent(),
+    }
+
+    headers.update(site.headers)
+
+    if "url" not in site.__dict__:
+        logger.error("No URL for site %s", site.name)
+
+    # URL of user on site (if it exists)
+    url = site.url.format(
+        urlMain=site.url_main, urlSubpath=site.url_subpath, username=username
+    )
+
+    # workaround to prevent slash errors
+    url = re.sub("(?<!:)/+", "/", url)
+
+    session = options['session']
+
+    # site check is disabled
+    if site.disabled and not options['forced']:
+        logger.debug(f"Site {site.name} is disabled, skipping...")
+        results_site["status"] = QueryResult(
+            username,
+            site.name,
+            url,
+            QueryStatus.ILLEGAL,
+            error=CheckError("Check is disabled"),
+        )
+    # current username type could not be applied
+    elif site.type != options["id_type"]:
+        results_site["status"] = QueryResult(
+            username,
+            site.name,
+            url,
+            QueryStatus.ILLEGAL,
+            error=CheckError('Unsupported identifier type', f'Want "{site.type}"'),
+        )
+    # username is not allowed.
+    elif site.regex_check and re.search(site.regex_check, username) is None:
+        results_site["status"] = QueryResult(
+            username,
+            site.name,
+            url,
+            QueryStatus.ILLEGAL,
+            error=CheckError(
+                'Unsupported username format', f'Want "{site.regex_check}"'
+            ),
+        )
+        results_site["url_user"] = ""
+        results_site["http_status"] = ""
+        results_site["response_text"] = ""
+        # query_notify.update(results_site["status"])
+    else:
+        # URL of user on site (if it exists)
+        results_site["url_user"] = url
+        url_probe = site.url_probe
+        if url_probe is None:
+            # Probe URL is normal one seen by people out on the web.
+            url_probe = url
+        else:
+            # There is a special URL for probing existence separate
+            # from where the user profile normally can be found.
+            url_probe = url_probe.format(
+                urlMain=site.url_main,
+                urlSubpath=site.url_subpath,
+                username=username,
+            )
+
+        for k, v in site.get_params.items():
+            url_probe += f"&{k}={v}"
+
+        if site.check_type == "status_code" and site.request_head_only:
+            # In most cases when we are detecting by status code,
+            # it is not necessary to get the entire body:  we can
+            # detect fine with just the HEAD response.
+            request_method = session.head
+        else:
+            # Either this detect method needs the content associated
+            # with the GET response, or this specific website will
+            # not respond properly unless we request the whole page.
+            request_method = session.get
+
+        if site.check_type == "response_url":
+            # Site forwards request to a different URL if username not
+            # found.  Disallow the redirect so we can capture the
+            # http status from the original URL request.
+            allow_redirects = False
+        else:
+            # Allow whatever redirect that the site wants to do.
+            # The final result of the request will be what is available.
+            allow_redirects = True
+
+        future = request_method(
+            url=url_probe,
+            headers=headers,
+            allow_redirects=allow_redirects,
+            timeout=options['timeout'],
+        )
+
+        # Store future request object in the results object
+        results_site["future"] = future
+
+    return results_site
+
+
+async def check_site_for_username(
+    site, username, options: QueryOptions, logger, query_notify, *args, **kwargs
+) -> Tuple[str, QueryResultWrapper]:
+    default_result = make_site_result(site, username, options, logger)
+    future = default_result.get("future")
+    if not future:
+        return site.name, default_result
+
+    response = await get_response(request_future=future, logger=logger)
+
+    response_result = process_site_result(
+        response, query_notify, logger, default_result, site
+    )
+
+    return site.name, response_result
+
+
+async def debug_ip_request(session, logger):
+    future = session.get(url="https://icanhazip.com")
+    ip, status, check_error = await get_response(future, logger)
+    if ip:
+        logger.debug(f"My IP is: {ip.strip()}")
+    else:
+        logger.debug(f"IP requesting {check_error.type}: {check_error.desc}")
+
+
+def get_failed_sites(results: Dict[str, QueryResultWrapper]) -> List[str]:
+    sites = []
+    for sitename, r in results.items():
+        status = r.get('status', {})
+        if status and status.error:
+            if errors.is_permanent(status.error.type):
+                continue
+            sites.append(sitename)
+    return sites
+
+
 async def maigret(
-    username,
-    site_dict,
+    username: str,
+    site_dict: Dict[str, MaigretSite],
    logger,
    query_notify=None,
    proxy=None,
@@ -343,14 +449,15 @@ async def maigret(
    max_connections=100,
    no_progressbar=False,
    cookies=None,
-):
+    retries=0,
+) -> QueryResultWrapper:
    """Main search func

    Checks for existence of username on certain sites.

    Keyword Arguments:
    username               -- Username string will be used for search.
-    site_dict              -- Dictionary containing sites data.
+    site_dict              -- Dictionary containing sites data in MaigretSite objects.
    query_notify           -- Object with base type of QueryNotify().
                              This will be used to notify the caller about
                              query results.
@@ -380,17 +487,16 @@ async def maigret(
                       there was an HTTP error when checking for existence.
    """

-    # Notify caller that we are starting the query.
+    # notify caller that we are starting the query.
    if not query_notify:
        query_notify = Mock()

    query_notify.start(username, id_type)

-    # TODO: connector
+    # make http client session
    connector = (
        ProxyConnector.from_url(proxy) if proxy else aiohttp.TCPConnector(ssl=False)
    )
-    # connector = aiohttp.TCPConnector(ssl=False)
    connector.verify_ssl = False

    cookie_jar = None
@@ -403,126 +509,10 @@ async def maigret(
    )

    if logger.level == logging.DEBUG:
-        future = session.get(url="https://icanhazip.com")
-        ip, status, check_error = await get_response(future, None, logger)
-        if ip:
-            logger.debug(f"My IP is: {ip.strip()}")
-        else:
-            logger.debug(f"IP requesting {check_error[0]}: {check_error[1]}")
-
-    # Results from analysis of all sites
-    results_total = {}
-
-    # First create futures for all requests. This allows for the requests to run in parallel
-    for site_name, site in site_dict.items():
-
-        if site.type != id_type:
-            continue
-
-        if site.disabled and not forced:
-            logger.debug(f"Site {site.name} is disabled, skipping...")
-            continue
-
-        # Results from analysis of this specific site
-        results_site = {}
-
-        # Record URL of main site and username
-        results_site["username"] = username
-        results_site["parsing_enabled"] = is_parsing_enabled
-        results_site["url_main"] = site.url_main
-        results_site["cookies"] = (
-            cookie_jar and cookie_jar.filter_cookies(site.url_main) or None
-        )
-
-        headers = {
-            "User-Agent": get_random_user_agent(),
-        }
-
-        headers.update(site.headers)
-
-        if "url" not in site.__dict__:
-            logger.error("No URL for site %s", site.name)
-        # URL of user on site (if it exists)
-        url = site.url.format(
-            urlMain=site.url_main, urlSubpath=site.url_subpath, username=username
-        )
-        # workaround to prevent slash errors
-        url = re.sub("(?<!:)/+", "/", url)
-
-        # Don't make request if username is invalid for the site
-        if site.regex_check and re.search(site.regex_check, username) is None:
-            # No need to do the check at the site: this user name is not allowed.
-            results_site["status"] = QueryResult(
-                username, site_name, url, QueryStatus.ILLEGAL
-            )
-            results_site["url_user"] = ""
-            results_site["http_status"] = ""
-            results_site["response_text"] = ""
-            query_notify.update(results_site["status"])
-        else:
-            # URL of user on site (if it exists)
-            results_site["url_user"] = url
-            url_probe = site.url_probe
-            if url_probe is None:
-                # Probe URL is normal one seen by people out on the web.
-                url_probe = url
-            else:
-                # There is a special URL for probing existence separate
-                # from where the user profile normally can be found.
-                url_probe = url_probe.format(
-                    urlMain=site.url_main,
-                    urlSubpath=site.url_subpath,
-                    username=username,
-                )
-
-            for k, v in site.get_params.items():
-                url_probe += f"&{k}={v}"
-
-            if site.check_type == "status_code" and site.request_head_only:
-                # In most cases when we are detecting by status code,
-                # it is not necessary to get the entire body:  we can
-                # detect fine with just the HEAD response.
-                request_method = session.head
-            else:
-                # Either this detect method needs the content associated
-                # with the GET response, or this specific website will
-                # not respond properly unless we request the whole page.
-                request_method = session.get
-
-            if site.check_type == "response_url":
-                # Site forwards request to a different URL if username not
-                # found.  Disallow the redirect so we can capture the
-                # http status from the original URL request.
-                allow_redirects = False
-            else:
-                # Allow whatever redirect that the site wants to do.
-                # The final result of the request will be what is available.
-                allow_redirects = True
-
-            future = request_method(
-                url=url_probe,
-                headers=headers,
-                allow_redirects=allow_redirects,
-                timeout=timeout,
-            )
-
-            # Store future in data for access later
-            # TODO: move to separate obj
-            site.request_future = future
-
-        # Add this site's results into final dictionary with all of the other results.
-        results_total[site_name] = results_site
-
-    coroutines = []
-    for sitename, result_obj in results_total.items():
-        coroutines.append(
-            (
-                update_site_dict_from_response,
-                [sitename, site_dict, result_obj, logger, query_notify],
-                {},
-            )
-        )
+        await debug_ip_request(session, logger)

+    # setup parallel executor
+    executor: Optional[AsyncExecutor] = None
    if no_progressbar:
        executor = AsyncioSimpleExecutor(logger=logger)
    else:
@@ -530,24 +520,68 @@ async def maigret(
            logger=logger, in_parallel=max_connections, timeout=timeout + 0.5
        )

-    results = await executor.run(coroutines)
+    # make options objects for all the requests
+    options: QueryOptions = {}
+    options["cookies"] = cookie_jar
+    options["session"] = session
+    options["parsing"] = is_parsing_enabled
+    options["timeout"] = timeout
+    options["id_type"] = id_type
+    options["forced"] = forced

+    # results from analysis of all sites
+    all_results: Dict[str, QueryResultWrapper] = {}
+
+    sites = list(site_dict.keys())
+
+    attempts = retries + 1
+    while attempts:
+        tasks_dict = {}
+
+        for sitename, site in site_dict.items():
+            if sitename not in sites:
+                continue
+            default_result: QueryResultWrapper = {
+                'site': site,
+                'status': QueryResult(
+                    username,
+                    sitename,
+                    '',
+                    QueryStatus.UNKNOWN,
+                    error=CheckError('Request failed'),
+                ),
+            }
+            tasks_dict[sitename] = (
+                check_site_for_username,
+                [site, username, options, logger, query_notify],
+                {'default': (sitename, default_result)},
+            )
+
+        cur_results = await executor.run(tasks_dict.values())
+
+        # wait for executor timeout errors
+        await asyncio.sleep(1)
+
+        all_results.update(cur_results)
+
+        sites = get_failed_sites(dict(cur_results))
+        attempts -= 1
+
+        if not sites:
+            break
+
+        if attempts:
+            query_notify.warning(
+                f'Restarting checks for {len(sites)} sites... ({attempts} attempts left)'
+            )
+
+    # closing http client session
    await session.close()

-    # Notify caller that all queries are finished.
+    # notify caller that all queries are finished
    query_notify.finish()

-    data = {}
-    for result in results:
-        # TODO: still can be empty
-        if result:
-            try:
-                data[result[0]] = result[1]
-            except Exception as e:
-                logger.error(e, exc_info=True)
-                logger.info(result)
-
-    return data
+    return all_results


 def timeout_check(value):
@@ -575,7 +609,9 @@ def timeout_check(value):
    return timeout


-async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False):
+async def site_self_check(
+    site: MaigretSite, logger, semaphore, db: MaigretDatabase, silent=False
+):
    changes = {
        "disabled": False,
    }
@@ -602,6 +638,7 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F
                id_type=site.type,
                forced=True,
                no_progressbar=True,
+                retries=1,
            )

            # don't disable entries with other ids types
@@ -57,6 +57,17 @@ ERRORS_TYPES = {
    'Request timeout': 'Try to increase timeout or to switch to another internet service provider',
 }

+TEMPORARY_ERRORS_TYPES = [
+    'Request timeout',
+    'Unknown',
+    'Request failed',
+    'Connecting failure',
+    'HTTP',
+    'Proxy',
+    'Interrupted',
+    'Connection lost',
+]
+
 THRESHOLD = 3  # percent


@@ -64,8 +75,8 @@ def is_important(err_data):
    return err_data['perc'] >= THRESHOLD


-def is_not_permanent(err_data):
-    return True
+def is_permanent(err_type):
+    return err_type not in TEMPORARY_ERRORS_TYPES


 def detect(text):
@@ -93,7 +93,7 @@ class AsyncioProgressbarQueueExecutor(AsyncExecutor):
            try:
                result = await asyncio.wait_for(query_task, timeout=self.timeout)
            except asyncio.TimeoutError:
-                result = None
+                result = kwargs.get('default')

            self.results.append(result)
            self.progress.update(1)
@@ -59,7 +59,7 @@ def notify_about_errors(search_results, query_notify):
        )


-async def main():
+def setup_arguments_parser():
    version_string = '\n'.join(
        [
            f'%(prog)s {__version__}',
@@ -148,6 +148,14 @@ async def main():
        "A longer timeout will be more likely to get results from slow sites. "
        "On the other hand, this may cause a long delay to gather all results. ",
    )
+    parser.add_argument(
+        "--retries",
+        action="store",
+        type=int,
+        metavar='RETRIES',
+        default=1,
+        help="Attempts to restart temporary failed requests.",
+    )
    parser.add_argument(
        "-n",
        "--max-connections",
@@ -334,8 +342,12 @@ async def main():
        help=f"Generate a JSON report of specific type: {', '.join(SUPPORTED_JSON_REPORT_FORMATS)}"
        " (one report per username).",
    )
+    return parser

-    args = parser.parse_args()
+
+async def main():
+    arg_parser = setup_arguments_parser()
+    args = arg_parser.parse_args()

    # Logging
    log_level = logging.ERROR
@@ -528,6 +540,7 @@ async def main():
            forced=args.use_disabled_sites,
            max_connections=args.connections,
            no_progressbar=args.no_progressbar,
+            retries=args.retries,
        )

        notify_about_errors(results, query_notify)
@@ -3,7 +3,7 @@
 import copy
 import json
 import sys
-from typing import Optional
+from typing import Optional, List, Dict, Any

 import requests

@@ -57,9 +57,10 @@ SUPPORTED_TAGS = [


 class MaigretEngine:
+    site: Dict[str, Any] = {}
+
    def __init__(self, name, data):
        self.name = name
-        self.site = {}
        self.__dict__.update(data)

    @property
@@ -78,35 +79,40 @@ class MaigretSite:
        "urlRegexp",
    ]

+    username_claimed = ""
+    username_unclaimed = ""
+    url_subpath = ""
+    url_main = ""
+    url = ""
+    disabled = False
+    similar_search = False
+    ignore403 = False
+    tags: List[str] = []
+
+    type = "username"
+    headers: Dict[str, str] = {}
+    errors: Dict[str, str] = {}
+    activation: Dict[str, Any] = {}
+    regex_check = None
+    url_probe = None
+    check_type = ""
+    request_head_only = ""
+    get_params: Dict[str, Any] = {}
+
+    presense_strs: List[str] = []
+    absence_strs: List[str] = []
+    stats: Dict[str, Any] = {}
+
+    engine = None
+    engine_data: Dict[str, Any] = {}
+    engine_obj: Optional["MaigretEngine"] = None
+    request_future = None
+    alexa_rank = None
+    source = None
+
    def __init__(self, name, information):
        self.name = name
-
-        self.disabled = False
-        self.similar_search = False
-        self.ignore403 = False
-        self.tags = []
-
-        self.type = "username"
-        self.headers = {}
-        self.errors = {}
-        self.activation = {}
        self.url_subpath = ""
-        self.regex_check = None
-        self.url_probe = None
-        self.check_type = ""
-        self.request_head_only = ""
-        self.get_params = {}
-
-        self.presense_strs = []
-        self.absence_strs = []
-        self.stats = {}
-
-        self.engine = None
-        self.engine_data = {}
-        self.engine_obj = None
-        self.request_future = None
-        self.alexa_rank = None
-        self.source = None

        for k, v in information.items():
            self.__dict__[CaseConverter.camel_to_snake(k)] = v
@@ -193,7 +199,7 @@ class MaigretSite:
        self.url_regexp = None

        self_copy = copy.deepcopy(self)
-        engine_data = self_copy.engine_obj.site
+        engine_data = self_copy.engine_obj and self_copy.engine_obj.site or {}
        site_data_keys = list(self_copy.__dict__.keys())

        for k in engine_data.keys():
@@ -1,5 +1,11 @@
-from typing import Callable, Any, Tuple
+from typing import Callable, List, Dict, Tuple, Any


 # search query
-QueryDraft = Tuple[Callable, Any, Any]
+QueryDraft = Tuple[Callable, List, Dict]
+
+# options dict
+QueryOptions = Dict[str, Any]
+
+# TODO: throw out
+QueryResultWrapper = Dict[str, Any]