Refactoring and linting, added notifications about frequent search errors

2026-05-07 06:24:35 +00:00 · 2021-04-30 12:03:13 +03:00
parent bfaf276f6e
commit bfa6afac32
20 changed files with 1351 additions and 787 deletions
@@ -12,11 +12,26 @@ from argparse import ArgumentParser, RawDescriptionHelpFormatter
 import requests
 from socid_extractor import extract, parse, __version__ as socid_version

-from .checking import timeout_check, supported_recursive_search_ids, self_check, unsupported_characters, maigret
+from .checking import (
+    timeout_check,
+    supported_recursive_search_ids,
+    self_check,
+    unsupported_characters,
+    maigret,
+)
+from . import errors
 from .notify import QueryNotifyPrint
-from .report import save_csv_report, save_xmind_report, save_html_report, save_pdf_report, \
-    generate_report_context, save_txt_report, SUPPORTED_JSON_REPORT_FORMATS, check_supported_json_format, \
-    save_json_report
+from .report import (
+    save_csv_report,
+    save_xmind_report,
+    save_html_report,
+    save_pdf_report,
+    generate_report_context,
+    save_txt_report,
+    SUPPORTED_JSON_REPORT_FORMATS,
+    check_supported_json_format,
+    save_json_report,
+)
 from .sites import MaigretDatabase
 from .submit import submit_dialog
 from .utils import get_dict_ascii_tree
@@ -24,168 +39,301 @@ from .utils import get_dict_ascii_tree
 __version__ = '0.1.19'


-async def main():
-    version_string = '\n'.join([
-        f'%(prog)s {__version__}',
-        f'Socid-extractor:  {socid_version}',
-        f'Aiohttp:  {aiohttp.__version__}',
-        f'Requests:  {requests.__version__}',
-        f'Python:  {platform.python_version()}',
-    ])
+def notify_about_errors(search_results, query_notify):
+    errs = errors.extract_and_group(search_results.values())
+    was_errs_displayed = False
+    for e in errs:
+        if not errors.is_important(e):
+            continue
+        text = f'Too many errors of type "{e["err"]}" ({e["perc"]}%)'
+        solution = errors.solution_of(e['err'])
+        if solution:
+            text = '. '.join([text, solution])

-    parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter,
-                            description=f"Maigret v{__version__}"
-                            )
-    parser.add_argument("--version",
-                        action="version", version=version_string,
-                        help="Display version information and dependencies."
-                        )
-    parser.add_argument("--info", "-vv",
-                        action="store_true", dest="info", default=False,
-                        help="Display service information."
-                        )
-    parser.add_argument("--verbose", "-v",
-                        action="store_true", dest="verbose", default=False,
-                        help="Display extra information and metrics."
-                        )
-    parser.add_argument("-d", "--debug", "-vvv",
-                        action="store_true", dest="debug", default=False,
-                        help="Saving debugging information and sites responses in debug.txt."
-                        )
-    parser.add_argument("--site",
-                        action="append", metavar='SITE_NAME',
-                        dest="site_list", default=[],
-                        help="Limit analysis to just the listed sites (use several times to specify more than one)"
-                        )
-    parser.add_argument("--proxy", "-p", metavar='PROXY_URL',
-                        action="store", dest="proxy", default=None,
-                        help="Make requests over a proxy. e.g. socks5://127.0.0.1:1080"
-                        )
-    parser.add_argument("--db", metavar="DB_FILE",
-                        dest="db_file", default=None,
-                        help="Load Maigret database from a JSON file or an online, valid, JSON file.")
-    parser.add_argument("--cookies-jar-file", metavar="COOKIE_FILE",
-                        dest="cookie_file", default=None,
-                        help="File with cookies.")
-    parser.add_argument("--timeout",
-                        action="store", metavar='TIMEOUT',
-                        dest="timeout", type=timeout_check, default=10,
-                        help="Time (in seconds) to wait for response to requests."
-                             "Default timeout of 10.0s. "
-                             "A longer timeout will be more likely to get results from slow sites."
-                             "On the other hand, this may cause a long delay to gather all results."
-                        )
-    parser.add_argument("-n", "--max-connections",
-                        action="store", type=int,
-                        dest="connections", default=100,
-                        help="Allowed number of concurrent connections."
-                        )
-    parser.add_argument("-a", "--all-sites",
-                        action="store_true", dest="all_sites", default=False,
-                        help="Use all sites for scan."
-                        )
-    parser.add_argument("--top-sites",
-                        action="store", default=500, type=int,
-                        help="Count of sites for scan ranked by Alexa Top (default: 500)."
-                        )
-    parser.add_argument("--print-not-found",
-                        action="store_true", dest="print_not_found", default=False,
-                        help="Print sites where the username was not found."
-                        )
-    parser.add_argument("--print-errors",
-                        action="store_true", dest="print_check_errors", default=False,
-                        help="Print errors messages: connection, captcha, site country ban, etc."
-                        )
-    parser.add_argument("--submit", metavar='EXISTING_USER_URL',
-                        type=str, dest="new_site_to_submit", default=False,
-                        help="URL of existing profile in new site to submit."
-                        )
-    parser.add_argument("--no-color",
-                        action="store_true", dest="no_color", default=False,
-                        help="Don't color terminal output"
-                        )
-    parser.add_argument("--no-progressbar",
-                        action="store_true", dest="no_progressbar", default=False,
-                        help="Don't show progressbar."
-                        )
-    parser.add_argument("--browse", "-b",
-                        action="store_true", dest="browse", default=False,
-                        help="Browse to all results on default bowser."
-                        )
-    parser.add_argument("--no-recursion",
-                        action="store_true", dest="disable_recursive_search", default=False,
-                        help="Disable recursive search by additional data extracted from pages."
-                        )
-    parser.add_argument("--no-extracting",
-                        action="store_true", dest="disable_extracting", default=False,
-                        help="Disable parsing pages for additional data and other usernames."
-                        )
-    parser.add_argument("--self-check",
-                        action="store_true", default=False,
-                        help="Do self check for sites and database and disable non-working ones."
-                        )
-    parser.add_argument("--stats",
-                        action="store_true", default=False,
-                        help="Show database statistics."
-                        )
-    parser.add_argument("--use-disabled-sites",
-                        action="store_true", default=False,
-                        help="Use disabled sites to search (may cause many false positives)."
-                        )
-    parser.add_argument("--parse",
-                        dest="parse_url", default='',
-                        help="Parse page by URL and extract username and IDs to use for search."
-                        )
-    parser.add_argument("--id-type",
-                        dest="id_type", default='username',
-                        help="Specify identifier(s) type (default: username)."
-                        )
-    parser.add_argument("--ignore-ids",
-                        action="append", metavar='IGNORED_IDS',
-                        dest="ignore_ids_list", default=[],
-                        help="Do not make search by the specified username or other ids."
-                        )
-    parser.add_argument("username",
-                        nargs='+', metavar='USERNAMES',
-                        action="store",
-                        help="One or more usernames to check with social networks."
-                        )
-    parser.add_argument("--tags",
-                        dest="tags", default='',
-                        help="Specify tags of sites."
-                        )
+        query_notify.warning(text, '!')
+        was_errs_displayed = True
+
+    if was_errs_displayed:
+        query_notify.warning(
+            'You can see detailed site check errors with a flag `--print-errors`'
+        )
+
+
+async def main():
+    version_string = '\n'.join(
+        [
+            f'%(prog)s {__version__}',
+            f'Socid-extractor:  {socid_version}',
+            f'Aiohttp:  {aiohttp.__version__}',
+            f'Requests:  {requests.__version__}',
+            f'Python:  {platform.python_version()}',
+        ]
+    )
+
+    parser = ArgumentParser(
+        formatter_class=RawDescriptionHelpFormatter,
+        description=f"Maigret v{__version__}",
+    )
+    parser.add_argument(
+        "--version",
+        action="version",
+        version=version_string,
+        help="Display version information and dependencies.",
+    )
+    parser.add_argument(
+        "--info",
+        "-vv",
+        action="store_true",
+        dest="info",
+        default=False,
+        help="Display service information.",
+    )
+    parser.add_argument(
+        "--verbose",
+        "-v",
+        action="store_true",
+        dest="verbose",
+        default=False,
+        help="Display extra information and metrics.",
+    )
+    parser.add_argument(
+        "-d",
+        "--debug",
+        "-vvv",
+        action="store_true",
+        dest="debug",
+        default=False,
+        help="Saving debugging information and sites responses in debug.txt.",
+    )
+    parser.add_argument(
+        "--site",
+        action="append",
+        metavar='SITE_NAME',
+        dest="site_list",
+        default=[],
+        help="Limit analysis to just the listed sites (use several times to specify more than one)",
+    )
+    parser.add_argument(
+        "--proxy",
+        "-p",
+        metavar='PROXY_URL',
+        action="store",
+        dest="proxy",
+        default=None,
+        help="Make requests over a proxy. e.g. socks5://127.0.0.1:1080",
+    )
+    parser.add_argument(
+        "--db",
+        metavar="DB_FILE",
+        dest="db_file",
+        default=None,
+        help="Load Maigret database from a JSON file or an online, valid, JSON file.",
+    )
+    parser.add_argument(
+        "--cookies-jar-file",
+        metavar="COOKIE_FILE",
+        dest="cookie_file",
+        default=None,
+        help="File with cookies.",
+    )
+    parser.add_argument(
+        "--timeout",
+        action="store",
+        metavar='TIMEOUT',
+        dest="timeout",
+        type=timeout_check,
+        default=30,
+        help="Time (in seconds) to wait for response to requests. "
+        "Default timeout of 30.0s. "
+        "A longer timeout will be more likely to get results from slow sites. "
+        "On the other hand, this may cause a long delay to gather all results. ",
+    )
+    parser.add_argument(
+        "-n",
+        "--max-connections",
+        action="store",
+        type=int,
+        dest="connections",
+        default=100,
+        help="Allowed number of concurrent connections.",
+    )
+    parser.add_argument(
+        "-a",
+        "--all-sites",
+        action="store_true",
+        dest="all_sites",
+        default=False,
+        help="Use all sites for scan.",
+    )
+    parser.add_argument(
+        "--top-sites",
+        action="store",
+        default=500,
+        type=int,
+        help="Count of sites for scan ranked by Alexa Top (default: 500).",
+    )
+    parser.add_argument(
+        "--print-not-found",
+        action="store_true",
+        dest="print_not_found",
+        default=False,
+        help="Print sites where the username was not found.",
+    )
+    parser.add_argument(
+        "--print-errors",
+        action="store_true",
+        dest="print_check_errors",
+        default=False,
+        help="Print errors messages: connection, captcha, site country ban, etc.",
+    )
+    parser.add_argument(
+        "--submit",
+        metavar='EXISTING_USER_URL',
+        type=str,
+        dest="new_site_to_submit",
+        default=False,
+        help="URL of existing profile in new site to submit.",
+    )
+    parser.add_argument(
+        "--no-color",
+        action="store_true",
+        dest="no_color",
+        default=False,
+        help="Don't color terminal output",
+    )
+    parser.add_argument(
+        "--no-progressbar",
+        action="store_true",
+        dest="no_progressbar",
+        default=False,
+        help="Don't show progressbar.",
+    )
+    parser.add_argument(
+        "--browse",
+        "-b",
+        action="store_true",
+        dest="browse",
+        default=False,
+        help="Browse to all results on default bowser.",
+    )
+    parser.add_argument(
+        "--no-recursion",
+        action="store_true",
+        dest="disable_recursive_search",
+        default=False,
+        help="Disable recursive search by additional data extracted from pages.",
+    )
+    parser.add_argument(
+        "--no-extracting",
+        action="store_true",
+        dest="disable_extracting",
+        default=False,
+        help="Disable parsing pages for additional data and other usernames.",
+    )
+    parser.add_argument(
+        "--self-check",
+        action="store_true",
+        default=False,
+        help="Do self check for sites and database and disable non-working ones.",
+    )
+    parser.add_argument(
+        "--stats", action="store_true", default=False, help="Show database statistics."
+    )
+    parser.add_argument(
+        "--use-disabled-sites",
+        action="store_true",
+        default=False,
+        help="Use disabled sites to search (may cause many false positives).",
+    )
+    parser.add_argument(
+        "--parse",
+        dest="parse_url",
+        default='',
+        help="Parse page by URL and extract username and IDs to use for search.",
+    )
+    parser.add_argument(
+        "--id-type",
+        dest="id_type",
+        default='username',
+        help="Specify identifier(s) type (default: username).",
+    )
+    parser.add_argument(
+        "--ignore-ids",
+        action="append",
+        metavar='IGNORED_IDS',
+        dest="ignore_ids_list",
+        default=[],
+        help="Do not make search by the specified username or other ids.",
+    )
+    parser.add_argument(
+        "username",
+        nargs='+',
+        metavar='USERNAMES',
+        action="store",
+        help="One or more usernames to check with social networks.",
+    )
+    parser.add_argument(
+        "--tags", dest="tags", default='', help="Specify tags of sites."
+    )
    # reports options
-    parser.add_argument("--folderoutput", "-fo", dest="folderoutput", default="reports",
-                        help="If using multiple usernames, the output of the results will be saved to this folder."
-                        )
-    parser.add_argument("-T", "--txt",
-                        action="store_true", dest="txt", default=False,
-                        help="Create a TXT report (one report per username)."
-                        )
-    parser.add_argument("-C", "--csv",
-                        action="store_true", dest="csv", default=False,
-                        help="Create a CSV report (one report per username)."
-                        )
-    parser.add_argument("-H", "--html",
-                        action="store_true", dest="html", default=False,
-                        help="Create an HTML report file (general report on all usernames)."
-                        )
-    parser.add_argument("-X", "--xmind",
-                        action="store_true",
-                        dest="xmind", default=False,
-                        help="Generate an XMind 8 mindmap report (one report per username)."
-                        )
-    parser.add_argument("-P", "--pdf",
-                        action="store_true",
-                        dest="pdf", default=False,
-                        help="Generate a PDF report (general report on all usernames)."
-                        )
-    parser.add_argument("-J", "--json",
-                        action="store", metavar='REPORT_TYPE',
-                        dest="json", default='', type=check_supported_json_format,
-                        help=f"Generate a JSON report of specific type: {', '.join(SUPPORTED_JSON_REPORT_FORMATS)}"
-                             " (one report per username)."
-                        )
+    parser.add_argument(
+        "--folderoutput",
+        "-fo",
+        dest="folderoutput",
+        default="reports",
+        help="If using multiple usernames, the output of the results will be saved to this folder.",
+    )
+    parser.add_argument(
+        "-T",
+        "--txt",
+        action="store_true",
+        dest="txt",
+        default=False,
+        help="Create a TXT report (one report per username).",
+    )
+    parser.add_argument(
+        "-C",
+        "--csv",
+        action="store_true",
+        dest="csv",
+        default=False,
+        help="Create a CSV report (one report per username).",
+    )
+    parser.add_argument(
+        "-H",
+        "--html",
+        action="store_true",
+        dest="html",
+        default=False,
+        help="Create an HTML report file (general report on all usernames).",
+    )
+    parser.add_argument(
+        "-X",
+        "--xmind",
+        action="store_true",
+        dest="xmind",
+        default=False,
+        help="Generate an XMind 8 mindmap report (one report per username).",
+    )
+    parser.add_argument(
+        "-P",
+        "--pdf",
+        action="store_true",
+        dest="pdf",
+        default=False,
+        help="Generate a PDF report (general report on all usernames).",
+    )
+    parser.add_argument(
+        "-J",
+        "--json",
+        action="store",
+        metavar='REPORT_TYPE',
+        dest="json",
+        default='',
+        type=check_supported_json_format,
+        help=f"Generate a JSON report of specific type: {', '.join(SUPPORTED_JSON_REPORT_FORMATS)}"
+        " (one report per username).",
+    )

    args = parser.parse_args()

@@ -194,7 +342,7 @@ async def main():
    logging.basicConfig(
        format='[%(filename)s:%(lineno)d] %(levelname)-3s  %(asctime)s %(message)s',
        datefmt='%H:%M:%S',
-        level=log_level
+        level=log_level,
    )

    if args.debug:
@@ -211,8 +359,7 @@ async def main():
    usernames = {
        u: args.id_type
        for u in args.username
-        if u not in ['-']
-           and u not in args.ignore_ids_list
+        if u not in ['-'] and u not in args.ignore_ids_list
    }

    parsing_enabled = not args.disable_extracting
@@ -228,8 +375,10 @@ async def main():
        try:
            # temporary workaround for URL mutations MVP
            from socid_extractor import mutate_url
+
            reqs += list(mutate_url(args.parse_url))
-        except:
+        except Exception as e:
+            logger.warning(e)
            pass

        for req in reqs:
@@ -251,38 +400,47 @@ async def main():
        args.tags = list(set(str(args.tags).split(',')))

    if args.db_file is None:
-        args.db_file = \
-            os.path.join(os.path.dirname(os.path.realpath(__file__)),
-                         "resources/data.json"
-                         )
+        args.db_file = os.path.join(
+            os.path.dirname(os.path.realpath(__file__)), "resources/data.json"
+        )

    if args.top_sites == 0 or args.all_sites:
        args.top_sites = sys.maxsize

    # Create notify object for query results.
-    query_notify = QueryNotifyPrint(result=None,
-                                    verbose=args.verbose,
-                                    print_found_only=not args.print_not_found,
-                                    skip_check_errors=not args.print_check_errors,
-                                    color=not args.no_color)
+    query_notify = QueryNotifyPrint(
+        result=None,
+        verbose=args.verbose,
+        print_found_only=not args.print_not_found,
+        skip_check_errors=not args.print_check_errors,
+        color=not args.no_color,
+    )

    # Create object with all information about sites we are aware of.
    db = MaigretDatabase().load_from_file(args.db_file)
-    get_top_sites_for_id = lambda x: db.ranked_sites_dict(top=args.top_sites, tags=args.tags,
-                                                          names=args.site_list,
-                                                          disabled=False, id_type=x)
+    get_top_sites_for_id = lambda x: db.ranked_sites_dict(
+        top=args.top_sites,
+        tags=args.tags,
+        names=args.site_list,
+        disabled=False,
+        id_type=x,
+    )

    site_data = get_top_sites_for_id(args.id_type)

    if args.new_site_to_submit:
-        is_submitted = await submit_dialog(db, args.new_site_to_submit, args.cookie_file, logger)
+        is_submitted = await submit_dialog(
+            db, args.new_site_to_submit, args.cookie_file, logger
+        )
        if is_submitted:
            db.save_to_file(args.db_file)

    # Database self-checking
    if args.self_check:
        print('Maigret sites database self-checking...')
-        is_need_update = await self_check(db, site_data, logger, max_connections=args.connections)
+        is_need_update = await self_check(
+            db, site_data, logger, max_connections=args.connections
+        )
        if is_need_update:
            if input('Do you want to save changes permanently? [Yn]\n').lower() == 'y':
                db.save_to_file(args.db_file)
@@ -314,9 +472,13 @@ async def main():
        query_notify.warning('No sites to check, exiting!')
        sys.exit(2)
    else:
-        query_notify.warning(f'Starting a search on top {len(site_data)} sites from the Maigret database...')
+        query_notify.warning(
+            f'Starting a search on top {len(site_data)} sites from the Maigret database...'
+        )
        if not args.all_sites:
-            query_notify.warning(f'You can run search by full list of sites with flag `-a`', '!')
+            query_notify.warning(
+                'You can run search by full list of sites with flag `-a`', '!'
+            )

    already_checked = set()
    general_results = []
@@ -331,34 +493,44 @@ async def main():
            already_checked.add(username.lower())

        if username in args.ignore_ids_list:
-            query_notify.warning(f'Skip a search by username {username} cause it\'s marked as ignored.')
+            query_notify.warning(
+                f'Skip a search by username {username} cause it\'s marked as ignored.'
+            )
            continue

        # check for characters do not supported by sites generally
-        found_unsupported_chars = set(unsupported_characters).intersection(set(username))
+        found_unsupported_chars = set(unsupported_characters).intersection(
+            set(username)
+        )

        if found_unsupported_chars:
-            pretty_chars_str = ','.join(map(lambda s: f'"{s}"', found_unsupported_chars))
+            pretty_chars_str = ','.join(
+                map(lambda s: f'"{s}"', found_unsupported_chars)
+            )
            query_notify.warning(
-                f'Found unsupported URL characters: {pretty_chars_str}, skip search by username "{username}"')
+                f'Found unsupported URL characters: {pretty_chars_str}, skip search by username "{username}"'
+            )
            continue

        sites_to_check = get_top_sites_for_id(id_type)

-        results = await maigret(username=username,
-                                site_dict=dict(sites_to_check),
-                                query_notify=query_notify,
-                                proxy=args.proxy,
-                                timeout=args.timeout,
-                                is_parsing_enabled=parsing_enabled,
-                                id_type=id_type,
-                                debug=args.verbose,
-                                logger=logger,
-                                cookies=args.cookie_file,
-                                forced=args.use_disabled_sites,
-                                max_connections=args.connections,
-                                no_progressbar=args.no_progressbar,
-                                )
+        results = await maigret(
+            username=username,
+            site_dict=dict(sites_to_check),
+            query_notify=query_notify,
+            proxy=args.proxy,
+            timeout=args.timeout,
+            is_parsing_enabled=parsing_enabled,
+            id_type=id_type,
+            debug=args.verbose,
+            logger=logger,
+            cookies=args.cookie_file,
+            forced=args.use_disabled_sites,
+            max_connections=args.connections,
+            no_progressbar=args.no_progressbar,
+        )
+
+        notify_about_errors(results, query_notify)

        general_results.append((username, id_type, results))

@@ -397,9 +569,13 @@ async def main():
            query_notify.warning(f'TXT report for {username} saved in {filename}')

        if args.json:
-            filename = report_filepath_tpl.format(username=username, postfix=f'_{args.json}.json')
+            filename = report_filepath_tpl.format(
+                username=username, postfix=f'_{args.json}.json'
+            )
            save_json_report(filename, username, results, report_type=args.json)
-            query_notify.warning(f'JSON {args.json} report for {username} saved in {filename}')
+            query_notify.warning(
+                f'JSON {args.json} report for {username} saved in {filename}'
+            )

    # reporting for all the result
    if general_results: