Improved submit mode, several sites added

2026-05-07 06:24:35 +00:00 · 2021-03-21 13:59:59 +03:00
parent b586a4cd06
commit 2bb01f7019
6 changed files with 2746 additions and 2864 deletions
@@ -44,6 +44,7 @@ unsupported_characters = '#'
 QueryDraft = Tuple[Callable, Any, Any]
 QueriesDraft = Iterable[QueryDraft]
 class AsyncExecutor:
    def __init__(self, *args, **kwargs):
        self.logger = kwargs['logger']
@@ -472,7 +473,7 @@ async def maigret(username, site_dict, query_notify, logger,
        headers.update(site.headers)
-        if not 'url' in site.__dict__:
+        if 'url' not in site.__dict__:
            logger.error('No URL for site %s', site.name)
        # URL of user on site (if it exists)
        url = site.url.format(
@@ -1,19 +1,23 @@
 """
 Maigret main module
 """
-
+import aiohttp
 import asyncio
 import logging
 import os
 import sys
 import platform
 from argparse import ArgumentParser, RawDescriptionHelpFormatter
 import requests
-from socid_extractor import parse, __version__ as socid_version
+from socid_extractor import extract, parse, __version__ as socid_version
-from .checking import *
+from .checking import timeout_check, supported_recursive_search_ids, self_check, unsupported_characters, maigret
 from .notify import QueryNotifyPrint
 from .report import save_csv_report, save_xmind_report, save_html_report, save_pdf_report, \
    generate_report_context, save_txt_report, SUPPORTED_JSON_REPORT_FORMATS, check_supported_json_format, \
    save_json_report
 from .sites import MaigretDatabase
 from .submit import submit_dialog
 __version__ = '0.1.15'
@@ -273,7 +277,6 @@ async def main():
    # Make reports folder is not exists
    os.makedirs(args.folderoutput, exist_ok=True)
    report_path = args.folderoutput
    # Define one report filename template
    report_filepath_tpl = os.path.join(args.folderoutput, 'report_{username}{postfix}')
@@ -15,6 +15,7 @@ SUPPORTED_TAGS = [
    'discussion', 'sharing', 'writing', 'wiki', 'business', 'shopping', 'sport',
    'books', 'news', 'documents', 'travel', 'maps', 'hobby', 'apps', 'classified',
    'career', 'geosocial', 'streaming', 'education', 'networking', 'torrent',
    'science', 'medicine',
 ]
@@ -154,6 +155,7 @@ class MaigretSite:
            # remove dict keys
            if isinstance(engine_data[k], dict) and is_exists:
                for f in engine_data[k].keys():
                    if f in self_copy.__dict__[field]:
                        del self_copy.__dict__[field][f]
                continue
            # remove list items
@@ -1,13 +1,15 @@
 import difflib
 import requests
 from mock import Mock
 from .checking import *
 DESIRED_STRINGS = ["username", "not found", "пользователь", "profile", "lastname", "firstname", "biography",
                   "birthday", "репутация", "информация", "e-mail"]
 SUPPOSED_USERNAMES = ['alex', 'god', 'admin', 'red', 'blue', 'john']
 RATIO = 0.6
 TOP_FEATURES = 5
 URL_RE = re.compile(r'https?://(www\.)?')
@@ -20,7 +22,7 @@ def get_match_ratio(x):
    ]), 2)
-def extract_domain(url):
+def extract_mainpage_url(url):
    return '/'.join(url.split('/', 3)[:3])
@@ -38,7 +40,6 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F
    logger.info(f'Checking {site.name}...')
    for username, status in check_data:
        async with semaphore:
        results_dict = await maigret(
            username,
            {site.name: site},
@@ -84,18 +85,45 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F
    return changes
-async def submit_dialog(db, url_exists, cookie_file):
+async def detect_known_engine(db, url_exists, url_mainpage):
-    domain_raw = URL_RE.sub('', url_exists).strip().strip('/')
+    try:
-    domain_raw = domain_raw.split('/')[0]
+        r = requests.get(url_mainpage)
    except Exception as e:
        print(e)
        print('Some error while checking main page')
        return None
-    matched_sites = list(filter(lambda x: domain_raw in x.url_main + x.url, db.sites))
+    for e in db.engines:
-    if matched_sites:
+        strs_to_check = e.__dict__.get('presenseStrs')
-        print(f'Sites with domain "{domain_raw}" already exists in the Maigret database!')
+        if strs_to_check and r and r.text:
-        status = lambda s: '(disabled)' if s.disabled else ''
+            all_strs_in_response = True
-        url_block = lambda s: f'\n\t{s.url_main}\n\t{s.url}'
+            for s in strs_to_check:
-        print('\n'.join([f'{site.name} {status(site)}{url_block(site)}' for site in matched_sites]))
+                if not s in r.text:
-        return False
+                    all_strs_in_response = False
            if all_strs_in_response:
                engine_name = e.__dict__.get('name')
                print(f'Detected engine {engine_name} for site {url_mainpage}')
                sites = []
                for u in SUPPOSED_USERNAMES:
                    site_data = {
                        'urlMain': url_mainpage,
                        'name': url_mainpage.split('//')[0],
                        'engine': engine_name,
                        'usernameClaimed': u,
                        'usernameUnclaimed': 'noonewouldeverusethis7',
                    }
                    maigret_site = MaigretSite(url_mainpage.split('/')[-1], site_data)
                    maigret_site.update_from_engine(db.engines_dict[engine_name])
                    sites.append(maigret_site)
                return sites
    return None
 async def check_features_manually(db, url_exists, url_mainpage, cookie_file):
    url_parts = url_exists.split('/')
    supposed_username = url_parts[-1]
    new_name = input(f'Is "{supposed_username}" a valid username? If not, write it manually: ')
@@ -138,21 +166,40 @@ async def submit_dialog(db, url_exists, cookie_file):
    if features:
        absence_list = features.split(',')
    url_main = extract_domain(url_exists)
    site_data = {
        'absenceStrs': absence_list,
        'presenseStrs': presence_list,
        'url': url_user,
-        'urlMain': url_main,
+        'urlMain': url_mainpage,
        'usernameClaimed': supposed_username,
        'usernameUnclaimed': non_exist_username,
        'checkType': 'message',
    }
-    site = MaigretSite(url_main.split('/')[-1], site_data)
+    site = MaigretSite(url_mainpage.split('/')[-1], site_data)
    return site
-    print(site.__dict__)
+async def submit_dialog(db, url_exists, cookie_file):
    domain_raw = URL_RE.sub('', url_exists).strip().strip('/')
    domain_raw = domain_raw.split('/')[0]
    # check for existence
    matched_sites = list(filter(lambda x: domain_raw in x.url_main + x.url, db.sites))
    if matched_sites:
        print(f'Sites with domain "{domain_raw}" already exists in the Maigret database!')
        status = lambda s: '(disabled)' if s.disabled else ''
        url_block = lambda s: f'\n\t{s.url_main}\n\t{s.url}'
        print('\n'.join([f'{site.name} {status(site)}{url_block(site)}' for site in matched_sites]))
        return False
    url_mainpage = extract_mainpage_url(url_exists)
    sites = await detect_known_engine(db, url_exists, url_mainpage)
    if not sites:
        print('Unable to detect site engine, lets generate checking features')
        sites = [await check_features_manually(db, url_exists, url_mainpage, cookie_file)]
    print(sites[0].__dict__)
    sem = asyncio.Semaphore(1)
    log_level = logging.INFO
@@ -164,14 +211,24 @@ async def submit_dialog(db, url_exists, cookie_file):
    logger = logging.getLogger('site-submit')
    logger.setLevel(log_level)
-    result = await site_self_check(site, logger, sem, db)
+    found = False
    chosen_site = None
    for s in sites:
        chosen_site = s
        result = await site_self_check(s, logger, sem, db)
        if not result['disabled']:
            found = True
            break
-    if result['disabled']:
+    if not found:
-        print(f'Sorry, we couldn\'t find params to detect account presence/absence in {site.name}.')
+        print(f'Sorry, we couldn\'t find params to detect account presence/absence in {chosen_site.name}.')
        print('Try to run this mode again and increase features count or choose others.')
    else:
-        if input(f'Site {site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] ').lower() in 'y':
+        if input(f'Site {chosen_site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] ').lower() in 'y':
-            db.update_site(site)
+            print(chosen_site.json)
            site_data = chosen_site.strip_engine_data()
            print(site_data.json)
            db.update_site(site_data)
            return True
    return False