From 90135d467681d3892f06a42dffc826053a162190 Mon Sep 17 00:00:00 2001 From: Soxoj Date: Tue, 9 Feb 2021 00:43:59 +0300 Subject: [PATCH] Experimental site submit mode --- maigret/checking.py | 601 +++++++++++++++++++++++++++++++++++ maigret/maigret.py | 617 +----------------------------------- maigret/resources/data.json | 30 +- maigret/submit.py | 161 ++++++++++ 4 files changed, 807 insertions(+), 602 deletions(-) create mode 100644 maigret/checking.py create mode 100644 maigret/submit.py diff --git a/maigret/checking.py b/maigret/checking.py new file mode 100644 index 0000000..d0c5300 --- /dev/null +++ b/maigret/checking.py @@ -0,0 +1,601 @@ +import asyncio +import logging +import re +import ssl + +import aiohttp +import tqdm.asyncio +from aiohttp_socks import ProxyConnector +from mock import Mock +from python_socks import _errors as proxy_errors +from socid_extractor import extract + +from .activation import ParsingActivator, import_aiohttp_cookies +from .result import QueryResult, QueryStatus +from .sites import MaigretDatabase, MaigretSite + +supported_recursive_search_ids = ( + 'yandex_public_id', + 'gaia_id', + 'vk_id', + 'ok_id', + 'wikimapia_uid', +) + +common_errors = { + 'Attention Required! | Cloudflare': 'Cloudflare captcha', + 'Please stand by, while we are checking your browser': 'Cloudflare captcha', + 'Доступ ограничен': 'Rostelecom censorship', + 'document.getElementById(\'validate_form_submit\').disabled=true': 'Mail.ru captcha', + 'Verifying your browser, please wait...
DDoS Protection by Blazingfast.io': 'Blazingfast protection', + '404

Мы не нашли страницу': 'MegaFon 404 page', + 'Доступ к информационному ресурсу ограничен на основании Федерального закона': 'MGTS censorship', + 'Incapsula incident ID': 'Incapsula antibot protection', +} + +unsupported_characters = '#' + + +async def get_response(request_future, site_name, logger): + html_text = None + status_code = 0 + + error_text = "General Unknown Error" + expection_text = None + + try: + response = await request_future + + status_code = response.status + response_content = await response.content.read() + charset = response.charset or 'utf-8' + decoded_content = response_content.decode(charset, 'ignore') + html_text = decoded_content + + if status_code > 0: + error_text = None + + logger.debug(html_text) + + except asyncio.TimeoutError as errt: + error_text = "Timeout Error" + expection_text = str(errt) + except (ssl.SSLCertVerificationError, ssl.SSLError) as err: + error_text = "SSL Error" + expection_text = str(err) + except aiohttp.client_exceptions.ClientConnectorError as err: + error_text = "Error Connecting" + expection_text = str(err) + except aiohttp.http_exceptions.BadHttpMessage as err: + error_text = "HTTP Error" + expection_text = str(err) + except proxy_errors.ProxyError as err: + error_text = "Proxy Error" + expection_text = str(err) + except Exception as err: + logger.warning(f'Unhandled error while requesting {site_name}: {err}') + logger.debug(err, exc_info=True) + error_text = "Some Error" + expection_text = str(err) + + # TODO: return only needed information + return html_text, status_code, error_text, expection_text + + +async def update_site_dict_from_response(sitename, site_dict, results_info, semaphore, logger, query_notify): + async with semaphore: + site_obj = site_dict[sitename] + future = site_obj.request_future + if not future: + # ignore: search by incompatible id type + return + + response = await get_response(request_future=future, + site_name=sitename, + logger=logger) + + site_dict[sitename] = process_site_result(response, query_notify, logger, results_info, site_obj) + + +# TODO: move info separate module +def detect_error_page(html_text, status_code, fail_flags, ignore_403): + # Detect service restrictions such as a country restriction + for flag, msg in fail_flags.items(): + if flag in html_text: + return 'Some site error', msg + + # Detect common restrictions such as provider censorship and bot protection + for flag, msg in common_errors.items(): + if flag in html_text: + return 'Error', msg + + # Detect common site errors + if status_code == 403 and not ignore_403: + return 'Access denied', 'Access denied, use proxy/vpn' + elif status_code >= 500: + return f'Error {status_code}', f'Site error {status_code}' + + return None, None + + +def process_site_result(response, query_notify, logger, results_info, site: MaigretSite): + if not response: + return results_info + + fulltags = site.tags + + # Retrieve other site information again + username = results_info['username'] + is_parsing_enabled = results_info['parsing_enabled'] + url = results_info.get("url_user") + logger.debug(url) + + status = results_info.get("status") + if status is not None: + # We have already determined the user doesn't exist here + return results_info + + # Get the expected check type + check_type = site.check_type + + # Get the failure messages and comments + failure_errors = site.errors + + # TODO: refactor + if not response: + logger.error(f'No response for {site.name}') + return results_info + + html_text, status_code, error_text, expection_text = response + site_error_text = '?' + + # TODO: add elapsed request time counting + response_time = None + + if logger.level == logging.DEBUG: + with open('debug.txt', 'a') as f: + status = status_code or 'No response' + f.write(f'url: {url}\nerror: {str(error_text)}\nr: {status}\n') + if html_text: + f.write(f'code: {status}\nresponse: {str(html_text)}\n') + + if status_code and not error_text: + error_text, site_error_text = detect_error_page(html_text, status_code, failure_errors, + site.ignore_403) + + if site.activation and html_text: + is_need_activation = any([s for s in site.activation['marks'] if s in html_text]) + if is_need_activation: + method = site.activation['method'] + try: + activate_fun = getattr(ParsingActivator(), method) + # TODO: async call + activate_fun(site, logger) + except AttributeError: + logger.warning(f'Activation method {method} for site {site.name} not found!') + + # presense flags + # True by default + presense_flags = site.presense_strs + is_presense_detected = False + if html_text: + if not presense_flags: + is_presense_detected = True + site.stats['presense_flag'] = None + else: + for presense_flag in presense_flags: + if presense_flag in html_text: + is_presense_detected = True + site.stats['presense_flag'] = presense_flag + logger.info(presense_flag) + break + + if error_text is not None: + logger.debug(error_text) + result = QueryResult(username, + site.name, + url, + QueryStatus.UNKNOWN, + query_time=response_time, + context=f'{error_text}: {site_error_text}', tags=fulltags) + elif check_type == "message": + absence_flags = site.absence_strs + is_absence_flags_list = isinstance(absence_flags, list) + absence_flags_set = set(absence_flags) if is_absence_flags_list else {absence_flags} + # Checks if the error message is in the HTML + is_absence_detected = any([(absence_flag in html_text) for absence_flag in absence_flags_set]) + if not is_absence_detected and is_presense_detected: + result = QueryResult(username, + site.name, + url, + QueryStatus.CLAIMED, + query_time=response_time, tags=fulltags) + else: + result = QueryResult(username, + site.name, + url, + QueryStatus.AVAILABLE, + query_time=response_time, tags=fulltags) + elif check_type == "status_code": + # Checks if the status code of the response is 2XX + if (not status_code >= 300 or status_code < 200) and is_presense_detected: + result = QueryResult(username, + site.name, + url, + QueryStatus.CLAIMED, + query_time=response_time, tags=fulltags) + else: + result = QueryResult(username, + site.name, + url, + QueryStatus.AVAILABLE, + query_time=response_time, tags=fulltags) + elif check_type == "response_url": + # For this detection method, we have turned off the redirect. + # So, there is no need to check the response URL: it will always + # match the request. Instead, we will ensure that the response + # code indicates that the request was successful (i.e. no 404, or + # forward to some odd redirect). + if 200 <= status_code < 300 and is_presense_detected: + result = QueryResult(username, + site.name, + url, + QueryStatus.CLAIMED, + query_time=response_time, tags=fulltags) + else: + result = QueryResult(username, + site.name, + url, + QueryStatus.AVAILABLE, + query_time=response_time, tags=fulltags) + else: + # It should be impossible to ever get here... + raise ValueError(f"Unknown check type '{check_type}' for " + f"site '{site.name}'") + + extracted_ids_data = {} + + if is_parsing_enabled and result.status == QueryStatus.CLAIMED: + try: + extracted_ids_data = extract(html_text) + except Exception as e: + logger.warning(f'Error while parsing {site.name}: {e}', exc_info=True) + + if extracted_ids_data: + new_usernames = {} + for k, v in extracted_ids_data.items(): + if 'username' in k: + new_usernames[v] = 'username' + if k in supported_recursive_search_ids: + new_usernames[v] = k + + results_info['ids_usernames'] = new_usernames + result.ids_data = extracted_ids_data + + # Notify caller about results of query. + query_notify.update(result, site.similar_search) + + # Save status of request + results_info['status'] = result + + # Save results from request + results_info['http_status'] = status_code + results_info['is_similar'] = site.similar_search + # results_site['response_text'] = html_text + results_info['rank'] = site.alexa_rank + return results_info + + +async def maigret(username, site_dict, query_notify, logger, + proxy=None, timeout=None, recursive_search=False, + id_type='username', debug=False, forced=False, + max_connections=100, no_progressbar=False, + cookies=None): + """Main search func + + Checks for existence of username on various social media sites. + + Keyword Arguments: + username -- String indicating username that report + should be created against. + site_dict -- Dictionary containing all of the site data. + query_notify -- Object with base type of QueryNotify(). + This will be used to notify the caller about + query results. + proxy -- String indicating the proxy URL + timeout -- Time in seconds to wait before timing out request. + Default is no timeout. + recursive_search -- Search for other usernames in website pages & recursive search by them. + + Return Value: + Dictionary containing results from report. Key of dictionary is the name + of the social network site, and the value is another dictionary with + the following keys: + url_main: URL of main site. + url_user: URL of user on site (if account exists). + status: QueryResult() object indicating results of test for + account existence. + http_status: HTTP status code of query which checked for existence on + site. + response_text: Text that came back from request. May be None if + there was an HTTP error when checking for existence. + """ + + # Notify caller that we are starting the query. + query_notify.start(username, id_type) + + # TODO: connector + connector = ProxyConnector.from_url(proxy) if proxy else aiohttp.TCPConnector(ssl=False) + # connector = aiohttp.TCPConnector(ssl=False) + connector.verify_ssl = False + + cookie_jar = None + if cookies: + cookie_jar = await import_aiohttp_cookies(cookies) + + session = aiohttp.ClientSession(connector=connector, trust_env=True, cookie_jar=cookie_jar) + + if logger.level == logging.DEBUG: + future = session.get(url='https://icanhazip.com') + ip, status, error, expection = await get_response(future, None, logger) + if ip: + logger.debug(f'My IP is: {ip.strip()}') + else: + logger.debug(f'IP requesting {error}: {expection}') + + # Results from analysis of all sites + results_total = {} + + # First create futures for all requests. This allows for the requests to run in parallel + for site_name, site in site_dict.items(): + + if site.type != id_type: + continue + + if site.disabled and not forced: + logger.debug(f'Site {site.name} is disabled, skipping...') + continue + + # Results from analysis of this specific site + results_site = {} + + # Record URL of main site and username + results_site['username'] = username + results_site['parsing_enabled'] = recursive_search + results_site['url_main'] = site.url_main + results_site['cookies'] = cookie_jar and cookie_jar.filter_cookies(site.url_main) or None + + headers = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11.1; rv:55.0) Gecko/20100101 Firefox/55.0', + } + + headers.update(site.headers) + + if not 'url' in site.__dict__: + logger.error('No URL for site %s', site.name) + # URL of user on site (if it exists) + url = site.url.format( + urlMain=site.url_main, + urlSubpath=site.url_subpath, + username=username + ) + # workaround to prevent slash errors + url = re.sub('(? bool: + sem = asyncio.Semaphore(max_connections) + tasks = [] + all_sites = site_data + + def disabled_count(lst): + return len(list(filter(lambda x: x.disabled, lst))) + + disabled_old_count = disabled_count(all_sites.values()) + + for _, site in all_sites.items(): + check_coro = site_self_check(site, logger, sem, db, silent) + future = asyncio.ensure_future(check_coro) + tasks.append(future) + + for f in tqdm.asyncio.tqdm.as_completed(tasks): + await f + + disabled_new_count = disabled_count(all_sites.values()) + total_disabled = disabled_new_count - disabled_old_count + + if total_disabled >= 0: + message = 'Disabled' + else: + message = 'Enabled' + total_disabled *= -1 + + if not silent: + print( + f'{message} {total_disabled} ({disabled_old_count} => {disabled_new_count}) checked sites. Run with `--info` flag to get more information') + + return total_disabled != 0 diff --git a/maigret/maigret.py b/maigret/maigret.py index 4d97d13..4b13804 100755 --- a/maigret/maigret.py +++ b/maigret/maigret.py @@ -2,616 +2,22 @@ Maigret main module """ -import asyncio -import logging import os import platform -import re -import ssl import sys from argparse import ArgumentParser, RawDescriptionHelpFormatter -import aiohttp import requests -import tqdm.asyncio -from aiohttp_socks import ProxyConnector -from mock import Mock -from python_socks import _errors as proxy_errors -from socid_extractor import parse, extract, __version__ as socid_version +from socid_extractor import parse, __version__ as socid_version -from .activation import ParsingActivator, import_aiohttp_cookies +from .checking import * from .notify import QueryNotifyPrint from .report import save_csv_report, save_xmind_report, save_html_report, save_pdf_report, \ generate_report_context, save_txt_report -from .result import QueryResult, QueryStatus -from .sites import MaigretDatabase, MaigretSite +from .submit import submit_dialog __version__ = '0.1.13' -supported_recursive_search_ids = ( - 'yandex_public_id', - 'gaia_id', - 'vk_id', - 'ok_id', - 'wikimapia_uid', -) - -common_errors = { - 'Attention Required! | Cloudflare': 'Cloudflare captcha', - 'Please stand by, while we are checking your browser': 'Cloudflare captcha', - 'Доступ ограничен': 'Rostelecom censorship', - 'document.getElementById(\'validate_form_submit\').disabled=true': 'Mail.ru captcha', - 'Verifying your browser, please wait...
DDoS Protection by Blazingfast.io': 'Blazingfast protection', - '404

Мы не нашли страницу': 'MegaFon 404 page', - 'Доступ к информационному ресурсу ограничен на основании Федерального закона': 'MGTS censorship', - 'Incapsula incident ID': 'Incapsula antibot protection', -} - -unsupported_characters = '#' - -async def get_response(request_future, site_name, logger): - html_text = None - status_code = 0 - - error_text = "General Unknown Error" - expection_text = None - - try: - response = await request_future - - status_code = response.status - response_content = await response.content.read() - charset = response.charset or 'utf-8' - decoded_content = response_content.decode(charset, 'ignore') - html_text = decoded_content - - if status_code > 0: - error_text = None - - logger.debug(html_text) - - except asyncio.TimeoutError as errt: - error_text = "Timeout Error" - expection_text = str(errt) - except (ssl.SSLCertVerificationError, ssl.SSLError) as err: - error_text = "SSL Error" - expection_text = str(err) - except aiohttp.client_exceptions.ClientConnectorError as err: - error_text = "Error Connecting" - expection_text = str(err) - except aiohttp.http_exceptions.BadHttpMessage as err: - error_text = "HTTP Error" - expection_text = str(err) - except proxy_errors.ProxyError as err: - error_text = "Proxy Error" - expection_text = str(err) - except Exception as err: - logger.warning(f'Unhandled error while requesting {site_name}: {err}') - logger.debug(err, exc_info=True) - error_text = "Some Error" - expection_text = str(err) - - # TODO: return only needed information - return html_text, status_code, error_text, expection_text - - -async def update_site_dict_from_response(sitename, site_dict, results_info, semaphore, logger, query_notify): - async with semaphore: - site_obj = site_dict[sitename] - future = site_obj.request_future - if not future: - # ignore: search by incompatible id type - return - - response = await get_response(request_future=future, - site_name=sitename, - logger=logger) - - site_dict[sitename] = process_site_result(response, query_notify, logger, results_info, site_obj) - -# TODO: move info separate module -def detect_error_page(html_text, status_code, fail_flags, ignore_403): - # Detect service restrictions such as a country restriction - for flag, msg in fail_flags.items(): - if flag in html_text: - return 'Some site error', msg - - # Detect common restrictions such as provider censorship and bot protection - for flag, msg in common_errors.items(): - if flag in html_text: - return 'Error', msg - - # Detect common site errors - if status_code == 403 and not ignore_403: - return 'Access denied', 'Access denied, use proxy/vpn' - elif status_code >= 500: - return f'Error {status_code}', f'Site error {status_code}' - - return None, None - - -def process_site_result(response, query_notify, logger, results_info, site: MaigretSite): - if not response: - return results_info - - fulltags = site.tags - - # Retrieve other site information again - username = results_info['username'] - is_parsing_enabled = results_info['parsing_enabled'] - url = results_info.get("url_user") - logger.debug(url) - - status = results_info.get("status") - if status is not None: - # We have already determined the user doesn't exist here - return results_info - - # Get the expected check type - check_type = site.check_type - - # Get the failure messages and comments - failure_errors = site.errors - - # TODO: refactor - if not response: - logger.error(f'No response for {site.name}') - return results_info - - html_text, status_code, error_text, expection_text = response - site_error_text = '?' - - # TODO: add elapsed request time counting - response_time = None - - if logger.level == logging.DEBUG: - with open('debug.txt', 'a') as f: - status = status_code or 'No response' - f.write(f'url: {url}\nerror: {str(error_text)}\nr: {status}\n') - if html_text: - f.write(f'code: {status}\nresponse: {str(html_text)}\n') - - if status_code and not error_text: - error_text, site_error_text = detect_error_page(html_text, status_code, failure_errors, - site.ignore_403) - - if site.activation and html_text: - is_need_activation = any([s for s in site.activation['marks'] if s in html_text]) - if is_need_activation: - method = site.activation['method'] - try: - activate_fun = getattr(ParsingActivator(), method) - # TODO: async call - activate_fun(site, logger) - except AttributeError: - logger.warning(f'Activation method {method} for site {site.name} not found!') - - # presense flags - # True by default - presense_flags = site.presense_strs - is_presense_detected = False - if html_text: - if not presense_flags: - is_presense_detected = True - site.stats['presense_flag'] = None - else: - for presense_flag in presense_flags: - if presense_flag in html_text: - is_presense_detected = True - site.stats['presense_flag'] = presense_flag - logger.info(presense_flag) - break - - if error_text is not None: - logger.debug(error_text) - result = QueryResult(username, - site.name, - url, - QueryStatus.UNKNOWN, - query_time=response_time, - context=f'{error_text}: {site_error_text}', tags=fulltags) - elif check_type == "message": - absence_flags = site.absence_strs - is_absence_flags_list = isinstance(absence_flags, list) - absence_flags_set = set(absence_flags) if is_absence_flags_list else {absence_flags} - # Checks if the error message is in the HTML - is_absence_detected = any([(absence_flag in html_text) for absence_flag in absence_flags_set]) - if not is_absence_detected and is_presense_detected: - result = QueryResult(username, - site.name, - url, - QueryStatus.CLAIMED, - query_time=response_time, tags=fulltags) - else: - result = QueryResult(username, - site.name, - url, - QueryStatus.AVAILABLE, - query_time=response_time, tags=fulltags) - elif check_type == "status_code": - # Checks if the status code of the response is 2XX - if (not status_code >= 300 or status_code < 200) and is_presense_detected: - result = QueryResult(username, - site.name, - url, - QueryStatus.CLAIMED, - query_time=response_time, tags=fulltags) - else: - result = QueryResult(username, - site.name, - url, - QueryStatus.AVAILABLE, - query_time=response_time, tags=fulltags) - elif check_type == "response_url": - # For this detection method, we have turned off the redirect. - # So, there is no need to check the response URL: it will always - # match the request. Instead, we will ensure that the response - # code indicates that the request was successful (i.e. no 404, or - # forward to some odd redirect). - if 200 <= status_code < 300 and is_presense_detected: - result = QueryResult(username, - site.name, - url, - QueryStatus.CLAIMED, - query_time=response_time, tags=fulltags) - else: - result = QueryResult(username, - site.name, - url, - QueryStatus.AVAILABLE, - query_time=response_time, tags=fulltags) - else: - # It should be impossible to ever get here... - raise ValueError(f"Unknown check type '{check_type}' for " - f"site '{site.name}'") - - extracted_ids_data = {} - - if is_parsing_enabled and result.status == QueryStatus.CLAIMED: - try: - extracted_ids_data = extract(html_text) - except Exception as e: - logger.warning(f'Error while parsing {site.name}: {e}', exc_info=True) - - if extracted_ids_data: - new_usernames = {} - for k, v in extracted_ids_data.items(): - if 'username' in k: - new_usernames[v] = 'username' - if k in supported_recursive_search_ids: - new_usernames[v] = k - - results_info['ids_usernames'] = new_usernames - result.ids_data = extracted_ids_data - - # Notify caller about results of query. - query_notify.update(result, site.similar_search) - - # Save status of request - results_info['status'] = result - - # Save results from request - results_info['http_status'] = status_code - results_info['is_similar'] = site.similar_search - # results_site['response_text'] = html_text - results_info['rank'] = site.alexa_rank - return results_info - - - - -async def maigret(username, site_dict, query_notify, logger, - proxy=None, timeout=None, recursive_search=False, - id_type='username', debug=False, forced=False, - max_connections=100, no_progressbar=False, - cookies=None): - """Main search func - - Checks for existence of username on various social media sites. - - Keyword Arguments: - username -- String indicating username that report - should be created against. - site_dict -- Dictionary containing all of the site data. - query_notify -- Object with base type of QueryNotify(). - This will be used to notify the caller about - query results. - proxy -- String indicating the proxy URL - timeout -- Time in seconds to wait before timing out request. - Default is no timeout. - recursive_search -- Search for other usernames in website pages & recursive search by them. - - Return Value: - Dictionary containing results from report. Key of dictionary is the name - of the social network site, and the value is another dictionary with - the following keys: - url_main: URL of main site. - url_user: URL of user on site (if account exists). - status: QueryResult() object indicating results of test for - account existence. - http_status: HTTP status code of query which checked for existence on - site. - response_text: Text that came back from request. May be None if - there was an HTTP error when checking for existence. - """ - - # Notify caller that we are starting the query. - query_notify.start(username, id_type) - - # TODO: connector - connector = ProxyConnector.from_url(proxy) if proxy else aiohttp.TCPConnector(ssl=False) - # connector = aiohttp.TCPConnector(ssl=False) - connector.verify_ssl=False - - cookie_jar = None - if cookies: - cookie_jar = await import_aiohttp_cookies(cookies) - - session = aiohttp.ClientSession(connector=connector, trust_env=True, cookie_jar=cookie_jar) - - if logger.level == logging.DEBUG: - future = session.get(url='https://icanhazip.com') - ip, status, error, expection = await get_response(future, None, logger) - if ip: - logger.debug(f'My IP is: {ip.strip()}') - else: - logger.debug(f'IP requesting {error}: {expection}') - - - # Results from analysis of all sites - results_total = {} - - # First create futures for all requests. This allows for the requests to run in parallel - for site_name, site in site_dict.items(): - - if site.type != id_type: - continue - - if site.disabled and not forced: - logger.debug(f'Site {site.name} is disabled, skipping...') - continue - - # Results from analysis of this specific site - results_site = {} - - # Record URL of main site and username - results_site['username'] = username - results_site['parsing_enabled'] = recursive_search - results_site['url_main'] = site.url_main - results_site['cookies'] = cookie_jar and cookie_jar.filter_cookies(site.url_main) or None - - headers = { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11.1; rv:55.0) Gecko/20100101 Firefox/55.0', - } - - headers.update(site.headers) - - if not 'url' in site.__dict__: - logger.error('No URL for site %s', site.name) - # URL of user on site (if it exists) - url = site.url.format( - urlMain=site.url_main, - urlSubpath=site.url_subpath, - username=username - ) - # workaround to prevent slash errors - url = re.sub('(? bool: - sem = asyncio.Semaphore(max_connections) - tasks = [] - all_sites = site_data - - def disabled_count(lst): - return len(list(filter(lambda x: x.disabled, lst))) - - disabled_old_count = disabled_count(all_sites.values()) - - for _, site in all_sites.items(): - check_coro = site_self_check(site, logger, sem, db, silent) - future = asyncio.ensure_future(check_coro) - tasks.append(future) - - for f in tqdm.asyncio.tqdm.as_completed(tasks): - await f - - disabled_new_count = disabled_count(all_sites.values()) - total_disabled = disabled_new_count - disabled_old_count - - if total_disabled >= 0: - message = 'Disabled' - else: - message = 'Enabled' - total_disabled *= -1 - - if not silent: - print(f'{message} {total_disabled} ({disabled_old_count} => {disabled_new_count}) checked sites. Run with `--info` flag to get more information') - - return total_disabled != 0 - async def main(): version_string = '\n'.join([ @@ -685,6 +91,10 @@ async def main(): action="store_true", dest="print_check_errors", default=False, help="Print errors messages: connection, captcha, site country ban, etc." ) + parser.add_argument("--submit", + type=str, dest="new_site_to_submit", default=False, + help="URL of existing profile in new site to submit." + ) parser.add_argument("--no-color", action="store_true", dest="no_color", default=False, help="Don't color terminal output" @@ -738,7 +148,7 @@ async def main(): action="store_true", dest="html", default=False, help="Create an HTML report file (general report on all usernames)." ) - parser.add_argument("-X","--xmind", + parser.add_argument("-X", "--xmind", action="store_true", dest="xmind", default=False, help="Generate an XMind 8 mindmap report (one report per username)." @@ -820,6 +230,11 @@ async def main(): site_data = get_top_sites_for_id(args.id_type) + if args.new_site_to_submit: + is_submitted = await submit_dialog(db, args.new_site_to_submit) + if is_submitted: + db.save_to_file(args.json_file) + # Database self-checking if args.self_check: print('Maigret sites database self-checking...') @@ -874,7 +289,8 @@ async def main(): if found_unsupported_chars: pretty_chars_str = ','.join(map(lambda s: f'"{s}"', found_unsupported_chars)) - query_notify.warning(f'Found unsupported URL characters: {pretty_chars_str}, skip search by username "{username}"') + query_notify.warning( + f'Found unsupported URL characters: {pretty_chars_str}, skip search by username "{username}"') continue sites_to_check = get_top_sites_for_id(id_type) @@ -952,5 +368,6 @@ def run(): print('Maigret is interrupted.') sys.exit(1) + if __name__ == "__main__": - run() \ No newline at end of file + run() diff --git a/maigret/resources/data.json b/maigret/resources/data.json index 1ac0083..b926cab 100644 --- a/maigret/resources/data.json +++ b/maigret/resources/data.json @@ -13590,7 +13590,7 @@ "sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"", "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA", "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36", - "x-guest-token": "1358064134064140290" + "x-guest-token": "1358893858789208065" }, "errors": { "Bad guest token": "x-guest-token update required" @@ -13956,7 +13956,7 @@ "video" ], "headers": { - "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MTI2MjQ4NjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.kgp8r380d1aDWcd-ROncr0Tqf8EdA-l35EeEY9is6TI" + "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MTI4MjE0MjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.TXUhqilVT25xN4lZeoki6hEmbtcOiy7FKxTm5PWOMVs" }, "activation": { "url": "https://vimeo.com/_rv/viewer", @@ -23070,6 +23070,32 @@ "urlMain": "https://protovary.style", "usernameClaimed": "alex", "usernameUnclaimed": "noonewouldeverusethis7" + }, + "beacons.ai": { + "checkType": "message", + "presenseStrs": [ + "https://cdn.beacons.ai/profile_pictures" + ], + "absenceStrs": [ + "https://beacons.ai/bw_logo_full.png" + ], + "url": "https://beacons.ai/{username}", + "urlMain": "https://beacons.ai", + "usernameClaimed": "pasteljellies", + "usernameUnclaimed": "noonewouldeverusethis7" + }, + "are.na": { + "checkType": "message", + "presenseStrs": [ + "Profile--view" + ], + "absenceStrs": [ + "Are.na home" + ], + "url": "https://www.are.na/{username}", + "urlMain": "https://www.are.na", + "usernameClaimed": "nate-cassel", + "usernameUnclaimed": "noonewouldeverusethis7" } }, "engines": { diff --git a/maigret/submit.py b/maigret/submit.py new file mode 100644 index 0000000..d45c263 --- /dev/null +++ b/maigret/submit.py @@ -0,0 +1,161 @@ +import difflib + +import requests +from mock import Mock + +from .checking import * + +DESIRED_STRINGS = ["username", "not found", "пользователь", "profile", "lastname", "firstname", "biography", + "birthday", "репутация", "информация", "e-mail"] + +RATIO = 0.6 +TOP_FEATURES = 5 + + +def get_match_ratio(x): + return round(max([ + difflib.SequenceMatcher(a=x.lower(), b=y).ratio() + for y in DESIRED_STRINGS + ]), 2) + + +def extract_domain(url): + return '/'.join(url.split('/', 3)[:3]) + + +async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False): + query_notify = Mock() + changes = { + 'disabled': False, + } + + check_data = [ + (site.username_claimed, QueryStatus.CLAIMED), + (site.username_unclaimed, QueryStatus.AVAILABLE), + ] + + logger.info(f'Checking {site.name}...') + + for username, status in check_data: + async with semaphore: + results_dict = await maigret( + username, + {site.name: site}, + query_notify, + logger, + timeout=30, + id_type=site.type, + forced=True, + no_progressbar=True, + ) + + # don't disable entries with other ids types + # TODO: make normal checking + if site.name not in results_dict: + logger.info(results_dict) + changes['disabled'] = True + continue + + result = results_dict[site.name]['status'] + + site_status = result.status + + if site_status != status: + if site_status == QueryStatus.UNKNOWN: + msgs = site.absence_strs + etype = site.check_type + logger.warning( + f'Error while searching {username} in {site.name}: {result.context}, {msgs}, type {etype}') + # don't disable in case of available username + if status == QueryStatus.CLAIMED: + changes['disabled'] = True + elif status == QueryStatus.CLAIMED: + logger.warning(f'Not found `{username}` in {site.name}, must be claimed') + logger.info(results_dict[site.name]) + changes['disabled'] = True + else: + logger.warning(f'Found `{username}` in {site.name}, must be available') + logger.info(results_dict[site.name]) + changes['disabled'] = True + + logger.info(f'Site {site.name} checking is finished') + + return changes + + +async def submit_dialog(db, url_exists): + url_parts = url_exists.split('/') + supposed_username = url_parts[-1] + new_name = input(f'Is "{supposed_username}" a valid username? If not, write it manually: ') + if new_name: + supposed_username = new_name + non_exist_username = 'noonewouldeverusethis7' + + url_user = url_exists.replace(supposed_username, '{username}') + url_not_exists = url_exists.replace(supposed_username, non_exist_username) + + a = requests.get(url_exists).text + b = requests.get(url_not_exists).text + + tokens_a = set(a.split('"')) + tokens_b = set(b.split('"')) + + a_minus_b = tokens_a.difference(tokens_b) + b_minus_a = tokens_b.difference(tokens_a) + + top_features_count = int(input(f'Specify count of features to extract [default {TOP_FEATURES}]: ')) + if not top_features_count: + top_features_count = TOP_FEATURES + + presence_list = sorted(a_minus_b, key=get_match_ratio, reverse=True)[:top_features_count] + + print('Detected text features of existing account: ' + ', '.join(presence_list)) + features = input('If features was not detected correctly, write it manually: ') + + if features: + presence_list = features.split(',') + + absence_list = sorted(b_minus_a, key=get_match_ratio, reverse=True)[:top_features_count] + print('Detected text features of non-existing account: ' + ', '.join(absence_list)) + features = input('If features was not detected correctly, write it manually: ') + + if features: + absence_list = features.split(',') + + url_main = extract_domain(url_exists) + + site_data = { + 'absenceStrs': absence_list, + 'presenseStrs': presence_list, + 'url': url_user, + 'urlMain': url_main, + 'usernameClaimed': supposed_username, + 'usernameUnclaimed': non_exist_username, + 'checkType': 'message', + } + + site = MaigretSite(url_main.split('/')[-1], site_data) + + print(site.__dict__) + + sem = asyncio.Semaphore(1) + log_level = logging.INFO + logging.basicConfig( + format='[%(filename)s:%(lineno)d] %(levelname)-3s %(asctime)s %(message)s', + datefmt='%H:%M:%S', + level=log_level + ) + logger = logging.getLogger('site-submit') + logger.setLevel(log_level) + + result = await site_self_check(site, logger, sem, db) + + if result['disabled']: + print(f'Sorry, we couldn\'t find params to detect account presence/absence in {site.name}.') + print('Try to run this mode again and increase features count or choose others.') + else: + if input(f'Site {site.name} successfully checked. Do you want to save it in the Maigret DB? [yY] ') in 'yY': + db.update_site(site) + return True + + return False