import asyncio import logging import re import ssl import sys import aiohttp import tqdm.asyncio from aiohttp_socks import ProxyConnector from mock import Mock from python_socks import _errors as proxy_errors from socid_extractor import extract from .activation import ParsingActivator, import_aiohttp_cookies from .result import QueryResult, QueryStatus from .sites import MaigretDatabase, MaigretSite supported_recursive_search_ids = ( 'yandex_public_id', 'gaia_id', 'vk_id', 'ok_id', 'wikimapia_uid', 'steam_id', ) common_errors = { 'Attention Required! | Cloudflare': 'Cloudflare captcha', 'Please stand by, while we are checking your browser': 'Cloudflare captcha', 'Доступ ограничен': 'Rostelecom censorship', 'document.getElementById(\'validate_form_submit\').disabled=true': 'Mail.ru captcha', 'Verifying your browser, please wait...
DDoS Protection by Blazingfast.io': 'Blazingfast protection', '404

Мы не нашли страницу': 'MegaFon 404 page', 'Доступ к информационному ресурсу ограничен на основании Федерального закона': 'MGTS censorship', 'Incapsula incident ID': 'Incapsula antibot protection', } unsupported_characters = '#' async def get_response(request_future, site_name, logger): html_text = None status_code = 0 error_text = "General Unknown Error" expection_text = None try: response = await request_future status_code = response.status response_content = await response.content.read() charset = response.charset or 'utf-8' decoded_content = response_content.decode(charset, 'ignore') html_text = decoded_content if status_code > 0: error_text = None logger.debug(html_text) except asyncio.TimeoutError as errt: error_text = "Timeout Error" expection_text = str(errt) except aiohttp.client_exceptions.ClientConnectorError as err: error_text = "Error Connecting" expection_text = str(err) except aiohttp.http_exceptions.BadHttpMessage as err: error_text = "HTTP Error" expection_text = str(err) except proxy_errors.ProxyError as err: error_text = "Proxy Error" expection_text = str(err) except Exception as err: # python-specific exceptions if sys.version_info.minor > 6: if isinstance(err, ssl.SSLCertVerificationError) or isinstance(err, ssl.SSLError): error_text = "SSL Error" expection_text = str(err) else: logger.warning(f'Unhandled error while requesting {site_name}: {err}') logger.debug(err, exc_info=True) error_text = "Some Error" expection_text = str(err) # TODO: return only needed information return html_text, status_code, error_text, expection_text async def update_site_dict_from_response(sitename, site_dict, results_info, semaphore, logger, query_notify): async with semaphore: site_obj = site_dict[sitename] future = site_obj.request_future if not future: # ignore: search by incompatible id type return response = await get_response(request_future=future, site_name=sitename, logger=logger) site_dict[sitename] = process_site_result(response, query_notify, logger, results_info, site_obj) # TODO: move to separate class def detect_error_page(html_text, status_code, fail_flags, ignore_403): # Detect service restrictions such as a country restriction for flag, msg in fail_flags.items(): if flag in html_text: return 'Some site error', msg # Detect common restrictions such as provider censorship and bot protection for flag, msg in common_errors.items(): if flag in html_text: return 'Error', msg # Detect common site errors if status_code == 403 and not ignore_403: return 'Access denied', 'Access denied, use proxy/vpn' elif status_code >= 500: return f'Error {status_code}', f'Site error {status_code}' return None, None def process_site_result(response, query_notify, logger, results_info, site: MaigretSite): if not response: return results_info fulltags = site.tags # Retrieve other site information again username = results_info['username'] is_parsing_enabled = results_info['parsing_enabled'] url = results_info.get("url_user") logger.debug(url) status = results_info.get("status") if status is not None: # We have already determined the user doesn't exist here return results_info # Get the expected check type check_type = site.check_type # Get the failure messages and comments failure_errors = site.errors # TODO: refactor if not response: logger.error(f'No response for {site.name}') return results_info html_text, status_code, error_text, expection_text = response site_error_text = '?' # TODO: add elapsed request time counting response_time = None if logger.level == logging.DEBUG: with open('debug.txt', 'a') as f: status = status_code or 'No response' f.write(f'url: {url}\nerror: {str(error_text)}\nr: {status}\n') if html_text: f.write(f'code: {status}\nresponse: {str(html_text)}\n') if status_code and not error_text: error_text, site_error_text = detect_error_page(html_text, status_code, failure_errors, site.ignore_403) if site.activation and html_text: is_need_activation = any([s for s in site.activation['marks'] if s in html_text]) if is_need_activation: method = site.activation['method'] try: activate_fun = getattr(ParsingActivator(), method) # TODO: async call activate_fun(site, logger) except AttributeError: logger.warning(f'Activation method {method} for site {site.name} not found!') except Exception as e: logger.warning(f'Failed activation {method} for site {site.name}: {e}') # presense flags # True by default presense_flags = site.presense_strs is_presense_detected = False if html_text: if not presense_flags: is_presense_detected = True site.stats['presense_flag'] = None else: for presense_flag in presense_flags: if presense_flag in html_text: is_presense_detected = True site.stats['presense_flag'] = presense_flag logger.info(presense_flag) break if error_text is not None: logger.debug(error_text) result = QueryResult(username, site.name, url, QueryStatus.UNKNOWN, query_time=response_time, context=f'{error_text}: {site_error_text}', tags=fulltags) elif check_type == "message": absence_flags = site.absence_strs is_absence_flags_list = isinstance(absence_flags, list) absence_flags_set = set(absence_flags) if is_absence_flags_list else {absence_flags} # Checks if the error message is in the HTML is_absence_detected = any([(absence_flag in html_text) for absence_flag in absence_flags_set]) if not is_absence_detected and is_presense_detected: result = QueryResult(username, site.name, url, QueryStatus.CLAIMED, query_time=response_time, tags=fulltags) else: result = QueryResult(username, site.name, url, QueryStatus.AVAILABLE, query_time=response_time, tags=fulltags) elif check_type == "status_code": # Checks if the status code of the response is 2XX if (not status_code >= 300 or status_code < 200) and is_presense_detected: result = QueryResult(username, site.name, url, QueryStatus.CLAIMED, query_time=response_time, tags=fulltags) else: result = QueryResult(username, site.name, url, QueryStatus.AVAILABLE, query_time=response_time, tags=fulltags) elif check_type == "response_url": # For this detection method, we have turned off the redirect. # So, there is no need to check the response URL: it will always # match the request. Instead, we will ensure that the response # code indicates that the request was successful (i.e. no 404, or # forward to some odd redirect). if 200 <= status_code < 300 and is_presense_detected: result = QueryResult(username, site.name, url, QueryStatus.CLAIMED, query_time=response_time, tags=fulltags) else: result = QueryResult(username, site.name, url, QueryStatus.AVAILABLE, query_time=response_time, tags=fulltags) else: # It should be impossible to ever get here... raise ValueError(f"Unknown check type '{check_type}' for " f"site '{site.name}'") extracted_ids_data = {} if is_parsing_enabled and result.status == QueryStatus.CLAIMED: try: extracted_ids_data = extract(html_text) except Exception as e: logger.warning(f'Error while parsing {site.name}: {e}', exc_info=True) if extracted_ids_data: new_usernames = {} for k, v in extracted_ids_data.items(): if 'username' in k: new_usernames[v] = 'username' if k in supported_recursive_search_ids: new_usernames[v] = k results_info['ids_usernames'] = new_usernames results_info['ids_links'] = eval(extracted_ids_data.get('links', '[]')) result.ids_data = extracted_ids_data # Notify caller about results of query. query_notify.update(result, site.similar_search) # Save status of request results_info['status'] = result # Save results from request results_info['http_status'] = status_code results_info['is_similar'] = site.similar_search # results_site['response_text'] = html_text results_info['rank'] = site.alexa_rank return results_info async def maigret(username, site_dict, query_notify, logger, proxy=None, timeout=None, is_parsing_enabled=False, id_type='username', debug=False, forced=False, max_connections=100, no_progressbar=False, cookies=None): """Main search func Checks for existence of username on various social media sites. Keyword Arguments: username -- String indicating username that report should be created against. site_dict -- Dictionary containing all of the site data. query_notify -- Object with base type of QueryNotify(). This will be used to notify the caller about query results. proxy -- String indicating the proxy URL timeout -- Time in seconds to wait before timing out request. Default is no timeout. is_parsing_enabled -- Search for other usernames in website pages. Return Value: Dictionary containing results from report. Key of dictionary is the name of the social network site, and the value is another dictionary with the following keys: url_main: URL of main site. url_user: URL of user on site (if account exists). status: QueryResult() object indicating results of test for account existence. http_status: HTTP status code of query which checked for existence on site. response_text: Text that came back from request. May be None if there was an HTTP error when checking for existence. """ # Notify caller that we are starting the query. query_notify.start(username, id_type) # TODO: connector connector = ProxyConnector.from_url(proxy) if proxy else aiohttp.TCPConnector(ssl=False) # connector = aiohttp.TCPConnector(ssl=False) connector.verify_ssl = False cookie_jar = None if cookies: logger.debug(f'Using cookies jar file {cookies}') cookie_jar = await import_aiohttp_cookies(cookies) session = aiohttp.ClientSession(connector=connector, trust_env=True, cookie_jar=cookie_jar) if logger.level == logging.DEBUG: future = session.get(url='https://icanhazip.com') ip, status, error, expection = await get_response(future, None, logger) if ip: logger.debug(f'My IP is: {ip.strip()}') else: logger.debug(f'IP requesting {error}: {expection}') # Results from analysis of all sites results_total = {} # First create futures for all requests. This allows for the requests to run in parallel for site_name, site in site_dict.items(): if site.type != id_type: continue if site.disabled and not forced: logger.debug(f'Site {site.name} is disabled, skipping...') continue # Results from analysis of this specific site results_site = {} # Record URL of main site and username results_site['username'] = username results_site['parsing_enabled'] = is_parsing_enabled results_site['url_main'] = site.url_main results_site['cookies'] = cookie_jar and cookie_jar.filter_cookies(site.url_main) or None headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11.1; rv:55.0) Gecko/20100101 Firefox/55.0', } headers.update(site.headers) if not 'url' in site.__dict__: logger.error('No URL for site %s', site.name) # URL of user on site (if it exists) url = site.url.format( urlMain=site.url_main, urlSubpath=site.url_subpath, username=username ) # workaround to prevent slash errors url = re.sub('(? bool: sem = asyncio.Semaphore(max_connections) tasks = [] all_sites = site_data def disabled_count(lst): return len(list(filter(lambda x: x.disabled, lst))) disabled_old_count = disabled_count(all_sites.values()) for _, site in all_sites.items(): check_coro = site_self_check(site, logger, sem, db, silent) future = asyncio.ensure_future(check_coro) tasks.append(future) for f in tqdm.asyncio.tqdm.as_completed(tasks): await f disabled_new_count = disabled_count(all_sites.values()) total_disabled = disabled_new_count - disabled_old_count if total_disabled >= 0: message = 'Disabled' else: message = 'Enabled' total_disabled *= -1 if not silent: print( f'{message} {total_disabled} ({disabled_old_count} => {disabled_new_count}) checked sites. Run with `--info` flag to get more information') return total_disabled != 0