import asyncio import json import re import os import logging from typing import Any, Dict, List, Optional, Tuple from aiohttp import ClientSession, TCPConnector from aiohttp_socks import ProxyConnector import cloudscraper from colorama import Fore, Style from .activation import import_aiohttp_cookies from .result import MaigretCheckResult from .settings import Settings from .sites import MaigretDatabase, MaigretEngine, MaigretSite from .utils import get_random_user_agent from .checking import site_self_check from .utils import get_match_ratio, generate_random_username class CloudflareSession: def __init__(self): self.scraper = cloudscraper.create_scraper() async def get(self, *args, **kwargs): await asyncio.sleep(0) res = self.scraper.get(*args, **kwargs) self.last_text = res.text self.status = res.status_code return self def status_code(self): return self.status async def text(self): await asyncio.sleep(0) return self.last_text async def close(self): pass class Submitter: HEADERS = { "User-Agent": get_random_user_agent(), } SEPARATORS = "\"'\n" RATIO = 0.6 TOP_FEATURES = 5 URL_RE = re.compile(r"https?://(www\.)?") def __init__(self, db: MaigretDatabase, settings: Settings, logger, args): self.settings = settings self.args = args self.db = db self.logger = logger from aiohttp_socks import ProxyConnector proxy = self.args.proxy cookie_jar = None if args.cookie_file: if not os.path.exists(args.cookie_file): logger.error(f"Cookie file {args.cookie_file} does not exist!") else: cookie_jar = import_aiohttp_cookies(args.cookie_file) connector = ProxyConnector.from_url(proxy) if proxy else TCPConnector(ssl=False) connector.verify_ssl = False self.session = ClientSession( connector=connector, trust_env=True, cookie_jar=cookie_jar ) async def close(self): await self.session.close() @staticmethod def get_alexa_rank(site_url_main): import requests import xml.etree.ElementTree as ElementTree url = f"http://data.alexa.com/data?cli=10&url={site_url_main}" xml_data = requests.get(url).text root = ElementTree.fromstring(xml_data) alexa_rank = 0 try: alexa_rank = int(root.find('.//REACH').attrib['RANK']) except Exception: pass return alexa_rank @staticmethod def extract_mainpage_url(url): return "/".join(url.split("/", 3)[:3]) async def site_self_check(self, site, semaphore, silent=False): # Call the general function from the checking.py changes = await site_self_check( site=site, logger=self.logger, semaphore=semaphore, db=self.db, silent=silent, proxy=self.args.proxy, cookies=self.args.cookie_file, # Don't skip errors in submit mode - we need check both false positives/true negatives skip_errors=False, ) return changes def generate_additional_fields_dialog(self, engine: MaigretEngine, dialog): fields = {} if 'urlSubpath' in engine.site.get('url', ''): msg = ( 'Detected engine suppose additional URL subpath using (/forum/, /blog/, etc). ' 'Enter in manually if it exists: ' ) subpath = input(msg).strip('/') if subpath: fields['urlSubpath'] = f'/{subpath}' return fields async def detect_known_engine( self, url_exists, url_mainpage, session, follow_redirects, headers ) -> [List[MaigretSite], str]: session = session or self.session resp_text, _ = await self.get_html_response_to_compare( url_exists, session, follow_redirects, headers ) for engine in self.db.engines: strs_to_check = engine.__dict__.get("presenseStrs") if strs_to_check and resp_text: all_strs_in_response = True for s in strs_to_check: if s not in resp_text: all_strs_in_response = False sites = [] if all_strs_in_response: engine_name = engine.__dict__.get("name") print(f"Detected engine {engine_name} for site {url_mainpage}") usernames_to_check = self.settings.supposed_usernames supposed_username = self.extract_username_dialog(url_exists) if supposed_username: usernames_to_check = [supposed_username] + usernames_to_check add_fields = self.generate_additional_fields_dialog( engine, url_exists ) for u in usernames_to_check: site_data = { "urlMain": url_mainpage, "name": url_mainpage.split("//")[1].split("/")[0], "engine": engine_name, "usernameClaimed": u, "usernameUnclaimed": "noonewouldeverusethis7", **add_fields, } self.logger.info(site_data) maigret_site = MaigretSite( url_mainpage.split("/")[-1], site_data ) maigret_site.update_from_engine( self.db.engines_dict[engine_name] ) sites.append(maigret_site) return sites, resp_text return [], resp_text @staticmethod def extract_username_dialog(url): url_parts = url.rstrip("/").split("/") supposed_username = url_parts[-1].strip('@') entered_username = input( f"{Fore.GREEN}[?] Is \"{supposed_username}\" a valid username? If not, write it manually: {Style.RESET_ALL}" ) return entered_username if entered_username else supposed_username @staticmethod async def get_html_response_to_compare( url: str, session: ClientSession = None, redirects=False, headers: Dict = None ): async with session.get( url, allow_redirects=redirects, headers=headers ) as response: # Try different encodings or fallback to 'ignore' errors try: html_response = await response.text(encoding='utf-8') except UnicodeDecodeError: try: html_response = await response.text(encoding='latin1') except UnicodeDecodeError: html_response = await response.text(errors='ignore') return html_response, response.status async def check_features_manually( self, username: str, url_exists: str, cookie_filename="", # TODO: use cookies session: ClientSession = None, follow_redirects=False, headers: dict = None, ) -> Tuple[List[str], List[str], str, str]: random_username = generate_random_username() url_of_non_existing_account = url_exists.lower().replace( username.lower(), random_username ) try: session = session or self.session first_html_response, first_status = await self.get_html_response_to_compare( url_exists, session, follow_redirects, headers ) second_html_response, second_status = ( await self.get_html_response_to_compare( url_of_non_existing_account, session, follow_redirects, headers ) ) await session.close() except Exception as e: self.logger.error( f"Error while getting HTTP response for username {username}: {e}", exc_info=True, ) return None, None, str(e), random_username self.logger.info(f"URL with existing account: {url_exists}") self.logger.info( f"HTTP response status for URL with existing account: {first_status}" ) self.logger.info( f"HTTP response length URL with existing account: {len(first_html_response)}" ) self.logger.debug(first_html_response) self.logger.info(f"URL with existing account: {url_of_non_existing_account}") self.logger.info( f"HTTP response status for URL with non-existing account: {second_status}" ) self.logger.info( f"HTTP response length URL with non-existing account: {len(second_html_response)}" ) self.logger.debug(second_html_response) # TODO: filter by errors, move to dialog function if ( "/cdn-cgi/challenge-platform" in first_html_response or "\t\t\t\tnow: " in first_html_response or "Sorry, you have been blocked" in first_html_response ): self.logger.info("Cloudflare detected, skipping") return None, None, "Cloudflare detected, skipping", random_username tokens_a = set(re.split(f'[{self.SEPARATORS}]', first_html_response)) tokens_b = set(re.split(f'[{self.SEPARATORS}]', second_html_response)) a_minus_b = tokens_a.difference(tokens_b) b_minus_a = tokens_b.difference(tokens_a) a_minus_b = list(map(lambda x: x.strip('\\'), a_minus_b)) b_minus_a = list(map(lambda x: x.strip('\\'), b_minus_a)) # Filter out strings containing usernames a_minus_b = [s for s in a_minus_b if username.lower() not in s.lower()] b_minus_a = [s for s in b_minus_a if random_username.lower() not in s.lower()] def filter_tokens(token: str, html_response: str) -> bool: is_in_html = token in html_response is_long_str = len(token) >= 50 is_number = re.match(r'^\d\.?\d+$', token) or re.match(r':^\d+$', token) is_whitelisted_number = token in ['200', '404', '403'] return not ( is_in_html or is_long_str or (is_number and not is_whitelisted_number) ) a_minus_b = list( filter(lambda t: filter_tokens(t, second_html_response), a_minus_b) ) b_minus_a = list( filter(lambda t: filter_tokens(t, first_html_response), b_minus_a) ) if len(a_minus_b) == len(b_minus_a) == 0: return ( None, None, "HTTP responses for pages with existing and non-existing accounts are the same", random_username, ) match_fun = get_match_ratio(self.settings.presence_strings) presence_list = sorted(a_minus_b, key=match_fun, reverse=True)[ : self.TOP_FEATURES ] absence_list = sorted(b_minus_a, key=match_fun, reverse=True)[ : self.TOP_FEATURES ] self.logger.info(f"Detected presence features: {presence_list}") self.logger.info(f"Detected absence features: {absence_list}") return presence_list, absence_list, "Found", random_username async def add_site(self, site): sem = asyncio.Semaphore(1) print( f"{Fore.BLUE}{Style.BRIGHT}[*] Adding site {site.name}, let's check it...{Style.RESET_ALL}" ) result = await self.site_self_check(site, sem) if result["disabled"]: print(f"Checks failed for {site.name}, please, verify them manually.") return { "valid": False, "reason": "checks_failed", } while True: print("\nAvailable fields to edit:") editable_fields = { '1': 'name', '2': 'tags', '3': 'url', '4': 'url_main', '5': 'username_claimed', '6': 'username_unclaimed', '7': 'presense_strs', '8': 'absence_strs', } for num, field in editable_fields.items(): current_value = getattr(site, field) print(f"{num}. {field} (current: {current_value})") print("0. finish editing") print("10. reject and block domain") print("11. invalid params, remove") choice = input("\nSelect field number to edit (0-8): ").strip() if choice == '0': break if choice == '10': return { "valid": False, "reason": "manual block", } if choice == '11': return { "valid": False, "reason": "remove", } if choice in editable_fields: field = editable_fields[choice] current_value = getattr(site, field) new_value = input( f"Enter new value for {field} (current: {current_value}): " ).strip() if field in ['tags', 'presense_strs', 'absence_strs']: new_value = list(map(str.strip, new_value.split(','))) if new_value: setattr(site, field, new_value) print(f"Updated {field} to: {new_value}") self.logger.info(site.json) self.db.update_site(site) return { "valid": True, } async def dialog(self, url_exists, cookie_file): """ An implementation of the submit mode: - User provides a URL of a existing social media account - Maigret tries to detect the site engine and understand how to check for account presence with HTTP responses analysis - If detection succeeds, Maigret generates a new site entry/replace old one in the database """ old_site = None additional_options_enabled = self.logger.level in ( logging.DEBUG, logging.WARNING, ) domain_raw = self.URL_RE.sub("", url_exists).strip().strip("/") domain_raw = domain_raw.split("/")[0] self.logger.info('Domain is %s', domain_raw) # check for existence matched_sites = list( filter(lambda x: domain_raw in x.url_main + x.url, self.db.sites) ) if matched_sites: # TODO: update the existing site print( f"{Fore.YELLOW}[!] Sites with domain \"{domain_raw}\" already exists in the Maigret database!{Style.RESET_ALL}" ) status = lambda s: "(disabled)" if s.disabled else "" url_block = lambda s: f"\n\t{s.url_main}\n\t{s.url}" print( "\n".join( [ f"{site.name} {status(site)}{url_block(site)}" for site in matched_sites ] ) ) if ( input( f"{Fore.GREEN}[?] Do you want to continue? [yN] {Style.RESET_ALL}" ).lower() in "n" ): return False site_names = [site.name for site in matched_sites] site_name = ( input( f"{Fore.GREEN}[?] Which site do you want to update in case of success? 1st by default. [{', '.join(site_names)}] {Style.RESET_ALL}" ) or matched_sites[0].name ) old_site = next( (site for site in matched_sites if site.name == site_name), None ) print( f'{Fore.GREEN}[+] We will update site "{old_site.name}" in case of success.{Style.RESET_ALL}' ) # Check if the site check is ordinary or not if old_site and (old_site.url_probe or old_site.activation): skip = input(f"{Fore.RED}[!] The site check depends on activation / probing mechanism! Consider to update it manually. Continue? [yN]{Style.RESET_ALL}") if skip.lower() in ['n', '']: return False # TODO: urlProbe support # TODO: activation support url_mainpage = self.extract_mainpage_url(url_exists) # headers update custom_headers = dict(self.HEADERS) while additional_options_enabled: header_key = input( f'{Fore.GREEN}[?] Specify custom header if you need or just press Enter to skip. Header name: {Style.RESET_ALL}' ) if not header_key: break header_value = input(f'{Fore.GREEN}[?] Header value: {Style.RESET_ALL}') custom_headers[header_key.strip()] = header_value.strip() # redirects settings update redirects = False if additional_options_enabled: redirects = ( 'y' in input( f'{Fore.GREEN}[?] Should we do redirects automatically? [yN] {Style.RESET_ALL}' ).lower() ) print('Detecting site engine, please wait...') sites = [] text = None try: sites, text = await self.detect_known_engine( url_exists, url_exists, session=None, follow_redirects=redirects, headers=custom_headers, ) except KeyboardInterrupt: print('Engine detect process is interrupted.') if 'cloudflare' in text.lower(): print( 'Cloudflare protection detected. I will use cloudscraper for further work' ) # self.session = CloudflareSession() if not sites: print("Unable to detect site engine, lets generate checking features") supposed_username = self.extract_username_dialog(url_exists) self.logger.info(f"Supposed username: {supposed_username}") # TODO: pass status_codes # check it here and suggest to enable / auto-enable redirects presence_list, absence_list, status, non_exist_username = ( await self.check_features_manually( username=supposed_username, url_exists=url_exists, cookie_filename=cookie_file, follow_redirects=redirects, headers=custom_headers, ) ) if status == "Found": site_data = { "absenceStrs": absence_list, "presenseStrs": presence_list, "url": url_exists.replace(supposed_username, '{username}'), "urlMain": url_mainpage, "usernameClaimed": supposed_username, "usernameUnclaimed": non_exist_username, "headers": custom_headers, "checkType": "message", } self.logger.info(json.dumps(site_data, indent=4)) if custom_headers != self.HEADERS: site_data['headers'] = custom_headers site = MaigretSite(url_mainpage.split("/")[-1], site_data) sites.append(site) else: print( f"{Fore.RED}[!] The check for site failed! Reason: {status}{Style.RESET_ALL}" ) return False self.logger.debug(sites[0].__dict__) sem = asyncio.Semaphore(1) print(f"{Fore.GREEN}[*] Checking, please wait...{Style.RESET_ALL}") found = False chosen_site = None for s in sites: chosen_site = s result = await self.site_self_check(s, sem) if not result["disabled"]: found = True break if not found: print( f"{Fore.RED}[!] The check for site '{chosen_site.name}' failed!{Style.RESET_ALL}" ) print( "Try to run this mode again and increase features count or choose others." ) self.logger.debug(json.dumps(chosen_site.json)) return False else: if ( input( f"{Fore.GREEN}[?] Site {chosen_site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] {Style.RESET_ALL}" ) .lower() .strip("y") ): return False if self.args.verbose: self.logger.info( "Verbose mode is enabled, additional settings are available" ) source = input( f"{Fore.GREEN}[?] Name the source site if it is mirror: {Style.RESET_ALL}" ) if source: chosen_site.source = source default_site_name = old_site.name if old_site else chosen_site.name new_name = ( input( f"{Fore.GREEN}[?] Change site name if you want [{default_site_name}]: {Style.RESET_ALL}" ) or default_site_name ) if new_name != default_site_name: self.logger.info(f"New site name is {new_name}") chosen_site.name = new_name default_tags_str = "" if old_site: default_tags_str = f' [{", ".join(old_site.tags)}]' new_tags = input(f"{Fore.GREEN}[?] Site tags{default_tags_str}: {Style.RESET_ALL}") if new_tags: chosen_site.tags = list(map(str.strip, new_tags.split(','))) else: chosen_site.tags = [] self.logger.info(f"Site tags are: {', '.join(chosen_site.tags)}") # rank = Submitter.get_alexa_rank(chosen_site.url_main) # if rank: # print(f'New alexa rank: {rank}') # chosen_site.alexa_rank = rank self.logger.info(chosen_site.json) site_data = chosen_site.strip_engine_data() self.logger.info(site_data.json) if old_site: # Update old site with new values and log changes fields_to_check = { 'url': 'URL', 'url_main': 'Main URL', 'username_claimed': 'Username claimed', 'username_unclaimed': 'Username unclaimed', 'check_type': 'Check type', 'presense_strs': 'Presence strings', 'absence_strs': 'Absence strings', 'tags': 'Tags', 'source': 'Source', 'headers': 'Headers', } for field, display_name in fields_to_check.items(): old_value = getattr(old_site, field) new_value = getattr(site_data, field) if field == 'tags' and not new_tags: continue if str(old_value) != str(new_value): print( f"{Fore.YELLOW}[*] '{display_name}' updated: {Fore.RED}{old_value} {Fore.YELLOW}to {Fore.GREEN}{new_value}{Style.RESET_ALL}" ) old_site.__dict__[field] = new_value # update the site final_site = old_site if old_site else site_data self.db.update_site(final_site) # save the db in file if self.args.db_file != self.settings.sites_db_path: print( f"{Fore.GREEN}[+] Maigret DB is saved to {self.args.db}.{Style.RESET_ALL}" ) self.db.save_to_file(self.args.db) return True