From 81a817a39f9830f307d6444f1545950db2a9abae Mon Sep 17 00:00:00 2001 From: Soxoj <31013580+soxoj@users.noreply.github.com> Date: Tue, 10 Dec 2024 18:02:43 +0100 Subject: [PATCH] Improved "submit new site" mode, added tests, fixed top-500 sites (#1952) --- maigret/resources/data.json | 153 +++++++++----- maigret/result.py | 2 +- maigret/submit.py | 383 ++++++++++++++++++++++++------------ maigret/utils.py | 5 + sites.md | 44 ++--- tests/conftest.py | 7 + tests/db.json | 8 +- tests/test_activation.py | 1 + tests/test_data.py | 2 + tests/test_executors.py | 1 + tests/test_maigret.py | 1 + tests/test_submit.py | 278 ++++++++++++++++++++++++++ 12 files changed, 691 insertions(+), 194 deletions(-) create mode 100644 tests/test_submit.py diff --git a/maigret/resources/data.json b/maigret/resources/data.json index c2d4bc0..f10f3b6 100644 --- a/maigret/resources/data.json +++ b/maigret/resources/data.json @@ -1970,6 +1970,7 @@ "usernameUnclaimed": "noonewouldeverusethis7" }, "BeerMoneyForum": { + "disabled": true, "ignore403": true, "tags": [ "finance", @@ -2366,19 +2367,30 @@ "usernameUnclaimed": "noonewouldeverusethis7" }, "BoardGameGeek": { + "checkType": "message", "tags": [ "gaming", "us" ], - "checkType": "message", "absenceStrs": [ - "User does not exist." + "\t\tUser not found", + "messagebox error", + ">\t
Profile | BoardGameGeek", + "\t
" ], "alexaRank": 4327, - "urlMain": "https://www.boardgamegeek.com", - "url": "https://www.boardgamegeek.com/user/{username}", - "usernameClaimed": "adam", - "usernameUnclaimed": "noonewouldeverusethis7" + "urlMain": "https://boardgamegeek.com", + "url": "https://boardgamegeek.com/user/{username}", + "usernameClaimed": "ZakuBG", + "usernameUnclaimed": "uzytnhstvj", + "presenseStrs": [ + "username", + " style=", + "mail", + " \tstyle=", + " data-username=" + ] }, "Bobrdobr": { "tags": [ @@ -3005,7 +3017,8 @@ "alexaRank": 2689, "urlMain": "https://community.cbr.com", "usernameClaimed": "red", - "usernameUnclaimed": "noonewouldeverusethis7" + "usernameUnclaimed": "noonewouldeverusethis7", + "disabled": true }, "Ccdi": { "tags": [ @@ -4645,21 +4658,6 @@ "usernameUnclaimed": "noonewouldeverusethis7", "alexaRank": 301125 }, - "Eksisozluk": { - "tags": [ - "tr" - ], - "checkType": "message", - "absenceStrs": [ - "isimli bir yazar kayd\u0131 mevcut de\u011fil", - "olmaz \u00f6yle \u015fey" - ], - "alexaRank": 977, - "urlMain": "https://eksisozluk.com/biri/", - "url": "https://eksisozluk.com/biri/{username}", - "usernameClaimed": "adam", - "usernameUnclaimed": "noonewouldeverusethis7" - }, "Elakiri": { "tags": [ "lk" @@ -5747,6 +5745,7 @@ "usernameUnclaimed": "noonewouldeverusethis7" }, "Folkd": { + "disabled": true, "tags": [ "eu", "in" @@ -7678,17 +7677,28 @@ }, "Hotcopper": { "tags": [ - "au" + "finance" ], "checkType": "message", "absenceStrs": [ - "The following error occurred" + "error-page", + "error-page home container", + "card-footer-item", + ">

No such player

This username doesn", + "})()", + "IR0Cf7qpkpcOhvI9r03a0QbI" ], "alexaRank": 2374, "urlMain": "https://lichess.org", "url": "https://lichess.org/@/{username}", - "usernameClaimed": "blue", - "usernameUnclaimed": "noonewouldeverusethis7", + "usernameClaimed": "adam", + "usernameUnclaimed": "efxvyhnwrh", "tags": [ "gaming", "hobby" + ], + "presenseStrs": [ + "us_profile", + "og:title", + "profile-side", + " data-username=", + "og:site_name" ] }, "Liebe69": { @@ -14739,16 +14760,25 @@ "usernameUnclaimed": "noonewouldeverusethis7" }, "SlideShare": { - "tags": [ - "documents", - "sharing" - ], - "checkType": "status_code", + "checkType": "message", "alexaRank": 158, - "urlMain": "https://slideshare.net/", - "url": "https://slideshare.net/{username}", - "usernameClaimed": "blue", - "usernameUnclaimed": "noonewouldeverusethis7" + "urlMain": "https://www.slideshare.net", + "url": "https://www.slideshare.net/{username}", + "usernameClaimed": "KumarSurya7", + "usernameUnclaimed": "kwbmsonxvp", + "presenseStrs": [ + "user-name", + "pageInfo", + "listitem", + "polite", + "strippedTitle" + ], + "absenceStrs": [ + "blankProfile", + "username-available", + "robots", + "noindex,nofollow" + ] }, "Slides": { "tags": [ @@ -15447,7 +15477,8 @@ "urlMain": "https://www.strava.com/", "url": "https://www.strava.com/athletes/{username}", "usernameClaimed": "adam", - "usernameUnclaimed": "noonewouldeverusethis7" + "usernameUnclaimed": "noonewouldeverusethis7", + "disabled": true }, "Studfile": { "tags": [ @@ -16771,13 +16802,20 @@ "regexCheck": "^[^\\.]+$", "checkType": "message", "absenceStrs": [ - "There's nothing here." + "Not found.", + ":404,", + "userAgent", + ",displayStatus:" ], "alexaRank": 112, - "urlMain": "https://tumblr.com/", - "url": "https://{username}.tumblr.com/", - "usernameClaimed": "red", - "usernameUnclaimed": "noonewouldeverusethis7" + "urlMain": "https://www.tumblr.com", + "url": "https://www.tumblr.com/{username}", + "usernameClaimed": "soxoj", + "usernameUnclaimed": "zdbimdoqyt", + "presenseStrs": [ + "profile", + " title=" + ] }, "Tunefind": { "checkType": "message", @@ -17114,7 +17152,8 @@ "urlMain": "https://vc.ru", "url": "https://vc.ru/search/v2/subsite/relevant?query={username}", "usernameClaimed": "adam", - "usernameUnclaimed": "noonewouldeverusethis7" + "usernameUnclaimed": "noonewouldeverusethis7", + "disabled": true }, "Viddler": { "checkType": "message", @@ -17377,7 +17416,7 @@ "video" ], "headers": { - "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3MzM2MTc5MjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbCwianRpIjoiNGYxM2M4N2ItYWMwMy00Y2JhLWExMDctNmNiODhmM2U3NjZjIn0.Y7CWEWckdSMsmJ8ROPmhHR6el2QCYJRDl0RLPpdJOKc" + "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3MzM4MzkwODAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbCwianRpIjoiOWNjMjk0ZjktZGZhOS00NDI0LWE0OGEtN2JjYzkwYjM2NTMyIn0.wG0kC7fWtrdKI9ccS-LE81lVgQRfYobrqCAPWxr1wzc" }, "activation": { "url": "https://vimeo.com/_rv/viewer", @@ -18971,7 +19010,8 @@ "urlMain": "https://aminoapps.com/", "url": "https://aminoapps.com/u/{username}", "usernameClaimed": "blue", - "usernameUnclaimed": "noonewouldeverusethis77777" + "usernameUnclaimed": "noonewouldeverusethis77777", + "disabled": true }, "analitika-forex.ru": { "engine": "uCoz", @@ -35419,6 +35459,27 @@ "Cache-Control": "no-cache", "TE": "trailers" } + }, + "Eksisozluk": { + "absenceStrs": [ + "

b\u00f6yle bir yazar yok

\r" + ], + "presenseStrs": [ + "profile-dots", + "profile-logo", + "profile-cards", + "profile-biography", + " data-title=" + ], + "alexaRank": 977, + "url": "https://eksisozluk.com/biri/{username}", + "urlMain": "https://eksisozluk.com", + "usernameClaimed": "kartalbafilerrr", + "usernameUnclaimed": "rlcvuwlxqh", + "checkType": "message", + "tags": [ + "tr" + ] } }, "engines": { diff --git a/maigret/result.py b/maigret/result.py index 0ce9127..5346d6c 100644 --- a/maigret/result.py +++ b/maigret/result.py @@ -96,7 +96,7 @@ class MaigretCheckResult: return self.status == MaigretCheckStatus.CLAIMED def __repr__(self): - return f"'{self.__str__()}'" + return f"<{self.__str__()}>" def __str__(self): """Convert Object To String. diff --git a/maigret/submit.py b/maigret/submit.py index fbe86ae..a3ab8fd 100644 --- a/maigret/submit.py +++ b/maigret/submit.py @@ -2,7 +2,8 @@ import asyncio import json import re import os -from typing import Any, Dict, List, Optional +import logging +from typing import Any, Dict, List, Optional, Tuple from aiohttp import ClientSession, TCPConnector from aiohttp_socks import ProxyConnector @@ -15,7 +16,7 @@ from .settings import Settings from .sites import MaigretDatabase, MaigretEngine, MaigretSite from .utils import get_random_user_agent from .checking import site_self_check -from .utils import get_match_ratio +from .utils import get_match_ratio, generate_random_username class CloudflareSession: @@ -125,21 +126,13 @@ class Submitter: return fields async def detect_known_engine( - self, url_exists, url_mainpage + self, url_exists, url_mainpage, session, follow_redirects, headers ) -> [List[MaigretSite], str]: - resp_text = '' - - try: - r = await self.session.get(url_mainpage) - content = await r.content.read() - charset = r.charset or "utf-8" - resp_text = content.decode(charset, "ignore") - self.logger.debug(resp_text) - except Exception as e: - self.logger.warning(e, exc_info=True) - print(f"Some error while checking main page: {e}") - return [], resp_text + session = session or self.session + resp_text, _ = await self.get_html_response_to_compare( + url_exists, session, follow_redirects, headers + ) for engine in self.db.engines: strs_to_check = engine.__dict__.get("presenseStrs") @@ -195,113 +188,134 @@ class Submitter: ) return entered_username if entered_username else supposed_username - async def check_features_manually( - self, url_exists, url_mainpage, cookie_file, redirects=False + @staticmethod + async def get_html_response_to_compare( + url: str, session: ClientSession = None, redirects=False, headers: Dict = None ): - custom_headers = {} - while self.args.verbose: - header_key = input( - 'Specify custom header if you need or just press Enter to skip. Header name: ' + async with session.get( + url, allow_redirects=redirects, headers=headers + ) as response: + # Try different encodings or fallback to 'ignore' errors + try: + html_response = await response.text(encoding='utf-8') + except UnicodeDecodeError: + try: + html_response = await response.text(encoding='latin1') + except UnicodeDecodeError: + html_response = await response.text(errors='ignore') + return html_response, response.status + + async def check_features_manually( + self, + username: str, + url_exists: str, + cookie_filename="", # TODO: use cookies + session: ClientSession = None, + follow_redirects=False, + headers: dict = None, + ) -> Tuple[List[str], List[str], str, str]: + + random_username = generate_random_username() + url_of_non_existing_account = url_exists.lower().replace( + username.lower(), random_username + ) + + try: + session = session or self.session + first_html_response, first_status = await self.get_html_response_to_compare( + url_exists, session, follow_redirects, headers ) - if not header_key: - break - header_value = input('Header value: ') - custom_headers[header_key.strip()] = header_value.strip() + second_html_response, second_status = ( + await self.get_html_response_to_compare( + url_of_non_existing_account, session, follow_redirects, headers + ) + ) + await session.close() + except Exception as e: + self.logger.error( + f"Error while getting HTTP response for username {username}: {e}", + exc_info=True, + ) + return None, None, str(e), random_username - supposed_username = self.extract_username_dialog(url_exists) - non_exist_username = "noonewouldeverusethis7" - - url_user = url_exists.replace(supposed_username, "{username}") - url_not_exists = url_exists.replace(supposed_username, non_exist_username) - - headers = dict(self.HEADERS) - headers.update(custom_headers) - - exists_resp = await self.session.get( - url_exists, - headers=headers, - allow_redirects=redirects, + self.logger.info(f"URL with existing account: {url_exists}") + self.logger.info( + f"HTTP response status for URL with existing account: {first_status}" ) - exists_resp_text = await exists_resp.text() - self.logger.debug(url_exists) - self.logger.debug(exists_resp.status) - self.logger.debug(exists_resp_text) - - non_exists_resp = await self.session.get( - url_not_exists, - headers=headers, - allow_redirects=redirects, + self.logger.info( + f"HTTP response length URL with existing account: {len(first_html_response)}" ) - non_exists_resp_text = await non_exists_resp.text() - self.logger.debug(url_not_exists) - self.logger.debug(non_exists_resp.status) - self.logger.debug(non_exists_resp_text) + self.logger.debug(first_html_response) - a = exists_resp_text - b = non_exists_resp_text + self.logger.info(f"URL with existing account: {url_of_non_existing_account}") + self.logger.info( + f"HTTP response status for URL with non-existing account: {second_status}" + ) + self.logger.info( + f"HTTP response length URL with non-existing account: {len(second_html_response)}" + ) + self.logger.debug(second_html_response) - tokens_a = set(re.split(f'[{self.SEPARATORS}]', a)) - tokens_b = set(re.split(f'[{self.SEPARATORS}]', b)) + # TODO: filter by errors, move to dialog function + if ( + "/cdn-cgi/challenge-platform" in first_html_response + or "\t\t\t\tnow: " in first_html_response + or "Sorry, you have been blocked" in first_html_response + ): + self.logger.info("Cloudflare detected, skipping") + return None, None, "Cloudflare detected, skipping", random_username + + tokens_a = set(re.split(f'[{self.SEPARATORS}]', first_html_response)) + tokens_b = set(re.split(f'[{self.SEPARATORS}]', second_html_response)) a_minus_b = tokens_a.difference(tokens_b) b_minus_a = tokens_b.difference(tokens_a) - # additional filtering by html response - a_minus_b = [t for t in a_minus_b if t not in non_exists_resp_text] - b_minus_a = [t for t in b_minus_a if t not in exists_resp_text] + a_minus_b = list(map(lambda x: x.strip('\\'), a_minus_b)) + b_minus_a = list(map(lambda x: x.strip('\\'), b_minus_a)) + + # Filter out strings containing usernames + a_minus_b = [s for s in a_minus_b if username.lower() not in s.lower()] + b_minus_a = [s for s in b_minus_a if random_username.lower() not in s.lower()] + + def filter_tokens(token: str, html_response: str) -> bool: + is_in_html = token in html_response + is_long_str = len(token) >= 50 + is_number = re.match(r'^\d\.?\d+$', token) or re.match(r':^\d+$', token) + is_whitelisted_number = token in ['200', '404', '403'] + + return not ( + is_in_html or is_long_str or (is_number and not is_whitelisted_number) + ) + + a_minus_b = list( + filter(lambda t: filter_tokens(t, second_html_response), a_minus_b) + ) + b_minus_a = list( + filter(lambda t: filter_tokens(t, first_html_response), b_minus_a) + ) if len(a_minus_b) == len(b_minus_a) == 0: - print("The pages for existing and non-existing account are the same!") - - top_features_count = int( - input( - f"Specify count of features to extract [default {self.TOP_FEATURES}]: " + return ( + None, + None, + "HTTP responses for pages with existing and non-existing accounts are the same", + random_username, ) - or self.TOP_FEATURES - ) match_fun = get_match_ratio(self.settings.presence_strings) presence_list = sorted(a_minus_b, key=match_fun, reverse=True)[ - :top_features_count + : self.TOP_FEATURES ] - - self.logger.debug([(keyword, match_fun(keyword)) for keyword in presence_list]) - - print("Detected text features of existing account: " + ", ".join(presence_list)) - features = input("If features was not detected correctly, write it manually: ") - - if features: - presence_list = list(map(str.strip, features.split(","))) - absence_list = sorted(b_minus_a, key=match_fun, reverse=True)[ - :top_features_count + : self.TOP_FEATURES ] - self.logger.debug([(keyword, match_fun(keyword)) for keyword in absence_list]) - print( - "Detected text features of non-existing account: " + ", ".join(absence_list) - ) - features = input("If features was not detected correctly, write it manually: ") + self.logger.info(f"Detected presence features: {presence_list}") + self.logger.info(f"Detected absence features: {absence_list}") - if features: - absence_list = list(map(str.strip, features.split(","))) - - site_data = { - "absenceStrs": absence_list, - "presenseStrs": presence_list, - "url": url_user, - "urlMain": url_mainpage, - "usernameClaimed": supposed_username, - "usernameUnclaimed": non_exist_username, - "checkType": "message", - } - - if headers != self.HEADERS: - site_data['headers'] = headers - - site = MaigretSite(url_mainpage.split("/")[-1], site_data) - return site + return presence_list, absence_list, "Found", random_username async def add_site(self, site): sem = asyncio.Semaphore(1) @@ -376,6 +390,12 @@ class Submitter: } async def dialog(self, url_exists, cookie_file): + old_site = None + additional_options_enabled = self.logger.level in ( + logging.DEBUG, + logging.WARNING, + ) + domain_raw = self.URL_RE.sub("", url_exists).strip().strip("/") domain_raw = domain_raw.split("/")[0] self.logger.info('Domain is %s', domain_raw) @@ -386,9 +406,11 @@ class Submitter: ) if matched_sites: + # TODO: update the existing site print( - f'Sites with domain "{domain_raw}" already exists in the Maigret database!' + f"{Fore.YELLOW}[!] Sites with domain \"{domain_raw}\" already exists in the Maigret database!{Style.RESET_ALL}" ) + status = lambda s: "(disabled)" if s.disabled else "" url_block = lambda s: f"\n\t{s.url_main}\n\t{s.url}" print( @@ -400,16 +422,62 @@ class Submitter: ) ) - if input("Do you want to continue? [yN] ").lower() in "n": + if ( + input( + f"{Fore.GREEN}[?] Do you want to continue? [yN] {Style.RESET_ALL}" + ).lower() + in "n" + ): return False + site_names = [site.name for site in matched_sites] + site_name = ( + input( + f"{Fore.GREEN}[?] Which site do you want to update in case of success? 1st by default. [{', '.join(site_names)}] {Style.RESET_ALL}" + ) + or matched_sites[0].name + ) + old_site = next( + (site for site in matched_sites if site.name == site_name), None + ) + print( + f'{Fore.GREEN}[+] We will update site "{old_site.name}" in case of success.{Style.RESET_ALL}' + ) + url_mainpage = self.extract_mainpage_url(url_exists) + # headers update + custom_headers = dict(self.HEADERS) + while additional_options_enabled: + header_key = input( + f'{Fore.GREEN}[?] Specify custom header if you need or just press Enter to skip. Header name: {Style.RESET_ALL}' + ) + if not header_key: + break + header_value = input(f'{Fore.GREEN}[?] Header value: {Style.RESET_ALL}') + custom_headers[header_key.strip()] = header_value.strip() + + # redirects settings update + redirects = False + if additional_options_enabled: + redirects = ( + 'y' + in input( + f'{Fore.GREEN}[?] Should we do redirects automatically? [yN] {Style.RESET_ALL}' + ).lower() + ) + print('Detecting site engine, please wait...') sites = [] text = None try: - sites, text = await self.detect_known_engine(url_exists, url_exists) + sites, text = await self.detect_known_engine( + url_exists, + url_exists, + session=None, + follow_redirects=redirects, + headers=custom_headers, + ) except KeyboardInterrupt: print('Engine detect process is interrupted.') @@ -422,26 +490,48 @@ class Submitter: if not sites: print("Unable to detect site engine, lets generate checking features") - redirects = False - if self.args.verbose: - redirects = ( - 'y' in input('Should we do redirects automatically? [yN] ').lower() - ) + supposed_username = self.extract_username_dialog(url_exists) + self.logger.info(f"Supposed username: {supposed_username}") - sites = [ + presence_list, absence_list, status, non_exist_username = ( await self.check_features_manually( - url_exists, - url_mainpage, - cookie_file, - redirects, + username=supposed_username, + url_exists=url_exists, + cookie_filename=cookie_file, + follow_redirects=redirects, + headers=custom_headers, ) - ] + ) + + if status == "Found": + site_data = { + "absenceStrs": absence_list, + "presenseStrs": presence_list, + "url": url_exists.replace(supposed_username, '{username}'), + "urlMain": url_mainpage, + "usernameClaimed": supposed_username, + "usernameUnclaimed": non_exist_username, + "checkType": "message", + } + self.logger.info(json.dumps(site_data, indent=4)) + + if custom_headers != self.HEADERS: + site_data['headers'] = custom_headers + + site = MaigretSite(url_mainpage.split("/")[-1], site_data) + sites.append(site) + + else: + print( + f"{Fore.RED}[!] The check for site failed! Reason: {status}{Style.RESET_ALL}" + ) + return False self.logger.debug(sites[0].__dict__) sem = asyncio.Semaphore(1) - print("Checking, please wait...") + print(f"{Fore.GREEN}[*] Checking, please wait...{Style.RESET_ALL}") found = False chosen_site = None for s in sites: @@ -463,7 +553,7 @@ class Submitter: else: if ( input( - f"Site {chosen_site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] " + f"{Fore.GREEN}[?] Site {chosen_site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] {Style.RESET_ALL}" ) .lower() .strip("y") @@ -471,22 +561,73 @@ class Submitter: return False if self.args.verbose: - source = input("Name the source site if it is mirror: ") + self.logger.info( + "Verbose mode is enabled, additional settings are available" + ) + source = input( + f"{Fore.GREEN}[?] Name the source site if it is mirror: {Style.RESET_ALL}" + ) if source: chosen_site.source = source - chosen_site.name = input("Change site name if you want: ") or chosen_site.name - chosen_site.tags = list(map(str.strip, input("Site tags: ").split(','))) + default_site_name = old_site.name if old_site else chosen_site.name + new_name = ( + input( + f"{Fore.GREEN}[?] Change site name if you want [{default_site_name}]: {Style.RESET_ALL}" + ) + or default_site_name + ) + if new_name != default_site_name: + self.logger.info(f"New site name is {new_name}") + chosen_site.name = new_name + + # TODO: remove empty tags + new_tags = input(f"{Fore.GREEN}[?] Site tags: {Style.RESET_ALL}") + if new_tags: + chosen_site.tags = list(map(str.strip, new_tags.split(','))) + else: + chosen_site.tags = [] + self.logger.info(f"Site tags are: {', '.join(chosen_site.tags)}") # rank = Submitter.get_alexa_rank(chosen_site.url_main) # if rank: # print(f'New alexa rank: {rank}') # chosen_site.alexa_rank = rank - self.logger.debug(chosen_site.json) + self.logger.info(chosen_site.json) site_data = chosen_site.strip_engine_data() - self.logger.debug(site_data.json) - self.db.update_site(site_data) + self.logger.info(site_data.json) + if old_site: + # Update old site with new values and log changes + fields_to_check = { + 'url': 'URL', + 'url_main': 'Main URL', + 'username_claimed': 'Username claimed', + 'username_unclaimed': 'Username unclaimed', + 'check_type': 'Check type', + 'presense_strs': 'Presence strings', + 'absence_strs': 'Absence strings', + 'tags': 'Tags', + 'source': 'Source', + 'headers': 'Headers', + } + + for field, display_name in fields_to_check.items(): + old_value = getattr(old_site, field) + new_value = getattr(site_data, field) + if field == 'tags' and not new_tags: + continue + if str(old_value) != str(new_value): + print( + f"{Fore.YELLOW}[*] '{display_name}' updated: {Fore.RED}{old_value} {Fore.YELLOW}to {Fore.GREEN}{new_value}{Style.RESET_ALL}" + ) + old_site.__dict__[field] = new_value + + # update the site + final_site = old_site if old_site else site_data + self.db.update_site(final_site) + + # save the db in file if self.args.db_file != self.settings.sites_db_path: print( f"{Fore.GREEN}[+] Maigret DB is saved to {self.args.db}.{Style.RESET_ALL}" diff --git a/maigret/utils.py b/maigret/utils.py index 8367c55..4cb326c 100644 --- a/maigret/utils.py +++ b/maigret/utils.py @@ -3,6 +3,7 @@ import ast import difflib import re import random +import string from typing import Any @@ -119,3 +120,7 @@ def get_match_ratio(base_strs: list): ) return get_match_inner + + +def generate_random_username(): + return ''.join(random.choices(string.ascii_lowercase, k=10)) diff --git a/sites.md b/sites.md index 15b8335..3d3d768 100644 --- a/sites.md +++ b/sites.md @@ -77,7 +77,7 @@ Rank data fetched from Alexa by domains. 1. ![](https://www.google.com/s2/favicons?domain=https://open.spotify.com/) [Spotify (https://open.spotify.com/)](https://open.spotify.com/)*: top 100, music, us*, search is disabled 1. ![](https://www.google.com/s2/favicons?domain=https://www.tiktok.com/) [TikTok (https://www.tiktok.com/)](https://www.tiktok.com/)*: top 100, video* 1. ![](https://www.google.com/s2/favicons?domain=https://xvideos.com/) [Xvideos (https://xvideos.com/)](https://xvideos.com/)*: top 500, porn, us* -1. ![](https://www.google.com/s2/favicons?domain=https://tumblr.com/) [Tumblr (https://tumblr.com/)](https://tumblr.com/)*: top 500, blog* +1. ![](https://www.google.com/s2/favicons?domain=https://www.tumblr.com) [Tumblr (https://www.tumblr.com)](https://www.tumblr.com)*: top 500, blog* 1. ![](https://www.google.com/s2/favicons?domain=https://www.roblox.com/) [Roblox (https://www.roblox.com/)](https://www.roblox.com/)*: top 500, gaming, us* 1. ![](https://www.google.com/s2/favicons?domain=https://soundcloud.com/) [SoundCloud (https://soundcloud.com/)](https://soundcloud.com/)*: top 500, music* 1. ![](https://www.google.com/s2/favicons?domain=https://www.udemy.com) [Udemy (https://www.udemy.com)](https://www.udemy.com)*: top 500, in* @@ -92,7 +92,7 @@ Rank data fetched from Alexa by domains. 1. ![](https://www.google.com/s2/favicons?domain=https://www.pinterest.com/) [Pinterest (https://www.pinterest.com/)](https://www.pinterest.com/)*: top 500, art, photo, sharing* 1. ![](https://www.google.com/s2/favicons?domain=https://www.fiverr.com/) [Fiverr (https://www.fiverr.com/)](https://www.fiverr.com/)*: top 500, shopping, us* 1. ![](https://www.google.com/s2/favicons?domain=https://t.me/) [Telegram (https://t.me/)](https://t.me/)*: top 500, messaging* -1. ![](https://www.google.com/s2/favicons?domain=https://slideshare.net/) [SlideShare (https://slideshare.net/)](https://slideshare.net/)*: top 500, documents, sharing* +1. ![](https://www.google.com/s2/favicons?domain=https://www.slideshare.net) [SlideShare (https://www.slideshare.net)](https://www.slideshare.net)*: top 500* 1. ![](https://www.google.com/s2/favicons?domain=https://theguardian.com) [TheGuardian (https://theguardian.com)](https://theguardian.com)*: top 500, news, us*, search is disabled 1. ![](https://www.google.com/s2/favicons?domain=https://trello.com/) [Trello (https://trello.com/)](https://trello.com/)*: top 500, tasks* 1. ![](https://www.google.com/s2/favicons?domain=https://support.mozilla.org) [Mozilla Support (https://support.mozilla.org)](https://support.mozilla.org)*: top 500, us* @@ -187,7 +187,7 @@ Rank data fetched from Alexa by domains. 1. ![](https://www.google.com/s2/favicons?domain=https://community.brave.com) [community.brave.com (https://community.brave.com)](https://community.brave.com)*: top 1K, forum, us* 1. ![](https://www.google.com/s2/favicons?domain=https://tinder.com/) [Tinder (https://tinder.com/)](https://tinder.com/)*: top 1K, dating, us* 1. ![](https://www.google.com/s2/favicons?domain=https://community.cloudflare.com/) [CloudflareCommunity (https://community.cloudflare.com/)](https://community.cloudflare.com/)*: top 1K, forum, tech* -1. ![](https://www.google.com/s2/favicons?domain=https://eksisozluk.com/biri/) [Eksisozluk (https://eksisozluk.com/biri/)](https://eksisozluk.com/biri/)*: top 1K, tr* +1. ![](https://www.google.com/s2/favicons?domain=https://eksisozluk.com) [Eksisozluk (https://eksisozluk.com)](https://eksisozluk.com)*: top 1K, tr* 1. ![](https://www.google.com/s2/favicons?domain=https://www.allrecipes.com/) [AllRecipes (https://www.allrecipes.com/)](https://www.allrecipes.com/)*: top 1K, us* 1. ![](https://www.google.com/s2/favicons?domain=https://support.t-mobile.com) [T-MobileSupport (https://support.t-mobile.com)](https://support.t-mobile.com)*: top 1K, us*, search is disabled 1. ![](https://www.google.com/s2/favicons?domain=https://www.tinkoff.ru/invest/) [Tinkoff Invest (https://www.tinkoff.ru/invest/)](https://www.tinkoff.ru/invest/)*: top 5K, ru* @@ -195,7 +195,7 @@ Rank data fetched from Alexa by domains. 1. ![](https://www.google.com/s2/favicons?domain=https://discuss.python.org/) [DiscussPython (https://discuss.python.org/)](https://discuss.python.org/)*: top 5K, coding, forum, us* 1. ![](https://www.google.com/s2/favicons?domain=https://www.nairaland.com/) [Nairaland Forum (https://www.nairaland.com/)](https://www.nairaland.com/)*: top 5K, ng* 1. ![](https://www.google.com/s2/favicons?domain=https://ru.redtube.com/) [Redtube (https://ru.redtube.com/)](https://ru.redtube.com/)*: top 5K, porn, us* -1. ![](https://www.google.com/s2/favicons?domain=https://www.strava.com/) [Strava (https://www.strava.com/)](https://www.strava.com/)*: top 5K, us* +1. ![](https://www.google.com/s2/favicons?domain=https://www.strava.com/) [Strava (https://www.strava.com/)](https://www.strava.com/)*: top 5K, us*, search is disabled 1. ![](https://www.google.com/s2/favicons?domain=https://profile.ameba.jp) [Ameba (https://profile.ameba.jp)](https://profile.ameba.jp)*: top 5K, jp* 1. ![](https://www.google.com/s2/favicons?domain=https://adblockplus.org) [adblockplus.org (https://adblockplus.org)](https://adblockplus.org)*: top 5K, us* 1. ![](https://www.google.com/s2/favicons?domain=https://houzz.com/) [Houzz (https://houzz.com/)](https://houzz.com/)*: top 5K, us*, search is disabled @@ -265,7 +265,7 @@ Rank data fetched from Alexa by domains. 1. ![](https://www.google.com/s2/favicons?domain=https://lichess.org) [Lichess (https://lichess.org)](https://lichess.org)*: top 5K, gaming, hobby* 1. ![](https://www.google.com/s2/favicons?domain=https://jsfiddle.net) [jsfiddle.net (https://jsfiddle.net)](https://jsfiddle.net)*: top 5K, coding, sharing* 1. ![](https://www.google.com/s2/favicons?domain=https://ru.pathofexile.com) [Pathofexile (https://ru.pathofexile.com)](https://ru.pathofexile.com)*: top 5K, ru, us* -1. ![](https://www.google.com/s2/favicons?domain=https://vc.ru) [VC.ru (https://vc.ru)](https://vc.ru)*: top 5K, ru* +1. ![](https://www.google.com/s2/favicons?domain=https://vc.ru) [VC.ru (https://vc.ru)](https://vc.ru)*: top 5K, ru*, search is disabled 1. ![](https://www.google.com/s2/favicons?domain=https://www.metacritic.com/) [metacritic (https://www.metacritic.com/)](https://www.metacritic.com/)*: top 5K, us*, search is disabled 1. ![](https://www.google.com/s2/favicons?domain=https://www.digitalocean.com/) [DigitalOcean (https://www.digitalocean.com/)](https://www.digitalocean.com/)*: top 5K, forum, in, tech* 1. ![](https://www.google.com/s2/favicons?domain=http://www.jeuxvideo.com) [jeuxvideo (http://www.jeuxvideo.com)](http://www.jeuxvideo.com)*: top 5K, fr, gaming* @@ -281,7 +281,7 @@ Rank data fetched from Alexa by domains. 1. ![](https://www.google.com/s2/favicons?domain=https://archiveofourown.org) [ArchiveOfOurOwn (https://archiveofourown.org)](https://archiveofourown.org)*: top 5K, us* 1. ![](https://www.google.com/s2/favicons?domain=https://bit.ly) [Bit.ly (https://bit.ly)](https://bit.ly)*: top 5K, links* 1. ![](https://www.google.com/s2/favicons?domain=https://infourok.ru) [Infourok (https://infourok.ru)](https://infourok.ru)*: top 5K, ru* -1. ![](https://www.google.com/s2/favicons?domain=https://community.cbr.com) [Cbr (https://community.cbr.com)](https://community.cbr.com)*: top 5K, forum, us* +1. ![](https://www.google.com/s2/favicons?domain=https://community.cbr.com) [Cbr (https://community.cbr.com)](https://community.cbr.com)*: top 5K, forum, us*, search is disabled 1. ![](https://www.google.com/s2/favicons?domain=https://segmentfault.com/) [segmentfault (https://segmentfault.com/)](https://segmentfault.com/)*: top 5K, cn*, search is disabled 1. ![](https://www.google.com/s2/favicons?domain=https://www.warriorforum.com/) [Warrior Forum (https://www.warriorforum.com/)](https://www.warriorforum.com/)*: top 5K, forum, us* 1. ![](https://www.google.com/s2/favicons?domain=https://hub.docker.com/) [Docker Hub (https://hub.docker.com/)](https://hub.docker.com/)*: top 5K, coding* @@ -295,7 +295,7 @@ Rank data fetched from Alexa by domains. 1. ![](https://www.google.com/s2/favicons?domain=https://creativemarket.com/) [CreativeMarket (https://creativemarket.com/)](https://creativemarket.com/)*: top 5K, art, stock* 1. ![](https://www.google.com/s2/favicons?domain=https://bitbucket.org/) [BitBucket (https://bitbucket.org/)](https://bitbucket.org/)*: top 5K, coding* 1. ![](https://www.google.com/s2/favicons?domain=https://www.techrepublic.com) [Techrepublic (https://www.techrepublic.com)](https://www.techrepublic.com)*: top 5K, us* -1. ![](https://www.google.com/s2/favicons?domain=https://aminoapps.com/) [aminoapp (https://aminoapps.com/)](https://aminoapps.com/)*: top 5K, br, us* +1. ![](https://www.google.com/s2/favicons?domain=https://aminoapps.com/) [aminoapp (https://aminoapps.com/)](https://aminoapps.com/)*: top 5K, br, us*, search is disabled 1. ![](https://www.google.com/s2/favicons?domain=https://www.mixcloud.com/) [MixCloud (https://www.mixcloud.com/)](https://www.mixcloud.com/)*: top 5K, music* 1. ![](https://www.google.com/s2/favicons?domain=https://forum.xda-developers.com) [XDA (https://forum.xda-developers.com)](https://forum.xda-developers.com)*: top 5K, apps, forum*, search is disabled 1. ![](https://www.google.com/s2/favicons?domain=https://thechive.com/) [Thechive (https://thechive.com/)](https://thechive.com/)*: top 5K, us* @@ -321,7 +321,7 @@ Rank data fetched from Alexa by domains. 1. ![](https://www.google.com/s2/favicons?domain=http://forums.bulbagarden.net) [forums.bulbagarden.net (http://forums.bulbagarden.net)](http://forums.bulbagarden.net)*: top 5K, forum, us* 1. ![](https://www.google.com/s2/favicons?domain=https://videohive.net) [videohive.net (https://videohive.net)](https://videohive.net)*: top 5K, video* 1. ![](https://www.google.com/s2/favicons?domain=https://imginn.com) [ImgInn (https://imginn.com)](https://imginn.com)*: top 5K, photo* -1. ![](https://www.google.com/s2/favicons?domain=https://www.boardgamegeek.com) [BoardGameGeek (https://www.boardgamegeek.com)](https://www.boardgamegeek.com)*: top 5K, gaming, us* +1. ![](https://www.google.com/s2/favicons?domain=https://boardgamegeek.com) [BoardGameGeek (https://boardgamegeek.com)](https://boardgamegeek.com)*: top 5K, gaming, us* 1. ![](https://www.google.com/s2/favicons?domain=https://osu.ppy.sh/) [osu! (https://osu.ppy.sh/)](https://osu.ppy.sh/)*: top 5K, us* 1. ![](https://www.google.com/s2/favicons?domain=https://app.pluralsight.com) [Pluralsight (https://app.pluralsight.com)](https://app.pluralsight.com)*: top 5K, in, us* 1. ![](https://www.google.com/s2/favicons?domain=https://www.techpowerup.com) [TechPowerUp (https://www.techpowerup.com)](https://www.techpowerup.com)*: top 5K, us* @@ -406,7 +406,7 @@ Rank data fetched from Alexa by domains. 1. ![](https://www.google.com/s2/favicons?domain=https://www.reverbnation.com/) [ReverbNation (https://www.reverbnation.com/)](https://www.reverbnation.com/)*: top 10K, us* 1. ![](https://www.google.com/s2/favicons?domain=https://www.glavbukh.ru) [Scorcher (https://www.glavbukh.ru)](https://www.glavbukh.ru)*: top 10K, ru*, search is disabled 1. ![](https://www.google.com/s2/favicons?domain=https://www.trakt.tv/) [Trakt (https://www.trakt.tv/)](https://www.trakt.tv/)*: top 10K, de, fr* -1. ![](https://www.google.com/s2/favicons?domain=https://hotcopper.com.au) [Hotcopper (https://hotcopper.com.au)](https://hotcopper.com.au)*: top 10K, au* +1. ![](https://www.google.com/s2/favicons?domain=https://hotcopper.com.au) [Hotcopper (https://hotcopper.com.au)](https://hotcopper.com.au)*: top 10K, finance* 1. ![](https://www.google.com/s2/favicons?domain=https://pandia.ru) [Pandia (https://pandia.ru)](https://pandia.ru)*: top 10K, news, ru* 1. ![](https://www.google.com/s2/favicons?domain=https://forums.majorgeeks.com) [forums.majorgeeks.com (https://forums.majorgeeks.com)](https://forums.majorgeeks.com)*: top 10K, forum, us* 1. ![](https://www.google.com/s2/favicons?domain=https://www.hackerearth.com) [Hackerearth (https://www.hackerearth.com)](https://www.hackerearth.com)*: top 10K, freelance* @@ -472,7 +472,7 @@ Rank data fetched from Alexa by domains. 1. ![](https://www.google.com/s2/favicons?domain=https://3ddd.ru) [3ddd (https://3ddd.ru)](https://3ddd.ru)*: top 100K, ru* 1. ![](https://www.google.com/s2/favicons?domain=https://namemc.com/) [NameMC (https://namemc.com/)](https://namemc.com/)*: top 100K, us* 1. ![](https://www.google.com/s2/favicons?domain=https://www.b17.ru/) [B17 (https://www.b17.ru/)](https://www.b17.ru/)*: top 100K, ru* -1. ![](https://www.google.com/s2/favicons?domain=https://www.beermoneyforum.com) [BeerMoneyForum (https://www.beermoneyforum.com)](https://www.beermoneyforum.com)*: top 100K, finance, forum, gambling* +1. ![](https://www.google.com/s2/favicons?domain=https://www.beermoneyforum.com) [BeerMoneyForum (https://www.beermoneyforum.com)](https://www.beermoneyforum.com)*: top 100K, finance, forum, gambling*, search is disabled 1. ![](https://www.google.com/s2/favicons?domain=https://diary.ru) [Diary.ru (https://diary.ru)](https://diary.ru)*: top 100K, blog, nl, ru* 1. ![](https://www.google.com/s2/favicons?domain=https://www.americanthinker.com/) [Americanthinker (https://www.americanthinker.com/)](https://www.americanthinker.com/)*: top 100K* 1. ![](https://www.google.com/s2/favicons?domain=https://contently.com/) [Contently (https://contently.com/)](https://contently.com/)*: top 100K, freelance, in* @@ -497,7 +497,7 @@ Rank data fetched from Alexa by domains. 1. ![](https://www.google.com/s2/favicons?domain=https://pbase.com/) [Pbase (https://pbase.com/)](https://pbase.com/)*: top 100K, in* 1. ![](https://www.google.com/s2/favicons?domain=https://www.native-instruments.com/forum/) [NICommunityForum (https://www.native-instruments.com/forum/)](https://www.native-instruments.com/forum/)*: top 100K, forum* 1. ![](https://www.google.com/s2/favicons?domain=https://spletnik.ru/) [spletnik (https://spletnik.ru/)](https://spletnik.ru/)*: top 100K, ru* -1. ![](https://www.google.com/s2/favicons?domain=http://www.folkd.com/profile/) [Folkd (http://www.folkd.com/profile/)](http://www.folkd.com/profile/)*: top 100K, eu, in* +1. ![](https://www.google.com/s2/favicons?domain=http://www.folkd.com/profile/) [Folkd (http://www.folkd.com/profile/)](http://www.folkd.com/profile/)*: top 100K, eu, in*, search is disabled 1. ![](https://www.google.com/s2/favicons?domain=https://www.iphones.ru) [Iphones.ru (https://www.iphones.ru)](https://www.iphones.ru)*: top 100K, ru* 1. ![](https://www.google.com/s2/favicons?domain=https://www.oper.ru/) [Oper (https://www.oper.ru/)](https://www.oper.ru/)*: top 100K, ru* 1. ![](https://www.google.com/s2/favicons?domain=https://www.interpals.net/) [interpals (https://www.interpals.net/)](https://www.interpals.net/)*: top 100K, dating* @@ -3141,20 +3141,20 @@ Rank data fetched from Alexa by domains. 1. ![](https://www.google.com/s2/favicons?domain=https://pubg.op.gg) [OP.GG [PUBG] (https://pubg.op.gg)](https://pubg.op.gg)*: top 100M, gaming* 1. ![](https://www.google.com/s2/favicons?domain=https://valorant.op.gg) [OP.GG [Valorant] (https://valorant.op.gg)](https://valorant.op.gg)*: top 100M, gaming* -The list was updated at (2024-12-09) +The list was updated at (2024-12-10) ## Statistics -Enabled/total sites: 2699/3137 = 86.04% +Enabled/total sites: 2693/3137 = 85.85% -Incomplete message checks: 406/2699 = 15.04% (false positive risks) +Incomplete message checks: 397/2693 = 14.74% (false positive risks) -Status code checks: 720/2699 = 26.68% (false positive risks) +Status code checks: 719/2693 = 26.7% (false positive risks) -False positive risk (total): 41.72% +False positive risk (total): 41.44% Top 20 profile URLs: - (796) `{urlMain}/index/8-0-{username} (uCoz)` -- (300) `/{username}` +- (301) `/{username}` - (221) `{urlMain}{urlSubpath}/members/?username={username} (XenForo)` - (161) `/user/{username}` - (133) `{urlMain}{urlSubpath}/member.php?username={username} (vBulletin)` @@ -3165,8 +3165,8 @@ Top 20 profile URLs: - (87) `{urlMain}/u/{username}/summary (Discourse)` - (54) `/wiki/User:{username}` - (52) `/@{username}` -- (42) `SUBDOMAIN` - (41) `/members/?username={username}` +- (41) `SUBDOMAIN` - (32) `/members/{username}` - (29) `/author/{username}` - (27) `{urlMain}{urlSubpath}/memberlist.php?username={username} (phpBB)` @@ -3177,21 +3177,21 @@ Top 20 profile URLs: Top 20 tags: - (328) `NO_TAGS` (non-standard) - (307) `forum` -- (52) `gaming` +- (50) `gaming` - (26) `coding` - (21) `photo` -- (21) `blog` +- (20) `blog` - (19) `news` - (15) `music` - (14) `tech` -- (12) `sharing` - (12) `freelance` - (12) `finance` +- (11) `sharing` - (10) `dating` - (10) `art` - (10) `shopping` - (10) `movies` -- (8) `hobby` - (8) `crypto` - (7) `sport` +- (7) `hobby` - (7) `hacking` diff --git a/tests/conftest.py b/tests/conftest.py index cbef55d..966c4a3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -79,6 +79,13 @@ def reports_autoclean(): remove_test_reports() +@pytest.fixture(scope='session') +def settings(): + settings = Settings() + settings.load([SETTINGS_FILE]) + return settings + + @pytest.fixture(scope='session') def argparser(): settings = Settings() diff --git a/tests/db.json b/tests/db.json index 93e5969..91d1a51 100644 --- a/tests/db.json +++ b/tests/db.json @@ -26,7 +26,7 @@ "alexaRank": 1, "url": "https://play.google.com/store/apps/developer?id={username}", "urlMain": "https://play.google.com/store", - "usernameClaimed": "OpenAI", + "usernameClaimed": "KONAMI", "usernameUnclaimed": "noonewouldeverusethis7" }, "InvalidActive": { @@ -36,7 +36,7 @@ "alexaRank": 1, "url": "https://play.google.com/store/apps/dev?id={username}", "urlMain": "https://play.google.com/store", - "usernameClaimed": "OpenAI", + "usernameClaimed": "KONAMI", "usernameUnclaimed": "noonewouldeverusethis7" }, "ValidInactive": { @@ -46,7 +46,7 @@ "alexaRank": 1, "url": "https://play.google.com/store/apps/developer?id={username}", "urlMain": "https://play.google.com/store", - "usernameClaimed": "OpenAI", + "usernameClaimed": "KONAMI", "usernameUnclaimed": "noonewouldeverusethis7" }, "InvalidInactive": { @@ -56,7 +56,7 @@ "alexaRank": 1, "url": "https://play.google.com/store/apps/dev?id={username}", "urlMain": "https://play.google.com/store", - "usernameClaimed": "OpenAI", + "usernameClaimed": "KONAMI", "usernameUnclaimed": "noonewouldeverusethis7" } } diff --git a/tests/test_activation.py b/tests/test_activation.py index 650a3f3..4e71933 100644 --- a/tests/test_activation.py +++ b/tests/test_activation.py @@ -34,6 +34,7 @@ def test_vimeo_activation(default_db): assert token1 != token2 +@pytest.mark.slow @pytest.mark.asyncio async def test_import_aiohttp_cookies(): cookies_filename = 'cookies_test.txt' diff --git a/tests/test_data.py b/tests/test_data.py index b296def..1d7bb0c 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -1,8 +1,10 @@ """Maigret data test functions""" +import pytest from maigret.utils import is_country_tag +@pytest.mark.slow def test_tags_validity(default_db): unknown_tags = set() diff --git a/tests/test_executors.py b/tests/test_executors.py index 3ad276d..e893773 100644 --- a/tests/test_executors.py +++ b/tests/test_executors.py @@ -49,6 +49,7 @@ async def test_asyncio_progressbar_semaphore_executor(): assert executor.execution_time < 0.4 +@pytest.mark.slow @pytest.mark.asyncio async def test_asyncio_progressbar_queue_executor(): tasks = [(func, [n], {}) for n in range(10)] diff --git a/tests/test_maigret.py b/tests/test_maigret.py index 4c2f8ac..c3e2bcf 100644 --- a/tests/test_maigret.py +++ b/tests/test_maigret.py @@ -84,6 +84,7 @@ def test_maigret_results(test_db): assert results == RESULTS_EXAMPLE +@pytest.mark.slow def test_extract_ids_from_url(default_db): assert default_db.extract_ids_from_url('https://www.reddit.com/user/test') == { 'test': 'username' diff --git a/tests/test_submit.py b/tests/test_submit.py new file mode 100644 index 0000000..2588300 --- /dev/null +++ b/tests/test_submit.py @@ -0,0 +1,278 @@ +import pytest +from unittest.mock import AsyncMock, MagicMock, patch +from maigret.submit import Submitter, MaigretSite, MaigretEngine +from aiohttp import ClientSession +from maigret.sites import MaigretDatabase +from maigret.settings import Settings +import logging + + +@pytest.mark.slow +@pytest.mark.asyncio +async def test_detect_known_engine(test_db, local_test_db): + # Use the database fixture instead of mocking + mock_db = test_db + mock_settings = MagicMock() + mock_logger = MagicMock() + mock_args = MagicMock() + mock_args.cookie_file = "" + mock_args.proxy = "" + + # Mock the supposed usernames + mock_settings.supposed_usernames = ["adam"] + # Create the Submitter instance + submitter = Submitter(test_db, mock_settings, mock_logger, mock_args) + + # Call the method with test URLs + url_exists = "https://devforum.zoom.us/u/adam" + url_mainpage = "https://devforum.zoom.us/" + # Mock extract_username_dialog to return "adam" + submitter.extract_username_dialog = MagicMock(return_value="adam") + + sites, resp_text = await submitter.detect_known_engine( + url_exists, url_mainpage, session=None, follow_redirects=False, headers=None + ) + + # Assertions + assert len(sites) == 2 + assert sites[0].name == "devforum.zoom.us" + assert sites[0].url_main == "https://devforum.zoom.us/" + assert sites[0].engine == "Discourse" + assert sites[0].username_claimed == "adam" + assert sites[0].username_unclaimed == "noonewouldeverusethis7" + assert resp_text != "" + + await submitter.close() + + # Create the Submitter instance without engines + submitter = Submitter(local_test_db, mock_settings, mock_logger, mock_args) + sites, resp_text = await submitter.detect_known_engine( + url_exists, url_mainpage, session=None, follow_redirects=False, headers=None + ) + assert len(sites) == 0 + + await submitter.close() + + +@pytest.mark.slow +@pytest.mark.asyncio +async def test_check_features_manually_success(settings): + # Setup + db = MaigretDatabase() + logger = logging.getLogger("test_logger") + args = type( + 'Args', (object,), {'proxy': None, 'cookie_file': None, 'verbose': False} + )() + + submitter = Submitter(db, settings, logger, args) + + username = "KONAMI" + url_exists = "https://play.google.com/store/apps/developer?id=KONAMI" + + # Execute + presence_list, absence_list, status, random_username = ( + await submitter.check_features_manually( + username=username, + url_exists=url_exists, + session=ClientSession(), + follow_redirects=False, + headers=None, + ) + ) + await submitter.close() + # Assert + assert status == "Found", "Expected status to be 'Found'" + assert isinstance(presence_list, list), "Presence list should be a list" + assert isinstance(absence_list, list), "Absence list should be a list" + assert isinstance(random_username, str), "Random username should be a string" + assert ( + random_username != username + ), "Random username should not be the same as the input username" + assert sorted(presence_list) == sorted( + [ + ' title=', + 'og:title', + 'display: none;', + '4;0', + 'main-title', + ] + ) + assert sorted(absence_list) == sorted( + [ + ' body {', + ' ', + '>Not Found', + '