diff --git a/.githooks/pre-commit b/.githooks/pre-commit index e075419..207c24a 100755 --- a/.githooks/pre-commit +++ b/.githooks/pre-commit @@ -1,2 +1,3 @@ #!/bin/sh -python3 ./utils/update_site_data.py +echo 'Activating update_sitesmd hook script...' +poetry run update_sitesmd \ No newline at end of file diff --git a/maigret/activation.py b/maigret/activation.py index 67d6f9c..65d9024 100644 --- a/maigret/activation.py +++ b/maigret/activation.py @@ -1,3 +1,4 @@ +import json from http.cookiejar import MozillaCookieJar from http.cookies import Morsel @@ -25,6 +26,7 @@ class ParsingActivator: import requests r = requests.get(site.activation["url"], headers=headers) + logger.debug(f"Vimeo viewer activation: {json.dumps(r.json(), indent=4)}") jwt_token = r.json()["jwt"] site.headers["Authorization"] = "jwt " + jwt_token diff --git a/maigret/resources/data.json b/maigret/resources/data.json index f10f3b6..5222056 100644 --- a/maigret/resources/data.json +++ b/maigret/resources/data.json @@ -2757,16 +2757,27 @@ ], "checkType": "message", "absenceStrs": [ - "

null null

" + "error_404", + "c-error404", + "Author not found", + "c-error404_back", + "c-error404_header" ], "presenseStrs": [ - "Joined CNET:" + "},firstName:", + "#email", + ",cmsDisplayName:", + "og:title", + "c-pageProfile" ], "alexaRank": 181, - "urlMain": "https://www.cnet.com/", + "urlMain": "https://www.cnet.com", "url": "https://www.cnet.com/profiles/{username}/", "usernameClaimed": "leadicicco", - "usernameUnclaimed": "noonewouldeverusethis" + "usernameUnclaimed": "chexowcxzm", + "headers": { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36" + } }, "CORSAIR": { "urlSubpath": "/v3", @@ -3187,16 +3198,27 @@ ], "checkType": "message", "absenceStrs": [ - "The page you are looking for doesn\u2019t exist. (404)" + "error image", + "

404 Page not found

", + "_404-header", + "_404-inner-container", + " no-nav " ], "presenseStrs": [ - "Full Stats" + "profile-top", + "og:title", + " style=", + "view-profile", + " data-username=" ], "alexaRank": 211, - "urlMain": "https://www.chess.com/", + "urlMain": "https://www.chess.com", "url": "https://www.chess.com/member/{username}", - "usernameClaimed": "blue", - "usernameUnclaimed": "noonewouldeverusethis7" + "usernameClaimed": "sexytwerker69", + "usernameUnclaimed": "aublurbrxm", + "headers": { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36" + } }, "Chess-russia": { "tags": [ @@ -3832,18 +3854,31 @@ }, "DailyMotion": { "tags": [ - "us", "video" ], "checkType": "message", "presenseStrs": [ - "al:ios:app_name" + " style=", + "", + "og:title", + "Twitter", + "og:site_name" ], "alexaRank": 263, - "urlMain": "https://www.dailymotion.com/", + "urlMain": "https://www.dailymotion.com", "url": "https://www.dailymotion.com/{username}", "usernameClaimed": "blue", - "usernameUnclaimed": "noonewouldeverusethis7" + "usernameUnclaimed": "rstnodkwzr", + "absenceStrs": [ + "Page not found", + "profile", + "error404", + "bodyall", + "No matches found" + ], + "headers": { + "User-Agent": "" + } }, "Dalnoboi": { "tags": [ @@ -7037,7 +7072,7 @@ "alexaRank": 1, "urlMain": "https://play.google.com/store", "url": "https://play.google.com/store/apps/developer?id={username}", - "usernameClaimed": "OpenAI", + "usernameClaimed": "KONAMI", "usernameUnclaimed": "noonewouldeverusethis7" }, "Gorod.dp.ua": { @@ -10583,13 +10618,27 @@ ], "checkType": "message", "absenceStrs": [ - "Page Not Found | Mozilla" + ">Page Not Found</h1>", + "error-page", + "sumo-page-intro", + "search-results-visible page-not-found", + "search-empty" ], "alexaRank": 172, "urlMain": "https://support.mozilla.org", - "url": "https://support.mozilla.org/en-US/user/{username}", - "usernameClaimed": "adam", - "usernameUnclaimed": "noonewouldeverusethis7" + "url": "https://support.mozilla.org/en-US/user/{username}/", + "usernameClaimed": "derekmarable", + "usernameUnclaimed": "tasgcxxxcz", + "presenseStrs": [ + "user-nav", + "</article>", + "sidebar-nav", + "noindex", + "sidebar-nav--item" + ], + "headers": { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36" + } }, "Mpgh": { "urlSubpath": "/forum", @@ -17412,26 +17461,25 @@ }, "Vimeo": { "tags": [ - "us", "video" ], - "headers": { - "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3MzM4MzkwODAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbCwianRpIjoiOWNjMjk0ZjktZGZhOS00NDI0LWE0OGEtN2JjYzkwYjM2NTMyIn0.wG0kC7fWtrdKI9ccS-LE81lVgQRfYobrqCAPWxr1wzc" - }, "activation": { "url": "https://vimeo.com/_rv/viewer", "marks": [ - "Something strange occurred. Please get in touch with the app's creator." + "Something strange occurred. Please get in touch" ], "method": "vimeo" }, + "headers": { + "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3MzM4NzYyODAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbCwianRpIjoiMDk0ZjY5MjctZDJhMy00ZTY3LWI1N2EtN2IwMjBlOTcyZjQ5In0.dxgGrY7vQs6DW3sfKaOJy4UL8MKjMK-ssr_kndr9_vY" + }, "urlProbe": "https://api.vimeo.com/users/{username}?fields=name%2Cgender%2Cbio%2Curi%2Clink%2Cbackground_video%2Clocation_details%2Cpictures%2Cverified%2Cmetadata.public_videos.total%2Cavailable_for_hire%2Ccan_work_remotely%2Cmetadata.connections.videos.total%2Cmetadata.connections.albums.total%2Cmetadata.connections.followers.total%2Cmetadata.connections.following.total%2Cmetadata.public_videos.total%2Cmetadata.connections.vimeo_experts.is_enrolled%2Ctotal_collection_count%2Ccreated_time%2Cprofile_preferences%2Cmembership%2Cclients%2Cskills%2Cproject_types%2Crates%2Ccategories%2Cis_expert%2Cprofile_discovery%2Cwebsites%2Ccontact_emails&fetch_user_profile=1", "checkType": "status_code", "alexaRank": 148, - "urlMain": "https://vimeo.com/", + "urlMain": "https://vimeo.com", "url": "https://vimeo.com/{username}", "usernameClaimed": "blue", - "usernameUnclaimed": "noonewouldeverusethis7" + "usernameUnclaimed": "smbepezbrg" }, "Virgool": { "disabled": true, @@ -30424,6 +30472,9 @@ "presenseStrs": [ "collectionName" ], + "errors": { + "recaptchaKey": "Captcha detected" + }, "url": "https://www.istockphoto.com/ru/portfolio/{username}", "urlMain": "https://www.istockphoto.com", "usernameClaimed": "leowilde", diff --git a/maigret/sites.py b/maigret/sites.py index 17216bd..b0bfdb3 100644 --- a/maigret/sites.py +++ b/maigret/sites.py @@ -560,6 +560,17 @@ class MaigretDatabase: checks_perc = round(100 * message_checks_one_factor / enabled_count, 2) status_checks_perc = round(100 * status_checks / enabled_count, 2) + # Sites with probing and activation (kinda special cases, let's watch them) + site_with_probing = [] + site_with_activation = [] + for site in sites_dict.values(): + def get_site_label(site): + return f"{site.name}{' (disabled)' if site.disabled else ''}" + if site.url_probe: + site_with_probing.append(get_site_label(site)) + if site.activation: + site_with_activation.append(get_site_label(site)) + # Format output separator = "\n\n" output = [ @@ -567,6 +578,8 @@ class MaigretDatabase: f"Incomplete message checks: {message_checks_one_factor}/{enabled_count} = {checks_perc}% (false positive risks)", f"Status code checks: {status_checks}/{enabled_count} = {status_checks_perc}% (false positive risks)", f"False positive risk (total): {checks_perc + status_checks_perc:.2f}%", + f"Sites with probing: {', '.join(sorted(site_with_probing))}", + f"Sites with activation: {', '.join(sorted(site_with_activation))}", self._format_top_items("profile URLs", urls, 20, is_markdown), self._format_top_items("tags", tags, 20, is_markdown, self._tags), ] diff --git a/maigret/submit.py b/maigret/submit.py index a3ab8fd..31750ec 100644 --- a/maigret/submit.py +++ b/maigret/submit.py @@ -184,7 +184,7 @@ class Submitter: url_parts = url.rstrip("/").split("/") supposed_username = url_parts[-1].strip('@') entered_username = input( - f'Is "{supposed_username}" a valid username? If not, write it manually: ' + f"{Fore.GREEN}[?] Is \"{supposed_username}\" a valid username? If not, write it manually: {Style.RESET_ALL}" ) return entered_username if entered_username else supposed_username @@ -390,6 +390,13 @@ class Submitter: } async def dialog(self, url_exists, cookie_file): + """ + An implementation of the submit mode: + - User provides a URL of a existing social media account + - Maigret tries to detect the site engine and understand how to check + for account presence with HTTP responses analysis + - If detection succeeds, Maigret generates a new site entry/replace old one in the database + """ old_site = None additional_options_enabled = self.logger.level in ( logging.DEBUG, @@ -444,6 +451,15 @@ class Submitter: f'{Fore.GREEN}[+] We will update site "{old_site.name}" in case of success.{Style.RESET_ALL}' ) + # Check if the site check is ordinary or not + if old_site and (old_site.url_probe or old_site.activation): + skip = input(f"{Fore.RED}[!] The site check depends on activation / probing mechanism! Consider to update it manually. Continue? [yN]{Style.RESET_ALL}") + if skip.lower() in ['n', '']: + return False + + # TODO: urlProbe support + # TODO: activation support + url_mainpage = self.extract_mainpage_url(url_exists) # headers update @@ -493,6 +509,8 @@ class Submitter: supposed_username = self.extract_username_dialog(url_exists) self.logger.info(f"Supposed username: {supposed_username}") + # TODO: pass status_codes + # check it here and suggest to enable / auto-enable redirects presence_list, absence_list, status, non_exist_username = ( await self.check_features_manually( username=supposed_username, @@ -511,6 +529,7 @@ class Submitter: "urlMain": url_mainpage, "usernameClaimed": supposed_username, "usernameUnclaimed": non_exist_username, + "headers": custom_headers, "checkType": "message", } self.logger.info(json.dumps(site_data, indent=4)) @@ -581,8 +600,11 @@ class Submitter: self.logger.info(f"New site name is {new_name}") chosen_site.name = new_name - # TODO: remove empty tags - new_tags = input(f"{Fore.GREEN}[?] Site tags: {Style.RESET_ALL}") + default_tags_str = "" + if old_site: + default_tags_str = f' [{", ".join(old_site.tags)}]' + + new_tags = input(f"{Fore.GREEN}[?] Site tags{default_tags_str}: {Style.RESET_ALL}") if new_tags: chosen_site.tags = list(map(str.strip, new_tags.split(','))) else: diff --git a/pyproject.toml b/pyproject.toml index 9aabe56..fef716c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -90,3 +90,4 @@ coverage = "^7.6.9" [tool.poetry.scripts] # Run with: poetry run maigret <username> maigret = "maigret.maigret:run" +update_sitesmd = "utils.update_site_data:main" \ No newline at end of file diff --git a/sites.md b/sites.md index 3d3d768..f0fd2db 100644 --- a/sites.md +++ b/sites.md @@ -88,7 +88,7 @@ Rank data fetched from Alexa by domains. 1. ![](https://www.google.com/s2/favicons?domain=https://www.blogger.com) [Blogger (by GAIA id) (https://www.blogger.com)](https://www.blogger.com)*: top 500, blog* 1. ![](https://www.google.com/s2/favicons?domain=https://www.researchgate.net/) [ResearchGate (https://www.researchgate.net/)](https://www.researchgate.net/)*: top 500, in, us* 1. ![](https://www.google.com/s2/favicons?domain=https://www.freepik.com) [Freepik (https://www.freepik.com)](https://www.freepik.com)*: top 500, art, photo, stock* -1. ![](https://www.google.com/s2/favicons?domain=https://vimeo.com/) [Vimeo (https://vimeo.com/)](https://vimeo.com/)*: top 500, us, video* +1. ![](https://www.google.com/s2/favicons?domain=https://vimeo.com) [Vimeo (https://vimeo.com)](https://vimeo.com)*: top 500, video* 1. ![](https://www.google.com/s2/favicons?domain=https://www.pinterest.com/) [Pinterest (https://www.pinterest.com/)](https://www.pinterest.com/)*: top 500, art, photo, sharing* 1. ![](https://www.google.com/s2/favicons?domain=https://www.fiverr.com/) [Fiverr (https://www.fiverr.com/)](https://www.fiverr.com/)*: top 500, shopping, us* 1. ![](https://www.google.com/s2/favicons?domain=https://t.me/) [Telegram (https://t.me/)](https://t.me/)*: top 500, messaging* @@ -96,18 +96,18 @@ Rank data fetched from Alexa by domains. 1. ![](https://www.google.com/s2/favicons?domain=https://theguardian.com) [TheGuardian (https://theguardian.com)](https://theguardian.com)*: top 500, news, us*, search is disabled 1. ![](https://www.google.com/s2/favicons?domain=https://trello.com/) [Trello (https://trello.com/)](https://trello.com/)*: top 500, tasks* 1. ![](https://www.google.com/s2/favicons?domain=https://support.mozilla.org) [Mozilla Support (https://support.mozilla.org)](https://support.mozilla.org)*: top 500, us* -1. ![](https://www.google.com/s2/favicons?domain=https://www.cnet.com/) [CNET (https://www.cnet.com/)](https://www.cnet.com/)*: top 500, news, tech, us* +1. ![](https://www.google.com/s2/favicons?domain=https://www.cnet.com) [CNET (https://www.cnet.com)](https://www.cnet.com)*: top 500, news, tech, us* 1. ![](https://www.google.com/s2/favicons?domain=https://www.shutterstock.com) [Shutterstock (https://www.shutterstock.com)](https://www.shutterstock.com)*: top 500, music, photo, stock, us* 1. ![](https://www.google.com/s2/favicons?domain=https://wix.com/) [Wix (https://wix.com/)](https://wix.com/)*: top 500, us* 1. ![](https://www.google.com/s2/favicons?domain=https://slack.com) [Slack (https://slack.com)](https://slack.com)*: top 500, messaging* -1. ![](https://www.google.com/s2/favicons?domain=https://www.chess.com/) [Chess (https://www.chess.com/)](https://www.chess.com/)*: top 500, gaming, hobby* +1. ![](https://www.google.com/s2/favicons?domain=https://www.chess.com) [Chess (https://www.chess.com)](https://www.chess.com)*: top 500, gaming, hobby* 1. ![](https://www.google.com/s2/favicons?domain=https://upwork.com) [upwork.com (https://upwork.com)](https://upwork.com)*: top 500, us* 1. ![](https://www.google.com/s2/favicons?domain=https://archive.org) [Archive.org (https://archive.org)](https://archive.org)*: top 500*, search is disabled 1. ![](https://www.google.com/s2/favicons?domain=https://www.figma.com/) [Figma (https://www.figma.com/)](https://www.figma.com/)*: top 500, design* 1. ![](https://www.google.com/s2/favicons?domain=https://www.istockphoto.com) [iStock (https://www.istockphoto.com)](https://www.istockphoto.com)*: top 500, photo, stock* 1. ![](https://www.google.com/s2/favicons?domain=https://www.scribd.com/) [Scribd (https://www.scribd.com/)](https://www.scribd.com/)*: top 500, reading* 1. ![](https://www.google.com/s2/favicons?domain=https://opensea.io) [opensea.io (https://opensea.io)](https://opensea.io)*: top 500, us* -1. ![](https://www.google.com/s2/favicons?domain=https://www.dailymotion.com/) [DailyMotion (https://www.dailymotion.com/)](https://www.dailymotion.com/)*: top 500, us, video* +1. ![](https://www.google.com/s2/favicons?domain=https://www.dailymotion.com) [DailyMotion (https://www.dailymotion.com)](https://www.dailymotion.com)*: top 500, video* 1. ![](https://www.google.com/s2/favicons?domain=https://www.behance.net/) [Behance (https://www.behance.net/)](https://www.behance.net/)*: top 500, business* 1. ![](https://www.google.com/s2/favicons?domain=http://www.yelp.com) [Yelp (http://www.yelp.com)](http://www.yelp.com)*: top 500, review*, search is disabled 1. ![](https://www.google.com/s2/favicons?domain=https://www.yelp.com) [Yelp (by id) (https://www.yelp.com)](https://www.yelp.com)*: top 500, review* @@ -3141,16 +3141,20 @@ Rank data fetched from Alexa by domains. 1. ![](https://www.google.com/s2/favicons?domain=https://pubg.op.gg) [OP.GG [PUBG] (https://pubg.op.gg)](https://pubg.op.gg)*: top 100M, gaming* 1. ![](https://www.google.com/s2/favicons?domain=https://valorant.op.gg) [OP.GG [Valorant] (https://valorant.op.gg)](https://valorant.op.gg)*: top 100M, gaming* -The list was updated at (2024-12-10) +The list was updated at (2024-12-11) ## Statistics Enabled/total sites: 2693/3137 = 85.85% -Incomplete message checks: 397/2693 = 14.74% (false positive risks) +Incomplete message checks: 395/2693 = 14.67% (false positive risks) -Status code checks: 719/2693 = 26.7% (false positive risks) +Status code checks: 616/2693 = 22.87% (false positive risks) -False positive risk (total): 41.44% +False positive risk (total): 37.54% + +Sites with probing: 500px, Aparat, BinarySearch (disabled), BongaCams, BuyMeACoffee, Cent, Disqus, Docker Hub, Duolingo, Gab, GitHub, GitLab, Google Plus (archived), Gravatar, Imgur, Issuu, Keybase, Livejasmin, LocalCryptos (disabled), MixCloud, Niftygateway, Reddit Search (Pushshift) (disabled), SportsTracker, Spotify (disabled), TAP'D, Trello, Twitch, Twitter, Twitter Shadowban (disabled), UnstoppableDomains, Vimeo, Weibo, Yapisal (disabled), YouNow, nightbot, notabug.org, polarsteps, qiwi.me (disabled) + +Sites with activation: Spotify (disabled), Twitter, Vimeo, Weibo Top 20 profile URLs: - (796) `{urlMain}/index/8-0-{username} (uCoz)` @@ -3174,24 +3178,25 @@ Top 20 profile URLs: - (17) `/forum/members/?username={username}` - (17) `/search.php?keywords=&terms=all&author={username}` + Top 20 tags: -- (328) `NO_TAGS` (non-standard) -- (307) `forum` -- (50) `gaming` -- (26) `coding` -- (21) `photo` -- (20) `blog` -- (19) `news` -- (15) `music` -- (14) `tech` -- (12) `freelance` -- (12) `finance` -- (11) `sharing` -- (10) `dating` -- (10) `art` -- (10) `shopping` -- (10) `movies` -- (8) `crypto` -- (7) `sport` -- (7) `hobby` -- (7) `hacking` +- (1105) `NO_TAGS` (non-standard) +- (735) `forum` +- (92) `gaming` +- (48) `photo` +- (41) `coding` +- (30) `tech` +- (29) `news` +- (28) `blog` +- (23) `music` +- (19) `finance` +- (18) `crypto` +- (16) `sharing` +- (16) `freelance` +- (15) `art` +- (15) `shopping` +- (13) `sport` +- (13) `business` +- (12) `movies` +- (11) `hobby` +- (11) `education` diff --git a/utils/update_site_data.py b/utils/update_site_data.py index dd275fe..c1d5273 100755 --- a/utils/update_site_data.py +++ b/utils/update_site_data.py @@ -67,7 +67,7 @@ def get_step_rank(rank): return get_readable_rank(list(filter(lambda x: x >= rank, valid_step_ranks))[0]) -if __name__ == '__main__': +def main(): parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter ) parser.add_argument("--base","-b", metavar="BASE_FILE", @@ -86,6 +86,8 @@ if __name__ == '__main__': db = MaigretDatabase() sites_subset = db.load_from_file(args.base_file).sites + print(f"\nUpdating supported sites list (don't worry, it's needed)...") + with open("sites.md", "w") as site_file: site_file.write(f""" ## List of supported sites (search methods): total {len(sites_subset)}\n @@ -144,4 +146,8 @@ Rank data fetched from Alexa by domains. site_file.write('## Statistics\n\n') site_file.write(statistics_text) - print("\nFinished updating supported site listing!") + print("Finished updating supported site listing!") + + +if __name__ == '__main__': + main()