diff --git a/maigret/maigret.py b/maigret/maigret.py index 5c64abc..16a4b01 100755 --- a/maigret/maigret.py +++ b/maigret/maigret.py @@ -368,7 +368,6 @@ async def maigret(username, site_dict, query_notify, logger, results_site['parsing_enabled'] = recursive_search results_site['url_main'] = site.url_main - headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11.1; rv:55.0) Gecko/20100101 Firefox/55.0', } @@ -506,95 +505,102 @@ def timeout_check(value): return timeout -async def site_self_check(site_name, site_data, logger, no_progressbar=False): +async def site_self_check(site, logger, semaphore, db: MaigretDatabase, no_progressbar=False): query_notify = Mock() changes = { 'disabled': False, } - check_data = [ - (site_data.username_claimed, QueryStatus.CLAIMED), - (site_data.username_unclaimed, QueryStatus.AVAILABLE), - ] + try: + check_data = [ + (site.username_claimed, QueryStatus.CLAIMED), + (site.username_unclaimed, QueryStatus.AVAILABLE), + ] + except: + print(site.__dict__) - logger.info(f'Checking {site_name}...') + logger.info(f'Checking {site.name}...') for username, status in check_data: - results = await maigret( - username, - {site_name: site_data}, - query_notify, - logger, - timeout=30, - forced=True, - no_progressbar=no_progressbar, - ) + async with semaphore: + results_dict = await maigret( + username, + {site.name: site}, + query_notify, + logger, + timeout=30, + forced=True, + no_progressbar=True, + ) - # don't disable entries with other ids types - if site_name not in results: - logger.info(results) - changes['disabled'] = True - continue + # don't disable entries with other ids types + # TODO: make normal checking + if site.name not in results_dict: + logger.info(results_dict) + changes['disabled'] = True + continue + + result = results_dict[site.name]['status'] + + + site_status = result.status - site_status = results[site_name]['status'].status if site_status != status: if site_status == QueryStatus.UNKNOWN: - msgs = site_data.absence_strs - etype = site_data.check_type - logger.info(f'Error while searching {username} in {site_name}: {msgs}, type {etype}') + msgs = site.absence_strs + etype = site.check_type + logger.warning(f'Error while searching {username} in {site.name}: {result.context}, {msgs}, type {etype}') # don't disable in case of available username if status == QueryStatus.CLAIMED: changes['disabled'] = True elif status == QueryStatus.CLAIMED: - logger.info(f'Not found `{username}` in {site_name}, must be claimed') - logger.info(results[site_name]) + logger.warning(f'Not found `{username}` in {site.name}, must be claimed') + logger.info(results_dict[site.name]) changes['disabled'] = True else: - logger.info(f'Found `{username}` in {site_name}, must be available') - logger.info(results[site_name]) + logger.warning(f'Found `{username}` in {site.name}, must be available') + logger.info(results_dict[site.name]) changes['disabled'] = True - logger.info(f'Site {site_name} checking is finished') + logger.info(f'Site {site.name} checking is finished') + + if changes['disabled'] != site.disabled: + site.disabled = changes['disabled'] + db.update_site(site) + action = 'Disabled' if not site.disabled else 'Enabled' + print(f'{action} site {site.name}...') + return changes -async def self_check(json_file, logger): - db = MaigretDatabase() - db.load_from_file(json_file) - sites = db.sites - all_sites = {} +async def self_check(db: MaigretDatabase, site_data: dict, logger): + sem = asyncio.Semaphore(10) + tasks = [] + all_sites = site_data - def disabled_count(data): - return len(list(filter(lambda x: x.get('disabled', False), data))) - - async def update_site_data(site_name, site_data, all_sites, logger): - updates = await site_self_check(site_name, dict(site_data), logger) - all_sites[site_name].update(updates) - - for site in sites: - all_sites[site.name] = site.information + def disabled_count(lst): + return len(list(filter(lambda x: x.disabled, lst))) disabled_old_count = disabled_count(all_sites.values()) - tasks = [] - for site_name, site_data in all_sites.items(): - future = asyncio.ensure_future(update_site_data(site_name, site_data, all_sites, logger)) + for _, site in all_sites.items(): + check_coro = site_self_check(site, logger, sem, db) + future = asyncio.ensure_future(check_coro) tasks.append(future) - await asyncio.gather(*tasks) + for f in tqdm.asyncio.tqdm.as_completed(tasks): + await f disabled_new_count = disabled_count(all_sites.values()) total_disabled = disabled_new_count - disabled_old_count - if total_disabled > 0: + + if total_disabled >= 0: message = 'Disabled' else: message = 'Enabled' total_disabled *= -1 - print(f'{message} {total_disabled} checked sites. Run with `--info` flag to get more information') - with open(json_file, 'w') as f: - data['sites'] = all_sites - json.dump(data, f, indent=4) + print(f'{message} {total_disabled} checked sites. Run with `--info` flag to get more information') async def main(): @@ -621,9 +627,6 @@ async def main(): action="store_true", dest="debug", default=False, help="Saving debugging information and sites responses in debug.txt." ) - parser.add_argument("--rank", "-r", - action="store_true", dest="rank", default=False, - help="Present websites ordered by their Alexa.com global rank in popularity.") parser.add_argument("--folderoutput", "-fo", dest="folderoutput", default="reports", help="If using multiple usernames, the output of the results will be saved to this folder." ) @@ -637,7 +640,7 @@ async def main(): ) parser.add_argument("--site", action="append", metavar='SITE_NAME', - dest="site_list", default=None, + dest="site_list", default=[], help="Limit analysis to just the listed sites (use several times to specify more than one)" ) parser.add_argument("--proxy", "-p", metavar='PROXY_URL', @@ -758,7 +761,7 @@ async def main(): usernames[v] = k if args.tags: - args.tags = set(str(args.tags).split(',')) + args.tags = list(set(str(args.tags).split(','))) if args.json_file is None: args.json_file = \ @@ -766,53 +769,40 @@ async def main(): "resources/data.json" ) - # Database self-checking - if args.self_check: - print('Maigret sites database self-checking...') - await self_check(args.json_file, logger) + if args.top_sites == 0: + args.top_sites = sys.maxsize # Create object with all information about sites we are aware of. try: db = MaigretDatabase().load_from_file(args.json_file) - site_data_all = db.ranked_sites_dict(top=args.top_sites) + site_data = db.ranked_sites_dict(top=args.top_sites, tags=args.tags, names=args.site_list) except Exception as error: print(f"ERROR: {error}") sys.exit(1) - if args.site_list is None: - # Not desired to look at a sub-set of sites - site_data = site_data_all - else: - # User desires to selectively run queries on a sub-set of the site list. - - # Make sure that the sites are supported & build up pruned site database. - site_data = {} - site_missing = [] - for site in args.site_list: - for existing_site in site_data_all: - if site.lower() == existing_site.lower(): - site_data[existing_site] = site_data_all[existing_site] - if not site_data: - # Build up list of sites not supported for future error message. - site_missing.append(f"'{site}'") - - if site_missing: - print( - f"Error: Desired sites not found: {', '.join(site_missing)}.") - sys.exit(1) - - if args.rank: - # Sort data by rank - site_dataCpy = dict(site_data) - ranked_sites = sorted(site_data, key=lambda k: ("rank" not in k, site_data[k].get("rank", sys.maxsize))) - site_data = {} - for site in ranked_sites: - site_data[site] = site_dataCpy.get(site) + # Database self-checking + if args.self_check: + print('Maigret sites database self-checking...') + await self_check(db, site_data, logger) + if input('Do you want to save changes permanently? [yYnN]\n').lower() == 'y': + db.save_to_file(args.json_file) + print('Database was successfully updated.') + else: + print('Updates will be applied only for current search session.') # Database consistency enabled_count = len(list(filter(lambda x: not x.disabled, site_data.values()))) print(f'Sites in database, enabled/total: {enabled_count}/{len(site_data)}') + if not enabled_count: + print('No sites to check, exiting!') + sys.exit(2) + + if usernames == ['-']: + # magic params to exit after init + print('No usernames to check, exiting.') + sys.exit(0) + # Create notify object for query results. query_notify = QueryNotifyPrint(result=None, verbose=args.verbose, diff --git a/maigret/resources/data.json b/maigret/resources/data.json index 6827447..5ab47fd 100644 --- a/maigret/resources/data.json +++ b/maigret/resources/data.json @@ -5,7 +5,7 @@ "ru" ], "engine": "XenForo", - "alexaRank": 7547465, + "alexaRank": 7547627, "urlMain": "http://0-3.ru", "usernameClaimed": "donna", "usernameUnclaimed": "noonewouldeverusethis7" @@ -15,7 +15,7 @@ "ru" ], "engine": "uCoz", - "alexaRank": 8000787, + "alexaRank": 7993480, "urlMain": "http://0k.clan.su", "usernameClaimed": "eruzz", "usernameUnclaimed": "noonewouldeverusethis7" @@ -26,7 +26,7 @@ ], "checkType": "message", "absenceStrs": "\u042d\u0442\u043e\u0442 \u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u0435\u043b\u044c \u043d\u0435 \u0441\u0443\u0449\u0435\u0441\u0442\u0432\u0443\u0435\u0442, \u0438\u043b\u0438 \u0437\u0430\u0431\u043b\u043e\u043a\u0438\u0440\u043e\u0432\u0430\u043d.", - "alexaRank": 1807380, + "alexaRank": 1791104, "url": "http://1001mem.ru/{username}", "urlMain": "http://1001mem.ru", "usernameClaimed": "adam", @@ -43,7 +43,7 @@ "absenceStrs": [ "Sorry, the requested user is not valid!" ], - "alexaRank": 32925, + "alexaRank": 33209, "url": "https://www.1001tracklists.com/user/{username}/index.html", "urlMain": "https://www.1001tracklists.com", "usernameClaimed": "JacoWilles", @@ -56,7 +56,7 @@ "ru" ], "engine": "XenForo", - "alexaRank": 138628, + "alexaRank": 133302, "urlMain": "https://forum-ru.101xp.com", "usernameClaimed": "aida", "usernameUnclaimed": "noonewouldeverusethis7" @@ -66,7 +66,7 @@ "global" ], "checkType": "status_code", - "alexaRank": 1827622, + "alexaRank": 1834012, "url": "https://11x2.com/user/home/{username}", "urlMain": "https://11x2.com", "usernameClaimed": "hazelamy", @@ -80,7 +80,7 @@ "us" ], "checkType": "response_url", - "alexaRank": 951, + "alexaRank": 987, "url": "https://ru.123rf.com/profile_{username}", "urlMain": "https://ru.123rf.com", "usernameClaimed": "rawpixel", @@ -94,7 +94,7 @@ ], "checkType": "message", "absenceStrs": "Error something went wrong", - "alexaRank": 477, + "alexaRank": 483, "url": "https://1337x.to/user/{username}/", "urlMain": "https://1337x.to", "usernameClaimed": "adam", @@ -108,7 +108,7 @@ ], "checkType": "message", "absenceStrs": "This user does not exist or is not approved yet. Come back later.", - "alexaRank": 218377, + "alexaRank": 181244, "url": "https://1x.com/member/{username}", "urlMain": "https://1x.com", "usernameClaimed": "blue", @@ -119,7 +119,7 @@ "ru" ], "engine": "vBulletin", - "alexaRank": 1602938, + "alexaRank": 1586622, "urlMain": "https://1xforum.com", "usernameClaimed": "adam", "usernameUnclaimed": "noonewouldeverusethis7" @@ -130,7 +130,7 @@ "us" ], "checkType": "status_code", - "alexaRank": 2243, + "alexaRank": 2118, "url": "https://247sports.com/user/{username}/", "urlMain": "https://247sports.com", "usernameClaimed": "adam", @@ -143,7 +143,7 @@ "us" ], "checkType": "status_code", - "alexaRank": 50692, + "alexaRank": 53077, "url": "https://24open.ru/user/{username}/", "urlMain": "https://24open.ru", "usernameClaimed": "niko3193", @@ -162,7 +162,7 @@ ], "checkType": "message", "absenceStrs": "\u041f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u0435\u043b\u044c \u043d\u0435 \u043d\u0430\u0439\u0434\u0435\u043d", - "alexaRank": 760796, + "alexaRank": 718982, "url": "https://2berega.spb.ru/user/{username}", "urlMain": "https://2berega.spb.ru", "usernameClaimed": "adam", @@ -173,7 +173,7 @@ "ru" ], "checkType": "status_code", - "alexaRank": 339362, + "alexaRank": 356718, "url": "https://www.2d-3d.ru/user/{username}/", "urlMain": "https://www.2d-3d.ru", "usernameClaimed": "adam", @@ -185,7 +185,7 @@ ], "checkType": "message", "absenceStrs": "Deze gebruiker is niet geregistreerd, zodat je zijn of haar profiel niet kunt bekijken.", - "alexaRank": 6235500, + "alexaRank": 6234503, "url": "https://www.2fast4u.be/members/?username={username}", "urlMain": "https://www.2fast4u.be", "usernameClaimed": "Schussboelie", @@ -201,7 +201,7 @@ "\u041f\u0440\u043e\u0444\u0438\u043b\u044c \u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u0435\u043b\u044f" ], "absenceStrs": "\u0418\u0437\u0432\u0438\u043d\u0438\u0442\u0435, \u0442\u0430\u043a\u043e\u0433\u043e \u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u0435\u043b\u044f \u043d\u0435 \u0441\u0443\u0449\u0435\u0441\u0442\u0432\u0443\u0435\u0442", - "alexaRank": 1456238, + "alexaRank": 1463655, "url": "http://{username}.33bru.com/", "urlMain": "http://33bru.com/", "usernameClaimed": "adam", @@ -224,7 +224,7 @@ ], "checkType": "message", "absenceStrs": "The specified member cannot be found", - "alexaRank": 1336816, + "alexaRank": 1345435, "url": "https://www.3dcadforums.com/members/?username={username}", "urlMain": "https://www.3dcadforums.com/", "usernameClaimed": "adam", @@ -235,7 +235,7 @@ "ru" ], "checkType": "status_code", - "alexaRank": 12873, + "alexaRank": 12962, "url": "https://3ddd.ru/users/{username}", "urlMain": "https://3ddd.ru", "usernameClaimed": "adam", @@ -246,7 +246,7 @@ "ru" ], "engine": "vBulletin", - "alexaRank": 8355, + "alexaRank": 8129, "urlMain": "http://forum.3dnews.ru/", "usernameClaimed": "red", "usernameUnclaimed": "noonewouldeverusethis7" @@ -256,7 +256,7 @@ "ru" ], "checkType": "response_url", - "alexaRank": 74713, + "alexaRank": 71510, "url": "https://3dtoday.ru/blogs/{username}", "urlMain": "https://3dtoday.ru/", "usernameClaimed": "adam", @@ -267,7 +267,7 @@ "ru" ], "engine": "vBulletin", - "alexaRank": 347384, + "alexaRank": 324399, "urlMain": "https://4cheat.ru", "usernameClaimed": "adam", "usernameUnclaimed": "noonewouldeverusethis7" @@ -277,7 +277,7 @@ "ru" ], "engine": "XenForo", - "alexaRank": 87134, + "alexaRank": 84167, "urlMain": "https://4gameforum.com", "usernameClaimed": "persty", "usernameUnclaimed": "noonewouldeverusethis7" @@ -288,7 +288,7 @@ ], "checkType": "message", "absenceStrs": "\u041a \u0441\u043e\u0436\u0430\u043b\u0435\u043d\u0438\u044e, \u0412\u0430\u0448 \u043f\u043e\u0438\u0441\u043a \u043d\u0435 \u0434\u0430\u043b \u043d\u0438\u043a\u0430\u043a\u0438\u0445 \u0440\u0435\u0437\u0443\u043b\u044c\u0442\u0430\u0442\u043e\u0432.", - "alexaRank": 2874, + "alexaRank": 2743, "url": "https://4pda.ru/forum/index.php?act=search&source=pst&noform=1&username={username}", "urlMain": "https://4pda.ru/", "usernameClaimed": "green", @@ -299,7 +299,7 @@ "ru" ], "checkType": "status_code", - "alexaRank": 214524, + "alexaRank": 198703, "url": "https://4stor.ru/user/{username}", "urlMain": "https://4stor.ru", "usernameClaimed": "adam", @@ -307,9 +307,9 @@ }, "500px": { "tags": [ - "photos", + "global", "in", - "global" + "photos" ], "errors": { "INTERNAL_SERVER_ERROR": "Site error", @@ -318,7 +318,7 @@ "urlProbe": "https://api.500px.com/graphql?operationName=ProfileRendererQuery&variables=%7B%22username%22%3A%22{username}%22%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%225a17a9af1830b58b94a912995b7947b24f27f1301c6ea8ab71a9eb1a6a86585b%22%7D%7D", "checkType": "message", "absenceStrs": "No message available", - "alexaRank": 2981, + "alexaRank": 2970, "url": "https://500px.com/p/{username}", "urlMain": "https://500px.com/", "usernameClaimed": "blue", @@ -326,7 +326,7 @@ }, "50cc.com.ua": { "engine": "uCoz", - "alexaRank": 9871592, + "alexaRank": 7002071, "urlMain": "http://50cc.com.ua", "usernameClaimed": "noonewouldeverusethis7", "usernameUnclaimed": "alex" @@ -339,7 +339,7 @@ }, "74507.ucoz.ru": { "engine": "uCoz", - "alexaRank": 8687644, + "alexaRank": 8661177, "urlMain": "http://74507.ucoz.ru", "usernameClaimed": "noonewouldeverusethis7", "usernameUnclaimed": "alex" @@ -351,7 +351,7 @@ "us" ], "checkType": "status_code", - "alexaRank": 34978, + "alexaRank": 36192, "url": "https://www.7cups.com/@{username}", "urlMain": "https://www.7cups.com/", "usernameClaimed": "blue", @@ -362,7 +362,7 @@ "ru" ], "checkType": "status_code", - "alexaRank": 14197, + "alexaRank": 15616, "url": "https://7dach.ru/profile/{username}", "urlMain": "https://7dach.ru/", "usernameClaimed": "adam", @@ -373,7 +373,7 @@ "ru" ], "checkType": "status_code", - "alexaRank": 47803, + "alexaRank": 50336, "url": "https://blog.7ya.ru/{username}/", "urlMain": "https://blog.7ya.ru", "usernameClaimed": "trotter", @@ -385,7 +385,7 @@ "us" ], "checkType": "status_code", - "alexaRank": 401, + "alexaRank": 407, "url": "https://www.9gag.com/u/{username}", "urlMain": "https://www.9gag.com/", "usernameClaimed": "blue", @@ -409,7 +409,7 @@ ], "checkType": "message", "absenceStrs": "\u041f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u0435\u043b\u044c \u0441 \u0442\u0430\u043a\u0438\u043c \u0438\u043c\u0435\u043d\u0435\u043c \u043d\u0435 \u043d\u0430\u0439\u0434\u0435\u043d.", - "alexaRank": 9855006, + "alexaRank": 9823851, "url": "https://aback.com.ua/user/{username}", "urlMain": "https://aback.com.ua", "usernameClaimed": "adam", @@ -421,7 +421,7 @@ "social" ], "checkType": "status_code", - "alexaRank": 12196, + "alexaRank": 12200, "url": "https://about.me/{username}", "urlMain": "https://about.me/", "usernameClaimed": "blue", @@ -433,7 +433,7 @@ ], "checkType": "message", "absenceStrs": "\u041f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u0435\u043b\u044c \u043d\u0435 \u0437\u0430\u0440\u0435\u0433\u0438\u0441\u0442\u0440\u0438\u0440\u043e\u0432\u0430\u043d \u0438 \u043d\u0435 \u0438\u043c\u0435\u0435\u0442 \u043f\u0440\u043e\u0444\u0438\u043b\u044f \u0434\u043b\u044f \u043f\u0440\u043e\u0441\u043c\u043e\u0442\u0440\u0430.", - "alexaRank": 3701492, + "alexaRank": 3712080, "url": "http://aboutcar.ru/members/{username}.html", "urlMain": "http://aboutcar.ru", "usernameClaimed": "krolenya", @@ -445,7 +445,7 @@ ], "regexCheck": "^[^\\.]+$", "checkType": "status_code", - "alexaRank": 276, + "alexaRank": 280, "url": "https://independent.academia.edu/{username}", "urlMain": "https://www.academia.edu/", "usernameClaimed": "blue", @@ -456,7 +456,7 @@ "ru" ], "checkType": "status_code", - "alexaRank": 161291, + "alexaRank": 163888, "url": "https://acomics.ru/-{username}", "urlMain": "https://acomics.ru", "usernameClaimed": "Garage", @@ -470,7 +470,7 @@ ], "checkType": "message", "absenceStrs": "