Improved self-check mode (#1887)

This commit is contained in:
Soxoj
2024-11-25 18:27:59 +01:00
committed by GitHub
parent d8a05807ba
commit 13c20afe5b
7 changed files with 80 additions and 26 deletions
+13 -1
View File
@@ -869,6 +869,11 @@ async def site_self_check(
action = "Disabled" if site.disabled else "Enabled"
print(f"{action} site {site.name}...")
# remove service tag "unchecked"
if "unchecked" in site.tags:
site.tags.remove("unchecked")
db.update_site(site)
return changes
@@ -889,6 +894,7 @@ async def self_check(
def disabled_count(lst):
return len(list(filter(lambda x: x.disabled, lst)))
unchecked_old_count = len([site for site in all_sites.values() if "unchecked" in site.tags])
disabled_old_count = disabled_count(all_sites.values())
for _, site in all_sites.items():
@@ -898,12 +904,15 @@ async def self_check(
future = asyncio.ensure_future(check_coro)
tasks.append(future)
if tasks:
for f in tqdm.asyncio.tqdm.as_completed(tasks):
await f
unchecked_new_count = len([site for site in all_sites.values() if "unchecked" in site.tags])
disabled_new_count = disabled_count(all_sites.values())
total_disabled = disabled_new_count - disabled_old_count
if total_disabled:
if total_disabled >= 0:
message = "Disabled"
else:
@@ -916,4 +925,7 @@ async def self_check(
"Run with `--info` flag to get more information"
)
return total_disabled != 0
if unchecked_new_count != unchecked_old_count:
print(f"Unchecked sites verified: {unchecked_old_count - unchecked_new_count}")
return total_disabled != 0 or unchecked_new_count != unchecked_old_count
+8 -2
View File
@@ -569,7 +569,11 @@ async def main():
# Database self-checking
if args.self_check:
print('Maigret sites database self-checking...')
if len(site_data) == 0:
query_notify.warning('No sites to self-check with the current filters! Exiting...')
return
query_notify.success(f'Maigret sites database self-check started for {len(site_data)} sites...')
is_need_update = await self_check(
db,
site_data,
@@ -588,7 +592,9 @@ async def main():
print('Database was successfully updated.')
else:
print('Updates will be applied only for current search session.')
print('Scan sessions flags stats: ' + str(db.get_scan_stats(site_data)))
if args.verbose or args.debug:
query_notify.info('Scan sessions flags stats: ' + str(db.get_scan_stats(site_data)))
# Database statistics
if args.stats:
+4
View File
@@ -211,6 +211,10 @@ class QueryNotifyPrint(QueryNotify):
else:
print(msg)
def success(self, message, symbol="+"):
msg = f"[{symbol}] {message}"
self._colored_print(Fore.GREEN, msg)
def warning(self, message, symbol="-"):
msg = f"[{symbol}] {message}"
self._colored_print(Fore.YELLOW, msg)
+10 -1
View File
@@ -1,16 +1,25 @@
{
"presence_strings": [
"user not found",
"404",
"Page not found",
"error 404",
"username",
"not found",
"пользователь",
"profile",
"lastname",
"firstname",
"DisplayName",
"biography",
"title",
"birthday",
"репутация",
"информация",
"e-mail"
"e-mail",
"body",
"html",
"style"
],
"supposed_usernames": [
"alex", "god", "admin", "red", "blue", "john"
+30 -9
View File
@@ -43,7 +43,7 @@ class Submitter:
"User-Agent": get_random_user_agent(),
}
SEPARATORS = "\"'"
SEPARATORS = "\"'\n"
RATIO = 0.6
TOP_FEATURES = 5
@@ -138,17 +138,19 @@ class Submitter:
if status == QueryStatus.CLAIMED:
changes["disabled"] = True
elif status == QueryStatus.CLAIMED:
self.logger.warning(
f"Not found `{username}` in {site.name}, must be claimed"
print(
f"{Fore.YELLOW}[!] Not found `{username}` in {site.name}, must be claimed{Style.RESET_ALL}"
)
self.logger.info(results_dict[site.name])
self.logger.warning(site.json)
changes["disabled"] = True
else:
self.logger.warning(
f"Found `{username}` in {site.name}, must be available"
print(
f"{Fore.YELLOW}[!] Found `{username}` in {site.name}, must be available{Style.RESET_ALL}"
)
self.logger.info(results_dict[site.name])
self.logger.warning(site.json)
changes["disabled"] = True
else:
print(f"{Fore.GREEN}[+] {username} is successfully checked: {status} in {site.name}{Style.RESET_ALL}")
self.logger.info(f"Site {site.name} checking is finished")
@@ -286,6 +288,10 @@ class Submitter:
a_minus_b = tokens_a.difference(tokens_b)
b_minus_a = tokens_b.difference(tokens_a)
# additional filtering by html response
a_minus_b = [t for t in a_minus_b if not t in non_exists_resp_text]
b_minus_a = [t for t in b_minus_a if not t in exists_resp_text]
if len(a_minus_b) == len(b_minus_a) == 0:
print("The pages for existing and non-existing account are the same!")
@@ -302,6 +308,8 @@ class Submitter:
:top_features_count
]
self.logger.debug([(keyword, match_fun(keyword)) for keyword in presence_list])
print("Detected text features of existing account: " + ", ".join(presence_list))
features = input("If features was not detected correctly, write it manually: ")
@@ -311,6 +319,8 @@ class Submitter:
absence_list = sorted(b_minus_a, key=match_fun, reverse=True)[
:top_features_count
]
self.logger.debug([(keyword, match_fun(keyword)) for keyword in absence_list])
print(
"Detected text features of non-existing account: " + ", ".join(absence_list)
)
@@ -338,7 +348,6 @@ class Submitter:
async def add_site(self, site):
sem = asyncio.Semaphore(1)
print(f"{Fore.BLUE}{Style.BRIGHT}[*] Adding site {site.name}, let's check it...{Style.RESET_ALL}")
print(site.json)
result = await self.site_self_check(site, sem)
if result["disabled"]:
@@ -369,6 +378,7 @@ class Submitter:
print("0. finish editing")
print("10. reject and block domain")
print("11. invalid params, remove")
choice = input("\nSelect field number to edit (0-8): ").strip()
@@ -381,6 +391,12 @@ class Submitter:
"reason": "manual block",
}
if choice == '11':
return {
"valid": False,
"reason": "remove",
}
if choice in editable_fields:
field = editable_fields[choice]
current_value = getattr(site, field)
@@ -477,7 +493,7 @@ class Submitter:
if not found:
print(
f"Sorry, we couldn't find params to detect account presence/absence in {chosen_site.name}."
f"{Fore.RED}[!] The check for site '{chosen_site.name}' failed!{Style.RESET_ALL}"
)
print(
"Try to run this mode again and increase features count or choose others."
@@ -510,4 +526,9 @@ class Submitter:
site_data = chosen_site.strip_engine_data()
self.logger.debug(site_data.json)
self.db.update_site(site_data)
if self.args.db:
print(f"{Fore.GREEN}[+] Maigret DB is saved to {self.args.db}.{Style.RESET_ALL}")
self.db.save_to_file(self.args.db)
return True
-1
View File
@@ -3117,7 +3117,6 @@ Rank data fetched from Alexa by domains.
1. ![](https://www.google.com/s2/favicons?domain=https://www.stopstalk.com) [www.stopstalk.com (https://www.stopstalk.com)](https://www.stopstalk.com)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain=https://www.polywork.com) [www.polywork.com (https://www.polywork.com)](https://www.polywork.com)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain=https://oshwlab.com) [oshwlab.com (https://oshwlab.com)](https://oshwlab.com)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain=https://www.xshaker.net) [www.xshaker.net (https://www.xshaker.net)](https://www.xshaker.net)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain=https://chaturbator.su) [chaturbator.su (https://chaturbator.su)](https://chaturbator.su)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain=https://imgflip.com) [imgflip.com (https://imgflip.com)](https://imgflip.com)*: top 100M*
1. ![](https://www.google.com/s2/favicons?domain=https://www.flickr.com) [www.flickr.com (https://www.flickr.com)](https://www.flickr.com)*: top 100M*
+3
View File
@@ -13,4 +13,7 @@ def test_tags_validity(default_db):
if tag not in tags:
unknown_tags.add(tag)
# make sure all tags are known
# if you see "unchecked" tag error, please, do
# maigret --db `pwd`/maigret/resources/data.json --self-check --tag unchecked --use-disabled-sites
assert unknown_tags == set()