mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-07 06:24:35 +00:00
Experimental site submit mode
This commit is contained in:
@@ -0,0 +1,161 @@
|
||||
import difflib
|
||||
|
||||
import requests
|
||||
from mock import Mock
|
||||
|
||||
from .checking import *
|
||||
|
||||
DESIRED_STRINGS = ["username", "not found", "пользователь", "profile", "lastname", "firstname", "biography",
|
||||
"birthday", "репутация", "информация", "e-mail"]
|
||||
|
||||
RATIO = 0.6
|
||||
TOP_FEATURES = 5
|
||||
|
||||
|
||||
def get_match_ratio(x):
|
||||
return round(max([
|
||||
difflib.SequenceMatcher(a=x.lower(), b=y).ratio()
|
||||
for y in DESIRED_STRINGS
|
||||
]), 2)
|
||||
|
||||
|
||||
def extract_domain(url):
|
||||
return '/'.join(url.split('/', 3)[:3])
|
||||
|
||||
|
||||
async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False):
|
||||
query_notify = Mock()
|
||||
changes = {
|
||||
'disabled': False,
|
||||
}
|
||||
|
||||
check_data = [
|
||||
(site.username_claimed, QueryStatus.CLAIMED),
|
||||
(site.username_unclaimed, QueryStatus.AVAILABLE),
|
||||
]
|
||||
|
||||
logger.info(f'Checking {site.name}...')
|
||||
|
||||
for username, status in check_data:
|
||||
async with semaphore:
|
||||
results_dict = await maigret(
|
||||
username,
|
||||
{site.name: site},
|
||||
query_notify,
|
||||
logger,
|
||||
timeout=30,
|
||||
id_type=site.type,
|
||||
forced=True,
|
||||
no_progressbar=True,
|
||||
)
|
||||
|
||||
# don't disable entries with other ids types
|
||||
# TODO: make normal checking
|
||||
if site.name not in results_dict:
|
||||
logger.info(results_dict)
|
||||
changes['disabled'] = True
|
||||
continue
|
||||
|
||||
result = results_dict[site.name]['status']
|
||||
|
||||
site_status = result.status
|
||||
|
||||
if site_status != status:
|
||||
if site_status == QueryStatus.UNKNOWN:
|
||||
msgs = site.absence_strs
|
||||
etype = site.check_type
|
||||
logger.warning(
|
||||
f'Error while searching {username} in {site.name}: {result.context}, {msgs}, type {etype}')
|
||||
# don't disable in case of available username
|
||||
if status == QueryStatus.CLAIMED:
|
||||
changes['disabled'] = True
|
||||
elif status == QueryStatus.CLAIMED:
|
||||
logger.warning(f'Not found `{username}` in {site.name}, must be claimed')
|
||||
logger.info(results_dict[site.name])
|
||||
changes['disabled'] = True
|
||||
else:
|
||||
logger.warning(f'Found `{username}` in {site.name}, must be available')
|
||||
logger.info(results_dict[site.name])
|
||||
changes['disabled'] = True
|
||||
|
||||
logger.info(f'Site {site.name} checking is finished')
|
||||
|
||||
return changes
|
||||
|
||||
|
||||
async def submit_dialog(db, url_exists):
|
||||
url_parts = url_exists.split('/')
|
||||
supposed_username = url_parts[-1]
|
||||
new_name = input(f'Is "{supposed_username}" a valid username? If not, write it manually: ')
|
||||
if new_name:
|
||||
supposed_username = new_name
|
||||
non_exist_username = 'noonewouldeverusethis7'
|
||||
|
||||
url_user = url_exists.replace(supposed_username, '{username}')
|
||||
url_not_exists = url_exists.replace(supposed_username, non_exist_username)
|
||||
|
||||
a = requests.get(url_exists).text
|
||||
b = requests.get(url_not_exists).text
|
||||
|
||||
tokens_a = set(a.split('"'))
|
||||
tokens_b = set(b.split('"'))
|
||||
|
||||
a_minus_b = tokens_a.difference(tokens_b)
|
||||
b_minus_a = tokens_b.difference(tokens_a)
|
||||
|
||||
top_features_count = int(input(f'Specify count of features to extract [default {TOP_FEATURES}]: '))
|
||||
if not top_features_count:
|
||||
top_features_count = TOP_FEATURES
|
||||
|
||||
presence_list = sorted(a_minus_b, key=get_match_ratio, reverse=True)[:top_features_count]
|
||||
|
||||
print('Detected text features of existing account: ' + ', '.join(presence_list))
|
||||
features = input('If features was not detected correctly, write it manually: ')
|
||||
|
||||
if features:
|
||||
presence_list = features.split(',')
|
||||
|
||||
absence_list = sorted(b_minus_a, key=get_match_ratio, reverse=True)[:top_features_count]
|
||||
print('Detected text features of non-existing account: ' + ', '.join(absence_list))
|
||||
features = input('If features was not detected correctly, write it manually: ')
|
||||
|
||||
if features:
|
||||
absence_list = features.split(',')
|
||||
|
||||
url_main = extract_domain(url_exists)
|
||||
|
||||
site_data = {
|
||||
'absenceStrs': absence_list,
|
||||
'presenseStrs': presence_list,
|
||||
'url': url_user,
|
||||
'urlMain': url_main,
|
||||
'usernameClaimed': supposed_username,
|
||||
'usernameUnclaimed': non_exist_username,
|
||||
'checkType': 'message',
|
||||
}
|
||||
|
||||
site = MaigretSite(url_main.split('/')[-1], site_data)
|
||||
|
||||
print(site.__dict__)
|
||||
|
||||
sem = asyncio.Semaphore(1)
|
||||
log_level = logging.INFO
|
||||
logging.basicConfig(
|
||||
format='[%(filename)s:%(lineno)d] %(levelname)-3s %(asctime)s %(message)s',
|
||||
datefmt='%H:%M:%S',
|
||||
level=log_level
|
||||
)
|
||||
logger = logging.getLogger('site-submit')
|
||||
logger.setLevel(log_level)
|
||||
|
||||
result = await site_self_check(site, logger, sem, db)
|
||||
|
||||
if result['disabled']:
|
||||
print(f'Sorry, we couldn\'t find params to detect account presence/absence in {site.name}.')
|
||||
print('Try to run this mode again and increase features count or choose others.')
|
||||
else:
|
||||
if input(f'Site {site.name} successfully checked. Do you want to save it in the Maigret DB? [yY] ') in 'yY':
|
||||
db.update_site(site)
|
||||
return True
|
||||
|
||||
return False
|
||||
Reference in New Issue
Block a user