Improved submit mode, several sites added

This commit is contained in:
Soxoj
2021-03-21 13:59:59 +03:00
parent b586a4cd06
commit 2bb01f7019
6 changed files with 2746 additions and 2864 deletions
+2 -1
View File
@@ -44,6 +44,7 @@ unsupported_characters = '#'
QueryDraft = Tuple[Callable, Any, Any] QueryDraft = Tuple[Callable, Any, Any]
QueriesDraft = Iterable[QueryDraft] QueriesDraft = Iterable[QueryDraft]
class AsyncExecutor: class AsyncExecutor:
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
self.logger = kwargs['logger'] self.logger = kwargs['logger']
@@ -472,7 +473,7 @@ async def maigret(username, site_dict, query_notify, logger,
headers.update(site.headers) headers.update(site.headers)
if not 'url' in site.__dict__: if 'url' not in site.__dict__:
logger.error('No URL for site %s', site.name) logger.error('No URL for site %s', site.name)
# URL of user on site (if it exists) # URL of user on site (if it exists)
url = site.url.format( url = site.url.format(
+7 -4
View File
@@ -1,19 +1,23 @@
""" """
Maigret main module Maigret main module
""" """
import aiohttp
import asyncio
import logging
import os import os
import sys
import platform import platform
from argparse import ArgumentParser, RawDescriptionHelpFormatter from argparse import ArgumentParser, RawDescriptionHelpFormatter
import requests import requests
from socid_extractor import parse, __version__ as socid_version from socid_extractor import extract, parse, __version__ as socid_version
from .checking import * from .checking import timeout_check, supported_recursive_search_ids, self_check, unsupported_characters, maigret
from .notify import QueryNotifyPrint from .notify import QueryNotifyPrint
from .report import save_csv_report, save_xmind_report, save_html_report, save_pdf_report, \ from .report import save_csv_report, save_xmind_report, save_html_report, save_pdf_report, \
generate_report_context, save_txt_report, SUPPORTED_JSON_REPORT_FORMATS, check_supported_json_format, \ generate_report_context, save_txt_report, SUPPORTED_JSON_REPORT_FORMATS, check_supported_json_format, \
save_json_report save_json_report
from .sites import MaigretDatabase
from .submit import submit_dialog from .submit import submit_dialog
__version__ = '0.1.15' __version__ = '0.1.15'
@@ -273,7 +277,6 @@ async def main():
# Make reports folder is not exists # Make reports folder is not exists
os.makedirs(args.folderoutput, exist_ok=True) os.makedirs(args.folderoutput, exist_ok=True)
report_path = args.folderoutput
# Define one report filename template # Define one report filename template
report_filepath_tpl = os.path.join(args.folderoutput, 'report_{username}{postfix}') report_filepath_tpl = os.path.join(args.folderoutput, 'report_{username}{postfix}')
+1751 -1950
View File
File diff suppressed because it is too large Load Diff
+2
View File
@@ -15,6 +15,7 @@ SUPPORTED_TAGS = [
'discussion', 'sharing', 'writing', 'wiki', 'business', 'shopping', 'sport', 'discussion', 'sharing', 'writing', 'wiki', 'business', 'shopping', 'sport',
'books', 'news', 'documents', 'travel', 'maps', 'hobby', 'apps', 'classified', 'books', 'news', 'documents', 'travel', 'maps', 'hobby', 'apps', 'classified',
'career', 'geosocial', 'streaming', 'education', 'networking', 'torrent', 'career', 'geosocial', 'streaming', 'education', 'networking', 'torrent',
'science', 'medicine',
] ]
@@ -154,6 +155,7 @@ class MaigretSite:
# remove dict keys # remove dict keys
if isinstance(engine_data[k], dict) and is_exists: if isinstance(engine_data[k], dict) and is_exists:
for f in engine_data[k].keys(): for f in engine_data[k].keys():
if f in self_copy.__dict__[field]:
del self_copy.__dict__[field][f] del self_copy.__dict__[field][f]
continue continue
# remove list items # remove list items
+80 -23
View File
@@ -1,13 +1,15 @@
import difflib import difflib
import requests import requests
from mock import Mock
from .checking import * from .checking import *
DESIRED_STRINGS = ["username", "not found", "пользователь", "profile", "lastname", "firstname", "biography", DESIRED_STRINGS = ["username", "not found", "пользователь", "profile", "lastname", "firstname", "biography",
"birthday", "репутация", "информация", "e-mail"] "birthday", "репутация", "информация", "e-mail"]
SUPPOSED_USERNAMES = ['alex', 'god', 'admin', 'red', 'blue', 'john']
RATIO = 0.6 RATIO = 0.6
TOP_FEATURES = 5 TOP_FEATURES = 5
URL_RE = re.compile(r'https?://(www\.)?') URL_RE = re.compile(r'https?://(www\.)?')
@@ -20,7 +22,7 @@ def get_match_ratio(x):
]), 2) ]), 2)
def extract_domain(url): def extract_mainpage_url(url):
return '/'.join(url.split('/', 3)[:3]) return '/'.join(url.split('/', 3)[:3])
@@ -38,7 +40,6 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F
logger.info(f'Checking {site.name}...') logger.info(f'Checking {site.name}...')
for username, status in check_data: for username, status in check_data:
async with semaphore:
results_dict = await maigret( results_dict = await maigret(
username, username,
{site.name: site}, {site.name: site},
@@ -84,18 +85,45 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F
return changes return changes
async def submit_dialog(db, url_exists, cookie_file): async def detect_known_engine(db, url_exists, url_mainpage):
domain_raw = URL_RE.sub('', url_exists).strip().strip('/') try:
domain_raw = domain_raw.split('/')[0] r = requests.get(url_mainpage)
except Exception as e:
print(e)
print('Some error while checking main page')
return None
matched_sites = list(filter(lambda x: domain_raw in x.url_main + x.url, db.sites)) for e in db.engines:
if matched_sites: strs_to_check = e.__dict__.get('presenseStrs')
print(f'Sites with domain "{domain_raw}" already exists in the Maigret database!') if strs_to_check and r and r.text:
status = lambda s: '(disabled)' if s.disabled else '' all_strs_in_response = True
url_block = lambda s: f'\n\t{s.url_main}\n\t{s.url}' for s in strs_to_check:
print('\n'.join([f'{site.name} {status(site)}{url_block(site)}' for site in matched_sites])) if not s in r.text:
return False all_strs_in_response = False
if all_strs_in_response:
engine_name = e.__dict__.get('name')
print(f'Detected engine {engine_name} for site {url_mainpage}')
sites = []
for u in SUPPOSED_USERNAMES:
site_data = {
'urlMain': url_mainpage,
'name': url_mainpage.split('//')[0],
'engine': engine_name,
'usernameClaimed': u,
'usernameUnclaimed': 'noonewouldeverusethis7',
}
maigret_site = MaigretSite(url_mainpage.split('/')[-1], site_data)
maigret_site.update_from_engine(db.engines_dict[engine_name])
sites.append(maigret_site)
return sites
return None
async def check_features_manually(db, url_exists, url_mainpage, cookie_file):
url_parts = url_exists.split('/') url_parts = url_exists.split('/')
supposed_username = url_parts[-1] supposed_username = url_parts[-1]
new_name = input(f'Is "{supposed_username}" a valid username? If not, write it manually: ') new_name = input(f'Is "{supposed_username}" a valid username? If not, write it manually: ')
@@ -138,21 +166,40 @@ async def submit_dialog(db, url_exists, cookie_file):
if features: if features:
absence_list = features.split(',') absence_list = features.split(',')
url_main = extract_domain(url_exists)
site_data = { site_data = {
'absenceStrs': absence_list, 'absenceStrs': absence_list,
'presenseStrs': presence_list, 'presenseStrs': presence_list,
'url': url_user, 'url': url_user,
'urlMain': url_main, 'urlMain': url_mainpage,
'usernameClaimed': supposed_username, 'usernameClaimed': supposed_username,
'usernameUnclaimed': non_exist_username, 'usernameUnclaimed': non_exist_username,
'checkType': 'message', 'checkType': 'message',
} }
site = MaigretSite(url_main.split('/')[-1], site_data) site = MaigretSite(url_mainpage.split('/')[-1], site_data)
return site
print(site.__dict__) async def submit_dialog(db, url_exists, cookie_file):
domain_raw = URL_RE.sub('', url_exists).strip().strip('/')
domain_raw = domain_raw.split('/')[0]
# check for existence
matched_sites = list(filter(lambda x: domain_raw in x.url_main + x.url, db.sites))
if matched_sites:
print(f'Sites with domain "{domain_raw}" already exists in the Maigret database!')
status = lambda s: '(disabled)' if s.disabled else ''
url_block = lambda s: f'\n\t{s.url_main}\n\t{s.url}'
print('\n'.join([f'{site.name} {status(site)}{url_block(site)}' for site in matched_sites]))
return False
url_mainpage = extract_mainpage_url(url_exists)
sites = await detect_known_engine(db, url_exists, url_mainpage)
if not sites:
print('Unable to detect site engine, lets generate checking features')
sites = [await check_features_manually(db, url_exists, url_mainpage, cookie_file)]
print(sites[0].__dict__)
sem = asyncio.Semaphore(1) sem = asyncio.Semaphore(1)
log_level = logging.INFO log_level = logging.INFO
@@ -164,14 +211,24 @@ async def submit_dialog(db, url_exists, cookie_file):
logger = logging.getLogger('site-submit') logger = logging.getLogger('site-submit')
logger.setLevel(log_level) logger.setLevel(log_level)
result = await site_self_check(site, logger, sem, db) found = False
chosen_site = None
for s in sites:
chosen_site = s
result = await site_self_check(s, logger, sem, db)
if not result['disabled']:
found = True
break
if result['disabled']: if not found:
print(f'Sorry, we couldn\'t find params to detect account presence/absence in {site.name}.') print(f'Sorry, we couldn\'t find params to detect account presence/absence in {chosen_site.name}.')
print('Try to run this mode again and increase features count or choose others.') print('Try to run this mode again and increase features count or choose others.')
else: else:
if input(f'Site {site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] ').lower() in 'y': if input(f'Site {chosen_site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] ').lower() in 'y':
db.update_site(site) print(chosen_site.json)
site_data = chosen_site.strip_engine_data()
print(site_data.json)
db.update_site(site_data)
return True return True
return False return False
+886 -868
View File
File diff suppressed because it is too large Load Diff