mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-07 06:24:35 +00:00
Improved submit mode, several sites added
This commit is contained in:
+2
-1
@@ -44,6 +44,7 @@ unsupported_characters = '#'
|
|||||||
QueryDraft = Tuple[Callable, Any, Any]
|
QueryDraft = Tuple[Callable, Any, Any]
|
||||||
QueriesDraft = Iterable[QueryDraft]
|
QueriesDraft = Iterable[QueryDraft]
|
||||||
|
|
||||||
|
|
||||||
class AsyncExecutor:
|
class AsyncExecutor:
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
self.logger = kwargs['logger']
|
self.logger = kwargs['logger']
|
||||||
@@ -472,7 +473,7 @@ async def maigret(username, site_dict, query_notify, logger,
|
|||||||
|
|
||||||
headers.update(site.headers)
|
headers.update(site.headers)
|
||||||
|
|
||||||
if not 'url' in site.__dict__:
|
if 'url' not in site.__dict__:
|
||||||
logger.error('No URL for site %s', site.name)
|
logger.error('No URL for site %s', site.name)
|
||||||
# URL of user on site (if it exists)
|
# URL of user on site (if it exists)
|
||||||
url = site.url.format(
|
url = site.url.format(
|
||||||
|
|||||||
+7
-4
@@ -1,19 +1,23 @@
|
|||||||
"""
|
"""
|
||||||
Maigret main module
|
Maigret main module
|
||||||
"""
|
"""
|
||||||
|
import aiohttp
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
import platform
|
import platform
|
||||||
from argparse import ArgumentParser, RawDescriptionHelpFormatter
|
from argparse import ArgumentParser, RawDescriptionHelpFormatter
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from socid_extractor import parse, __version__ as socid_version
|
from socid_extractor import extract, parse, __version__ as socid_version
|
||||||
|
|
||||||
from .checking import *
|
from .checking import timeout_check, supported_recursive_search_ids, self_check, unsupported_characters, maigret
|
||||||
from .notify import QueryNotifyPrint
|
from .notify import QueryNotifyPrint
|
||||||
from .report import save_csv_report, save_xmind_report, save_html_report, save_pdf_report, \
|
from .report import save_csv_report, save_xmind_report, save_html_report, save_pdf_report, \
|
||||||
generate_report_context, save_txt_report, SUPPORTED_JSON_REPORT_FORMATS, check_supported_json_format, \
|
generate_report_context, save_txt_report, SUPPORTED_JSON_REPORT_FORMATS, check_supported_json_format, \
|
||||||
save_json_report
|
save_json_report
|
||||||
|
from .sites import MaigretDatabase
|
||||||
from .submit import submit_dialog
|
from .submit import submit_dialog
|
||||||
|
|
||||||
__version__ = '0.1.15'
|
__version__ = '0.1.15'
|
||||||
@@ -273,7 +277,6 @@ async def main():
|
|||||||
|
|
||||||
# Make reports folder is not exists
|
# Make reports folder is not exists
|
||||||
os.makedirs(args.folderoutput, exist_ok=True)
|
os.makedirs(args.folderoutput, exist_ok=True)
|
||||||
report_path = args.folderoutput
|
|
||||||
|
|
||||||
# Define one report filename template
|
# Define one report filename template
|
||||||
report_filepath_tpl = os.path.join(args.folderoutput, 'report_{username}{postfix}')
|
report_filepath_tpl = os.path.join(args.folderoutput, 'report_{username}{postfix}')
|
||||||
|
|||||||
+1751
-1950
File diff suppressed because it is too large
Load Diff
@@ -15,6 +15,7 @@ SUPPORTED_TAGS = [
|
|||||||
'discussion', 'sharing', 'writing', 'wiki', 'business', 'shopping', 'sport',
|
'discussion', 'sharing', 'writing', 'wiki', 'business', 'shopping', 'sport',
|
||||||
'books', 'news', 'documents', 'travel', 'maps', 'hobby', 'apps', 'classified',
|
'books', 'news', 'documents', 'travel', 'maps', 'hobby', 'apps', 'classified',
|
||||||
'career', 'geosocial', 'streaming', 'education', 'networking', 'torrent',
|
'career', 'geosocial', 'streaming', 'education', 'networking', 'torrent',
|
||||||
|
'science', 'medicine',
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@@ -154,6 +155,7 @@ class MaigretSite:
|
|||||||
# remove dict keys
|
# remove dict keys
|
||||||
if isinstance(engine_data[k], dict) and is_exists:
|
if isinstance(engine_data[k], dict) and is_exists:
|
||||||
for f in engine_data[k].keys():
|
for f in engine_data[k].keys():
|
||||||
|
if f in self_copy.__dict__[field]:
|
||||||
del self_copy.__dict__[field][f]
|
del self_copy.__dict__[field][f]
|
||||||
continue
|
continue
|
||||||
# remove list items
|
# remove list items
|
||||||
|
|||||||
+80
-23
@@ -1,13 +1,15 @@
|
|||||||
import difflib
|
import difflib
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from mock import Mock
|
|
||||||
|
|
||||||
from .checking import *
|
from .checking import *
|
||||||
|
|
||||||
|
|
||||||
DESIRED_STRINGS = ["username", "not found", "пользователь", "profile", "lastname", "firstname", "biography",
|
DESIRED_STRINGS = ["username", "not found", "пользователь", "profile", "lastname", "firstname", "biography",
|
||||||
"birthday", "репутация", "информация", "e-mail"]
|
"birthday", "репутация", "информация", "e-mail"]
|
||||||
|
|
||||||
|
SUPPOSED_USERNAMES = ['alex', 'god', 'admin', 'red', 'blue', 'john']
|
||||||
|
|
||||||
RATIO = 0.6
|
RATIO = 0.6
|
||||||
TOP_FEATURES = 5
|
TOP_FEATURES = 5
|
||||||
URL_RE = re.compile(r'https?://(www\.)?')
|
URL_RE = re.compile(r'https?://(www\.)?')
|
||||||
@@ -20,7 +22,7 @@ def get_match_ratio(x):
|
|||||||
]), 2)
|
]), 2)
|
||||||
|
|
||||||
|
|
||||||
def extract_domain(url):
|
def extract_mainpage_url(url):
|
||||||
return '/'.join(url.split('/', 3)[:3])
|
return '/'.join(url.split('/', 3)[:3])
|
||||||
|
|
||||||
|
|
||||||
@@ -38,7 +40,6 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F
|
|||||||
logger.info(f'Checking {site.name}...')
|
logger.info(f'Checking {site.name}...')
|
||||||
|
|
||||||
for username, status in check_data:
|
for username, status in check_data:
|
||||||
async with semaphore:
|
|
||||||
results_dict = await maigret(
|
results_dict = await maigret(
|
||||||
username,
|
username,
|
||||||
{site.name: site},
|
{site.name: site},
|
||||||
@@ -84,18 +85,45 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F
|
|||||||
return changes
|
return changes
|
||||||
|
|
||||||
|
|
||||||
async def submit_dialog(db, url_exists, cookie_file):
|
async def detect_known_engine(db, url_exists, url_mainpage):
|
||||||
domain_raw = URL_RE.sub('', url_exists).strip().strip('/')
|
try:
|
||||||
domain_raw = domain_raw.split('/')[0]
|
r = requests.get(url_mainpage)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
print('Some error while checking main page')
|
||||||
|
return None
|
||||||
|
|
||||||
matched_sites = list(filter(lambda x: domain_raw in x.url_main + x.url, db.sites))
|
for e in db.engines:
|
||||||
if matched_sites:
|
strs_to_check = e.__dict__.get('presenseStrs')
|
||||||
print(f'Sites with domain "{domain_raw}" already exists in the Maigret database!')
|
if strs_to_check and r and r.text:
|
||||||
status = lambda s: '(disabled)' if s.disabled else ''
|
all_strs_in_response = True
|
||||||
url_block = lambda s: f'\n\t{s.url_main}\n\t{s.url}'
|
for s in strs_to_check:
|
||||||
print('\n'.join([f'{site.name} {status(site)}{url_block(site)}' for site in matched_sites]))
|
if not s in r.text:
|
||||||
return False
|
all_strs_in_response = False
|
||||||
|
if all_strs_in_response:
|
||||||
|
engine_name = e.__dict__.get('name')
|
||||||
|
print(f'Detected engine {engine_name} for site {url_mainpage}')
|
||||||
|
|
||||||
|
sites = []
|
||||||
|
for u in SUPPOSED_USERNAMES:
|
||||||
|
site_data = {
|
||||||
|
'urlMain': url_mainpage,
|
||||||
|
'name': url_mainpage.split('//')[0],
|
||||||
|
'engine': engine_name,
|
||||||
|
'usernameClaimed': u,
|
||||||
|
'usernameUnclaimed': 'noonewouldeverusethis7',
|
||||||
|
}
|
||||||
|
|
||||||
|
maigret_site = MaigretSite(url_mainpage.split('/')[-1], site_data)
|
||||||
|
maigret_site.update_from_engine(db.engines_dict[engine_name])
|
||||||
|
sites.append(maigret_site)
|
||||||
|
|
||||||
|
return sites
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def check_features_manually(db, url_exists, url_mainpage, cookie_file):
|
||||||
url_parts = url_exists.split('/')
|
url_parts = url_exists.split('/')
|
||||||
supposed_username = url_parts[-1]
|
supposed_username = url_parts[-1]
|
||||||
new_name = input(f'Is "{supposed_username}" a valid username? If not, write it manually: ')
|
new_name = input(f'Is "{supposed_username}" a valid username? If not, write it manually: ')
|
||||||
@@ -138,21 +166,40 @@ async def submit_dialog(db, url_exists, cookie_file):
|
|||||||
if features:
|
if features:
|
||||||
absence_list = features.split(',')
|
absence_list = features.split(',')
|
||||||
|
|
||||||
url_main = extract_domain(url_exists)
|
|
||||||
|
|
||||||
site_data = {
|
site_data = {
|
||||||
'absenceStrs': absence_list,
|
'absenceStrs': absence_list,
|
||||||
'presenseStrs': presence_list,
|
'presenseStrs': presence_list,
|
||||||
'url': url_user,
|
'url': url_user,
|
||||||
'urlMain': url_main,
|
'urlMain': url_mainpage,
|
||||||
'usernameClaimed': supposed_username,
|
'usernameClaimed': supposed_username,
|
||||||
'usernameUnclaimed': non_exist_username,
|
'usernameUnclaimed': non_exist_username,
|
||||||
'checkType': 'message',
|
'checkType': 'message',
|
||||||
}
|
}
|
||||||
|
|
||||||
site = MaigretSite(url_main.split('/')[-1], site_data)
|
site = MaigretSite(url_mainpage.split('/')[-1], site_data)
|
||||||
|
return site
|
||||||
|
|
||||||
print(site.__dict__)
|
async def submit_dialog(db, url_exists, cookie_file):
|
||||||
|
domain_raw = URL_RE.sub('', url_exists).strip().strip('/')
|
||||||
|
domain_raw = domain_raw.split('/')[0]
|
||||||
|
|
||||||
|
# check for existence
|
||||||
|
matched_sites = list(filter(lambda x: domain_raw in x.url_main + x.url, db.sites))
|
||||||
|
if matched_sites:
|
||||||
|
print(f'Sites with domain "{domain_raw}" already exists in the Maigret database!')
|
||||||
|
status = lambda s: '(disabled)' if s.disabled else ''
|
||||||
|
url_block = lambda s: f'\n\t{s.url_main}\n\t{s.url}'
|
||||||
|
print('\n'.join([f'{site.name} {status(site)}{url_block(site)}' for site in matched_sites]))
|
||||||
|
return False
|
||||||
|
|
||||||
|
url_mainpage = extract_mainpage_url(url_exists)
|
||||||
|
|
||||||
|
sites = await detect_known_engine(db, url_exists, url_mainpage)
|
||||||
|
if not sites:
|
||||||
|
print('Unable to detect site engine, lets generate checking features')
|
||||||
|
sites = [await check_features_manually(db, url_exists, url_mainpage, cookie_file)]
|
||||||
|
|
||||||
|
print(sites[0].__dict__)
|
||||||
|
|
||||||
sem = asyncio.Semaphore(1)
|
sem = asyncio.Semaphore(1)
|
||||||
log_level = logging.INFO
|
log_level = logging.INFO
|
||||||
@@ -164,14 +211,24 @@ async def submit_dialog(db, url_exists, cookie_file):
|
|||||||
logger = logging.getLogger('site-submit')
|
logger = logging.getLogger('site-submit')
|
||||||
logger.setLevel(log_level)
|
logger.setLevel(log_level)
|
||||||
|
|
||||||
result = await site_self_check(site, logger, sem, db)
|
found = False
|
||||||
|
chosen_site = None
|
||||||
|
for s in sites:
|
||||||
|
chosen_site = s
|
||||||
|
result = await site_self_check(s, logger, sem, db)
|
||||||
|
if not result['disabled']:
|
||||||
|
found = True
|
||||||
|
break
|
||||||
|
|
||||||
if result['disabled']:
|
if not found:
|
||||||
print(f'Sorry, we couldn\'t find params to detect account presence/absence in {site.name}.')
|
print(f'Sorry, we couldn\'t find params to detect account presence/absence in {chosen_site.name}.')
|
||||||
print('Try to run this mode again and increase features count or choose others.')
|
print('Try to run this mode again and increase features count or choose others.')
|
||||||
else:
|
else:
|
||||||
if input(f'Site {site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] ').lower() in 'y':
|
if input(f'Site {chosen_site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] ').lower() in 'y':
|
||||||
db.update_site(site)
|
print(chosen_site.json)
|
||||||
|
site_data = chosen_site.strip_engine_data()
|
||||||
|
print(site_data.json)
|
||||||
|
db.update_site(site_data)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|||||||
Reference in New Issue
Block a user