Refactoring and linting, added notifications about frequent search errors

This commit is contained in:
Soxoj
2021-04-30 12:03:13 +03:00
parent bfaf276f6e
commit bfa6afac32
20 changed files with 1351 additions and 787 deletions
Executable
+5
View File
@@ -0,0 +1,5 @@
#!/bin/sh
FILES="maigret wizard.py maigret.py"
echo 'black'
black --skip-string-normalization $FILES
Executable
+11
View File
@@ -0,0 +1,11 @@
#!/bin/sh
FILES="maigret wizard.py maigret.py"
echo 'syntax errors or undefined names'
flake8 --count --select=E9,F63,F7,F82 --show-source --statistics $FILES
echo 'warning'
flake8 --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --ignore=E731,W503 $FILES
echo 'mypy'
mypy ./maigret
+25 -23
View File
@@ -9,46 +9,48 @@ class ParsingActivator:
@staticmethod @staticmethod
def twitter(site, logger, cookies={}): def twitter(site, logger, cookies={}):
headers = dict(site.headers) headers = dict(site.headers)
del headers['x-guest-token'] del headers["x-guest-token"]
r = requests.post(site.activation['url'], headers=headers) r = requests.post(site.activation["url"], headers=headers)
logger.info(r) logger.info(r)
j = r.json() j = r.json()
guest_token = j[site.activation['src']] guest_token = j[site.activation["src"]]
site.headers['x-guest-token'] = guest_token site.headers["x-guest-token"] = guest_token
@staticmethod @staticmethod
def vimeo(site, logger, cookies={}): def vimeo(site, logger, cookies={}):
headers = dict(site.headers) headers = dict(site.headers)
if 'Authorization' in headers: if "Authorization" in headers:
del headers['Authorization'] del headers["Authorization"]
r = requests.get(site.activation['url'], headers=headers) r = requests.get(site.activation["url"], headers=headers)
jwt_token = r.json()['jwt'] jwt_token = r.json()["jwt"]
site.headers['Authorization'] = 'jwt ' + jwt_token site.headers["Authorization"] = "jwt " + jwt_token
@staticmethod @staticmethod
def spotify(site, logger, cookies={}): def spotify(site, logger, cookies={}):
headers = dict(site.headers) headers = dict(site.headers)
if 'Authorization' in headers: if "Authorization" in headers:
del headers['Authorization'] del headers["Authorization"]
r = requests.get(site.activation['url']) r = requests.get(site.activation["url"])
bearer_token = r.json()['accessToken'] bearer_token = r.json()["accessToken"]
site.headers['authorization'] = f'Bearer {bearer_token}' site.headers["authorization"] = f"Bearer {bearer_token}"
@staticmethod @staticmethod
def xssis(site, logger, cookies={}): def xssis(site, logger, cookies={}):
if not cookies: if not cookies:
logger.debug('You must have cookies to activate xss.is parsing!') logger.debug("You must have cookies to activate xss.is parsing!")
return return
headers = dict(site.headers) headers = dict(site.headers)
post_data = { post_data = {
'_xfResponseType': 'json', "_xfResponseType": "json",
'_xfToken': '1611177919,a2710362e45dad9aa1da381e21941a38' "_xfToken": "1611177919,a2710362e45dad9aa1da381e21941a38",
} }
headers['content-type'] = 'application/x-www-form-urlencoded; charset=UTF-8' headers["content-type"] = "application/x-www-form-urlencoded; charset=UTF-8"
r = requests.post(site.activation['url'], headers=headers, cookies=cookies, data=post_data) r = requests.post(
csrf = r.json()['csrf'] site.activation["url"], headers=headers, cookies=cookies, data=post_data
site.get_params['_xfToken'] = csrf )
csrf = r.json()["csrf"]
site.get_params["_xfToken"] = csrf
async def import_aiohttp_cookies(cookiestxt_filename): async def import_aiohttp_cookies(cookiestxt_filename):
@@ -62,8 +64,8 @@ async def import_aiohttp_cookies(cookiestxt_filename):
for key, cookie in list(domain.values())[0].items(): for key, cookie in list(domain.values())[0].items():
c = Morsel() c = Morsel()
c.set(key, cookie.value, cookie.value) c.set(key, cookie.value, cookie.value)
c['domain'] = cookie.domain c["domain"] = cookie.domain
c['path'] = cookie.path c["path"] = cookie.path
cookies_list.append((key, c)) cookies_list.append((key, c))
cookies.update_cookies(cookies_list) cookies.update_cookies(cookies_list)
+208 -162
View File
@@ -5,135 +5,138 @@ import re
import ssl import ssl
import sys import sys
import tqdm import tqdm
import time from typing import Tuple, Optional
import aiohttp import aiohttp
import tqdm.asyncio import tqdm.asyncio
from aiohttp_socks import ProxyConnector from aiohttp_socks import ProxyConnector
from mock import Mock
from python_socks import _errors as proxy_errors from python_socks import _errors as proxy_errors
from socid_extractor import extract from socid_extractor import extract
from .activation import ParsingActivator, import_aiohttp_cookies from .activation import ParsingActivator, import_aiohttp_cookies
from . import errors
from .errors import CheckError
from .executors import AsyncioSimpleExecutor, AsyncioProgressbarQueueExecutor from .executors import AsyncioSimpleExecutor, AsyncioProgressbarQueueExecutor
from .result import QueryResult, QueryStatus from .result import QueryResult, QueryStatus
from .sites import MaigretDatabase, MaigretSite from .sites import MaigretDatabase, MaigretSite
from .types import CheckError
from .utils import get_random_user_agent from .utils import get_random_user_agent
supported_recursive_search_ids = ( supported_recursive_search_ids = (
'yandex_public_id', "yandex_public_id",
'gaia_id', "gaia_id",
'vk_id', "vk_id",
'ok_id', "ok_id",
'wikimapia_uid', "wikimapia_uid",
'steam_id', "steam_id",
'uidme_uguid', "uidme_uguid",
) )
common_errors = { unsupported_characters = "#"
'<title>Attention Required! | Cloudflare</title>': CheckError('Captcha', 'Cloudflare'),
'Please stand by, while we are checking your browser': CheckError('Bot protection', 'Cloudflare'),
'<title>Доступ ограничен</title>': CheckError('Censorship', 'Rostelecom'),
'document.getElementById(\'validate_form_submit\').disabled=true': CheckError('Captcha', 'Mail.ru'),
'Verifying your browser, please wait...<br>DDoS Protection by</font> Blazingfast.io': CheckError('Bot protection', 'Blazingfast'),
'404</h1><p class="error-card__description">Мы&nbsp;не&nbsp;нашли страницу': CheckError('Resolving', 'MegaFon 404 page'),
'Доступ к информационному ресурсу ограничен на основании Федерального закона': CheckError('Censorship', 'MGTS'),
'Incapsula incident ID': CheckError('Bot protection', 'Incapsula'),
}
unsupported_characters = '#'
async def get_response(request_future, site_name, logger) -> (str, int, CheckError): async def get_response(
request_future, site_name, logger
) -> Tuple[str, int, Optional[CheckError]]:
html_text = None html_text = None
status_code = 0 status_code = 0
error = CheckError('Error') error: Optional[CheckError] = CheckError("Error")
try: try:
response = await request_future response = await request_future
status_code = response.status status_code = response.status
response_content = await response.content.read() response_content = await response.content.read()
charset = response.charset or 'utf-8' charset = response.charset or "utf-8"
decoded_content = response_content.decode(charset, 'ignore') decoded_content = response_content.decode(charset, "ignore")
html_text = decoded_content html_text = decoded_content
if status_code == 0: if status_code == 0:
error = CheckError('Connection lost') error = CheckError("Connection lost")
else: else:
error = None error = None
logger.debug(html_text) logger.debug(html_text)
except asyncio.TimeoutError as e: except asyncio.TimeoutError as e:
error = CheckError('Request timeout', str(e)) error = CheckError("Request timeout", str(e))
except aiohttp.client_exceptions.ClientConnectorError as e: except aiohttp.client_exceptions.ClientConnectorError as e:
error = CheckError('Connecting failure', str(e)) error = CheckError("Connecting failure", str(e))
except aiohttp.http_exceptions.BadHttpMessage as e: except aiohttp.http_exceptions.BadHttpMessage as e:
error = CheckError('HTTP', str(e)) error = CheckError("HTTP", str(e))
except proxy_errors.ProxyError as e: except proxy_errors.ProxyError as e:
error = CheckError('Proxy', str(e)) error = CheckError("Proxy", str(e))
except KeyboardInterrupt:
error = CheckError("Interrupted")
except Exception as e: except Exception as e:
# python-specific exceptions # python-specific exceptions
if sys.version_info.minor > 6: if sys.version_info.minor > 6:
if isinstance(e, ssl.SSLCertVerificationError) or isinstance(e, ssl.SSLError): if isinstance(e, ssl.SSLCertVerificationError) or isinstance(
error = CheckError('SSL', str(e)) e, ssl.SSLError
):
error = CheckError("SSL", str(e))
else: else:
logger.warning(f'Unhandled error while requesting {site_name}: {e}') logger.warning(f"Unhandled error while requesting {site_name}: {e}")
logger.debug(e, exc_info=True) logger.debug(e, exc_info=True)
error = CheckError('Error', str(e)) error = CheckError("Error", str(e))
# TODO: return only needed information # TODO: return only needed information
return html_text, status_code, error return str(html_text), status_code, error
async def update_site_dict_from_response(sitename, site_dict, results_info, logger, query_notify): async def update_site_dict_from_response(
sitename, site_dict, results_info, logger, query_notify
):
site_obj = site_dict[sitename] site_obj = site_dict[sitename]
future = site_obj.request_future future = site_obj.request_future
if not future: if not future:
# ignore: search by incompatible id type # ignore: search by incompatible id type
return return
response = await get_response(request_future=future, response = await get_response(
site_name=sitename, request_future=future, site_name=sitename, logger=logger
logger=logger) )
return sitename, process_site_result(response, query_notify, logger, results_info, site_obj) return sitename, process_site_result(
response, query_notify, logger, results_info, site_obj
)
# TODO: move to separate class # TODO: move to separate class
def detect_error_page(html_text, status_code, fail_flags, ignore_403) -> CheckError: def detect_error_page(
html_text, status_code, fail_flags, ignore_403
) -> Optional[CheckError]:
# Detect service restrictions such as a country restriction # Detect service restrictions such as a country restriction
for flag, msg in fail_flags.items(): for flag, msg in fail_flags.items():
if flag in html_text: if flag in html_text:
return CheckError('Site-specific', msg) return CheckError("Site-specific", msg)
# Detect common restrictions such as provider censorship and bot protection # Detect common restrictions such as provider censorship and bot protection
for flag, err in common_errors.items(): err = errors.detect(html_text)
if flag in html_text: if err:
return err return err
# Detect common site errors # Detect common site errors
if status_code == 403 and not ignore_403: if status_code == 403 and not ignore_403:
return CheckError('Access denied', '403 status code, use proxy/vpn') return CheckError("Access denied", "403 status code, use proxy/vpn")
elif status_code >= 500: elif status_code >= 500:
return CheckError(f'Server', f'{status_code} status code') return CheckError("Server", f"{status_code} status code")
return None return None
def process_site_result(response, query_notify, logger, results_info, site: MaigretSite): def process_site_result(
response, query_notify, logger, results_info, site: MaigretSite
):
if not response: if not response:
return results_info return results_info
fulltags = site.tags fulltags = site.tags
# Retrieve other site information again # Retrieve other site information again
username = results_info['username'] username = results_info["username"]
is_parsing_enabled = results_info['parsing_enabled'] is_parsing_enabled = results_info["parsing_enabled"]
url = results_info.get("url_user") url = results_info.get("url_user")
logger.debug(url) logger.debug(url)
@@ -147,7 +150,7 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
# TODO: refactor # TODO: refactor
if not response: if not response:
logger.error(f'No response for {site.name}') logger.error(f"No response for {site.name}")
return results_info return results_info
html_text, status_code, check_error = response html_text, status_code, check_error = response
@@ -156,28 +159,34 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
response_time = None response_time = None
if logger.level == logging.DEBUG: if logger.level == logging.DEBUG:
with open('debug.txt', 'a') as f: with open("debug.txt", "a") as f:
status = status_code or 'No response' status = status_code or "No response"
f.write(f'url: {url}\nerror: {check_error}\nr: {status}\n') f.write(f"url: {url}\nerror: {check_error}\nr: {status}\n")
if html_text: if html_text:
f.write(f'code: {status}\nresponse: {str(html_text)}\n') f.write(f"code: {status}\nresponse: {str(html_text)}\n")
# additional check for errors # additional check for errors
if status_code and not check_error: if status_code and not check_error:
check_error = detect_error_page(html_text, status_code, site.errors, site.ignore403) check_error = detect_error_page(
html_text, status_code, site.errors, site.ignore403
)
if site.activation and html_text: if site.activation and html_text:
is_need_activation = any([s for s in site.activation['marks'] if s in html_text]) is_need_activation = any(
[s for s in site.activation["marks"] if s in html_text]
)
if is_need_activation: if is_need_activation:
method = site.activation['method'] method = site.activation["method"]
try: try:
activate_fun = getattr(ParsingActivator(), method) activate_fun = getattr(ParsingActivator(), method)
# TODO: async call # TODO: async call
activate_fun(site, logger) activate_fun(site, logger)
except AttributeError: except AttributeError:
logger.warning(f'Activation method {method} for site {site.name} not found!') logger.warning(
f"Activation method {method} for site {site.name} not found!"
)
except Exception as e: except Exception as e:
logger.warning(f'Failed activation {method} for site {site.name}: {e}') logger.warning(f"Failed activation {method} for site {site.name}: {e}")
site_name = site.pretty_name site_name = site.pretty_name
# presense flags # presense flags
@@ -187,56 +196,75 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
if html_text: if html_text:
if not presense_flags: if not presense_flags:
is_presense_detected = True is_presense_detected = True
site.stats['presense_flag'] = None site.stats["presense_flag"] = None
else: else:
for presense_flag in presense_flags: for presense_flag in presense_flags:
if presense_flag in html_text: if presense_flag in html_text:
is_presense_detected = True is_presense_detected = True
site.stats['presense_flag'] = presense_flag site.stats["presense_flag"] = presense_flag
logger.debug(presense_flag) logger.debug(presense_flag)
break break
if check_error: if check_error:
logger.debug(check_error) logger.debug(check_error)
result = QueryResult(username, result = QueryResult(
username,
site_name, site_name,
url, url,
QueryStatus.UNKNOWN, QueryStatus.UNKNOWN,
query_time=response_time, query_time=response_time,
error=check_error, error=check_error,
context=str(CheckError), tags=fulltags) context=str(CheckError),
tags=fulltags,
)
elif check_type == "message": elif check_type == "message":
absence_flags = site.absence_strs absence_flags = site.absence_strs
is_absence_flags_list = isinstance(absence_flags, list) is_absence_flags_list = isinstance(absence_flags, list)
absence_flags_set = set(absence_flags) if is_absence_flags_list else {absence_flags} absence_flags_set = (
set(absence_flags) if is_absence_flags_list else {absence_flags}
)
# Checks if the error message is in the HTML # Checks if the error message is in the HTML
is_absence_detected = any([(absence_flag in html_text) for absence_flag in absence_flags_set]) is_absence_detected = any(
[(absence_flag in html_text) for absence_flag in absence_flags_set]
)
if not is_absence_detected and is_presense_detected: if not is_absence_detected and is_presense_detected:
result = QueryResult(username, result = QueryResult(
username,
site_name, site_name,
url, url,
QueryStatus.CLAIMED, QueryStatus.CLAIMED,
query_time=response_time, tags=fulltags) query_time=response_time,
tags=fulltags,
)
else: else:
result = QueryResult(username, result = QueryResult(
username,
site_name, site_name,
url, url,
QueryStatus.AVAILABLE, QueryStatus.AVAILABLE,
query_time=response_time, tags=fulltags) query_time=response_time,
tags=fulltags,
)
elif check_type == "status_code": elif check_type == "status_code":
# Checks if the status code of the response is 2XX # Checks if the status code of the response is 2XX
if (not status_code >= 300 or status_code < 200) and is_presense_detected: if (not status_code >= 300 or status_code < 200) and is_presense_detected:
result = QueryResult(username, result = QueryResult(
username,
site_name, site_name,
url, url,
QueryStatus.CLAIMED, QueryStatus.CLAIMED,
query_time=response_time, tags=fulltags) query_time=response_time,
tags=fulltags,
)
else: else:
result = QueryResult(username, result = QueryResult(
username,
site_name, site_name,
url, url,
QueryStatus.AVAILABLE, QueryStatus.AVAILABLE,
query_time=response_time, tags=fulltags) query_time=response_time,
tags=fulltags,
)
elif check_type == "response_url": elif check_type == "response_url":
# For this detection method, we have turned off the redirect. # For this detection method, we have turned off the redirect.
# So, there is no need to check the response URL: it will always # So, there is no need to check the response URL: it will always
@@ -244,21 +272,28 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
# code indicates that the request was successful (i.e. no 404, or # code indicates that the request was successful (i.e. no 404, or
# forward to some odd redirect). # forward to some odd redirect).
if 200 <= status_code < 300 and is_presense_detected: if 200 <= status_code < 300 and is_presense_detected:
result = QueryResult(username, result = QueryResult(
username,
site_name, site_name,
url, url,
QueryStatus.CLAIMED, QueryStatus.CLAIMED,
query_time=response_time, tags=fulltags) query_time=response_time,
tags=fulltags,
)
else: else:
result = QueryResult(username, result = QueryResult(
username,
site_name, site_name,
url, url,
QueryStatus.AVAILABLE, QueryStatus.AVAILABLE,
query_time=response_time, tags=fulltags) query_time=response_time,
tags=fulltags,
)
else: else:
# It should be impossible to ever get here... # It should be impossible to ever get here...
raise ValueError(f"Unknown check type '{check_type}' for " raise ValueError(
f"site '{site.name}'") f"Unknown check type '{check_type}' for " f"site '{site.name}'"
)
extracted_ids_data = {} extracted_ids_data = {}
@@ -266,39 +301,49 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
try: try:
extracted_ids_data = extract(html_text) extracted_ids_data = extract(html_text)
except Exception as e: except Exception as e:
logger.warning(f'Error while parsing {site.name}: {e}', exc_info=True) logger.warning(f"Error while parsing {site.name}: {e}", exc_info=True)
if extracted_ids_data: if extracted_ids_data:
new_usernames = {} new_usernames = {}
for k, v in extracted_ids_data.items(): for k, v in extracted_ids_data.items():
if 'username' in k: if "username" in k:
new_usernames[v] = 'username' new_usernames[v] = "username"
if k in supported_recursive_search_ids: if k in supported_recursive_search_ids:
new_usernames[v] = k new_usernames[v] = k
results_info['ids_usernames'] = new_usernames results_info["ids_usernames"] = new_usernames
results_info['ids_links'] = eval(extracted_ids_data.get('links', '[]')) results_info["ids_links"] = eval(extracted_ids_data.get("links", "[]"))
result.ids_data = extracted_ids_data result.ids_data = extracted_ids_data
# Notify caller about results of query. # Notify caller about results of query.
query_notify.update(result, site.similar_search) query_notify.update(result, site.similar_search)
# Save status of request # Save status of request
results_info['status'] = result results_info["status"] = result
# Save results from request # Save results from request
results_info['http_status'] = status_code results_info["http_status"] = status_code
results_info['is_similar'] = site.similar_search results_info["is_similar"] = site.similar_search
# results_site['response_text'] = html_text # results_site['response_text'] = html_text
results_info['rank'] = site.alexa_rank results_info["rank"] = site.alexa_rank
return results_info return results_info
async def maigret(username, site_dict, logger, query_notify=None, async def maigret(
proxy=None, timeout=None, is_parsing_enabled=False, username,
id_type='username', debug=False, forced=False, site_dict,
max_connections=100, no_progressbar=False, logger,
cookies=None): query_notify=None,
proxy=None,
timeout=None,
is_parsing_enabled=False,
id_type="username",
debug=False,
forced=False,
max_connections=100,
no_progressbar=False,
cookies=None,
):
"""Main search func """Main search func
Checks for existence of username on certain sites. Checks for existence of username on certain sites.
@@ -342,24 +387,28 @@ async def maigret(username, site_dict, logger, query_notify=None,
query_notify.start(username, id_type) query_notify.start(username, id_type)
# TODO: connector # TODO: connector
connector = ProxyConnector.from_url(proxy) if proxy else aiohttp.TCPConnector(ssl=False) connector = (
ProxyConnector.from_url(proxy) if proxy else aiohttp.TCPConnector(ssl=False)
)
# connector = aiohttp.TCPConnector(ssl=False) # connector = aiohttp.TCPConnector(ssl=False)
connector.verify_ssl = False connector.verify_ssl = False
cookie_jar = None cookie_jar = None
if cookies: if cookies:
logger.debug(f'Using cookies jar file {cookies}') logger.debug(f"Using cookies jar file {cookies}")
cookie_jar = await import_aiohttp_cookies(cookies) cookie_jar = await import_aiohttp_cookies(cookies)
session = aiohttp.ClientSession(connector=connector, trust_env=True, cookie_jar=cookie_jar) session = aiohttp.ClientSession(
connector=connector, trust_env=True, cookie_jar=cookie_jar
)
if logger.level == logging.DEBUG: if logger.level == logging.DEBUG:
future = session.get(url='https://icanhazip.com') future = session.get(url="https://icanhazip.com")
ip, status, check_error = await get_response(future, None, logger) ip, status, check_error = await get_response(future, None, logger)
if ip: if ip:
logger.debug(f'My IP is: {ip.strip()}') logger.debug(f"My IP is: {ip.strip()}")
else: else:
logger.debug(f'IP requesting {check_error[0]}: {check_error[1]}') logger.debug(f"IP requesting {check_error[0]}: {check_error[1]}")
# Results from analysis of all sites # Results from analysis of all sites
results_total = {} results_total = {}
@@ -371,46 +420,45 @@ async def maigret(username, site_dict, logger, query_notify=None,
continue continue
if site.disabled and not forced: if site.disabled and not forced:
logger.debug(f'Site {site.name} is disabled, skipping...') logger.debug(f"Site {site.name} is disabled, skipping...")
continue continue
# Results from analysis of this specific site # Results from analysis of this specific site
results_site = {} results_site = {}
# Record URL of main site and username # Record URL of main site and username
results_site['username'] = username results_site["username"] = username
results_site['parsing_enabled'] = is_parsing_enabled results_site["parsing_enabled"] = is_parsing_enabled
results_site['url_main'] = site.url_main results_site["url_main"] = site.url_main
results_site['cookies'] = cookie_jar and cookie_jar.filter_cookies(site.url_main) or None results_site["cookies"] = (
cookie_jar and cookie_jar.filter_cookies(site.url_main) or None
)
headers = { headers = {
'User-Agent': get_random_user_agent(), "User-Agent": get_random_user_agent(),
} }
headers.update(site.headers) headers.update(site.headers)
if 'url' not in site.__dict__: if "url" not in site.__dict__:
logger.error('No URL for site %s', site.name) logger.error("No URL for site %s", site.name)
# URL of user on site (if it exists) # URL of user on site (if it exists)
url = site.url.format( url = site.url.format(
urlMain=site.url_main, urlMain=site.url_main, urlSubpath=site.url_subpath, username=username
urlSubpath=site.url_subpath,
username=username
) )
# workaround to prevent slash errors # workaround to prevent slash errors
url = re.sub('(?<!:)/+', '/', url) url = re.sub("(?<!:)/+", "/", url)
# Don't make request if username is invalid for the site # Don't make request if username is invalid for the site
if site.regex_check and re.search(site.regex_check, username) is None: if site.regex_check and re.search(site.regex_check, username) is None:
# No need to do the check at the site: this user name is not allowed. # No need to do the check at the site: this user name is not allowed.
results_site['status'] = QueryResult(username, results_site["status"] = QueryResult(
site_name, username, site_name, url, QueryStatus.ILLEGAL
url, )
QueryStatus.ILLEGAL)
results_site["url_user"] = "" results_site["url_user"] = ""
results_site['http_status'] = "" results_site["http_status"] = ""
results_site['response_text'] = "" results_site["response_text"] = ""
query_notify.update(results_site['status']) query_notify.update(results_site["status"])
else: else:
# URL of user on site (if it exists) # URL of user on site (if it exists)
results_site["url_user"] = url results_site["url_user"] = url
@@ -428,9 +476,9 @@ async def maigret(username, site_dict, logger, query_notify=None,
) )
for k, v in site.get_params.items(): for k, v in site.get_params.items():
url_probe += f'&{k}={v}' url_probe += f"&{k}={v}"
if site.check_type == 'status_code' and site.request_head_only: if site.check_type == "status_code" and site.request_head_only:
# In most cases when we are detecting by status code, # In most cases when we are detecting by status code,
# it is not necessary to get the entire body: we can # it is not necessary to get the entire body: we can
# detect fine with just the HEAD response. # detect fine with just the HEAD response.
@@ -451,7 +499,9 @@ async def maigret(username, site_dict, logger, query_notify=None,
# The final result of the request will be what is available. # The final result of the request will be what is available.
allow_redirects = True allow_redirects = True
future = request_method(url=url_probe, headers=headers, future = request_method(
url=url_probe,
headers=headers,
allow_redirects=allow_redirects, allow_redirects=allow_redirects,
timeout=timeout, timeout=timeout,
) )
@@ -465,35 +515,25 @@ async def maigret(username, site_dict, logger, query_notify=None,
coroutines = [] coroutines = []
for sitename, result_obj in results_total.items(): for sitename, result_obj in results_total.items():
coroutines.append((update_site_dict_from_response, [sitename, site_dict, result_obj, logger, query_notify], {})) coroutines.append(
(
update_site_dict_from_response,
[sitename, site_dict, result_obj, logger, query_notify],
{},
)
)
if no_progressbar: if no_progressbar:
executor = AsyncioSimpleExecutor(logger=logger) executor = AsyncioSimpleExecutor(logger=logger)
else: else:
executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=max_connections, timeout=timeout+0.5) executor = AsyncioProgressbarQueueExecutor(
logger=logger, in_parallel=max_connections, timeout=timeout + 0.5
)
results = await executor.run(coroutines) results = await executor.run(coroutines)
await session.close() await session.close()
# TODO: move to separate function
errors = {}
for el in results:
if not el:
continue
_, r = el
if r and isinstance(r, dict) and r.get('status'):
if not isinstance(r['status'], QueryResult):
continue
err = r['status'].error
if not err:
continue
errors[err.type] = errors.get(err.type, 0) + 1
for err, count in sorted(errors.items(), key=lambda x: x[1], reverse=True):
logger.warning(f'Errors of type "{err}": {count}')
# Notify caller that all queries are finished. # Notify caller that all queries are finished.
query_notify.finish() query_notify.finish()
@@ -537,7 +577,7 @@ def timeout_check(value):
async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False): async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False):
changes = { changes = {
'disabled': False, "disabled": False,
} }
try: try:
@@ -550,7 +590,7 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F
logger.error(site.__dict__) logger.error(site.__dict__)
check_data = [] check_data = []
logger.info(f'Checking {site.name}...') logger.info(f"Checking {site.name}...")
for username, status in check_data: for username, status in check_data:
async with semaphore: async with semaphore:
@@ -568,10 +608,10 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F
# TODO: make normal checking # TODO: make normal checking
if site.name not in results_dict: if site.name not in results_dict:
logger.info(results_dict) logger.info(results_dict)
changes['disabled'] = True changes["disabled"] = True
continue continue
result = results_dict[site.name]['status'] result = results_dict[site.name]["status"]
site_status = result.status site_status = result.status
@@ -580,33 +620,37 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F
msgs = site.absence_strs msgs = site.absence_strs
etype = site.check_type etype = site.check_type
logger.warning( logger.warning(
f'Error while searching {username} in {site.name}: {result.context}, {msgs}, type {etype}') f"Error while searching {username} in {site.name}: {result.context}, {msgs}, type {etype}"
)
# don't disable in case of available username # don't disable in case of available username
if status == QueryStatus.CLAIMED: if status == QueryStatus.CLAIMED:
changes['disabled'] = True changes["disabled"] = True
elif status == QueryStatus.CLAIMED: elif status == QueryStatus.CLAIMED:
logger.warning(f'Not found `{username}` in {site.name}, must be claimed') logger.warning(
f"Not found `{username}` in {site.name}, must be claimed"
)
logger.info(results_dict[site.name]) logger.info(results_dict[site.name])
changes['disabled'] = True changes["disabled"] = True
else: else:
logger.warning(f'Found `{username}` in {site.name}, must be available') logger.warning(f"Found `{username}` in {site.name}, must be available")
logger.info(results_dict[site.name]) logger.info(results_dict[site.name])
changes['disabled'] = True changes["disabled"] = True
logger.info(f'Site {site.name} checking is finished') logger.info(f"Site {site.name} checking is finished")
if changes['disabled'] != site.disabled: if changes["disabled"] != site.disabled:
site.disabled = changes['disabled'] site.disabled = changes["disabled"]
db.update_site(site) db.update_site(site)
if not silent: if not silent:
action = 'Disabled' if site.disabled else 'Enabled' action = "Disabled" if site.disabled else "Enabled"
print(f'{action} site {site.name}...') print(f"{action} site {site.name}...")
return changes return changes
async def self_check(db: MaigretDatabase, site_data: dict, logger, silent=False, async def self_check(
max_connections=10) -> bool: db: MaigretDatabase, site_data: dict, logger, silent=False, max_connections=10
) -> bool:
sem = asyncio.Semaphore(max_connections) sem = asyncio.Semaphore(max_connections)
tasks = [] tasks = []
all_sites = site_data all_sites = site_data
@@ -628,13 +672,15 @@ async def self_check(db: MaigretDatabase, site_data: dict, logger, silent=False,
total_disabled = disabled_new_count - disabled_old_count total_disabled = disabled_new_count - disabled_old_count
if total_disabled >= 0: if total_disabled >= 0:
message = 'Disabled' message = "Disabled"
else: else:
message = 'Enabled' message = "Enabled"
total_disabled *= -1 total_disabled *= -1
if not silent: if not silent:
print( print(
f'{message} {total_disabled} ({disabled_old_count} => {disabled_new_count}) checked sites. Run with `--info` flag to get more information') f"{message} {total_disabled} ({disabled_old_count} => {disabled_new_count}) checked sites. "
"Run with `--info` flag to get more information"
)
return total_disabled != 0 return total_disabled != 0
+104
View File
@@ -0,0 +1,104 @@
from typing import Dict, List, Any
from .result import QueryResult
# error got as a result of completed search query
class CheckError:
_type = 'Unknown'
_desc = ''
def __init__(self, typename, desc=''):
self._type = typename
self._desc = desc
def __str__(self):
if not self._desc:
return f'{self._type} error'
return f'{self._type} error: {self._desc}'
@property
def type(self):
return self._type
@property
def desc(self):
return self._desc
COMMON_ERRORS = {
'<title>Attention Required! | Cloudflare</title>': CheckError(
'Captcha', 'Cloudflare'
),
'Please stand by, while we are checking your browser': CheckError(
'Bot protection', 'Cloudflare'
),
'<title>Доступ ограничен</title>': CheckError('Censorship', 'Rostelecom'),
'document.getElementById(\'validate_form_submit\').disabled=true': CheckError(
'Captcha', 'Mail.ru'
),
'Verifying your browser, please wait...<br>DDoS Protection by</font> Blazingfast.io': CheckError(
'Bot protection', 'Blazingfast'
),
'404</h1><p class="error-card__description">Мы&nbsp;не&nbsp;нашли страницу': CheckError(
'Resolving', 'MegaFon 404 page'
),
'Доступ к информационному ресурсу ограничен на основании Федерального закона': CheckError(
'Censorship', 'MGTS'
),
'Incapsula incident ID': CheckError('Bot protection', 'Incapsula'),
}
ERRORS_TYPES = {
'Captcha': 'Try to switch to another IP address or to use service cookies',
'Bot protection': 'Try to switch to another IP address',
'Censorship': 'switch to another internet service provider',
'Request timeout': 'Try to increase timeout or to switch to another internet service provider',
}
THRESHOLD = 3 # percent
def is_important(err_data):
return err_data['perc'] >= THRESHOLD
def is_not_permanent(err_data):
return True
def detect(text):
for flag, err in COMMON_ERRORS.items():
if flag in text:
return err
return None
def solution_of(err_type) -> str:
return ERRORS_TYPES.get(err_type, '')
def extract_and_group(search_res: dict) -> List[Dict[str, Any]]:
errors_counts: Dict[str, int] = {}
for r in search_res:
if r and isinstance(r, dict) and r.get('status'):
if not isinstance(r['status'], QueryResult):
continue
err = r['status'].error
if not err:
continue
errors_counts[err.type] = errors_counts.get(err.type, 0) + 1
counts = []
for err, count in sorted(errors_counts.items(), key=lambda x: x[1], reverse=True):
counts.append(
{
'err': err,
'count': count,
'perc': round(count / len(search_res), 2) * 100,
}
)
return counts
+3 -4
View File
@@ -2,7 +2,7 @@ import asyncio
import time import time
import tqdm import tqdm
import sys import sys
from typing import Iterable from typing import Iterable, Any, List
from .types import QueryDraft from .types import QueryDraft
@@ -100,14 +100,13 @@ class AsyncioProgressbarQueueExecutor(AsyncExecutor):
self.queue.task_done() self.queue.task_done()
async def _run(self, queries: Iterable[QueryDraft]): async def _run(self, queries: Iterable[QueryDraft]):
self.results = [] self.results: List[Any] = []
queries_list = list(queries) queries_list = list(queries)
min_workers = min(len(queries_list), self.workers_count) min_workers = min(len(queries_list), self.workers_count)
workers = [create_task_func()(self.worker()) workers = [create_task_func()(self.worker()) for _ in range(min_workers)]
for _ in range(min_workers)]
self.progress = self.progress_func(total=len(queries_list)) self.progress = self.progress_func(total=len(queries_list))
for t in queries_list: for t in queries_list:
+340 -164
View File
@@ -12,11 +12,26 @@ from argparse import ArgumentParser, RawDescriptionHelpFormatter
import requests import requests
from socid_extractor import extract, parse, __version__ as socid_version from socid_extractor import extract, parse, __version__ as socid_version
from .checking import timeout_check, supported_recursive_search_ids, self_check, unsupported_characters, maigret from .checking import (
timeout_check,
supported_recursive_search_ids,
self_check,
unsupported_characters,
maigret,
)
from . import errors
from .notify import QueryNotifyPrint from .notify import QueryNotifyPrint
from .report import save_csv_report, save_xmind_report, save_html_report, save_pdf_report, \ from .report import (
generate_report_context, save_txt_report, SUPPORTED_JSON_REPORT_FORMATS, check_supported_json_format, \ save_csv_report,
save_json_report save_xmind_report,
save_html_report,
save_pdf_report,
generate_report_context,
save_txt_report,
SUPPORTED_JSON_REPORT_FORMATS,
check_supported_json_format,
save_json_report,
)
from .sites import MaigretDatabase from .sites import MaigretDatabase
from .submit import submit_dialog from .submit import submit_dialog
from .utils import get_dict_ascii_tree from .utils import get_dict_ascii_tree
@@ -24,167 +39,300 @@ from .utils import get_dict_ascii_tree
__version__ = '0.1.19' __version__ = '0.1.19'
def notify_about_errors(search_results, query_notify):
errs = errors.extract_and_group(search_results.values())
was_errs_displayed = False
for e in errs:
if not errors.is_important(e):
continue
text = f'Too many errors of type "{e["err"]}" ({e["perc"]}%)'
solution = errors.solution_of(e['err'])
if solution:
text = '. '.join([text, solution])
query_notify.warning(text, '!')
was_errs_displayed = True
if was_errs_displayed:
query_notify.warning(
'You can see detailed site check errors with a flag `--print-errors`'
)
async def main(): async def main():
version_string = '\n'.join([ version_string = '\n'.join(
[
f'%(prog)s {__version__}', f'%(prog)s {__version__}',
f'Socid-extractor: {socid_version}', f'Socid-extractor: {socid_version}',
f'Aiohttp: {aiohttp.__version__}', f'Aiohttp: {aiohttp.__version__}',
f'Requests: {requests.__version__}', f'Requests: {requests.__version__}',
f'Python: {platform.python_version()}', f'Python: {platform.python_version()}',
]) ]
)
parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter, parser = ArgumentParser(
description=f"Maigret v{__version__}" formatter_class=RawDescriptionHelpFormatter,
description=f"Maigret v{__version__}",
) )
parser.add_argument("--version", parser.add_argument(
action="version", version=version_string, "--version",
help="Display version information and dependencies." action="version",
version=version_string,
help="Display version information and dependencies.",
) )
parser.add_argument("--info", "-vv", parser.add_argument(
action="store_true", dest="info", default=False, "--info",
help="Display service information." "-vv",
action="store_true",
dest="info",
default=False,
help="Display service information.",
) )
parser.add_argument("--verbose", "-v", parser.add_argument(
action="store_true", dest="verbose", default=False, "--verbose",
help="Display extra information and metrics." "-v",
action="store_true",
dest="verbose",
default=False,
help="Display extra information and metrics.",
) )
parser.add_argument("-d", "--debug", "-vvv", parser.add_argument(
action="store_true", dest="debug", default=False, "-d",
help="Saving debugging information and sites responses in debug.txt." "--debug",
"-vvv",
action="store_true",
dest="debug",
default=False,
help="Saving debugging information and sites responses in debug.txt.",
) )
parser.add_argument("--site", parser.add_argument(
action="append", metavar='SITE_NAME', "--site",
dest="site_list", default=[], action="append",
help="Limit analysis to just the listed sites (use several times to specify more than one)" metavar='SITE_NAME',
dest="site_list",
default=[],
help="Limit analysis to just the listed sites (use several times to specify more than one)",
) )
parser.add_argument("--proxy", "-p", metavar='PROXY_URL', parser.add_argument(
action="store", dest="proxy", default=None, "--proxy",
help="Make requests over a proxy. e.g. socks5://127.0.0.1:1080" "-p",
) metavar='PROXY_URL',
parser.add_argument("--db", metavar="DB_FILE",
dest="db_file", default=None,
help="Load Maigret database from a JSON file or an online, valid, JSON file.")
parser.add_argument("--cookies-jar-file", metavar="COOKIE_FILE",
dest="cookie_file", default=None,
help="File with cookies.")
parser.add_argument("--timeout",
action="store", metavar='TIMEOUT',
dest="timeout", type=timeout_check, default=10,
help="Time (in seconds) to wait for response to requests."
"Default timeout of 10.0s. "
"A longer timeout will be more likely to get results from slow sites."
"On the other hand, this may cause a long delay to gather all results."
)
parser.add_argument("-n", "--max-connections",
action="store", type=int,
dest="connections", default=100,
help="Allowed number of concurrent connections."
)
parser.add_argument("-a", "--all-sites",
action="store_true", dest="all_sites", default=False,
help="Use all sites for scan."
)
parser.add_argument("--top-sites",
action="store", default=500, type=int,
help="Count of sites for scan ranked by Alexa Top (default: 500)."
)
parser.add_argument("--print-not-found",
action="store_true", dest="print_not_found", default=False,
help="Print sites where the username was not found."
)
parser.add_argument("--print-errors",
action="store_true", dest="print_check_errors", default=False,
help="Print errors messages: connection, captcha, site country ban, etc."
)
parser.add_argument("--submit", metavar='EXISTING_USER_URL',
type=str, dest="new_site_to_submit", default=False,
help="URL of existing profile in new site to submit."
)
parser.add_argument("--no-color",
action="store_true", dest="no_color", default=False,
help="Don't color terminal output"
)
parser.add_argument("--no-progressbar",
action="store_true", dest="no_progressbar", default=False,
help="Don't show progressbar."
)
parser.add_argument("--browse", "-b",
action="store_true", dest="browse", default=False,
help="Browse to all results on default bowser."
)
parser.add_argument("--no-recursion",
action="store_true", dest="disable_recursive_search", default=False,
help="Disable recursive search by additional data extracted from pages."
)
parser.add_argument("--no-extracting",
action="store_true", dest="disable_extracting", default=False,
help="Disable parsing pages for additional data and other usernames."
)
parser.add_argument("--self-check",
action="store_true", default=False,
help="Do self check for sites and database and disable non-working ones."
)
parser.add_argument("--stats",
action="store_true", default=False,
help="Show database statistics."
)
parser.add_argument("--use-disabled-sites",
action="store_true", default=False,
help="Use disabled sites to search (may cause many false positives)."
)
parser.add_argument("--parse",
dest="parse_url", default='',
help="Parse page by URL and extract username and IDs to use for search."
)
parser.add_argument("--id-type",
dest="id_type", default='username',
help="Specify identifier(s) type (default: username)."
)
parser.add_argument("--ignore-ids",
action="append", metavar='IGNORED_IDS',
dest="ignore_ids_list", default=[],
help="Do not make search by the specified username or other ids."
)
parser.add_argument("username",
nargs='+', metavar='USERNAMES',
action="store", action="store",
help="One or more usernames to check with social networks." dest="proxy",
default=None,
help="Make requests over a proxy. e.g. socks5://127.0.0.1:1080",
) )
parser.add_argument("--tags", parser.add_argument(
dest="tags", default='', "--db",
help="Specify tags of sites." metavar="DB_FILE",
dest="db_file",
default=None,
help="Load Maigret database from a JSON file or an online, valid, JSON file.",
)
parser.add_argument(
"--cookies-jar-file",
metavar="COOKIE_FILE",
dest="cookie_file",
default=None,
help="File with cookies.",
)
parser.add_argument(
"--timeout",
action="store",
metavar='TIMEOUT',
dest="timeout",
type=timeout_check,
default=30,
help="Time (in seconds) to wait for response to requests. "
"Default timeout of 30.0s. "
"A longer timeout will be more likely to get results from slow sites. "
"On the other hand, this may cause a long delay to gather all results. ",
)
parser.add_argument(
"-n",
"--max-connections",
action="store",
type=int,
dest="connections",
default=100,
help="Allowed number of concurrent connections.",
)
parser.add_argument(
"-a",
"--all-sites",
action="store_true",
dest="all_sites",
default=False,
help="Use all sites for scan.",
)
parser.add_argument(
"--top-sites",
action="store",
default=500,
type=int,
help="Count of sites for scan ranked by Alexa Top (default: 500).",
)
parser.add_argument(
"--print-not-found",
action="store_true",
dest="print_not_found",
default=False,
help="Print sites where the username was not found.",
)
parser.add_argument(
"--print-errors",
action="store_true",
dest="print_check_errors",
default=False,
help="Print errors messages: connection, captcha, site country ban, etc.",
)
parser.add_argument(
"--submit",
metavar='EXISTING_USER_URL',
type=str,
dest="new_site_to_submit",
default=False,
help="URL of existing profile in new site to submit.",
)
parser.add_argument(
"--no-color",
action="store_true",
dest="no_color",
default=False,
help="Don't color terminal output",
)
parser.add_argument(
"--no-progressbar",
action="store_true",
dest="no_progressbar",
default=False,
help="Don't show progressbar.",
)
parser.add_argument(
"--browse",
"-b",
action="store_true",
dest="browse",
default=False,
help="Browse to all results on default bowser.",
)
parser.add_argument(
"--no-recursion",
action="store_true",
dest="disable_recursive_search",
default=False,
help="Disable recursive search by additional data extracted from pages.",
)
parser.add_argument(
"--no-extracting",
action="store_true",
dest="disable_extracting",
default=False,
help="Disable parsing pages for additional data and other usernames.",
)
parser.add_argument(
"--self-check",
action="store_true",
default=False,
help="Do self check for sites and database and disable non-working ones.",
)
parser.add_argument(
"--stats", action="store_true", default=False, help="Show database statistics."
)
parser.add_argument(
"--use-disabled-sites",
action="store_true",
default=False,
help="Use disabled sites to search (may cause many false positives).",
)
parser.add_argument(
"--parse",
dest="parse_url",
default='',
help="Parse page by URL and extract username and IDs to use for search.",
)
parser.add_argument(
"--id-type",
dest="id_type",
default='username',
help="Specify identifier(s) type (default: username).",
)
parser.add_argument(
"--ignore-ids",
action="append",
metavar='IGNORED_IDS',
dest="ignore_ids_list",
default=[],
help="Do not make search by the specified username or other ids.",
)
parser.add_argument(
"username",
nargs='+',
metavar='USERNAMES',
action="store",
help="One or more usernames to check with social networks.",
)
parser.add_argument(
"--tags", dest="tags", default='', help="Specify tags of sites."
) )
# reports options # reports options
parser.add_argument("--folderoutput", "-fo", dest="folderoutput", default="reports", parser.add_argument(
help="If using multiple usernames, the output of the results will be saved to this folder." "--folderoutput",
"-fo",
dest="folderoutput",
default="reports",
help="If using multiple usernames, the output of the results will be saved to this folder.",
) )
parser.add_argument("-T", "--txt", parser.add_argument(
action="store_true", dest="txt", default=False, "-T",
help="Create a TXT report (one report per username)." "--txt",
)
parser.add_argument("-C", "--csv",
action="store_true", dest="csv", default=False,
help="Create a CSV report (one report per username)."
)
parser.add_argument("-H", "--html",
action="store_true", dest="html", default=False,
help="Create an HTML report file (general report on all usernames)."
)
parser.add_argument("-X", "--xmind",
action="store_true", action="store_true",
dest="xmind", default=False, dest="txt",
help="Generate an XMind 8 mindmap report (one report per username)." default=False,
help="Create a TXT report (one report per username).",
) )
parser.add_argument("-P", "--pdf", parser.add_argument(
"-C",
"--csv",
action="store_true", action="store_true",
dest="pdf", default=False, dest="csv",
help="Generate a PDF report (general report on all usernames)." default=False,
help="Create a CSV report (one report per username).",
) )
parser.add_argument("-J", "--json", parser.add_argument(
action="store", metavar='REPORT_TYPE', "-H",
dest="json", default='', type=check_supported_json_format, "--html",
action="store_true",
dest="html",
default=False,
help="Create an HTML report file (general report on all usernames).",
)
parser.add_argument(
"-X",
"--xmind",
action="store_true",
dest="xmind",
default=False,
help="Generate an XMind 8 mindmap report (one report per username).",
)
parser.add_argument(
"-P",
"--pdf",
action="store_true",
dest="pdf",
default=False,
help="Generate a PDF report (general report on all usernames).",
)
parser.add_argument(
"-J",
"--json",
action="store",
metavar='REPORT_TYPE',
dest="json",
default='',
type=check_supported_json_format,
help=f"Generate a JSON report of specific type: {', '.join(SUPPORTED_JSON_REPORT_FORMATS)}" help=f"Generate a JSON report of specific type: {', '.join(SUPPORTED_JSON_REPORT_FORMATS)}"
" (one report per username)." " (one report per username).",
) )
args = parser.parse_args() args = parser.parse_args()
@@ -194,7 +342,7 @@ async def main():
logging.basicConfig( logging.basicConfig(
format='[%(filename)s:%(lineno)d] %(levelname)-3s %(asctime)s %(message)s', format='[%(filename)s:%(lineno)d] %(levelname)-3s %(asctime)s %(message)s',
datefmt='%H:%M:%S', datefmt='%H:%M:%S',
level=log_level level=log_level,
) )
if args.debug: if args.debug:
@@ -211,8 +359,7 @@ async def main():
usernames = { usernames = {
u: args.id_type u: args.id_type
for u in args.username for u in args.username
if u not in ['-'] if u not in ['-'] and u not in args.ignore_ids_list
and u not in args.ignore_ids_list
} }
parsing_enabled = not args.disable_extracting parsing_enabled = not args.disable_extracting
@@ -228,8 +375,10 @@ async def main():
try: try:
# temporary workaround for URL mutations MVP # temporary workaround for URL mutations MVP
from socid_extractor import mutate_url from socid_extractor import mutate_url
reqs += list(mutate_url(args.parse_url)) reqs += list(mutate_url(args.parse_url))
except: except Exception as e:
logger.warning(e)
pass pass
for req in reqs: for req in reqs:
@@ -251,38 +400,47 @@ async def main():
args.tags = list(set(str(args.tags).split(','))) args.tags = list(set(str(args.tags).split(',')))
if args.db_file is None: if args.db_file is None:
args.db_file = \ args.db_file = os.path.join(
os.path.join(os.path.dirname(os.path.realpath(__file__)), os.path.dirname(os.path.realpath(__file__)), "resources/data.json"
"resources/data.json"
) )
if args.top_sites == 0 or args.all_sites: if args.top_sites == 0 or args.all_sites:
args.top_sites = sys.maxsize args.top_sites = sys.maxsize
# Create notify object for query results. # Create notify object for query results.
query_notify = QueryNotifyPrint(result=None, query_notify = QueryNotifyPrint(
result=None,
verbose=args.verbose, verbose=args.verbose,
print_found_only=not args.print_not_found, print_found_only=not args.print_not_found,
skip_check_errors=not args.print_check_errors, skip_check_errors=not args.print_check_errors,
color=not args.no_color) color=not args.no_color,
)
# Create object with all information about sites we are aware of. # Create object with all information about sites we are aware of.
db = MaigretDatabase().load_from_file(args.db_file) db = MaigretDatabase().load_from_file(args.db_file)
get_top_sites_for_id = lambda x: db.ranked_sites_dict(top=args.top_sites, tags=args.tags, get_top_sites_for_id = lambda x: db.ranked_sites_dict(
top=args.top_sites,
tags=args.tags,
names=args.site_list, names=args.site_list,
disabled=False, id_type=x) disabled=False,
id_type=x,
)
site_data = get_top_sites_for_id(args.id_type) site_data = get_top_sites_for_id(args.id_type)
if args.new_site_to_submit: if args.new_site_to_submit:
is_submitted = await submit_dialog(db, args.new_site_to_submit, args.cookie_file, logger) is_submitted = await submit_dialog(
db, args.new_site_to_submit, args.cookie_file, logger
)
if is_submitted: if is_submitted:
db.save_to_file(args.db_file) db.save_to_file(args.db_file)
# Database self-checking # Database self-checking
if args.self_check: if args.self_check:
print('Maigret sites database self-checking...') print('Maigret sites database self-checking...')
is_need_update = await self_check(db, site_data, logger, max_connections=args.connections) is_need_update = await self_check(
db, site_data, logger, max_connections=args.connections
)
if is_need_update: if is_need_update:
if input('Do you want to save changes permanently? [Yn]\n').lower() == 'y': if input('Do you want to save changes permanently? [Yn]\n').lower() == 'y':
db.save_to_file(args.db_file) db.save_to_file(args.db_file)
@@ -314,9 +472,13 @@ async def main():
query_notify.warning('No sites to check, exiting!') query_notify.warning('No sites to check, exiting!')
sys.exit(2) sys.exit(2)
else: else:
query_notify.warning(f'Starting a search on top {len(site_data)} sites from the Maigret database...') query_notify.warning(
f'Starting a search on top {len(site_data)} sites from the Maigret database...'
)
if not args.all_sites: if not args.all_sites:
query_notify.warning(f'You can run search by full list of sites with flag `-a`', '!') query_notify.warning(
'You can run search by full list of sites with flag `-a`', '!'
)
already_checked = set() already_checked = set()
general_results = [] general_results = []
@@ -331,21 +493,29 @@ async def main():
already_checked.add(username.lower()) already_checked.add(username.lower())
if username in args.ignore_ids_list: if username in args.ignore_ids_list:
query_notify.warning(f'Skip a search by username {username} cause it\'s marked as ignored.') query_notify.warning(
f'Skip a search by username {username} cause it\'s marked as ignored.'
)
continue continue
# check for characters do not supported by sites generally # check for characters do not supported by sites generally
found_unsupported_chars = set(unsupported_characters).intersection(set(username)) found_unsupported_chars = set(unsupported_characters).intersection(
set(username)
)
if found_unsupported_chars: if found_unsupported_chars:
pretty_chars_str = ','.join(map(lambda s: f'"{s}"', found_unsupported_chars)) pretty_chars_str = ','.join(
map(lambda s: f'"{s}"', found_unsupported_chars)
)
query_notify.warning( query_notify.warning(
f'Found unsupported URL characters: {pretty_chars_str}, skip search by username "{username}"') f'Found unsupported URL characters: {pretty_chars_str}, skip search by username "{username}"'
)
continue continue
sites_to_check = get_top_sites_for_id(id_type) sites_to_check = get_top_sites_for_id(id_type)
results = await maigret(username=username, results = await maigret(
username=username,
site_dict=dict(sites_to_check), site_dict=dict(sites_to_check),
query_notify=query_notify, query_notify=query_notify,
proxy=args.proxy, proxy=args.proxy,
@@ -360,6 +530,8 @@ async def main():
no_progressbar=args.no_progressbar, no_progressbar=args.no_progressbar,
) )
notify_about_errors(results, query_notify)
general_results.append((username, id_type, results)) general_results.append((username, id_type, results))
# TODO: tests # TODO: tests
@@ -397,9 +569,13 @@ async def main():
query_notify.warning(f'TXT report for {username} saved in {filename}') query_notify.warning(f'TXT report for {username} saved in {filename}')
if args.json: if args.json:
filename = report_filepath_tpl.format(username=username, postfix=f'_{args.json}.json') filename = report_filepath_tpl.format(
username=username, postfix=f'_{args.json}.json'
)
save_json_report(filename, username, results, report_type=args.json) save_json_report(filename, username, results, report_type=args.json)
query_notify.warning(f'JSON {args.json} report for {username} saved in {filename}') query_notify.warning(
f'JSON {args.json} report for {username} saved in {filename}'
)
# reporting for all the result # reporting for all the result
if general_results: if general_results:
+61 -36
View File
@@ -11,7 +11,7 @@ from .result import QueryStatus
from .utils import get_dict_ascii_tree from .utils import get_dict_ascii_tree
class QueryNotify(): class QueryNotify:
"""Query Notify Object. """Query Notify Object.
Base class that describes methods available to notify the results of Base class that describes methods available to notify the results of
@@ -39,7 +39,7 @@ class QueryNotify():
return return
def start(self, message=None, id_type='username'): def start(self, message=None, id_type="username"):
"""Notify Start. """Notify Start.
Notify method for start of query. This method will be called before Notify method for start of query. This method will be called before
@@ -116,8 +116,14 @@ class QueryNotifyPrint(QueryNotify):
Query notify class that prints results. Query notify class that prints results.
""" """
def __init__(self, result=None, verbose=False, print_found_only=False, def __init__(
skip_check_errors=False, color=True): self,
result=None,
verbose=False,
print_found_only=False,
skip_check_errors=False,
color=True,
):
"""Create Query Notify Print Object. """Create Query Notify Print Object.
Contains information about a specific method of notifying the results Contains information about a specific method of notifying the results
@@ -162,22 +168,29 @@ class QueryNotifyPrint(QueryNotify):
title = f"Checking {id_type}" title = f"Checking {id_type}"
if self.color: if self.color:
print(Style.BRIGHT + Fore.GREEN + "[" + print(
Fore.YELLOW + "*" + Style.BRIGHT
Fore.GREEN + f"] {title}" + + Fore.GREEN
Fore.WHITE + f" {message}" + + "["
Fore.GREEN + " on:") + Fore.YELLOW
+ "*"
+ Fore.GREEN
+ f"] {title}"
+ Fore.WHITE
+ f" {message}"
+ Fore.GREEN
+ " on:"
)
else: else:
print(f"[*] {title} {message} on:") print(f"[*] {title} {message} on:")
def warning(self, message, symbol='-'): def warning(self, message, symbol="-"):
msg = f'[{symbol}] {message}' msg = f"[{symbol}] {message}"
if self.color: if self.color:
print(Style.BRIGHT + Fore.YELLOW + msg) print(Style.BRIGHT + Fore.YELLOW + msg)
else: else:
print(msg) print(msg)
def update(self, result, is_similar=False): def update(self, result, is_similar=False):
"""Notify Update. """Notify Update.
@@ -196,18 +209,20 @@ class QueryNotifyPrint(QueryNotify):
if not self.result.ids_data: if not self.result.ids_data:
ids_data_text = "" ids_data_text = ""
else: else:
ids_data_text = get_dict_ascii_tree(self.result.ids_data.items(), ' ') ids_data_text = get_dict_ascii_tree(self.result.ids_data.items(), " ")
def make_colored_terminal_notify(status, text, status_color, text_color, appendix): def make_colored_terminal_notify(
status, text, status_color, text_color, appendix
):
text = [ text = [
f'{Style.BRIGHT}{Fore.WHITE}[{status_color}{status}{Fore.WHITE}]' + f"{Style.BRIGHT}{Fore.WHITE}[{status_color}{status}{Fore.WHITE}]"
f'{text_color} {text}: {Style.RESET_ALL}' + + f"{text_color} {text}: {Style.RESET_ALL}"
f'{appendix}' + f"{appendix}"
] ]
return ''.join(text) return "".join(text)
def make_simple_terminal_notify(status, text, appendix): def make_simple_terminal_notify(status, text, appendix):
return f'[{status}] {text}: {appendix}' return f"[{status}] {text}: {appendix}"
def make_terminal_notify(is_colored=True, *args): def make_terminal_notify(is_colored=True, *args):
if is_colored: if is_colored:
@@ -220,45 +235,55 @@ class QueryNotifyPrint(QueryNotify):
# Output to the terminal is desired. # Output to the terminal is desired.
if result.status == QueryStatus.CLAIMED: if result.status == QueryStatus.CLAIMED:
color = Fore.BLUE if is_similar else Fore.GREEN color = Fore.BLUE if is_similar else Fore.GREEN
status = '?' if is_similar else '+' status = "?" if is_similar else "+"
notify = make_terminal_notify( notify = make_terminal_notify(
self.color, self.color,
status, result.site_name, status,
color, color, result.site_name,
result.site_url_user + ids_data_text color,
color,
result.site_url_user + ids_data_text,
) )
elif result.status == QueryStatus.AVAILABLE: elif result.status == QueryStatus.AVAILABLE:
if not self.print_found_only: if not self.print_found_only:
notify = make_terminal_notify( notify = make_terminal_notify(
self.color, self.color,
'-', result.site_name, "-",
Fore.RED, Fore.YELLOW, result.site_name,
'Not found!' + ids_data_text Fore.RED,
Fore.YELLOW,
"Not found!" + ids_data_text,
) )
elif result.status == QueryStatus.UNKNOWN: elif result.status == QueryStatus.UNKNOWN:
if not self.skip_check_errors: if not self.skip_check_errors:
notify = make_terminal_notify( notify = make_terminal_notify(
self.color, self.color,
'?', result.site_name, "?",
Fore.RED, Fore.RED, result.site_name,
str(self.result.error) + ids_data_text Fore.RED,
Fore.RED,
str(self.result.error) + ids_data_text,
) )
elif result.status == QueryStatus.ILLEGAL: elif result.status == QueryStatus.ILLEGAL:
if not self.print_found_only: if not self.print_found_only:
text = 'Illegal Username Format For This Site!' text = "Illegal Username Format For This Site!"
notify = make_terminal_notify( notify = make_terminal_notify(
self.color, self.color,
'-', result.site_name, "-",
Fore.RED, Fore.YELLOW, result.site_name,
text + ids_data_text Fore.RED,
Fore.YELLOW,
text + ids_data_text,
) )
else: else:
# It should be impossible to ever get here... # It should be impossible to ever get here...
raise ValueError(f"Unknown Query Status '{str(result.status)}' for " raise ValueError(
f"site '{self.result.site_name}'") f"Unknown Query Status '{str(result.status)}' for "
f"site '{self.result.site_name}'"
)
if notify: if notify:
sys.stdout.write('\x1b[1K\r') sys.stdout.write("\x1b[1K\r")
print(notify) print(notify)
return return
+100 -88
View File
@@ -5,6 +5,7 @@ import logging
import os import os
from argparse import ArgumentTypeError from argparse import ArgumentTypeError
from datetime import datetime from datetime import datetime
from typing import Dict, Any
import pycountry import pycountry
import xmind import xmind
@@ -16,61 +17,63 @@ from .result import QueryStatus
from .utils import is_country_tag, CaseConverter, enrich_link_str from .utils import is_country_tag, CaseConverter, enrich_link_str
SUPPORTED_JSON_REPORT_FORMATS = [ SUPPORTED_JSON_REPORT_FORMATS = [
'simple', "simple",
'ndjson', "ndjson",
] ]
''' """
UTILS UTILS
''' """
def filter_supposed_data(data): def filter_supposed_data(data):
### interesting fields # interesting fields
allowed_fields = ['fullname', 'gender', 'location', 'age'] allowed_fields = ["fullname", "gender", "location", "age"]
filtered_supposed_data = {CaseConverter.snake_to_title(k): v[0] filtered_supposed_data = {
CaseConverter.snake_to_title(k): v[0]
for k, v in data.items() for k, v in data.items()
if k in allowed_fields} if k in allowed_fields
}
return filtered_supposed_data return filtered_supposed_data
''' """
REPORTS SAVING REPORTS SAVING
''' """
def save_csv_report(filename: str, username: str, results: dict): def save_csv_report(filename: str, username: str, results: dict):
with open(filename, 'w', newline='', encoding='utf-8') as f: with open(filename, "w", newline="", encoding="utf-8") as f:
generate_csv_report(username, results, f) generate_csv_report(username, results, f)
def save_txt_report(filename: str, username: str, results: dict): def save_txt_report(filename: str, username: str, results: dict):
with open(filename, 'w', encoding='utf-8') as f: with open(filename, "w", encoding="utf-8") as f:
generate_txt_report(username, results, f) generate_txt_report(username, results, f)
def save_html_report(filename: str, context: dict): def save_html_report(filename: str, context: dict):
template, _ = generate_report_template(is_pdf=False) template, _ = generate_report_template(is_pdf=False)
filled_template = template.render(**context) filled_template = template.render(**context)
with open(filename, 'w') as f: with open(filename, "w") as f:
f.write(filled_template) f.write(filled_template)
def save_pdf_report(filename: str, context: dict): def save_pdf_report(filename: str, context: dict):
template, css = generate_report_template(is_pdf=True) template, css = generate_report_template(is_pdf=True)
filled_template = template.render(**context) filled_template = template.render(**context)
with open(filename, 'w+b') as f: with open(filename, "w+b") as f:
pisa.pisaDocument(io.StringIO(filled_template), dest=f, default_css=css) pisa.pisaDocument(io.StringIO(filled_template), dest=f, default_css=css)
def save_json_report(filename: str, username: str, results: dict, report_type: str): def save_json_report(filename: str, username: str, results: dict, report_type: str):
with open(filename, 'w', encoding='utf-8') as f: with open(filename, "w", encoding="utf-8") as f:
generate_json_report(username, results, f, report_type=report_type) generate_json_report(username, results, f, report_type=report_type)
''' """
REPORTS GENERATING REPORTS GENERATING
''' """
def generate_report_template(is_pdf: bool): def generate_report_template(is_pdf: bool):
@@ -79,20 +82,20 @@ def generate_report_template(is_pdf: bool):
""" """
def get_resource_content(filename): def get_resource_content(filename):
return open(os.path.join(maigret_path, 'resources', filename)).read() return open(os.path.join(maigret_path, "resources", filename)).read()
maigret_path = os.path.dirname(os.path.realpath(__file__)) maigret_path = os.path.dirname(os.path.realpath(__file__))
if is_pdf: if is_pdf:
template_content = get_resource_content('simple_report_pdf.tpl') template_content = get_resource_content("simple_report_pdf.tpl")
css_content = get_resource_content('simple_report_pdf.css') css_content = get_resource_content("simple_report_pdf.css")
else: else:
template_content = get_resource_content('simple_report.tpl') template_content = get_resource_content("simple_report.tpl")
css_content = None css_content = None
template = Template(template_content) template = Template(template_content)
template.globals['title'] = CaseConverter.snake_to_title template.globals["title"] = CaseConverter.snake_to_title # type: ignore
template.globals['detect_link'] = enrich_link_str template.globals["detect_link"] = enrich_link_str # type: ignore
return template, css_content return template, css_content
@@ -100,15 +103,15 @@ def generate_report_context(username_results: list):
brief_text = [] brief_text = []
usernames = {} usernames = {}
extended_info_count = 0 extended_info_count = 0
tags = {} tags: Dict[str, int] = {}
supposed_data = {} supposed_data: Dict[str, Any] = {}
first_seen = None first_seen = None
for username, id_type, results in username_results: for username, id_type, results in username_results:
found_accounts = 0 found_accounts = 0
new_ids = [] new_ids = []
usernames[username] = {'type': id_type} usernames[username] = {"type": id_type}
for website_name in results: for website_name in results:
dictionary = results[website_name] dictionary = results[website_name]
@@ -116,19 +119,19 @@ def generate_report_context(username_results: list):
if not dictionary: if not dictionary:
continue continue
if dictionary.get('is_similar'): if dictionary.get("is_similar"):
continue continue
status = dictionary.get('status') status = dictionary.get("status")
if not status: # FIXME: currently in case of timeout if not status: # FIXME: currently in case of timeout
continue continue
if status.ids_data: if status.ids_data:
dictionary['ids_data'] = status.ids_data dictionary["ids_data"] = status.ids_data
extended_info_count += 1 extended_info_count += 1
# detect first seen # detect first seen
created_at = status.ids_data.get('created_at') created_at = status.ids_data.get("created_at")
if created_at: if created_at:
if first_seen is None: if first_seen is None:
first_seen = created_at first_seen = created_at
@@ -138,37 +141,46 @@ def generate_report_context(username_results: list):
new_time = parse_datetime_str(created_at) new_time = parse_datetime_str(created_at)
if new_time < known_time: if new_time < known_time:
first_seen = created_at first_seen = created_at
except: except Exception as e:
logging.debug('Problems with converting datetime %s/%s', first_seen, created_at) logging.debug(
"Problems with converting datetime %s/%s: %s",
first_seen,
created_at,
str(e),
)
for k, v in status.ids_data.items(): for k, v in status.ids_data.items():
# suppose target data # suppose target data
field = 'fullname' if k == 'name' else k field = "fullname" if k == "name" else k
if not field in supposed_data: if field not in supposed_data:
supposed_data[field] = [] supposed_data[field] = []
supposed_data[field].append(v) supposed_data[field].append(v)
# suppose country # suppose country
if k in ['country', 'locale']: if k in ["country", "locale"]:
try: try:
if is_country_tag(k): if is_country_tag(k):
tag = pycountry.countries.get(alpha_2=v).alpha_2.lower() tag = pycountry.countries.get(alpha_2=v).alpha_2.lower()
else: else:
tag = pycountry.countries.search_fuzzy(v)[0].alpha_2.lower() tag = pycountry.countries.search_fuzzy(v)[
0
].alpha_2.lower()
# TODO: move countries to another struct # TODO: move countries to another struct
tags[tag] = tags.get(tag, 0) + 1 tags[tag] = tags.get(tag, 0) + 1
except Exception as e: except Exception as e:
logging.debug('pycountry exception', exc_info=True) logging.debug(
"Pycountry exception: %s", str(e), exc_info=True
)
new_usernames = dictionary.get('ids_usernames') new_usernames = dictionary.get("ids_usernames")
if new_usernames: if new_usernames:
for u, utype in new_usernames.items(): for u, utype in new_usernames.items():
if not u in usernames: if u not in usernames:
new_ids.append((u, utype)) new_ids.append((u, utype))
usernames[u] = {'type': utype} usernames[u] = {"type": utype}
if status.status == QueryStatus.CLAIMED: if status.status == QueryStatus.CLAIMED:
found_accounts += 1 found_accounts += 1
dictionary['found'] = True dictionary["found"] = True
else: else:
continue continue
@@ -177,22 +189,24 @@ def generate_report_context(username_results: list):
for t in status.tags: for t in status.tags:
tags[t] = tags.get(t, 0) + 1 tags[t] = tags.get(t, 0) + 1
brief_text.append(f'Search by {id_type} {username} returned {found_accounts} accounts.') brief_text.append(
f"Search by {id_type} {username} returned {found_accounts} accounts."
)
if new_ids: if new_ids:
ids_list = [] ids_list = []
for u, t in new_ids: for u, t in new_ids:
ids_list.append(f'{u} ({t})' if t != 'username' else u) ids_list.append(f"{u} ({t})" if t != "username" else u)
brief_text.append(f'Found target\'s other IDs: ' + ', '.join(ids_list) + '.') brief_text.append("Found target's other IDs: " + ", ".join(ids_list) + ".")
brief_text.append(f'Extended info extracted from {extended_info_count} accounts.') brief_text.append(f"Extended info extracted from {extended_info_count} accounts.")
brief = ' '.join(brief_text).strip() brief = " ".join(brief_text).strip()
tuple_sort = lambda d: sorted(d, key=lambda x: x[1], reverse=True) tuple_sort = lambda d: sorted(d, key=lambda x: x[1], reverse=True)
if 'global' in tags: if "global" in tags:
# remove tag 'global' useless for country detection # remove tag 'global' useless for country detection
del tags['global'] del tags["global"]
first_username = username_results[0][0] first_username = username_results[0][0]
countries_lists = list(filter(lambda x: is_country_tag(x[0]), tags.items())) countries_lists = list(filter(lambda x: is_country_tag(x[0]), tags.items()))
@@ -201,35 +215,33 @@ def generate_report_context(username_results: list):
filtered_supposed_data = filter_supposed_data(supposed_data) filtered_supposed_data = filter_supposed_data(supposed_data)
return { return {
'username': first_username, "username": first_username,
'brief': brief, "brief": brief,
'results': username_results, "results": username_results,
'first_seen': first_seen, "first_seen": first_seen,
'interests_tuple_list': tuple_sort(interests_list), "interests_tuple_list": tuple_sort(interests_list),
'countries_tuple_list': tuple_sort(countries_lists), "countries_tuple_list": tuple_sort(countries_lists),
'supposed_data': filtered_supposed_data, "supposed_data": filtered_supposed_data,
'generated_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'), "generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
} }
def generate_csv_report(username: str, results: dict, csvfile): def generate_csv_report(username: str, results: dict, csvfile):
writer = csv.writer(csvfile) writer = csv.writer(csvfile)
writer.writerow(['username', writer.writerow(
'name', ["username", "name", "url_main", "url_user", "exists", "http_status"]
'url_main',
'url_user',
'exists',
'http_status'
]
) )
for site in results: for site in results:
writer.writerow([username, writer.writerow(
[
username,
site, site,
results[site]['url_main'], results[site]["url_main"],
results[site]['url_user'], results[site]["url_user"],
str(results[site]['status'].status), str(results[site]["status"].status),
results[site]['http_status'], results[site]["http_status"],
]) ]
)
def generate_txt_report(username: str, results: dict, file): def generate_txt_report(username: str, results: dict, file):
@@ -242,12 +254,11 @@ def generate_txt_report(username: str, results: dict, file):
if dictionary.get("status").status == QueryStatus.CLAIMED: if dictionary.get("status").status == QueryStatus.CLAIMED:
exists_counter += 1 exists_counter += 1
file.write(dictionary["url_user"] + "\n") file.write(dictionary["url_user"] + "\n")
file.write(f'Total Websites Username Detected On : {exists_counter}') file.write(f"Total Websites Username Detected On : {exists_counter}")
def generate_json_report(username: str, results: dict, file, report_type): def generate_json_report(username: str, results: dict, file, report_type):
exists_counter = 0 is_report_per_line = report_type.startswith("ndjson")
is_report_per_line = report_type.startswith('ndjson')
all_json = {} all_json = {}
for sitename in results: for sitename in results:
@@ -257,11 +268,11 @@ def generate_json_report(username: str, results: dict, file, report_type):
continue continue
data = dict(site_result) data = dict(site_result)
data['status'] = data['status'].json() data["status"] = data["status"].json()
if is_report_per_line: if is_report_per_line:
data['sitename'] = sitename data["sitename"] = sitename
file.write(json.dumps(data) + '\n') file.write(json.dumps(data) + "\n")
else: else:
all_json[sitename] = data all_json[sitename] = data
@@ -269,9 +280,9 @@ def generate_json_report(username: str, results: dict, file, report_type):
file.write(json.dumps(all_json)) file.write(json.dumps(all_json))
''' """
XMIND 8 Functions XMIND 8 Functions
''' """
def save_xmind_report(filename, username, results): def save_xmind_report(filename, username, results):
@@ -284,7 +295,6 @@ def save_xmind_report(filename, username, results):
def design_sheet(sheet, username, results): def design_sheet(sheet, username, results):
##all tag list
alltags = {} alltags = {}
supposed_data = {} supposed_data = {}
@@ -300,7 +310,7 @@ def design_sheet(sheet, username, results):
dictionary = results[website_name] dictionary = results[website_name]
if dictionary.get("status").status == QueryStatus.CLAIMED: if dictionary.get("status").status == QueryStatus.CLAIMED:
## firsttime I found that entry # firsttime I found that entry
for tag in dictionary.get("status").tags: for tag in dictionary.get("status").tags:
if tag.strip() == "": if tag.strip() == "":
continue continue
@@ -329,22 +339,22 @@ def design_sheet(sheet, username, results):
# suppose target data # suppose target data
if not isinstance(v, list): if not isinstance(v, list):
currentsublabel = userlink.addSubTopic() currentsublabel = userlink.addSubTopic()
field = 'fullname' if k == 'name' else k field = "fullname" if k == "name" else k
if not field in supposed_data: if field not in supposed_data:
supposed_data[field] = [] supposed_data[field] = []
supposed_data[field].append(v) supposed_data[field].append(v)
currentsublabel.setTitle("%s: %s" % (k, v)) currentsublabel.setTitle("%s: %s" % (k, v))
else: else:
for currentval in v: for currentval in v:
currentsublabel = userlink.addSubTopic() currentsublabel = userlink.addSubTopic()
field = 'fullname' if k == 'name' else k field = "fullname" if k == "name" else k
if not field in supposed_data: if field not in supposed_data:
supposed_data[field] = [] supposed_data[field] = []
supposed_data[field].append(currentval) supposed_data[field].append(currentval)
currentsublabel.setTitle("%s: %s" % (k, currentval)) currentsublabel.setTitle("%s: %s" % (k, currentval))
### Add Supposed DATA # add supposed data
filterede_supposed_data = filter_supposed_data(supposed_data) filterede_supposed_data = filter_supposed_data(supposed_data)
if (len(filterede_supposed_data) > 0): if len(filterede_supposed_data) > 0:
undefinedsection = root_topic1.addSubTopic() undefinedsection = root_topic1.addSubTopic()
undefinedsection.setTitle("SUPPOSED DATA") undefinedsection.setTitle("SUPPOSED DATA")
for k, v in filterede_supposed_data.items(): for k, v in filterede_supposed_data.items():
@@ -353,7 +363,9 @@ def design_sheet(sheet, username, results):
def check_supported_json_format(value): def check_supported_json_format(value):
if value and not value in SUPPORTED_JSON_REPORT_FORMATS: if value and value not in SUPPORTED_JSON_REPORT_FORMATS:
raise ArgumentTypeError(f'JSON report type must be one of the following types: ' raise ArgumentTypeError(
+ ', '.join(SUPPORTED_JSON_REPORT_FORMATS)) "JSON report type must be one of the following types: "
+ ", ".join(SUPPORTED_JSON_REPORT_FORMATS)
)
return value return value
+69 -6
View File
@@ -12148,7 +12148,7 @@
"us" "us"
], ],
"headers": { "headers": {
"authorization": "Bearer BQAEeuyBT6S535Anlx4wU-pfPjjgiE8r2e7j0eOSnwZjSvjFvQgDzxwV__03-WNbwxPKyGehoJ5pQCBwUqs" "authorization": "Bearer BQCe5Yx_Evl2m1Td_86SzknoVan7OZxN6y6WaR7xNrJb8vnZ5B7VZY401MdivLmCQcyv0LUkfo1M-15_m-E"
}, },
"errors": { "errors": {
"Spotify is currently not available in your country.": "Access denied in your country, use proxy/vpn" "Spotify is currently not available in your country.": "Access denied in your country, use proxy/vpn"
@@ -13458,7 +13458,7 @@
"sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"", "sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
"authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA", "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36", "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
"x-guest-token": "1387733472027070474" "x-guest-token": "1388029767388106752"
}, },
"errors": { "errors": {
"Bad guest token": "x-guest-token update required" "Bad guest token": "x-guest-token update required"
@@ -13661,6 +13661,7 @@
"type": "vk_id", "type": "vk_id",
"checkType": "response_url", "checkType": "response_url",
"alexaRank": 26, "alexaRank": 26,
"source": "VK",
"url": "https://vk.com/id{username}", "url": "https://vk.com/id{username}",
"urlMain": "https://vk.com/", "urlMain": "https://vk.com/",
"usernameClaimed": "270433952", "usernameClaimed": "270433952",
@@ -13672,6 +13673,7 @@
], ],
"checkType": "status_code", "checkType": "status_code",
"alexaRank": 28938, "alexaRank": 28938,
"source": "VK",
"url": "https://vkfaces.com/vk/user/{username}", "url": "https://vkfaces.com/vk/user/{username}",
"urlMain": "https://vkfaces.com", "urlMain": "https://vkfaces.com",
"usernameClaimed": "adam", "usernameClaimed": "adam",
@@ -13835,7 +13837,7 @@
"video" "video"
], ],
"headers": { "headers": {
"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MTk2OTczNjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.yLRq0lhenTYfe0EKKJsk5HZJZt3ykUVNBGuiMCC5HR4" "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MTk3NzM3NDAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.4O4QL4IsoiKl0Cz1310Qjo9WablDr5LIyMOPQgMS1XE"
}, },
"activation": { "activation": {
"url": "https://vimeo.com/_rv/viewer", "url": "https://vimeo.com/_rv/viewer",
@@ -16125,10 +16127,8 @@
"gb", "gb",
"uk" "uk"
], ],
"checkType": "message", "engine": "XenForo",
"absenceStrs": "The specified member cannot be found. Please enter a member's entire name.",
"alexaRank": 12725, "alexaRank": 12725,
"url": "https://forums.overclockers.co.uk/members/?username={username}",
"urlMain": "https://forums.overclockers.co.uk", "urlMain": "https://forums.overclockers.co.uk",
"usernameClaimed": "adam", "usernameClaimed": "adam",
"usernameUnclaimed": "noonewouldeverusethis7" "usernameUnclaimed": "noonewouldeverusethis7"
@@ -23749,6 +23749,69 @@
"urlMain": "https://opensea.io", "urlMain": "https://opensea.io",
"usernameClaimed": "admin", "usernameClaimed": "admin",
"usernameUnclaimed": "noonewouldeverusethis7" "usernameUnclaimed": "noonewouldeverusethis7"
},
"SmiHub": {
"checkType": "message",
"presenseStrs": [
"profile",
"user-page",
"user",
" data-name=",
"user__img"
],
"absenceStrs": [
"text-lg mb-3"
],
"source": "Instagram",
"url": "https://smihub.com/v/{username}",
"urlMain": "https://smihub.com",
"usernameClaimed": "blue",
"usernameUnclaimed": "noonewouldeverusethis7"
},
"do100verno.info": {
"checkType": "message",
"presenseStrs": [
"white-space: nowrap;"
],
"absenceStrs": [
"l-main",
" l-mainDcL",
" l-usrMenu"
],
"url": "https://do100verno.info/card/{username}",
"urlMain": "https://do100verno.info",
"usernameClaimed": "ekostyle",
"usernameUnclaimed": "noonewouldeverusethis7"
},
"www.kinokopilka.pro": {
"checkType": "message",
"presenseStrs": [
"profile",
"user",
"people",
"users",
"/people"
],
"url": "https://www.kinokopilka.pro/users/{username}",
"urlMain": "https://www.kinokopilka.pro",
"usernameClaimed": "admin",
"usernameUnclaimed": "noonewouldeverusethis7"
},
"www.turpravda.com": {
"checkType": "message",
"presenseStrs": [
"email",
" name"
],
"absenceStrs": [
"Title",
" Shortcut Icon",
" submit"
],
"url": "https://www.turpravda.com/profile/{username}",
"urlMain": "https://www.turpravda.com",
"usernameClaimed": "admin",
"usernameUnclaimed": "noonewouldeverusethis7"
} }
}, },
"engines": { "engines": {
+20 -9
View File
@@ -10,6 +10,7 @@ class QueryStatus(Enum):
Describes status of query about a given username. Describes status of query about a given username.
""" """
CLAIMED = "Claimed" # Username Detected CLAIMED = "Claimed" # Username Detected
AVAILABLE = "Available" # Username Not Detected AVAILABLE = "Available" # Username Not Detected
UNKNOWN = "Unknown" # Error Occurred While Trying To Detect Username UNKNOWN = "Unknown" # Error Occurred While Trying To Detect Username
@@ -27,14 +28,24 @@ class QueryStatus(Enum):
return self.value return self.value
class QueryResult(): class QueryResult:
"""Query Result Object. """Query Result Object.
Describes result of query about a given username. Describes result of query about a given username.
""" """
def __init__(self, username, site_name, site_url_user, status, ids_data=None, def __init__(
query_time=None, context=None, error=None, tags=[]): self,
username,
site_name,
site_url_user,
status,
ids_data=None,
query_time=None,
context=None,
error=None,
tags=[],
):
"""Create Query Result Object. """Create Query Result Object.
Contains information about a specific method of detecting usernames on Contains information about a specific method of detecting usernames on
@@ -77,12 +88,12 @@ class QueryResult():
def json(self): def json(self):
return { return {
'username': self.username, "username": self.username,
'site_name': self.site_name, "site_name": self.site_name,
'url': self.site_url_user, "url": self.site_url_user,
'status': str(self.status), "status": str(self.status),
'ids': self.ids_data or {}, "ids": self.ids_data or {},
'tags': self.tags, "tags": self.tags,
} }
def is_found(self): def is_found(self):
+137 -76
View File
@@ -1,8 +1,9 @@
# -*- coding: future_annotations -*- # ****************************** -*-
"""Maigret Sites Information""" """Maigret Sites Information"""
import copy import copy
import json import json
import sys import sys
from typing import Optional
import requests import requests
@@ -10,12 +11,48 @@ from .utils import CaseConverter, URLMatcher, is_country_tag
# TODO: move to data.json # TODO: move to data.json
SUPPORTED_TAGS = [ SUPPORTED_TAGS = [
'gaming', 'coding', 'photo', 'music', 'blog', 'finance', 'freelance', 'dating', "gaming",
'tech', 'forum', 'porn', 'erotic', 'webcam', 'video', 'movies', 'hacking', 'art', "coding",
'discussion', 'sharing', 'writing', 'wiki', 'business', 'shopping', 'sport', "photo",
'books', 'news', 'documents', 'travel', 'maps', 'hobby', 'apps', 'classified', "music",
'career', 'geosocial', 'streaming', 'education', 'networking', 'torrent', "blog",
'science', 'medicine', 'reading', 'stock', "finance",
"freelance",
"dating",
"tech",
"forum",
"porn",
"erotic",
"webcam",
"video",
"movies",
"hacking",
"art",
"discussion",
"sharing",
"writing",
"wiki",
"business",
"shopping",
"sport",
"books",
"news",
"documents",
"travel",
"maps",
"hobby",
"apps",
"classified",
"career",
"geosocial",
"streaming",
"education",
"networking",
"torrent",
"science",
"medicine",
"reading",
"stock",
] ]
@@ -32,13 +69,13 @@ class MaigretEngine:
class MaigretSite: class MaigretSite:
NOT_SERIALIZABLE_FIELDS = [ NOT_SERIALIZABLE_FIELDS = [
'name', "name",
'engineData', "engineData",
'requestFuture', "requestFuture",
'detectedEngine', "detectedEngine",
'engineObj', "engineObj",
'stats', "stats",
'urlRegexp', "urlRegexp",
] ]
def __init__(self, name, information): def __init__(self, name, information):
@@ -49,15 +86,15 @@ class MaigretSite:
self.ignore403 = False self.ignore403 = False
self.tags = [] self.tags = []
self.type = 'username' self.type = "username"
self.headers = {} self.headers = {}
self.errors = {} self.errors = {}
self.activation = {} self.activation = {}
self.url_subpath = '' self.url_subpath = ""
self.regex_check = None self.regex_check = None
self.url_probe = None self.url_probe = None
self.check_type = '' self.check_type = ""
self.request_head_only = '' self.request_head_only = ""
self.get_params = {} self.get_params = {}
self.presense_strs = [] self.presense_strs = []
@@ -84,26 +121,29 @@ class MaigretSite:
return f"{self.name} ({self.url_main})" return f"{self.name} ({self.url_main})"
def update_detectors(self): def update_detectors(self):
if 'url' in self.__dict__: if "url" in self.__dict__:
url = self.url url = self.url
for group in ['urlMain', 'urlSubpath']: for group in ["urlMain", "urlSubpath"]:
if group in url: if group in url:
url = url.replace('{' + group + '}', self.__dict__[CaseConverter.camel_to_snake(group)]) url = url.replace(
"{" + group + "}",
self.__dict__[CaseConverter.camel_to_snake(group)],
)
self.url_regexp = URLMatcher.make_profile_url_regexp(url, self.regex_check) self.url_regexp = URLMatcher.make_profile_url_regexp(url, self.regex_check)
def detect_username(self, url: str) -> str: def detect_username(self, url: str) -> Optional[str]:
if self.url_regexp: if self.url_regexp:
match_groups = self.url_regexp.match(url) match_groups = self.url_regexp.match(url)
if match_groups: if match_groups:
return match_groups.groups()[-1].rstrip('/') return match_groups.groups()[-1].rstrip("/")
return None return None
@property @property
def pretty_name(self): def pretty_name(self):
if self.source: if self.source:
return f'{self.name} [{self.source}]' return f"{self.name} [{self.source}]"
return self.name return self.name
@property @property
@@ -113,7 +153,7 @@ class MaigretSite:
# convert to camelCase # convert to camelCase
field = CaseConverter.snake_to_camel(k) field = CaseConverter.snake_to_camel(k)
# strip empty elements # strip empty elements
if v in (False, '', [], {}, None, sys.maxsize, 'username'): if v in (False, "", [], {}, None, sys.maxsize, "username"):
continue continue
if field in self.NOT_SERIALIZABLE_FIELDS: if field in self.NOT_SERIALIZABLE_FIELDS:
continue continue
@@ -121,13 +161,13 @@ class MaigretSite:
return result return result
def update(self, updates: dict) -> MaigretSite: def update(self, updates: "dict") -> "MaigretSite":
self.__dict__.update(updates) self.__dict__.update(updates)
self.update_detectors() self.update_detectors()
return self return self
def update_from_engine(self, engine: MaigretEngine) -> MaigretSite: def update_from_engine(self, engine: MaigretEngine) -> "MaigretSite":
engine_data = engine.site engine_data = engine.site
for k, v in engine_data.items(): for k, v in engine_data.items():
field = CaseConverter.camel_to_snake(k) field = CaseConverter.camel_to_snake(k)
@@ -145,7 +185,7 @@ class MaigretSite:
return self return self
def strip_engine_data(self) -> MaigretSite: def strip_engine_data(self) -> "MaigretSite":
if not self.engine_obj: if not self.engine_obj:
return self return self
@@ -190,8 +230,15 @@ class MaigretDatabase:
def sites_dict(self): def sites_dict(self):
return {site.name: site for site in self._sites} return {site.name: site for site in self._sites}
def ranked_sites_dict(self, reverse=False, top=sys.maxsize, tags=[], names=[], def ranked_sites_dict(
disabled=True, id_type='username'): self,
reverse=False,
top=sys.maxsize,
tags=[],
names=[],
disabled=True,
id_type="username",
):
""" """
Ranking and filtering of the sites list Ranking and filtering of the sites list
""" """
@@ -200,20 +247,30 @@ class MaigretDatabase:
is_name_ok = lambda x: x.name.lower() in normalized_names is_name_ok = lambda x: x.name.lower() in normalized_names
is_source_ok = lambda x: x.source and x.source.lower() in normalized_names is_source_ok = lambda x: x.source and x.source.lower() in normalized_names
is_engine_ok = lambda x: isinstance(x.engine, str) and x.engine.lower() in normalized_tags is_engine_ok = (
lambda x: isinstance(x.engine, str) and x.engine.lower() in normalized_tags
)
is_tags_ok = lambda x: set(x.tags).intersection(set(normalized_tags)) is_tags_ok = lambda x: set(x.tags).intersection(set(normalized_tags))
is_disabled_needed = lambda x: not x.disabled or ('disabled' in tags or disabled) is_disabled_needed = lambda x: not x.disabled or (
"disabled" in tags or disabled
)
is_id_type_ok = lambda x: x.type == id_type is_id_type_ok = lambda x: x.type == id_type
filter_tags_engines_fun = lambda x: not tags or is_engine_ok(x) or is_tags_ok(x) filter_tags_engines_fun = lambda x: not tags or is_engine_ok(x) or is_tags_ok(x)
filter_names_fun = lambda x: not names or is_name_ok(x) or is_source_ok(x) filter_names_fun = lambda x: not names or is_name_ok(x) or is_source_ok(x)
filter_fun = lambda x: filter_tags_engines_fun(x) and filter_names_fun(x) \ filter_fun = (
and is_disabled_needed(x) and is_id_type_ok(x) lambda x: filter_tags_engines_fun(x)
and filter_names_fun(x)
and is_disabled_needed(x)
and is_id_type_ok(x)
)
filtered_list = [s for s in self.sites if filter_fun(s)] filtered_list = [s for s in self.sites if filter_fun(s)]
sorted_list = sorted(filtered_list, key=lambda x: x.alexa_rank, reverse=reverse)[:top] sorted_list = sorted(
filtered_list, key=lambda x: x.alexa_rank, reverse=reverse
)[:top]
return {site.name: site for site in sorted_list} return {site.name: site for site in sorted_list}
@property @property
@@ -224,7 +281,7 @@ class MaigretDatabase:
def engines_dict(self): def engines_dict(self):
return {engine.name: engine for engine in self._engines} return {engine.name: engine for engine in self._engines}
def update_site(self, site: MaigretSite) -> MaigretDatabase: def update_site(self, site: MaigretSite) -> "MaigretDatabase":
for s in self._sites: for s in self._sites:
if s.name == site.name: if s.name == site.name:
s = site s = site
@@ -233,20 +290,20 @@ class MaigretDatabase:
self._sites.append(site) self._sites.append(site)
return self return self
def save_to_file(self, filename: str) -> MaigretDatabase: def save_to_file(self, filename: str) -> "MaigretDatabase":
db_data = { db_data = {
'sites': {site.name: site.strip_engine_data().json for site in self._sites}, "sites": {site.name: site.strip_engine_data().json for site in self._sites},
'engines': {engine.name: engine.json for engine in self._engines}, "engines": {engine.name: engine.json for engine in self._engines},
} }
json_data = json.dumps(db_data, indent=4) json_data = json.dumps(db_data, indent=4)
with open(filename, 'w') as f: with open(filename, "w") as f:
f.write(json_data) f.write(json_data)
return self return self
def load_from_json(self, json_data: dict) -> MaigretDatabase: def load_from_json(self, json_data: dict) -> "MaigretDatabase":
# Add all of site information from the json file to internal site list. # Add all of site information from the json file to internal site list.
site_data = json_data.get("sites", {}) site_data = json_data.get("sites", {})
engines_data = json_data.get("engines", {}) engines_data = json_data.get("engines", {})
@@ -258,30 +315,32 @@ class MaigretDatabase:
try: try:
maigret_site = MaigretSite(site_name, site_data[site_name]) maigret_site = MaigretSite(site_name, site_data[site_name])
engine = site_data[site_name].get('engine') engine = site_data[site_name].get("engine")
if engine: if engine:
maigret_site.update_from_engine(self.engines_dict[engine]) maigret_site.update_from_engine(self.engines_dict[engine])
self._sites.append(maigret_site) self._sites.append(maigret_site)
except KeyError as error: except KeyError as error:
raise ValueError(f"Problem parsing json content for site {site_name}: " raise ValueError(
f"Problem parsing json content for site {site_name}: "
f"Missing attribute {str(error)}." f"Missing attribute {str(error)}."
) )
return self return self
def load_from_str(self, db_str: str) -> MaigretDatabase: def load_from_str(self, db_str: "str") -> "MaigretDatabase":
try: try:
data = json.loads(db_str) data = json.loads(db_str)
except Exception as error: except Exception as error:
raise ValueError(f"Problem parsing json contents from str" raise ValueError(
f"Problem parsing json contents from str"
f"'{db_str[:50]}'...: {str(error)}." f"'{db_str[:50]}'...: {str(error)}."
) )
return self.load_from_json(data) return self.load_from_json(data)
def load_from_url(self, url: str) -> MaigretDatabase: def load_from_url(self, url: str) -> "MaigretDatabase":
is_url_valid = url.startswith('http://') or url.startswith('https://') is_url_valid = url.startswith("http://") or url.startswith("https://")
if not is_url_valid: if not is_url_valid:
raise FileNotFoundError(f"Invalid data file URL '{url}'.") raise FileNotFoundError(f"Invalid data file URL '{url}'.")
@@ -289,7 +348,8 @@ class MaigretDatabase:
try: try:
response = requests.get(url=url) response = requests.get(url=url)
except Exception as error: except Exception as error:
raise FileNotFoundError(f"Problem while attempting to access " raise FileNotFoundError(
f"Problem while attempting to access "
f"data file URL '{url}': " f"data file URL '{url}': "
f"{str(error)}" f"{str(error)}"
) )
@@ -298,29 +358,30 @@ class MaigretDatabase:
try: try:
data = response.json() data = response.json()
except Exception as error: except Exception as error:
raise ValueError(f"Problem parsing json contents at " raise ValueError(
f"'{url}': {str(error)}." f"Problem parsing json contents at " f"'{url}': {str(error)}."
) )
else: else:
raise FileNotFoundError(f"Bad response while accessing " raise FileNotFoundError(
f"data file URL '{url}'." f"Bad response while accessing " f"data file URL '{url}'."
) )
return self.load_from_json(data) return self.load_from_json(data)
def load_from_file(self, filename: str) -> MaigretDatabase: def load_from_file(self, filename: "str") -> "MaigretDatabase":
try: try:
with open(filename, 'r', encoding='utf-8') as file: with open(filename, "r", encoding="utf-8") as file:
try: try:
data = json.load(file) data = json.load(file)
except Exception as error: except Exception as error:
raise ValueError(f"Problem parsing json contents from " raise ValueError(
f"Problem parsing json contents from "
f"file '{filename}': {str(error)}." f"file '{filename}': {str(error)}."
) )
except FileNotFoundError as error: except FileNotFoundError as error:
raise FileNotFoundError(f"Problem while attempting to access " raise FileNotFoundError(
f"data file '{filename}'." f"Problem while attempting to access " f"data file '{filename}'."
) ) from error
return self.load_from_json(data) return self.load_from_json(data)
@@ -328,8 +389,8 @@ class MaigretDatabase:
sites = sites_dict or self.sites_dict sites = sites_dict or self.sites_dict
found_flags = {} found_flags = {}
for _, s in sites.items(): for _, s in sites.items():
if 'presense_flag' in s.stats: if "presense_flag" in s.stats:
flag = s.stats['presense_flag'] flag = s.stats["presense_flag"]
found_flags[flag] = found_flags.get(flag, 0) + 1 found_flags[flag] = found_flags.get(flag, 0) + 1
return found_flags return found_flags
@@ -338,7 +399,7 @@ class MaigretDatabase:
if not sites_dict: if not sites_dict:
sites_dict = self.sites_dict() sites_dict = self.sites_dict()
output = '' output = ""
disabled_count = 0 disabled_count = 0
total_count = len(sites_dict) total_count = len(sites_dict)
urls = {} urls = {}
@@ -349,18 +410,18 @@ class MaigretDatabase:
disabled_count += 1 disabled_count += 1
url = URLMatcher.extract_main_part(site.url) url = URLMatcher.extract_main_part(site.url)
if url.startswith('{username}'): if url.startswith("{username}"):
url = 'SUBDOMAIN' url = "SUBDOMAIN"
elif url == '': elif url == "":
url = f'{site.url} ({site.engine})' url = f"{site.url} ({site.engine})"
else: else:
parts = url.split('/') parts = url.split("/")
url = '/' + '/'.join(parts[1:]) url = "/" + "/".join(parts[1:])
urls[url] = urls.get(url, 0) + 1 urls[url] = urls.get(url, 0) + 1
if not site.tags: if not site.tags:
tags['NO_TAGS'] = tags.get('NO_TAGS', 0) + 1 tags["NO_TAGS"] = tags.get("NO_TAGS", 0) + 1
for tag in site.tags: for tag in site.tags:
if is_country_tag(tag): if is_country_tag(tag):
@@ -368,17 +429,17 @@ class MaigretDatabase:
continue continue
tags[tag] = tags.get(tag, 0) + 1 tags[tag] = tags.get(tag, 0) + 1
output += f'Enabled/total sites: {total_count - disabled_count}/{total_count}\n' output += f"Enabled/total sites: {total_count - disabled_count}/{total_count}\n"
output += 'Top sites\' profile URLs:\n' output += "Top sites' profile URLs:\n"
for url, count in sorted(urls.items(), key=lambda x: x[1], reverse=True)[:20]: for url, count in sorted(urls.items(), key=lambda x: x[1], reverse=True)[:20]:
if count == 1: if count == 1:
break break
output += f'{count}\t{url}\n' output += f"{count}\t{url}\n"
output += 'Top sites\' tags:\n' output += "Top sites' tags:\n"
for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True): for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True):
mark = '' mark = ""
if not tag in SUPPORTED_TAGS: if tag not in SUPPORTED_TAGS:
mark = ' (non-standard)' mark = " (non-standard)"
output += f'{count}\t{tag}{mark}\n' output += f"{count}\t{tag}{mark}\n"
return output return output
+130 -73
View File
@@ -1,39 +1,57 @@
import asyncio
import difflib import difflib
import re
import requests import requests
from .checking import * from .activation import import_aiohttp_cookies
from .checking import maigret
from .result import QueryStatus
from .sites import MaigretDatabase, MaigretSite
from .utils import get_random_user_agent from .utils import get_random_user_agent
DESIRED_STRINGS = ["username", "not found", "пользователь", "profile", "lastname", "firstname", "biography", DESIRED_STRINGS = [
"birthday", "репутация", "информация", "e-mail"] "username",
"not found",
"пользователь",
"profile",
"lastname",
"firstname",
"biography",
"birthday",
"репутация",
"информация",
"e-mail",
]
SUPPOSED_USERNAMES = ['alex', 'god', 'admin', 'red', 'blue', 'john'] SUPPOSED_USERNAMES = ["alex", "god", "admin", "red", "blue", "john"]
HEADERS = { HEADERS = {
'User-Agent': get_random_user_agent(), "User-Agent": get_random_user_agent(),
} }
RATIO = 0.6 RATIO = 0.6
TOP_FEATURES = 5 TOP_FEATURES = 5
URL_RE = re.compile(r'https?://(www\.)?') URL_RE = re.compile(r"https?://(www\.)?")
def get_match_ratio(x): def get_match_ratio(x):
return round(max([ return round(
difflib.SequenceMatcher(a=x.lower(), b=y).ratio() max(
for y in DESIRED_STRINGS [difflib.SequenceMatcher(a=x.lower(), b=y).ratio() for y in DESIRED_STRINGS]
]), 2) ),
2,
)
def extract_mainpage_url(url): def extract_mainpage_url(url):
return '/'.join(url.split('/', 3)[:3]) return "/".join(url.split("/", 3)[:3])
async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False): async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False):
changes = { changes = {
'disabled': False, "disabled": False,
} }
check_data = [ check_data = [
@@ -41,7 +59,7 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F
(site.username_unclaimed, QueryStatus.AVAILABLE), (site.username_unclaimed, QueryStatus.AVAILABLE),
] ]
logger.info(f'Checking {site.name}...') logger.info(f"Checking {site.name}...")
for username, status in check_data: for username, status in check_data:
results_dict = await maigret( results_dict = await maigret(
@@ -58,10 +76,10 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F
# TODO: make normal checking # TODO: make normal checking
if site.name not in results_dict: if site.name not in results_dict:
logger.info(results_dict) logger.info(results_dict)
changes['disabled'] = True changes["disabled"] = True
continue continue
result = results_dict[site.name]['status'] result = results_dict[site.name]["status"]
site_status = result.status site_status = result.status
@@ -70,20 +88,23 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F
msgs = site.absence_strs msgs = site.absence_strs
etype = site.check_type etype = site.check_type
logger.warning( logger.warning(
f'Error while searching {username} in {site.name}: {result.context}, {msgs}, type {etype}') f"Error while searching {username} in {site.name}: {result.context}, {msgs}, type {etype}"
)
# don't disable in case of available username # don't disable in case of available username
if status == QueryStatus.CLAIMED: if status == QueryStatus.CLAIMED:
changes['disabled'] = True changes["disabled"] = True
elif status == QueryStatus.CLAIMED: elif status == QueryStatus.CLAIMED:
logger.warning(f'Not found `{username}` in {site.name}, must be claimed') logger.warning(
f"Not found `{username}` in {site.name}, must be claimed"
)
logger.info(results_dict[site.name]) logger.info(results_dict[site.name])
changes['disabled'] = True changes["disabled"] = True
else: else:
logger.warning(f'Found `{username}` in {site.name}, must be available') logger.warning(f"Found `{username}` in {site.name}, must be available")
logger.info(results_dict[site.name]) logger.info(results_dict[site.name])
changes['disabled'] = True changes["disabled"] = True
logger.info(f'Site {site.name} checking is finished') logger.info(f"Site {site.name} checking is finished")
return changes return changes
@@ -93,31 +114,31 @@ async def detect_known_engine(db, url_exists, url_mainpage):
r = requests.get(url_mainpage) r = requests.get(url_mainpage)
except Exception as e: except Exception as e:
print(e) print(e)
print('Some error while checking main page') print("Some error while checking main page")
return None return None
for e in db.engines: for engine in db.engines:
strs_to_check = e.__dict__.get('presenseStrs') strs_to_check = engine.__dict__.get("presenseStrs")
if strs_to_check and r and r.text: if strs_to_check and r and r.text:
all_strs_in_response = True all_strs_in_response = True
for s in strs_to_check: for s in strs_to_check:
if not s in r.text: if s not in r.text:
all_strs_in_response = False all_strs_in_response = False
if all_strs_in_response: if all_strs_in_response:
engine_name = e.__dict__.get('name') engine_name = engine.__dict__.get("name")
print(f'Detected engine {engine_name} for site {url_mainpage}') print(f"Detected engine {engine_name} for site {url_mainpage}")
sites = [] sites = []
for u in SUPPOSED_USERNAMES: for u in SUPPOSED_USERNAMES:
site_data = { site_data = {
'urlMain': url_mainpage, "urlMain": url_mainpage,
'name': url_mainpage.split('//')[0], "name": url_mainpage.split("//")[0],
'engine': engine_name, "engine": engine_name,
'usernameClaimed': u, "usernameClaimed": u,
'usernameUnclaimed': 'noonewouldeverusethis7', "usernameUnclaimed": "noonewouldeverusethis7",
} }
maigret_site = MaigretSite(url_mainpage.split('/')[-1], site_data) maigret_site = MaigretSite(url_mainpage.split("/")[-1], site_data)
maigret_site.update_from_engine(db.engines_dict[engine_name]) maigret_site.update_from_engine(db.engines_dict[engine_name])
sites.append(maigret_site) sites.append(maigret_site)
@@ -126,15 +147,19 @@ async def detect_known_engine(db, url_exists, url_mainpage):
return None return None
async def check_features_manually(db, url_exists, url_mainpage, cookie_file, logger, redirects=True): async def check_features_manually(
url_parts = url_exists.split('/') db, url_exists, url_mainpage, cookie_file, logger, redirects=True
):
url_parts = url_exists.split("/")
supposed_username = url_parts[-1] supposed_username = url_parts[-1]
new_name = input(f'Is "{supposed_username}" a valid username? If not, write it manually: ') new_name = input(
f'Is "{supposed_username}" a valid username? If not, write it manually: '
)
if new_name: if new_name:
supposed_username = new_name supposed_username = new_name
non_exist_username = 'noonewouldeverusethis7' non_exist_username = "noonewouldeverusethis7"
url_user = url_exists.replace(supposed_username, '{username}') url_user = url_exists.replace(supposed_username, "{username}")
url_not_exists = url_exists.replace(supposed_username, non_exist_username) url_not_exists = url_exists.replace(supposed_username, non_exist_username)
# cookies # cookies
@@ -143,15 +168,18 @@ async def check_features_manually(db, url_exists, url_mainpage, cookie_file, log
cookie_jar = await import_aiohttp_cookies(cookie_file) cookie_jar = await import_aiohttp_cookies(cookie_file)
cookie_dict = {c.key: c.value for c in cookie_jar} cookie_dict = {c.key: c.value for c in cookie_jar}
exists_resp = requests.get(url_exists, cookies=cookie_dict, headers=HEADERS, allow_redirects=redirects) exists_resp = requests.get(
url_exists, cookies=cookie_dict, headers=HEADERS, allow_redirects=redirects
)
logger.debug(exists_resp.status_code) logger.debug(exists_resp.status_code)
logger.debug(exists_resp.text) logger.debug(exists_resp.text)
non_exists_resp = requests.get(url_not_exists, cookies=cookie_dict, headers=HEADERS, allow_redirects=redirects) non_exists_resp = requests.get(
url_not_exists, cookies=cookie_dict, headers=HEADERS, allow_redirects=redirects
)
logger.debug(non_exists_resp.status_code) logger.debug(non_exists_resp.status_code)
logger.debug(non_exists_resp.text) logger.debug(non_exists_resp.text)
a = exists_resp.text a = exists_resp.text
b = non_exists_resp.text b = non_exists_resp.text
@@ -162,61 +190,81 @@ async def check_features_manually(db, url_exists, url_mainpage, cookie_file, log
b_minus_a = tokens_b.difference(tokens_a) b_minus_a = tokens_b.difference(tokens_a)
if len(a_minus_b) == len(b_minus_a) == 0: if len(a_minus_b) == len(b_minus_a) == 0:
print('The pages for existing and non-existing account are the same!') print("The pages for existing and non-existing account are the same!")
top_features_count = int(input(f'Specify count of features to extract [default {TOP_FEATURES}]: ') or TOP_FEATURES) top_features_count = int(
input(f"Specify count of features to extract [default {TOP_FEATURES}]: ")
or TOP_FEATURES
)
presence_list = sorted(a_minus_b, key=get_match_ratio, reverse=True)[:top_features_count] presence_list = sorted(a_minus_b, key=get_match_ratio, reverse=True)[
:top_features_count
]
print('Detected text features of existing account: ' + ', '.join(presence_list)) print("Detected text features of existing account: " + ", ".join(presence_list))
features = input('If features was not detected correctly, write it manually: ') features = input("If features was not detected correctly, write it manually: ")
if features: if features:
presence_list = features.split(',') presence_list = features.split(",")
absence_list = sorted(b_minus_a, key=get_match_ratio, reverse=True)[:top_features_count] absence_list = sorted(b_minus_a, key=get_match_ratio, reverse=True)[
print('Detected text features of non-existing account: ' + ', '.join(absence_list)) :top_features_count
features = input('If features was not detected correctly, write it manually: ') ]
print("Detected text features of non-existing account: " + ", ".join(absence_list))
features = input("If features was not detected correctly, write it manually: ")
if features: if features:
absence_list = features.split(',') absence_list = features.split(",")
site_data = { site_data = {
'absenceStrs': absence_list, "absenceStrs": absence_list,
'presenseStrs': presence_list, "presenseStrs": presence_list,
'url': url_user, "url": url_user,
'urlMain': url_mainpage, "urlMain": url_mainpage,
'usernameClaimed': supposed_username, "usernameClaimed": supposed_username,
'usernameUnclaimed': non_exist_username, "usernameUnclaimed": non_exist_username,
'checkType': 'message', "checkType": "message",
} }
site = MaigretSite(url_mainpage.split('/')[-1], site_data) site = MaigretSite(url_mainpage.split("/")[-1], site_data)
return site return site
async def submit_dialog(db, url_exists, cookie_file, logger): async def submit_dialog(db, url_exists, cookie_file, logger):
domain_raw = URL_RE.sub('', url_exists).strip().strip('/') domain_raw = URL_RE.sub("", url_exists).strip().strip("/")
domain_raw = domain_raw.split('/')[0] domain_raw = domain_raw.split("/")[0]
# check for existence # check for existence
matched_sites = list(filter(lambda x: domain_raw in x.url_main + x.url, db.sites)) matched_sites = list(filter(lambda x: domain_raw in x.url_main + x.url, db.sites))
if matched_sites: if matched_sites:
print(f'Sites with domain "{domain_raw}" already exists in the Maigret database!') print(
status = lambda s: '(disabled)' if s.disabled else '' f'Sites with domain "{domain_raw}" already exists in the Maigret database!'
url_block = lambda s: f'\n\t{s.url_main}\n\t{s.url}' )
print('\n'.join([f'{site.name} {status(site)}{url_block(site)}' for site in matched_sites])) status = lambda s: "(disabled)" if s.disabled else ""
url_block = lambda s: f"\n\t{s.url_main}\n\t{s.url}"
print(
"\n".join(
[
f"{site.name} {status(site)}{url_block(site)}"
for site in matched_sites
]
)
)
if input(f'Do you want to continue? [yN] ').lower() in 'n': if input("Do you want to continue? [yN] ").lower() in "n":
return False return False
url_mainpage = extract_mainpage_url(url_exists) url_mainpage = extract_mainpage_url(url_exists)
sites = await detect_known_engine(db, url_exists, url_mainpage) sites = await detect_known_engine(db, url_exists, url_mainpage)
if not sites: if not sites:
print('Unable to detect site engine, lets generate checking features') print("Unable to detect site engine, lets generate checking features")
sites = [await check_features_manually(db, url_exists, url_mainpage, cookie_file, logger)] sites = [
await check_features_manually(
db, url_exists, url_mainpage, cookie_file, logger
)
]
logger.debug(sites[0].__dict__) logger.debug(sites[0].__dict__)
@@ -227,15 +275,24 @@ async def submit_dialog(db, url_exists, cookie_file, logger):
for s in sites: for s in sites:
chosen_site = s chosen_site = s
result = await site_self_check(s, logger, sem, db) result = await site_self_check(s, logger, sem, db)
if not result['disabled']: if not result["disabled"]:
found = True found = True
break break
if not found: if not found:
print(f'Sorry, we couldn\'t find params to detect account presence/absence in {chosen_site.name}.') print(
print('Try to run this mode again and increase features count or choose others.') f"Sorry, we couldn't find params to detect account presence/absence in {chosen_site.name}."
)
print(
"Try to run this mode again and increase features count or choose others."
)
else: else:
if input(f'Site {chosen_site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] ').lower() in 'y': if (
input(
f"Site {chosen_site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] "
).lower()
in "y"
):
logger.debug(chosen_site.json) logger.debug(chosen_site.json)
site_data = chosen_site.strip_engine_data() site_data = chosen_site.strip_engine_data()
logger.debug(site_data.json) logger.debug(site_data.json)
-23
View File
@@ -3,26 +3,3 @@ from typing import Callable, Any, Tuple
# search query # search query
QueryDraft = Tuple[Callable, Any, Any] QueryDraft = Tuple[Callable, Any, Any]
# error got as a result of completed search query
class CheckError:
_type = 'Unknown'
_desc = ''
def __init__(self, typename, desc=''):
self._type = typename
self._desc = desc
def __str__(self):
if not self._desc:
return f'{self._type} error'
return f'{self._type} error: {self._desc}'
@property
def type(self):
return self._type
@property
def desc(self):
return self._desc
+23 -23
View File
@@ -3,80 +3,80 @@ import random
DEFAULT_USER_AGENTS = [ DEFAULT_USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36', "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36",
] ]
class CaseConverter: class CaseConverter:
@staticmethod @staticmethod
def camel_to_snake(camelcased_string: str) -> str: def camel_to_snake(camelcased_string: str) -> str:
return re.sub(r'(?<!^)(?=[A-Z])', '_', camelcased_string).lower() return re.sub(r"(?<!^)(?=[A-Z])", "_", camelcased_string).lower()
@staticmethod @staticmethod
def snake_to_camel(snakecased_string: str) -> str: def snake_to_camel(snakecased_string: str) -> str:
formatted = ''.join(word.title() for word in snakecased_string.split('_')) formatted = "".join(word.title() for word in snakecased_string.split("_"))
result = formatted[0].lower() + formatted[1:] result = formatted[0].lower() + formatted[1:]
return result return result
@staticmethod @staticmethod
def snake_to_title(snakecased_string: str) -> str: def snake_to_title(snakecased_string: str) -> str:
words = snakecased_string.split('_') words = snakecased_string.split("_")
words[0] = words[0].title() words[0] = words[0].title()
return ' '.join(words) return " ".join(words)
def is_country_tag(tag: str) -> bool: def is_country_tag(tag: str) -> bool:
"""detect if tag represent a country""" """detect if tag represent a country"""
return bool(re.match("^([a-zA-Z]){2}$", tag)) or tag == 'global' return bool(re.match("^([a-zA-Z]){2}$", tag)) or tag == "global"
def enrich_link_str(link: str) -> str: def enrich_link_str(link: str) -> str:
link = link.strip() link = link.strip()
if link.startswith('www.') or (link.startswith('http') and '//' in link): if link.startswith("www.") or (link.startswith("http") and "//" in link):
return f'<a class="auto-link" href="{link}">{link}</a>' return f'<a class="auto-link" href="{link}">{link}</a>'
return link return link
class URLMatcher: class URLMatcher:
_HTTP_URL_RE_STR = '^https?://(www.)?(.+)$' _HTTP_URL_RE_STR = "^https?://(www.)?(.+)$"
HTTP_URL_RE = re.compile(_HTTP_URL_RE_STR) HTTP_URL_RE = re.compile(_HTTP_URL_RE_STR)
UNSAFE_SYMBOLS = '.?' UNSAFE_SYMBOLS = ".?"
@classmethod @classmethod
def extract_main_part(self, url: str) -> str: def extract_main_part(self, url: str) -> str:
match = self.HTTP_URL_RE.search(url) match = self.HTTP_URL_RE.search(url)
if match and match.group(2): if match and match.group(2):
return match.group(2).rstrip('/') return match.group(2).rstrip("/")
return '' return ""
@classmethod @classmethod
def make_profile_url_regexp(self, url: str, username_regexp: str = ''): def make_profile_url_regexp(self, url: str, username_regexp: str = ""):
url_main_part = self.extract_main_part(url) url_main_part = self.extract_main_part(url)
for c in self.UNSAFE_SYMBOLS: for c in self.UNSAFE_SYMBOLS:
url_main_part = url_main_part.replace(c, f'\\{c}') url_main_part = url_main_part.replace(c, f"\\{c}")
username_regexp = username_regexp or '.+?' username_regexp = username_regexp or ".+?"
url_regexp = url_main_part.replace('{username}', f'({username_regexp})') url_regexp = url_main_part.replace("{username}", f"({username_regexp})")
regexp_str = self._HTTP_URL_RE_STR.replace('(.+)', url_regexp) regexp_str = self._HTTP_URL_RE_STR.replace("(.+)", url_regexp)
return re.compile(regexp_str) return re.compile(regexp_str)
def get_dict_ascii_tree(items, prepend='', new_line=True): def get_dict_ascii_tree(items, prepend="", new_line=True):
text = '' text = ""
for num, item in enumerate(items): for num, item in enumerate(items):
box_symbol = '┣╸' if num != len(items) - 1 else '┗╸' box_symbol = "┣╸" if num != len(items) - 1 else "┗╸"
if type(item) == tuple: if type(item) == tuple:
field_name, field_value = item field_name, field_value = item
if field_value.startswith('[\''): if field_value.startswith("['"):
is_last_item = num == len(items) - 1 is_last_item = num == len(items) - 1
prepend_symbols = ' ' * 3 if is_last_item else '' prepend_symbols = " " * 3 if is_last_item else ""
field_value = get_dict_ascii_tree(eval(field_value), prepend_symbols) field_value = get_dict_ascii_tree(eval(field_value), prepend_symbols)
text += f'\n{prepend}{box_symbol}{field_name}: {field_value}' text += f"\n{prepend}{box_symbol}{field_name}: {field_value}"
else: else:
text += f'\n{prepend}{box_symbol} {item}' text += f"\n{prepend}{box_symbol} {item}"
if not new_line: if not new_line:
text = text[1:] text = text[1:]
+6
View File
@@ -1,3 +1,9 @@
[egg_info] [egg_info]
tag_build = tag_build =
tag_date = 0 tag_date = 0
[flake8]
per-file-ignores = __init__.py:F401
[mypy]
ignore_missing_imports = True
Executable
+2
View File
@@ -0,0 +1,2 @@
#!/bin/sh
pytest tests
+11 -4
View File
@@ -26,7 +26,9 @@ if __name__ == '__main__':
# user input # user input
username = input('Enter username to search: ') username = input('Enter username to search: ')
sites_count_raw = input(f'Select the number of sites to search ({TOP_SITES_COUNT} for default, {len(db.sites_dict)} max): ') sites_count_raw = input(
f'Select the number of sites to search ({TOP_SITES_COUNT} for default, {len(db.sites_dict)} max): '
)
sites_count = int(sites_count_raw) or TOP_SITES_COUNT sites_count = int(sites_count_raw) or TOP_SITES_COUNT
sites = db.ranked_sites_dict(top=sites_count) sites = db.ranked_sites_dict(top=sites_count)
@@ -34,10 +36,14 @@ if __name__ == '__main__':
show_progressbar_raw = input('Do you want to show a progressbar? [Yn] ') show_progressbar_raw = input('Do you want to show a progressbar? [Yn] ')
show_progressbar = show_progressbar_raw.lower() != 'n' show_progressbar = show_progressbar_raw.lower() != 'n'
extract_info_raw = input('Do you want to extract additional info from accounts\' pages? [Yn] ') extract_info_raw = input(
'Do you want to extract additional info from accounts\' pages? [Yn] '
)
extract_info = extract_info_raw.lower() != 'n' extract_info = extract_info_raw.lower() != 'n'
use_notifier_raw = input('Do you want to use notifier for displaying results while searching? [Yn] ') use_notifier_raw = input(
'Do you want to use notifier for displaying results while searching? [Yn] '
)
use_notifier = use_notifier_raw.lower() != 'n' use_notifier = use_notifier_raw.lower() != 'n'
notifier = None notifier = None
@@ -45,7 +51,8 @@ if __name__ == '__main__':
notifier = maigret.Notifier(print_found_only=True, skip_check_errors=True) notifier = maigret.Notifier(print_found_only=True, skip_check_errors=True)
# search! # search!
search_func = maigret.search(username=username, search_func = maigret.search(
username=username,
site_dict=sites, site_dict=sites,
timeout=TIMEOUT, timeout=TIMEOUT,
logger=logger, logger=logger,