mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-06 14:08:59 +00:00
Merge pull request #51 from soxoj/submit-mode
Experimental site submit mode
This commit is contained in:
@@ -0,0 +1,601 @@
|
||||
import asyncio
|
||||
import logging
|
||||
import re
|
||||
import ssl
|
||||
|
||||
import aiohttp
|
||||
import tqdm.asyncio
|
||||
from aiohttp_socks import ProxyConnector
|
||||
from mock import Mock
|
||||
from python_socks import _errors as proxy_errors
|
||||
from socid_extractor import extract
|
||||
|
||||
from .activation import ParsingActivator, import_aiohttp_cookies
|
||||
from .result import QueryResult, QueryStatus
|
||||
from .sites import MaigretDatabase, MaigretSite
|
||||
|
||||
supported_recursive_search_ids = (
|
||||
'yandex_public_id',
|
||||
'gaia_id',
|
||||
'vk_id',
|
||||
'ok_id',
|
||||
'wikimapia_uid',
|
||||
)
|
||||
|
||||
common_errors = {
|
||||
'<title>Attention Required! | Cloudflare</title>': 'Cloudflare captcha',
|
||||
'Please stand by, while we are checking your browser': 'Cloudflare captcha',
|
||||
'<title>Доступ ограничен</title>': 'Rostelecom censorship',
|
||||
'document.getElementById(\'validate_form_submit\').disabled=true': 'Mail.ru captcha',
|
||||
'Verifying your browser, please wait...<br>DDoS Protection by</font> Blazingfast.io': 'Blazingfast protection',
|
||||
'404</h1><p class="error-card__description">Мы не нашли страницу': 'MegaFon 404 page',
|
||||
'Доступ к информационному ресурсу ограничен на основании Федерального закона': 'MGTS censorship',
|
||||
'Incapsula incident ID': 'Incapsula antibot protection',
|
||||
}
|
||||
|
||||
unsupported_characters = '#'
|
||||
|
||||
|
||||
async def get_response(request_future, site_name, logger):
|
||||
html_text = None
|
||||
status_code = 0
|
||||
|
||||
error_text = "General Unknown Error"
|
||||
expection_text = None
|
||||
|
||||
try:
|
||||
response = await request_future
|
||||
|
||||
status_code = response.status
|
||||
response_content = await response.content.read()
|
||||
charset = response.charset or 'utf-8'
|
||||
decoded_content = response_content.decode(charset, 'ignore')
|
||||
html_text = decoded_content
|
||||
|
||||
if status_code > 0:
|
||||
error_text = None
|
||||
|
||||
logger.debug(html_text)
|
||||
|
||||
except asyncio.TimeoutError as errt:
|
||||
error_text = "Timeout Error"
|
||||
expection_text = str(errt)
|
||||
except (ssl.SSLCertVerificationError, ssl.SSLError) as err:
|
||||
error_text = "SSL Error"
|
||||
expection_text = str(err)
|
||||
except aiohttp.client_exceptions.ClientConnectorError as err:
|
||||
error_text = "Error Connecting"
|
||||
expection_text = str(err)
|
||||
except aiohttp.http_exceptions.BadHttpMessage as err:
|
||||
error_text = "HTTP Error"
|
||||
expection_text = str(err)
|
||||
except proxy_errors.ProxyError as err:
|
||||
error_text = "Proxy Error"
|
||||
expection_text = str(err)
|
||||
except Exception as err:
|
||||
logger.warning(f'Unhandled error while requesting {site_name}: {err}')
|
||||
logger.debug(err, exc_info=True)
|
||||
error_text = "Some Error"
|
||||
expection_text = str(err)
|
||||
|
||||
# TODO: return only needed information
|
||||
return html_text, status_code, error_text, expection_text
|
||||
|
||||
|
||||
async def update_site_dict_from_response(sitename, site_dict, results_info, semaphore, logger, query_notify):
|
||||
async with semaphore:
|
||||
site_obj = site_dict[sitename]
|
||||
future = site_obj.request_future
|
||||
if not future:
|
||||
# ignore: search by incompatible id type
|
||||
return
|
||||
|
||||
response = await get_response(request_future=future,
|
||||
site_name=sitename,
|
||||
logger=logger)
|
||||
|
||||
site_dict[sitename] = process_site_result(response, query_notify, logger, results_info, site_obj)
|
||||
|
||||
|
||||
# TODO: move info separate module
|
||||
def detect_error_page(html_text, status_code, fail_flags, ignore_403):
|
||||
# Detect service restrictions such as a country restriction
|
||||
for flag, msg in fail_flags.items():
|
||||
if flag in html_text:
|
||||
return 'Some site error', msg
|
||||
|
||||
# Detect common restrictions such as provider censorship and bot protection
|
||||
for flag, msg in common_errors.items():
|
||||
if flag in html_text:
|
||||
return 'Error', msg
|
||||
|
||||
# Detect common site errors
|
||||
if status_code == 403 and not ignore_403:
|
||||
return 'Access denied', 'Access denied, use proxy/vpn'
|
||||
elif status_code >= 500:
|
||||
return f'Error {status_code}', f'Site error {status_code}'
|
||||
|
||||
return None, None
|
||||
|
||||
|
||||
def process_site_result(response, query_notify, logger, results_info, site: MaigretSite):
|
||||
if not response:
|
||||
return results_info
|
||||
|
||||
fulltags = site.tags
|
||||
|
||||
# Retrieve other site information again
|
||||
username = results_info['username']
|
||||
is_parsing_enabled = results_info['parsing_enabled']
|
||||
url = results_info.get("url_user")
|
||||
logger.debug(url)
|
||||
|
||||
status = results_info.get("status")
|
||||
if status is not None:
|
||||
# We have already determined the user doesn't exist here
|
||||
return results_info
|
||||
|
||||
# Get the expected check type
|
||||
check_type = site.check_type
|
||||
|
||||
# Get the failure messages and comments
|
||||
failure_errors = site.errors
|
||||
|
||||
# TODO: refactor
|
||||
if not response:
|
||||
logger.error(f'No response for {site.name}')
|
||||
return results_info
|
||||
|
||||
html_text, status_code, error_text, expection_text = response
|
||||
site_error_text = '?'
|
||||
|
||||
# TODO: add elapsed request time counting
|
||||
response_time = None
|
||||
|
||||
if logger.level == logging.DEBUG:
|
||||
with open('debug.txt', 'a') as f:
|
||||
status = status_code or 'No response'
|
||||
f.write(f'url: {url}\nerror: {str(error_text)}\nr: {status}\n')
|
||||
if html_text:
|
||||
f.write(f'code: {status}\nresponse: {str(html_text)}\n')
|
||||
|
||||
if status_code and not error_text:
|
||||
error_text, site_error_text = detect_error_page(html_text, status_code, failure_errors,
|
||||
site.ignore_403)
|
||||
|
||||
if site.activation and html_text:
|
||||
is_need_activation = any([s for s in site.activation['marks'] if s in html_text])
|
||||
if is_need_activation:
|
||||
method = site.activation['method']
|
||||
try:
|
||||
activate_fun = getattr(ParsingActivator(), method)
|
||||
# TODO: async call
|
||||
activate_fun(site, logger)
|
||||
except AttributeError:
|
||||
logger.warning(f'Activation method {method} for site {site.name} not found!')
|
||||
|
||||
# presense flags
|
||||
# True by default
|
||||
presense_flags = site.presense_strs
|
||||
is_presense_detected = False
|
||||
if html_text:
|
||||
if not presense_flags:
|
||||
is_presense_detected = True
|
||||
site.stats['presense_flag'] = None
|
||||
else:
|
||||
for presense_flag in presense_flags:
|
||||
if presense_flag in html_text:
|
||||
is_presense_detected = True
|
||||
site.stats['presense_flag'] = presense_flag
|
||||
logger.info(presense_flag)
|
||||
break
|
||||
|
||||
if error_text is not None:
|
||||
logger.debug(error_text)
|
||||
result = QueryResult(username,
|
||||
site.name,
|
||||
url,
|
||||
QueryStatus.UNKNOWN,
|
||||
query_time=response_time,
|
||||
context=f'{error_text}: {site_error_text}', tags=fulltags)
|
||||
elif check_type == "message":
|
||||
absence_flags = site.absence_strs
|
||||
is_absence_flags_list = isinstance(absence_flags, list)
|
||||
absence_flags_set = set(absence_flags) if is_absence_flags_list else {absence_flags}
|
||||
# Checks if the error message is in the HTML
|
||||
is_absence_detected = any([(absence_flag in html_text) for absence_flag in absence_flags_set])
|
||||
if not is_absence_detected and is_presense_detected:
|
||||
result = QueryResult(username,
|
||||
site.name,
|
||||
url,
|
||||
QueryStatus.CLAIMED,
|
||||
query_time=response_time, tags=fulltags)
|
||||
else:
|
||||
result = QueryResult(username,
|
||||
site.name,
|
||||
url,
|
||||
QueryStatus.AVAILABLE,
|
||||
query_time=response_time, tags=fulltags)
|
||||
elif check_type == "status_code":
|
||||
# Checks if the status code of the response is 2XX
|
||||
if (not status_code >= 300 or status_code < 200) and is_presense_detected:
|
||||
result = QueryResult(username,
|
||||
site.name,
|
||||
url,
|
||||
QueryStatus.CLAIMED,
|
||||
query_time=response_time, tags=fulltags)
|
||||
else:
|
||||
result = QueryResult(username,
|
||||
site.name,
|
||||
url,
|
||||
QueryStatus.AVAILABLE,
|
||||
query_time=response_time, tags=fulltags)
|
||||
elif check_type == "response_url":
|
||||
# For this detection method, we have turned off the redirect.
|
||||
# So, there is no need to check the response URL: it will always
|
||||
# match the request. Instead, we will ensure that the response
|
||||
# code indicates that the request was successful (i.e. no 404, or
|
||||
# forward to some odd redirect).
|
||||
if 200 <= status_code < 300 and is_presense_detected:
|
||||
result = QueryResult(username,
|
||||
site.name,
|
||||
url,
|
||||
QueryStatus.CLAIMED,
|
||||
query_time=response_time, tags=fulltags)
|
||||
else:
|
||||
result = QueryResult(username,
|
||||
site.name,
|
||||
url,
|
||||
QueryStatus.AVAILABLE,
|
||||
query_time=response_time, tags=fulltags)
|
||||
else:
|
||||
# It should be impossible to ever get here...
|
||||
raise ValueError(f"Unknown check type '{check_type}' for "
|
||||
f"site '{site.name}'")
|
||||
|
||||
extracted_ids_data = {}
|
||||
|
||||
if is_parsing_enabled and result.status == QueryStatus.CLAIMED:
|
||||
try:
|
||||
extracted_ids_data = extract(html_text)
|
||||
except Exception as e:
|
||||
logger.warning(f'Error while parsing {site.name}: {e}', exc_info=True)
|
||||
|
||||
if extracted_ids_data:
|
||||
new_usernames = {}
|
||||
for k, v in extracted_ids_data.items():
|
||||
if 'username' in k:
|
||||
new_usernames[v] = 'username'
|
||||
if k in supported_recursive_search_ids:
|
||||
new_usernames[v] = k
|
||||
|
||||
results_info['ids_usernames'] = new_usernames
|
||||
result.ids_data = extracted_ids_data
|
||||
|
||||
# Notify caller about results of query.
|
||||
query_notify.update(result, site.similar_search)
|
||||
|
||||
# Save status of request
|
||||
results_info['status'] = result
|
||||
|
||||
# Save results from request
|
||||
results_info['http_status'] = status_code
|
||||
results_info['is_similar'] = site.similar_search
|
||||
# results_site['response_text'] = html_text
|
||||
results_info['rank'] = site.alexa_rank
|
||||
return results_info
|
||||
|
||||
|
||||
async def maigret(username, site_dict, query_notify, logger,
|
||||
proxy=None, timeout=None, recursive_search=False,
|
||||
id_type='username', debug=False, forced=False,
|
||||
max_connections=100, no_progressbar=False,
|
||||
cookies=None):
|
||||
"""Main search func
|
||||
|
||||
Checks for existence of username on various social media sites.
|
||||
|
||||
Keyword Arguments:
|
||||
username -- String indicating username that report
|
||||
should be created against.
|
||||
site_dict -- Dictionary containing all of the site data.
|
||||
query_notify -- Object with base type of QueryNotify().
|
||||
This will be used to notify the caller about
|
||||
query results.
|
||||
proxy -- String indicating the proxy URL
|
||||
timeout -- Time in seconds to wait before timing out request.
|
||||
Default is no timeout.
|
||||
recursive_search -- Search for other usernames in website pages & recursive search by them.
|
||||
|
||||
Return Value:
|
||||
Dictionary containing results from report. Key of dictionary is the name
|
||||
of the social network site, and the value is another dictionary with
|
||||
the following keys:
|
||||
url_main: URL of main site.
|
||||
url_user: URL of user on site (if account exists).
|
||||
status: QueryResult() object indicating results of test for
|
||||
account existence.
|
||||
http_status: HTTP status code of query which checked for existence on
|
||||
site.
|
||||
response_text: Text that came back from request. May be None if
|
||||
there was an HTTP error when checking for existence.
|
||||
"""
|
||||
|
||||
# Notify caller that we are starting the query.
|
||||
query_notify.start(username, id_type)
|
||||
|
||||
# TODO: connector
|
||||
connector = ProxyConnector.from_url(proxy) if proxy else aiohttp.TCPConnector(ssl=False)
|
||||
# connector = aiohttp.TCPConnector(ssl=False)
|
||||
connector.verify_ssl = False
|
||||
|
||||
cookie_jar = None
|
||||
if cookies:
|
||||
cookie_jar = await import_aiohttp_cookies(cookies)
|
||||
|
||||
session = aiohttp.ClientSession(connector=connector, trust_env=True, cookie_jar=cookie_jar)
|
||||
|
||||
if logger.level == logging.DEBUG:
|
||||
future = session.get(url='https://icanhazip.com')
|
||||
ip, status, error, expection = await get_response(future, None, logger)
|
||||
if ip:
|
||||
logger.debug(f'My IP is: {ip.strip()}')
|
||||
else:
|
||||
logger.debug(f'IP requesting {error}: {expection}')
|
||||
|
||||
# Results from analysis of all sites
|
||||
results_total = {}
|
||||
|
||||
# First create futures for all requests. This allows for the requests to run in parallel
|
||||
for site_name, site in site_dict.items():
|
||||
|
||||
if site.type != id_type:
|
||||
continue
|
||||
|
||||
if site.disabled and not forced:
|
||||
logger.debug(f'Site {site.name} is disabled, skipping...')
|
||||
continue
|
||||
|
||||
# Results from analysis of this specific site
|
||||
results_site = {}
|
||||
|
||||
# Record URL of main site and username
|
||||
results_site['username'] = username
|
||||
results_site['parsing_enabled'] = recursive_search
|
||||
results_site['url_main'] = site.url_main
|
||||
results_site['cookies'] = cookie_jar and cookie_jar.filter_cookies(site.url_main) or None
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11.1; rv:55.0) Gecko/20100101 Firefox/55.0',
|
||||
}
|
||||
|
||||
headers.update(site.headers)
|
||||
|
||||
if not 'url' in site.__dict__:
|
||||
logger.error('No URL for site %s', site.name)
|
||||
# URL of user on site (if it exists)
|
||||
url = site.url.format(
|
||||
urlMain=site.url_main,
|
||||
urlSubpath=site.url_subpath,
|
||||
username=username
|
||||
)
|
||||
# workaround to prevent slash errors
|
||||
url = re.sub('(?<!:)/+', '/', url)
|
||||
|
||||
# Don't make request if username is invalid for the site
|
||||
if site.regex_check and re.search(site.regex_check, username) is None:
|
||||
# No need to do the check at the site: this user name is not allowed.
|
||||
results_site['status'] = QueryResult(username,
|
||||
site_name,
|
||||
url,
|
||||
QueryStatus.ILLEGAL)
|
||||
results_site["url_user"] = ""
|
||||
results_site['http_status'] = ""
|
||||
results_site['response_text'] = ""
|
||||
query_notify.update(results_site['status'])
|
||||
else:
|
||||
# URL of user on site (if it exists)
|
||||
results_site["url_user"] = url
|
||||
url_probe = site.url_probe
|
||||
if url_probe is None:
|
||||
# Probe URL is normal one seen by people out on the web.
|
||||
url_probe = url
|
||||
else:
|
||||
# There is a special URL for probing existence separate
|
||||
# from where the user profile normally can be found.
|
||||
url_probe = url_probe.format(
|
||||
urlMain=site.url_main,
|
||||
urlSubpath=site.url_subpath,
|
||||
username=username,
|
||||
)
|
||||
|
||||
for k, v in site.get_params.items():
|
||||
url_probe += f'&{k}={v}'
|
||||
|
||||
if site.check_type == 'status_code' and site.request_head_only:
|
||||
# In most cases when we are detecting by status code,
|
||||
# it is not necessary to get the entire body: we can
|
||||
# detect fine with just the HEAD response.
|
||||
request_method = session.head
|
||||
else:
|
||||
# Either this detect method needs the content associated
|
||||
# with the GET response, or this specific website will
|
||||
# not respond properly unless we request the whole page.
|
||||
request_method = session.get
|
||||
|
||||
if site.check_type == "response_url":
|
||||
# Site forwards request to a different URL if username not
|
||||
# found. Disallow the redirect so we can capture the
|
||||
# http status from the original URL request.
|
||||
allow_redirects = False
|
||||
else:
|
||||
# Allow whatever redirect that the site wants to do.
|
||||
# The final result of the request will be what is available.
|
||||
allow_redirects = True
|
||||
|
||||
future = request_method(url=url_probe, headers=headers,
|
||||
allow_redirects=allow_redirects,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
# Store future in data for access later
|
||||
# TODO: move to separate obj
|
||||
site.request_future = future
|
||||
|
||||
# Add this site's results into final dictionary with all of the other results.
|
||||
results_total[site_name] = results_site
|
||||
|
||||
# TODO: move into top-level function
|
||||
|
||||
sem = asyncio.Semaphore(max_connections)
|
||||
|
||||
tasks = []
|
||||
for sitename, result_obj in results_total.items():
|
||||
update_site_coro = update_site_dict_from_response(sitename, site_dict, result_obj, sem, logger, query_notify)
|
||||
future = asyncio.ensure_future(update_site_coro)
|
||||
tasks.append(future)
|
||||
|
||||
if no_progressbar:
|
||||
await asyncio.gather(*tasks)
|
||||
else:
|
||||
for f in tqdm.asyncio.tqdm.as_completed(tasks):
|
||||
await f
|
||||
|
||||
await session.close()
|
||||
|
||||
# Notify caller that all queries are finished.
|
||||
query_notify.finish()
|
||||
|
||||
return results_total
|
||||
|
||||
|
||||
def timeout_check(value):
|
||||
"""Check Timeout Argument.
|
||||
|
||||
Checks timeout for validity.
|
||||
|
||||
Keyword Arguments:
|
||||
value -- Time in seconds to wait before timing out request.
|
||||
|
||||
Return Value:
|
||||
Floating point number representing the time (in seconds) that should be
|
||||
used for the timeout.
|
||||
|
||||
NOTE: Will raise an exception if the timeout in invalid.
|
||||
"""
|
||||
from argparse import ArgumentTypeError
|
||||
|
||||
try:
|
||||
timeout = float(value)
|
||||
except ValueError:
|
||||
raise ArgumentTypeError(f"Timeout '{value}' must be a number.")
|
||||
if timeout <= 0:
|
||||
raise ArgumentTypeError(f"Timeout '{value}' must be greater than 0.0s.")
|
||||
return timeout
|
||||
|
||||
|
||||
async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False):
|
||||
query_notify = Mock()
|
||||
changes = {
|
||||
'disabled': False,
|
||||
}
|
||||
|
||||
try:
|
||||
check_data = [
|
||||
(site.username_claimed, QueryStatus.CLAIMED),
|
||||
(site.username_unclaimed, QueryStatus.AVAILABLE),
|
||||
]
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
logger.error(site.__dict__)
|
||||
check_data = []
|
||||
|
||||
logger.info(f'Checking {site.name}...')
|
||||
|
||||
for username, status in check_data:
|
||||
async with semaphore:
|
||||
results_dict = await maigret(
|
||||
username,
|
||||
{site.name: site},
|
||||
query_notify,
|
||||
logger,
|
||||
timeout=30,
|
||||
id_type=site.type,
|
||||
forced=True,
|
||||
no_progressbar=True,
|
||||
)
|
||||
|
||||
# don't disable entries with other ids types
|
||||
# TODO: make normal checking
|
||||
if site.name not in results_dict:
|
||||
logger.info(results_dict)
|
||||
changes['disabled'] = True
|
||||
continue
|
||||
|
||||
result = results_dict[site.name]['status']
|
||||
|
||||
site_status = result.status
|
||||
|
||||
if site_status != status:
|
||||
if site_status == QueryStatus.UNKNOWN:
|
||||
msgs = site.absence_strs
|
||||
etype = site.check_type
|
||||
logger.warning(
|
||||
f'Error while searching {username} in {site.name}: {result.context}, {msgs}, type {etype}')
|
||||
# don't disable in case of available username
|
||||
if status == QueryStatus.CLAIMED:
|
||||
changes['disabled'] = True
|
||||
elif status == QueryStatus.CLAIMED:
|
||||
logger.warning(f'Not found `{username}` in {site.name}, must be claimed')
|
||||
logger.info(results_dict[site.name])
|
||||
changes['disabled'] = True
|
||||
else:
|
||||
logger.warning(f'Found `{username}` in {site.name}, must be available')
|
||||
logger.info(results_dict[site.name])
|
||||
changes['disabled'] = True
|
||||
|
||||
logger.info(f'Site {site.name} checking is finished')
|
||||
|
||||
if changes['disabled'] != site.disabled:
|
||||
site.disabled = changes['disabled']
|
||||
db.update_site(site)
|
||||
if not silent:
|
||||
action = 'Disabled' if site.disabled else 'Enabled'
|
||||
print(f'{action} site {site.name}...')
|
||||
|
||||
return changes
|
||||
|
||||
|
||||
async def self_check(db: MaigretDatabase, site_data: dict, logger, silent=False,
|
||||
max_connections=10) -> bool:
|
||||
sem = asyncio.Semaphore(max_connections)
|
||||
tasks = []
|
||||
all_sites = site_data
|
||||
|
||||
def disabled_count(lst):
|
||||
return len(list(filter(lambda x: x.disabled, lst)))
|
||||
|
||||
disabled_old_count = disabled_count(all_sites.values())
|
||||
|
||||
for _, site in all_sites.items():
|
||||
check_coro = site_self_check(site, logger, sem, db, silent)
|
||||
future = asyncio.ensure_future(check_coro)
|
||||
tasks.append(future)
|
||||
|
||||
for f in tqdm.asyncio.tqdm.as_completed(tasks):
|
||||
await f
|
||||
|
||||
disabled_new_count = disabled_count(all_sites.values())
|
||||
total_disabled = disabled_new_count - disabled_old_count
|
||||
|
||||
if total_disabled >= 0:
|
||||
message = 'Disabled'
|
||||
else:
|
||||
message = 'Enabled'
|
||||
total_disabled *= -1
|
||||
|
||||
if not silent:
|
||||
print(
|
||||
f'{message} {total_disabled} ({disabled_old_count} => {disabled_new_count}) checked sites. Run with `--info` flag to get more information')
|
||||
|
||||
return total_disabled != 0
|
||||
+17
-600
@@ -2,616 +2,22 @@
|
||||
Maigret main module
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import platform
|
||||
import re
|
||||
import ssl
|
||||
import sys
|
||||
from argparse import ArgumentParser, RawDescriptionHelpFormatter
|
||||
|
||||
import aiohttp
|
||||
import requests
|
||||
import tqdm.asyncio
|
||||
from aiohttp_socks import ProxyConnector
|
||||
from mock import Mock
|
||||
from python_socks import _errors as proxy_errors
|
||||
from socid_extractor import parse, extract, __version__ as socid_version
|
||||
from socid_extractor import parse, __version__ as socid_version
|
||||
|
||||
from .activation import ParsingActivator, import_aiohttp_cookies
|
||||
from .checking import *
|
||||
from .notify import QueryNotifyPrint
|
||||
from .report import save_csv_report, save_xmind_report, save_html_report, save_pdf_report, \
|
||||
generate_report_context, save_txt_report
|
||||
from .result import QueryResult, QueryStatus
|
||||
from .sites import MaigretDatabase, MaigretSite
|
||||
from .submit import submit_dialog
|
||||
|
||||
__version__ = '0.1.13'
|
||||
|
||||
supported_recursive_search_ids = (
|
||||
'yandex_public_id',
|
||||
'gaia_id',
|
||||
'vk_id',
|
||||
'ok_id',
|
||||
'wikimapia_uid',
|
||||
)
|
||||
|
||||
common_errors = {
|
||||
'<title>Attention Required! | Cloudflare</title>': 'Cloudflare captcha',
|
||||
'Please stand by, while we are checking your browser': 'Cloudflare captcha',
|
||||
'<title>Доступ ограничен</title>': 'Rostelecom censorship',
|
||||
'document.getElementById(\'validate_form_submit\').disabled=true': 'Mail.ru captcha',
|
||||
'Verifying your browser, please wait...<br>DDoS Protection by</font> Blazingfast.io': 'Blazingfast protection',
|
||||
'404</h1><p class="error-card__description">Мы не нашли страницу': 'MegaFon 404 page',
|
||||
'Доступ к информационному ресурсу ограничен на основании Федерального закона': 'MGTS censorship',
|
||||
'Incapsula incident ID': 'Incapsula antibot protection',
|
||||
}
|
||||
|
||||
unsupported_characters = '#'
|
||||
|
||||
async def get_response(request_future, site_name, logger):
|
||||
html_text = None
|
||||
status_code = 0
|
||||
|
||||
error_text = "General Unknown Error"
|
||||
expection_text = None
|
||||
|
||||
try:
|
||||
response = await request_future
|
||||
|
||||
status_code = response.status
|
||||
response_content = await response.content.read()
|
||||
charset = response.charset or 'utf-8'
|
||||
decoded_content = response_content.decode(charset, 'ignore')
|
||||
html_text = decoded_content
|
||||
|
||||
if status_code > 0:
|
||||
error_text = None
|
||||
|
||||
logger.debug(html_text)
|
||||
|
||||
except asyncio.TimeoutError as errt:
|
||||
error_text = "Timeout Error"
|
||||
expection_text = str(errt)
|
||||
except (ssl.SSLCertVerificationError, ssl.SSLError) as err:
|
||||
error_text = "SSL Error"
|
||||
expection_text = str(err)
|
||||
except aiohttp.client_exceptions.ClientConnectorError as err:
|
||||
error_text = "Error Connecting"
|
||||
expection_text = str(err)
|
||||
except aiohttp.http_exceptions.BadHttpMessage as err:
|
||||
error_text = "HTTP Error"
|
||||
expection_text = str(err)
|
||||
except proxy_errors.ProxyError as err:
|
||||
error_text = "Proxy Error"
|
||||
expection_text = str(err)
|
||||
except Exception as err:
|
||||
logger.warning(f'Unhandled error while requesting {site_name}: {err}')
|
||||
logger.debug(err, exc_info=True)
|
||||
error_text = "Some Error"
|
||||
expection_text = str(err)
|
||||
|
||||
# TODO: return only needed information
|
||||
return html_text, status_code, error_text, expection_text
|
||||
|
||||
|
||||
async def update_site_dict_from_response(sitename, site_dict, results_info, semaphore, logger, query_notify):
|
||||
async with semaphore:
|
||||
site_obj = site_dict[sitename]
|
||||
future = site_obj.request_future
|
||||
if not future:
|
||||
# ignore: search by incompatible id type
|
||||
return
|
||||
|
||||
response = await get_response(request_future=future,
|
||||
site_name=sitename,
|
||||
logger=logger)
|
||||
|
||||
site_dict[sitename] = process_site_result(response, query_notify, logger, results_info, site_obj)
|
||||
|
||||
# TODO: move info separate module
|
||||
def detect_error_page(html_text, status_code, fail_flags, ignore_403):
|
||||
# Detect service restrictions such as a country restriction
|
||||
for flag, msg in fail_flags.items():
|
||||
if flag in html_text:
|
||||
return 'Some site error', msg
|
||||
|
||||
# Detect common restrictions such as provider censorship and bot protection
|
||||
for flag, msg in common_errors.items():
|
||||
if flag in html_text:
|
||||
return 'Error', msg
|
||||
|
||||
# Detect common site errors
|
||||
if status_code == 403 and not ignore_403:
|
||||
return 'Access denied', 'Access denied, use proxy/vpn'
|
||||
elif status_code >= 500:
|
||||
return f'Error {status_code}', f'Site error {status_code}'
|
||||
|
||||
return None, None
|
||||
|
||||
|
||||
def process_site_result(response, query_notify, logger, results_info, site: MaigretSite):
|
||||
if not response:
|
||||
return results_info
|
||||
|
||||
fulltags = site.tags
|
||||
|
||||
# Retrieve other site information again
|
||||
username = results_info['username']
|
||||
is_parsing_enabled = results_info['parsing_enabled']
|
||||
url = results_info.get("url_user")
|
||||
logger.debug(url)
|
||||
|
||||
status = results_info.get("status")
|
||||
if status is not None:
|
||||
# We have already determined the user doesn't exist here
|
||||
return results_info
|
||||
|
||||
# Get the expected check type
|
||||
check_type = site.check_type
|
||||
|
||||
# Get the failure messages and comments
|
||||
failure_errors = site.errors
|
||||
|
||||
# TODO: refactor
|
||||
if not response:
|
||||
logger.error(f'No response for {site.name}')
|
||||
return results_info
|
||||
|
||||
html_text, status_code, error_text, expection_text = response
|
||||
site_error_text = '?'
|
||||
|
||||
# TODO: add elapsed request time counting
|
||||
response_time = None
|
||||
|
||||
if logger.level == logging.DEBUG:
|
||||
with open('debug.txt', 'a') as f:
|
||||
status = status_code or 'No response'
|
||||
f.write(f'url: {url}\nerror: {str(error_text)}\nr: {status}\n')
|
||||
if html_text:
|
||||
f.write(f'code: {status}\nresponse: {str(html_text)}\n')
|
||||
|
||||
if status_code and not error_text:
|
||||
error_text, site_error_text = detect_error_page(html_text, status_code, failure_errors,
|
||||
site.ignore_403)
|
||||
|
||||
if site.activation and html_text:
|
||||
is_need_activation = any([s for s in site.activation['marks'] if s in html_text])
|
||||
if is_need_activation:
|
||||
method = site.activation['method']
|
||||
try:
|
||||
activate_fun = getattr(ParsingActivator(), method)
|
||||
# TODO: async call
|
||||
activate_fun(site, logger)
|
||||
except AttributeError:
|
||||
logger.warning(f'Activation method {method} for site {site.name} not found!')
|
||||
|
||||
# presense flags
|
||||
# True by default
|
||||
presense_flags = site.presense_strs
|
||||
is_presense_detected = False
|
||||
if html_text:
|
||||
if not presense_flags:
|
||||
is_presense_detected = True
|
||||
site.stats['presense_flag'] = None
|
||||
else:
|
||||
for presense_flag in presense_flags:
|
||||
if presense_flag in html_text:
|
||||
is_presense_detected = True
|
||||
site.stats['presense_flag'] = presense_flag
|
||||
logger.info(presense_flag)
|
||||
break
|
||||
|
||||
if error_text is not None:
|
||||
logger.debug(error_text)
|
||||
result = QueryResult(username,
|
||||
site.name,
|
||||
url,
|
||||
QueryStatus.UNKNOWN,
|
||||
query_time=response_time,
|
||||
context=f'{error_text}: {site_error_text}', tags=fulltags)
|
||||
elif check_type == "message":
|
||||
absence_flags = site.absence_strs
|
||||
is_absence_flags_list = isinstance(absence_flags, list)
|
||||
absence_flags_set = set(absence_flags) if is_absence_flags_list else {absence_flags}
|
||||
# Checks if the error message is in the HTML
|
||||
is_absence_detected = any([(absence_flag in html_text) for absence_flag in absence_flags_set])
|
||||
if not is_absence_detected and is_presense_detected:
|
||||
result = QueryResult(username,
|
||||
site.name,
|
||||
url,
|
||||
QueryStatus.CLAIMED,
|
||||
query_time=response_time, tags=fulltags)
|
||||
else:
|
||||
result = QueryResult(username,
|
||||
site.name,
|
||||
url,
|
||||
QueryStatus.AVAILABLE,
|
||||
query_time=response_time, tags=fulltags)
|
||||
elif check_type == "status_code":
|
||||
# Checks if the status code of the response is 2XX
|
||||
if (not status_code >= 300 or status_code < 200) and is_presense_detected:
|
||||
result = QueryResult(username,
|
||||
site.name,
|
||||
url,
|
||||
QueryStatus.CLAIMED,
|
||||
query_time=response_time, tags=fulltags)
|
||||
else:
|
||||
result = QueryResult(username,
|
||||
site.name,
|
||||
url,
|
||||
QueryStatus.AVAILABLE,
|
||||
query_time=response_time, tags=fulltags)
|
||||
elif check_type == "response_url":
|
||||
# For this detection method, we have turned off the redirect.
|
||||
# So, there is no need to check the response URL: it will always
|
||||
# match the request. Instead, we will ensure that the response
|
||||
# code indicates that the request was successful (i.e. no 404, or
|
||||
# forward to some odd redirect).
|
||||
if 200 <= status_code < 300 and is_presense_detected:
|
||||
result = QueryResult(username,
|
||||
site.name,
|
||||
url,
|
||||
QueryStatus.CLAIMED,
|
||||
query_time=response_time, tags=fulltags)
|
||||
else:
|
||||
result = QueryResult(username,
|
||||
site.name,
|
||||
url,
|
||||
QueryStatus.AVAILABLE,
|
||||
query_time=response_time, tags=fulltags)
|
||||
else:
|
||||
# It should be impossible to ever get here...
|
||||
raise ValueError(f"Unknown check type '{check_type}' for "
|
||||
f"site '{site.name}'")
|
||||
|
||||
extracted_ids_data = {}
|
||||
|
||||
if is_parsing_enabled and result.status == QueryStatus.CLAIMED:
|
||||
try:
|
||||
extracted_ids_data = extract(html_text)
|
||||
except Exception as e:
|
||||
logger.warning(f'Error while parsing {site.name}: {e}', exc_info=True)
|
||||
|
||||
if extracted_ids_data:
|
||||
new_usernames = {}
|
||||
for k, v in extracted_ids_data.items():
|
||||
if 'username' in k:
|
||||
new_usernames[v] = 'username'
|
||||
if k in supported_recursive_search_ids:
|
||||
new_usernames[v] = k
|
||||
|
||||
results_info['ids_usernames'] = new_usernames
|
||||
result.ids_data = extracted_ids_data
|
||||
|
||||
# Notify caller about results of query.
|
||||
query_notify.update(result, site.similar_search)
|
||||
|
||||
# Save status of request
|
||||
results_info['status'] = result
|
||||
|
||||
# Save results from request
|
||||
results_info['http_status'] = status_code
|
||||
results_info['is_similar'] = site.similar_search
|
||||
# results_site['response_text'] = html_text
|
||||
results_info['rank'] = site.alexa_rank
|
||||
return results_info
|
||||
|
||||
|
||||
|
||||
|
||||
async def maigret(username, site_dict, query_notify, logger,
|
||||
proxy=None, timeout=None, recursive_search=False,
|
||||
id_type='username', debug=False, forced=False,
|
||||
max_connections=100, no_progressbar=False,
|
||||
cookies=None):
|
||||
"""Main search func
|
||||
|
||||
Checks for existence of username on various social media sites.
|
||||
|
||||
Keyword Arguments:
|
||||
username -- String indicating username that report
|
||||
should be created against.
|
||||
site_dict -- Dictionary containing all of the site data.
|
||||
query_notify -- Object with base type of QueryNotify().
|
||||
This will be used to notify the caller about
|
||||
query results.
|
||||
proxy -- String indicating the proxy URL
|
||||
timeout -- Time in seconds to wait before timing out request.
|
||||
Default is no timeout.
|
||||
recursive_search -- Search for other usernames in website pages & recursive search by them.
|
||||
|
||||
Return Value:
|
||||
Dictionary containing results from report. Key of dictionary is the name
|
||||
of the social network site, and the value is another dictionary with
|
||||
the following keys:
|
||||
url_main: URL of main site.
|
||||
url_user: URL of user on site (if account exists).
|
||||
status: QueryResult() object indicating results of test for
|
||||
account existence.
|
||||
http_status: HTTP status code of query which checked for existence on
|
||||
site.
|
||||
response_text: Text that came back from request. May be None if
|
||||
there was an HTTP error when checking for existence.
|
||||
"""
|
||||
|
||||
# Notify caller that we are starting the query.
|
||||
query_notify.start(username, id_type)
|
||||
|
||||
# TODO: connector
|
||||
connector = ProxyConnector.from_url(proxy) if proxy else aiohttp.TCPConnector(ssl=False)
|
||||
# connector = aiohttp.TCPConnector(ssl=False)
|
||||
connector.verify_ssl=False
|
||||
|
||||
cookie_jar = None
|
||||
if cookies:
|
||||
cookie_jar = await import_aiohttp_cookies(cookies)
|
||||
|
||||
session = aiohttp.ClientSession(connector=connector, trust_env=True, cookie_jar=cookie_jar)
|
||||
|
||||
if logger.level == logging.DEBUG:
|
||||
future = session.get(url='https://icanhazip.com')
|
||||
ip, status, error, expection = await get_response(future, None, logger)
|
||||
if ip:
|
||||
logger.debug(f'My IP is: {ip.strip()}')
|
||||
else:
|
||||
logger.debug(f'IP requesting {error}: {expection}')
|
||||
|
||||
|
||||
# Results from analysis of all sites
|
||||
results_total = {}
|
||||
|
||||
# First create futures for all requests. This allows for the requests to run in parallel
|
||||
for site_name, site in site_dict.items():
|
||||
|
||||
if site.type != id_type:
|
||||
continue
|
||||
|
||||
if site.disabled and not forced:
|
||||
logger.debug(f'Site {site.name} is disabled, skipping...')
|
||||
continue
|
||||
|
||||
# Results from analysis of this specific site
|
||||
results_site = {}
|
||||
|
||||
# Record URL of main site and username
|
||||
results_site['username'] = username
|
||||
results_site['parsing_enabled'] = recursive_search
|
||||
results_site['url_main'] = site.url_main
|
||||
results_site['cookies'] = cookie_jar and cookie_jar.filter_cookies(site.url_main) or None
|
||||
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11.1; rv:55.0) Gecko/20100101 Firefox/55.0',
|
||||
}
|
||||
|
||||
headers.update(site.headers)
|
||||
|
||||
if not 'url' in site.__dict__:
|
||||
logger.error('No URL for site %s', site.name)
|
||||
# URL of user on site (if it exists)
|
||||
url = site.url.format(
|
||||
urlMain=site.url_main,
|
||||
urlSubpath=site.url_subpath,
|
||||
username=username
|
||||
)
|
||||
# workaround to prevent slash errors
|
||||
url = re.sub('(?<!:)/+', '/', url)
|
||||
|
||||
# Don't make request if username is invalid for the site
|
||||
if site.regex_check and re.search(site.regex_check, username) is None:
|
||||
# No need to do the check at the site: this user name is not allowed.
|
||||
results_site['status'] = QueryResult(username,
|
||||
site_name,
|
||||
url,
|
||||
QueryStatus.ILLEGAL)
|
||||
results_site["url_user"] = ""
|
||||
results_site['http_status'] = ""
|
||||
results_site['response_text'] = ""
|
||||
query_notify.update(results_site['status'])
|
||||
else:
|
||||
# URL of user on site (if it exists)
|
||||
results_site["url_user"] = url
|
||||
url_probe = site.url_probe
|
||||
if url_probe is None:
|
||||
# Probe URL is normal one seen by people out on the web.
|
||||
url_probe = url
|
||||
else:
|
||||
# There is a special URL for probing existence separate
|
||||
# from where the user profile normally can be found.
|
||||
url_probe = url_probe.format(
|
||||
urlMain=site.url_main,
|
||||
urlSubpath=site.url_subpath,
|
||||
username=username,
|
||||
)
|
||||
|
||||
for k, v in site.get_params.items():
|
||||
url_probe += f'&{k}={v}'
|
||||
|
||||
if site.check_type == 'status_code' and site.request_head_only:
|
||||
# In most cases when we are detecting by status code,
|
||||
# it is not necessary to get the entire body: we can
|
||||
# detect fine with just the HEAD response.
|
||||
request_method = session.head
|
||||
else:
|
||||
# Either this detect method needs the content associated
|
||||
# with the GET response, or this specific website will
|
||||
# not respond properly unless we request the whole page.
|
||||
request_method = session.get
|
||||
|
||||
if site.check_type == "response_url":
|
||||
# Site forwards request to a different URL if username not
|
||||
# found. Disallow the redirect so we can capture the
|
||||
# http status from the original URL request.
|
||||
allow_redirects = False
|
||||
else:
|
||||
# Allow whatever redirect that the site wants to do.
|
||||
# The final result of the request will be what is available.
|
||||
allow_redirects = True
|
||||
|
||||
future = request_method(url=url_probe, headers=headers,
|
||||
allow_redirects=allow_redirects,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
# Store future in data for access later
|
||||
# TODO: move to separate obj
|
||||
site.request_future = future
|
||||
|
||||
# Add this site's results into final dictionary with all of the other results.
|
||||
results_total[site_name] = results_site
|
||||
|
||||
# TODO: move into top-level function
|
||||
|
||||
sem = asyncio.Semaphore(max_connections)
|
||||
|
||||
tasks = []
|
||||
for sitename, result_obj in results_total.items():
|
||||
update_site_coro = update_site_dict_from_response(sitename, site_dict, result_obj, sem, logger, query_notify)
|
||||
future = asyncio.ensure_future(update_site_coro)
|
||||
tasks.append(future)
|
||||
|
||||
if no_progressbar:
|
||||
await asyncio.gather(*tasks)
|
||||
else:
|
||||
for f in tqdm.asyncio.tqdm.as_completed(tasks):
|
||||
await f
|
||||
|
||||
await session.close()
|
||||
|
||||
# Notify caller that all queries are finished.
|
||||
query_notify.finish()
|
||||
|
||||
return results_total
|
||||
|
||||
|
||||
def timeout_check(value):
|
||||
"""Check Timeout Argument.
|
||||
|
||||
Checks timeout for validity.
|
||||
|
||||
Keyword Arguments:
|
||||
value -- Time in seconds to wait before timing out request.
|
||||
|
||||
Return Value:
|
||||
Floating point number representing the time (in seconds) that should be
|
||||
used for the timeout.
|
||||
|
||||
NOTE: Will raise an exception if the timeout in invalid.
|
||||
"""
|
||||
from argparse import ArgumentTypeError
|
||||
|
||||
try:
|
||||
timeout = float(value)
|
||||
except ValueError:
|
||||
raise ArgumentTypeError(f"Timeout '{value}' must be a number.")
|
||||
if timeout <= 0:
|
||||
raise ArgumentTypeError(f"Timeout '{value}' must be greater than 0.0s.")
|
||||
return timeout
|
||||
|
||||
|
||||
async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False):
|
||||
query_notify = Mock()
|
||||
changes = {
|
||||
'disabled': False,
|
||||
}
|
||||
|
||||
try:
|
||||
check_data = [
|
||||
(site.username_claimed, QueryStatus.CLAIMED),
|
||||
(site.username_unclaimed, QueryStatus.AVAILABLE),
|
||||
]
|
||||
except:
|
||||
print(site.__dict__)
|
||||
|
||||
logger.info(f'Checking {site.name}...')
|
||||
|
||||
for username, status in check_data:
|
||||
async with semaphore:
|
||||
results_dict = await maigret(
|
||||
username,
|
||||
{site.name: site},
|
||||
query_notify,
|
||||
logger,
|
||||
timeout=30,
|
||||
id_type=site.type,
|
||||
forced=True,
|
||||
no_progressbar=True,
|
||||
)
|
||||
|
||||
# don't disable entries with other ids types
|
||||
# TODO: make normal checking
|
||||
if site.name not in results_dict:
|
||||
logger.info(results_dict)
|
||||
changes['disabled'] = True
|
||||
continue
|
||||
|
||||
result = results_dict[site.name]['status']
|
||||
|
||||
|
||||
site_status = result.status
|
||||
|
||||
if site_status != status:
|
||||
if site_status == QueryStatus.UNKNOWN:
|
||||
msgs = site.absence_strs
|
||||
etype = site.check_type
|
||||
logger.warning(f'Error while searching {username} in {site.name}: {result.context}, {msgs}, type {etype}')
|
||||
# don't disable in case of available username
|
||||
if status == QueryStatus.CLAIMED:
|
||||
changes['disabled'] = True
|
||||
elif status == QueryStatus.CLAIMED:
|
||||
logger.warning(f'Not found `{username}` in {site.name}, must be claimed')
|
||||
logger.info(results_dict[site.name])
|
||||
changes['disabled'] = True
|
||||
else:
|
||||
logger.warning(f'Found `{username}` in {site.name}, must be available')
|
||||
logger.info(results_dict[site.name])
|
||||
changes['disabled'] = True
|
||||
|
||||
logger.info(f'Site {site.name} checking is finished')
|
||||
|
||||
if changes['disabled'] != site.disabled:
|
||||
site.disabled = changes['disabled']
|
||||
db.update_site(site)
|
||||
if not silent:
|
||||
action = 'Disabled' if site.disabled else 'Enabled'
|
||||
print(f'{action} site {site.name}...')
|
||||
|
||||
return changes
|
||||
|
||||
|
||||
async def self_check(db: MaigretDatabase, site_data: dict, logger, silent=False,
|
||||
max_connections=10) -> bool:
|
||||
sem = asyncio.Semaphore(max_connections)
|
||||
tasks = []
|
||||
all_sites = site_data
|
||||
|
||||
def disabled_count(lst):
|
||||
return len(list(filter(lambda x: x.disabled, lst)))
|
||||
|
||||
disabled_old_count = disabled_count(all_sites.values())
|
||||
|
||||
for _, site in all_sites.items():
|
||||
check_coro = site_self_check(site, logger, sem, db, silent)
|
||||
future = asyncio.ensure_future(check_coro)
|
||||
tasks.append(future)
|
||||
|
||||
for f in tqdm.asyncio.tqdm.as_completed(tasks):
|
||||
await f
|
||||
|
||||
disabled_new_count = disabled_count(all_sites.values())
|
||||
total_disabled = disabled_new_count - disabled_old_count
|
||||
|
||||
if total_disabled >= 0:
|
||||
message = 'Disabled'
|
||||
else:
|
||||
message = 'Enabled'
|
||||
total_disabled *= -1
|
||||
|
||||
if not silent:
|
||||
print(f'{message} {total_disabled} ({disabled_old_count} => {disabled_new_count}) checked sites. Run with `--info` flag to get more information')
|
||||
|
||||
return total_disabled != 0
|
||||
|
||||
|
||||
async def main():
|
||||
version_string = '\n'.join([
|
||||
@@ -685,6 +91,10 @@ async def main():
|
||||
action="store_true", dest="print_check_errors", default=False,
|
||||
help="Print errors messages: connection, captcha, site country ban, etc."
|
||||
)
|
||||
parser.add_argument("--submit",
|
||||
type=str, dest="new_site_to_submit", default=False,
|
||||
help="URL of existing profile in new site to submit."
|
||||
)
|
||||
parser.add_argument("--no-color",
|
||||
action="store_true", dest="no_color", default=False,
|
||||
help="Don't color terminal output"
|
||||
@@ -738,7 +148,7 @@ async def main():
|
||||
action="store_true", dest="html", default=False,
|
||||
help="Create an HTML report file (general report on all usernames)."
|
||||
)
|
||||
parser.add_argument("-X","--xmind",
|
||||
parser.add_argument("-X", "--xmind",
|
||||
action="store_true",
|
||||
dest="xmind", default=False,
|
||||
help="Generate an XMind 8 mindmap report (one report per username)."
|
||||
@@ -820,6 +230,11 @@ async def main():
|
||||
|
||||
site_data = get_top_sites_for_id(args.id_type)
|
||||
|
||||
if args.new_site_to_submit:
|
||||
is_submitted = await submit_dialog(db, args.new_site_to_submit)
|
||||
if is_submitted:
|
||||
db.save_to_file(args.json_file)
|
||||
|
||||
# Database self-checking
|
||||
if args.self_check:
|
||||
print('Maigret sites database self-checking...')
|
||||
@@ -874,7 +289,8 @@ async def main():
|
||||
|
||||
if found_unsupported_chars:
|
||||
pretty_chars_str = ','.join(map(lambda s: f'"{s}"', found_unsupported_chars))
|
||||
query_notify.warning(f'Found unsupported URL characters: {pretty_chars_str}, skip search by username "{username}"')
|
||||
query_notify.warning(
|
||||
f'Found unsupported URL characters: {pretty_chars_str}, skip search by username "{username}"')
|
||||
continue
|
||||
|
||||
sites_to_check = get_top_sites_for_id(id_type)
|
||||
@@ -952,5 +368,6 @@ def run():
|
||||
print('Maigret is interrupted.')
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
||||
run()
|
||||
|
||||
@@ -13590,7 +13590,7 @@
|
||||
"sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
|
||||
"authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
|
||||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
|
||||
"x-guest-token": "1358064134064140290"
|
||||
"x-guest-token": "1358893858789208065"
|
||||
},
|
||||
"errors": {
|
||||
"Bad guest token": "x-guest-token update required"
|
||||
@@ -13956,7 +13956,7 @@
|
||||
"video"
|
||||
],
|
||||
"headers": {
|
||||
"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MTI2MjQ4NjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.kgp8r380d1aDWcd-ROncr0Tqf8EdA-l35EeEY9is6TI"
|
||||
"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MTI4MjE0MjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.TXUhqilVT25xN4lZeoki6hEmbtcOiy7FKxTm5PWOMVs"
|
||||
},
|
||||
"activation": {
|
||||
"url": "https://vimeo.com/_rv/viewer",
|
||||
@@ -23070,6 +23070,32 @@
|
||||
"urlMain": "https://protovary.style",
|
||||
"usernameClaimed": "alex",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
},
|
||||
"beacons.ai": {
|
||||
"checkType": "message",
|
||||
"presenseStrs": [
|
||||
"https://cdn.beacons.ai/profile_pictures"
|
||||
],
|
||||
"absenceStrs": [
|
||||
"https://beacons.ai/bw_logo_full.png"
|
||||
],
|
||||
"url": "https://beacons.ai/{username}",
|
||||
"urlMain": "https://beacons.ai",
|
||||
"usernameClaimed": "pasteljellies",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
},
|
||||
"are.na": {
|
||||
"checkType": "message",
|
||||
"presenseStrs": [
|
||||
"Profile--view"
|
||||
],
|
||||
"absenceStrs": [
|
||||
"Are.na home"
|
||||
],
|
||||
"url": "https://www.are.na/{username}",
|
||||
"urlMain": "https://www.are.na",
|
||||
"usernameClaimed": "nate-cassel",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
}
|
||||
},
|
||||
"engines": {
|
||||
|
||||
@@ -0,0 +1,161 @@
|
||||
import difflib
|
||||
|
||||
import requests
|
||||
from mock import Mock
|
||||
|
||||
from .checking import *
|
||||
|
||||
DESIRED_STRINGS = ["username", "not found", "пользователь", "profile", "lastname", "firstname", "biography",
|
||||
"birthday", "репутация", "информация", "e-mail"]
|
||||
|
||||
RATIO = 0.6
|
||||
TOP_FEATURES = 5
|
||||
|
||||
|
||||
def get_match_ratio(x):
|
||||
return round(max([
|
||||
difflib.SequenceMatcher(a=x.lower(), b=y).ratio()
|
||||
for y in DESIRED_STRINGS
|
||||
]), 2)
|
||||
|
||||
|
||||
def extract_domain(url):
|
||||
return '/'.join(url.split('/', 3)[:3])
|
||||
|
||||
|
||||
async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False):
|
||||
query_notify = Mock()
|
||||
changes = {
|
||||
'disabled': False,
|
||||
}
|
||||
|
||||
check_data = [
|
||||
(site.username_claimed, QueryStatus.CLAIMED),
|
||||
(site.username_unclaimed, QueryStatus.AVAILABLE),
|
||||
]
|
||||
|
||||
logger.info(f'Checking {site.name}...')
|
||||
|
||||
for username, status in check_data:
|
||||
async with semaphore:
|
||||
results_dict = await maigret(
|
||||
username,
|
||||
{site.name: site},
|
||||
query_notify,
|
||||
logger,
|
||||
timeout=30,
|
||||
id_type=site.type,
|
||||
forced=True,
|
||||
no_progressbar=True,
|
||||
)
|
||||
|
||||
# don't disable entries with other ids types
|
||||
# TODO: make normal checking
|
||||
if site.name not in results_dict:
|
||||
logger.info(results_dict)
|
||||
changes['disabled'] = True
|
||||
continue
|
||||
|
||||
result = results_dict[site.name]['status']
|
||||
|
||||
site_status = result.status
|
||||
|
||||
if site_status != status:
|
||||
if site_status == QueryStatus.UNKNOWN:
|
||||
msgs = site.absence_strs
|
||||
etype = site.check_type
|
||||
logger.warning(
|
||||
f'Error while searching {username} in {site.name}: {result.context}, {msgs}, type {etype}')
|
||||
# don't disable in case of available username
|
||||
if status == QueryStatus.CLAIMED:
|
||||
changes['disabled'] = True
|
||||
elif status == QueryStatus.CLAIMED:
|
||||
logger.warning(f'Not found `{username}` in {site.name}, must be claimed')
|
||||
logger.info(results_dict[site.name])
|
||||
changes['disabled'] = True
|
||||
else:
|
||||
logger.warning(f'Found `{username}` in {site.name}, must be available')
|
||||
logger.info(results_dict[site.name])
|
||||
changes['disabled'] = True
|
||||
|
||||
logger.info(f'Site {site.name} checking is finished')
|
||||
|
||||
return changes
|
||||
|
||||
|
||||
async def submit_dialog(db, url_exists):
|
||||
url_parts = url_exists.split('/')
|
||||
supposed_username = url_parts[-1]
|
||||
new_name = input(f'Is "{supposed_username}" a valid username? If not, write it manually: ')
|
||||
if new_name:
|
||||
supposed_username = new_name
|
||||
non_exist_username = 'noonewouldeverusethis7'
|
||||
|
||||
url_user = url_exists.replace(supposed_username, '{username}')
|
||||
url_not_exists = url_exists.replace(supposed_username, non_exist_username)
|
||||
|
||||
a = requests.get(url_exists).text
|
||||
b = requests.get(url_not_exists).text
|
||||
|
||||
tokens_a = set(a.split('"'))
|
||||
tokens_b = set(b.split('"'))
|
||||
|
||||
a_minus_b = tokens_a.difference(tokens_b)
|
||||
b_minus_a = tokens_b.difference(tokens_a)
|
||||
|
||||
top_features_count = int(input(f'Specify count of features to extract [default {TOP_FEATURES}]: '))
|
||||
if not top_features_count:
|
||||
top_features_count = TOP_FEATURES
|
||||
|
||||
presence_list = sorted(a_minus_b, key=get_match_ratio, reverse=True)[:top_features_count]
|
||||
|
||||
print('Detected text features of existing account: ' + ', '.join(presence_list))
|
||||
features = input('If features was not detected correctly, write it manually: ')
|
||||
|
||||
if features:
|
||||
presence_list = features.split(',')
|
||||
|
||||
absence_list = sorted(b_minus_a, key=get_match_ratio, reverse=True)[:top_features_count]
|
||||
print('Detected text features of non-existing account: ' + ', '.join(absence_list))
|
||||
features = input('If features was not detected correctly, write it manually: ')
|
||||
|
||||
if features:
|
||||
absence_list = features.split(',')
|
||||
|
||||
url_main = extract_domain(url_exists)
|
||||
|
||||
site_data = {
|
||||
'absenceStrs': absence_list,
|
||||
'presenseStrs': presence_list,
|
||||
'url': url_user,
|
||||
'urlMain': url_main,
|
||||
'usernameClaimed': supposed_username,
|
||||
'usernameUnclaimed': non_exist_username,
|
||||
'checkType': 'message',
|
||||
}
|
||||
|
||||
site = MaigretSite(url_main.split('/')[-1], site_data)
|
||||
|
||||
print(site.__dict__)
|
||||
|
||||
sem = asyncio.Semaphore(1)
|
||||
log_level = logging.INFO
|
||||
logging.basicConfig(
|
||||
format='[%(filename)s:%(lineno)d] %(levelname)-3s %(asctime)s %(message)s',
|
||||
datefmt='%H:%M:%S',
|
||||
level=log_level
|
||||
)
|
||||
logger = logging.getLogger('site-submit')
|
||||
logger.setLevel(log_level)
|
||||
|
||||
result = await site_self_check(site, logger, sem, db)
|
||||
|
||||
if result['disabled']:
|
||||
print(f'Sorry, we couldn\'t find params to detect account presence/absence in {site.name}.')
|
||||
print('Try to run this mode again and increase features count or choose others.')
|
||||
else:
|
||||
if input(f'Site {site.name} successfully checked. Do you want to save it in the Maigret DB? [yY] ') in 'yY':
|
||||
db.update_site(site)
|
||||
return True
|
||||
|
||||
return False
|
||||
Reference in New Issue
Block a user