Errors stats MVP, some fp fixes

This commit is contained in:
Soxoj
2021-04-25 01:08:23 +03:00
parent 83cda9e37f
commit f43dc5bd6f
5 changed files with 83 additions and 181 deletions
+71 -173
View File
@@ -6,7 +6,6 @@ import ssl
import sys
import tqdm
import time
from typing import Callable, Any, Iterable, Tuple
import aiohttp
import tqdm.asyncio
@@ -16,8 +15,11 @@ from python_socks import _errors as proxy_errors
from socid_extractor import extract
from .activation import ParsingActivator, import_aiohttp_cookies
from .executors import AsyncioSimpleExecutor, AsyncioProgressbarQueueExecutor
from .result import QueryResult, QueryStatus
from .sites import MaigretDatabase, MaigretSite
from .types import CheckError
supported_recursive_search_ids = (
'yandex_public_id',
@@ -30,139 +32,23 @@ supported_recursive_search_ids = (
)
common_errors = {
'<title>Attention Required! | Cloudflare</title>': 'Cloudflare captcha',
'Please stand by, while we are checking your browser': 'Cloudflare captcha',
'<title>Доступ ограничен</title>': 'Rostelecom censorship',
'document.getElementById(\'validate_form_submit\').disabled=true': 'Mail.ru captcha',
'Verifying your browser, please wait...<br>DDoS Protection by</font> Blazingfast.io': 'Blazingfast protection',
'404</h1><p class="error-card__description">Мы&nbsp;не&nbsp;нашли страницу': 'MegaFon 404 page',
'Доступ к информационному ресурсу ограничен на основании Федерального закона': 'MGTS censorship',
'Incapsula incident ID': 'Incapsula antibot protection',
'<title>Attention Required! | Cloudflare</title>': CheckError('Captcha', 'Cloudflare'),
'Please stand by, while we are checking your browser': CheckError('Bot protection', 'Cloudflare'),
'<title>Доступ ограничен</title>': CheckError('Censorship', 'Rostelecom'),
'document.getElementById(\'validate_form_submit\').disabled=true': CheckError('Captcha', 'Mail.ru'),
'Verifying your browser, please wait...<br>DDoS Protection by</font> Blazingfast.io': CheckError('Bot protection', 'Blazingfast'),
'404</h1><p class="error-card__description">Мы&nbsp;не&nbsp;нашли страницу': CheckError('Resolving', 'MegaFon 404 page'),
'Доступ к информационному ресурсу ограничен на основании Федерального закона': CheckError('Censorship', 'MGTS'),
'Incapsula incident ID': CheckError('Bot protection', 'Incapsula'),
}
unsupported_characters = '#'
QueryDraft = Tuple[Callable, Any, Any]
QueriesDraft = Iterable[QueryDraft]
def create_task_func():
if sys.version_info.minor > 6:
create_asyncio_task = asyncio.create_task
else:
loop = asyncio.get_event_loop()
create_asyncio_task = loop.create_task
return create_asyncio_task
class AsyncExecutor:
def __init__(self, *args, **kwargs):
self.logger = kwargs['logger']
async def run(self, tasks: QueriesDraft):
start_time = time.time()
results = await self._run(tasks)
self.execution_time = time.time() - start_time
self.logger.debug(f'Spent time: {self.execution_time}')
return results
async def _run(self, tasks: QueriesDraft):
await asyncio.sleep(0)
class AsyncioSimpleExecutor(AsyncExecutor):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
async def _run(self, tasks: QueriesDraft):
futures = [f(*args, **kwargs) for f, args, kwargs in tasks]
return await asyncio.gather(*futures)
class AsyncioProgressbarExecutor(AsyncExecutor):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
async def _run(self, tasks: QueriesDraft):
futures = [f(*args, **kwargs) for f, args, kwargs in tasks]
results = []
for f in tqdm.asyncio.tqdm.as_completed(futures):
results.append(await f)
return results
class AsyncioProgressbarSemaphoreExecutor(AsyncExecutor):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.semaphore = asyncio.Semaphore(kwargs.get('in_parallel', 1))
async def _run(self, tasks: QueriesDraft):
async def _wrap_query(q: QueryDraft):
async with self.semaphore:
f, args, kwargs = q
return await f(*args, **kwargs)
async def semaphore_gather(tasks: QueriesDraft):
coros = [_wrap_query(q) for q in tasks]
results = []
for f in tqdm.asyncio.tqdm.as_completed(coros):
results.append(await f)
return results
return await semaphore_gather(tasks)
class AsyncioProgressbarQueueExecutor(AsyncExecutor):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.workers_count = kwargs.get('in_parallel', 10)
self.progress_func = kwargs.get('progress_func', tqdm.tqdm)
self.queue = asyncio.Queue(self.workers_count)
self.timeout = kwargs.get('timeout')
async def worker(self):
while True:
try:
f, args, kwargs = self.queue.get_nowait()
except asyncio.QueueEmpty:
return
query_future = f(*args, **kwargs)
query_task = create_task_func()(query_future)
try:
result = await asyncio.wait_for(query_task, timeout=self.timeout)
except asyncio.TimeoutError:
result = None
self.results.append(result)
self.progress.update(1)
self.queue.task_done()
async def _run(self, queries: QueriesDraft):
self.results = []
queries_list = list(queries)
min_workers = min(len(queries_list), self.workers_count)
workers = [create_task_func()(self.worker())
for _ in range(min_workers)]
self.progress = self.progress_func(total=len(queries_list))
for t in queries_list:
await self.queue.put(t)
await self.queue.join()
for w in workers:
w.cancel()
self.progress.close()
return self.results
async def get_response(request_future, site_name, logger):
async def get_response(request_future, site_name, logger) -> (str, int, CheckError):
html_text = None
status_code = 0
error_text = "General Unknown Error"
expection_text = None
error = CheckError('Error')
try:
response = await request_future
@@ -173,37 +59,33 @@ async def get_response(request_future, site_name, logger):
decoded_content = response_content.decode(charset, 'ignore')
html_text = decoded_content
if status_code > 0:
error_text = None
if status_code == 0:
error = CheckError('Connection lost')
else:
error = None
logger.debug(html_text)
except asyncio.TimeoutError as errt:
error_text = "Timeout Error"
expection_text = str(errt)
except aiohttp.client_exceptions.ClientConnectorError as err:
error_text = "Error Connecting"
expection_text = str(err)
except aiohttp.http_exceptions.BadHttpMessage as err:
error_text = "HTTP Error"
expection_text = str(err)
except proxy_errors.ProxyError as err:
error_text = "Proxy Error"
expection_text = str(err)
except Exception as err:
except asyncio.TimeoutError as e:
error = CheckError('Request timeout', str(e))
except aiohttp.client_exceptions.ClientConnectorError as e:
error = CheckError('Connecting failure', str(e))
except aiohttp.http_exceptions.BadHttpMessage as e:
error = CheckError('HTTP', str(e))
except proxy_errors.ProxyError as e:
error = CheckError('Proxy', str(e))
except Exception as e:
# python-specific exceptions
if sys.version_info.minor > 6:
if isinstance(err, ssl.SSLCertVerificationError) or isinstance(err, ssl.SSLError):
error_text = "SSL Error"
expection_text = str(err)
if isinstance(e, ssl.SSLCertVerificationError) or isinstance(e, ssl.SSLError):
error = CheckError('SSL', str(e))
else:
logger.warning(f'Unhandled error while requesting {site_name}: {err}')
logger.debug(err, exc_info=True)
error_text = "Some Error"
expection_text = str(err)
logger.warning(f'Unhandled error while requesting {site_name}: {e}')
logger.debug(e, exc_info=True)
error = CheckError('Error', str(e))
# TODO: return only needed information
return html_text, status_code, error_text, expection_text
return html_text, status_code, error
async def update_site_dict_from_response(sitename, site_dict, results_info, logger, query_notify):
@@ -221,24 +103,25 @@ async def update_site_dict_from_response(sitename, site_dict, results_info, logg
# TODO: move to separate class
def detect_error_page(html_text, status_code, fail_flags, ignore_403):
def detect_error_page(html_text, status_code, fail_flags, ignore_403) -> CheckError:
# Detect service restrictions such as a country restriction
for flag, msg in fail_flags.items():
if flag in html_text:
return 'Some site error', msg
return CheckError('Site-specific', msg)
# Detect common restrictions such as provider censorship and bot protection
for flag, msg in common_errors.items():
for flag, err in common_errors.items():
if flag in html_text:
return 'Error', msg
return err
# Detect common site errors
if status_code == 403 and not ignore_403:
return 'Access denied', 'Access denied, use proxy/vpn'
elif status_code >= 500:
return f'Error {status_code}', f'Site error {status_code}'
return CheckError('Access denied', '403 status code, use proxy/vpn')
return None, None
elif status_code >= 500:
return CheckError(f'Server', f'{status_code} status code')
return None
def process_site_result(response, query_notify, logger, results_info, site: MaigretSite):
@@ -261,16 +144,12 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
# Get the expected check type
check_type = site.check_type
# Get the failure messages and comments
failure_errors = site.errors
# TODO: refactor
if not response:
logger.error(f'No response for {site.name}')
return results_info
html_text, status_code, error_text, expection_text = response
site_error_text = '?'
html_text, status_code, check_error = response
# TODO: add elapsed request time counting
response_time = None
@@ -278,13 +157,13 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
if logger.level == logging.DEBUG:
with open('debug.txt', 'a') as f:
status = status_code or 'No response'
f.write(f'url: {url}\nerror: {str(error_text)}\nr: {status}\n')
f.write(f'url: {url}\nerror: {check_error}\nr: {status}\n')
if html_text:
f.write(f'code: {status}\nresponse: {str(html_text)}\n')
if status_code and not error_text:
error_text, site_error_text = detect_error_page(html_text, status_code, failure_errors,
site.ignore403)
# additional check for errors
if status_code and not check_error:
check_error = detect_error_page(html_text, status_code, site.errors, site.ignore403)
if site.activation and html_text:
is_need_activation = any([s for s in site.activation['marks'] if s in html_text])
@@ -312,17 +191,18 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
if presense_flag in html_text:
is_presense_detected = True
site.stats['presense_flag'] = presense_flag
logger.info(presense_flag)
logger.debug(presense_flag)
break
if error_text is not None:
logger.debug(error_text)
if check_error:
logger.debug(check_error)
result = QueryResult(username,
site.name,
url,
QueryStatus.UNKNOWN,
query_time=response_time,
context=f'{error_text}: {site_error_text}', tags=fulltags)
error=check_error,
context=str(CheckError), tags=fulltags)
elif check_type == "message":
absence_flags = site.absence_strs
is_absence_flags_list = isinstance(absence_flags, list)
@@ -473,11 +353,11 @@ async def maigret(username, site_dict, logger, query_notify=None,
if logger.level == logging.DEBUG:
future = session.get(url='https://icanhazip.com')
ip, status, error, expection = await get_response(future, None, logger)
ip, status, check_error = await get_response(future, None, logger)
if ip:
logger.debug(f'My IP is: {ip.strip()}')
else:
logger.debug(f'IP requesting {error}: {expection}')
logger.debug(f'IP requesting {check_error[0]}: {check_error[1]}')
# Results from analysis of all sites
results_total = {}
@@ -594,6 +474,24 @@ async def maigret(username, site_dict, logger, query_notify=None,
await session.close()
# TODO: move to separate function
errors = {}
for el in results:
if not el:
continue
_, r = el
if r and isinstance(r, dict) and r.get('status'):
if not isinstance(r['status'], QueryResult):
continue
err = r['status'].error
if not err:
continue
errors[err.type] = errors.get(err.type, 0) + 1
for err, count in sorted(errors.items(), key=lambda x: x[1], reverse=True):
logger.warning(f'Errors of type "{err}": {count}')
# Notify caller that all queries are finished.
query_notify.finish()
+1 -1
View File
@@ -241,7 +241,7 @@ class QueryNotifyPrint(QueryNotify):
self.color,
'?', result.site_name,
Fore.RED, Fore.RED,
self.result.context + ids_data_text
str(self.result.error) + ids_data_text
)
elif result.status == QueryStatus.ILLEGAL:
if not self.print_found_only:
+7 -5
View File
@@ -294,9 +294,10 @@
],
"errors": {
"INTERNAL_SERVER_ERROR": "Site error",
"Something just went wrong": "Site error"
"Something just went wrong": "Site error",
"PersistedQueryNotFound": "Site error"
},
"urlProbe": "https://api.500px.com/graphql?operationName=ProfileRendererQuery&variables=%7B%22username%22%3A%22{username}%22%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%22105058632482dd2786fd5775745908dc928f537b28e28356b076522757d65c19%22%7D%7D",
"urlProbe": "https://api.500px.com/graphql?operationName=ProfileRendererQuery&variables=%7B%22username%22%3A%22{username}%22%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%22fcecc7028c308115b0defebc63acec3fe3c12df86a602c3e1785ba5cfb8fff47%22%7D%7D",
"checkType": "message",
"absenceStrs": "No message available",
"alexaRank": 3029,
@@ -1919,6 +1920,7 @@
"usernameUnclaimed": "noonewouldeverusethis7"
},
"Blu-ray": {
"disabled": true,
"tags": [
"us"
],
@@ -12149,7 +12151,7 @@
"us"
],
"headers": {
"authorization": "Bearer BQC38O_gP1qY7X8Ui7RsgyutAuZ2QissgeFgsDEX6siaE_dAFmzV0mWMSziGB_dLErQwtfJZa7qM9IsmNHI"
"authorization": "Bearer BQAjb32z4TLh0t19LDuYfk2BV3gUXCpqyUuy2gBOyJTN_2xoZlN4AW1B6ZVmdKMDcI3Hc8agrrQsKbQZE90"
},
"errors": {
"Spotify is currently not available in your country.": "Access denied in your country, use proxy/vpn"
@@ -13453,7 +13455,7 @@
"sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
"authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
"x-guest-token": "1383863637358432258"
"x-guest-token": "1386060728566681601"
},
"errors": {
"Bad guest token": "x-guest-token update required"
@@ -13830,7 +13832,7 @@
"video"
],
"headers": {
"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MTg3NzQ2ODAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.q3cuo2lPiamc4wj5NgWOl3JPr_d33y5C06epHB92thk"
"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MTkzMDI0NDAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.fN8PQIEkzQjfu7znGoIaLEP9Qr6bV8JbA2ZwpBSFI5E"
},
"activation": {
"url": "https://vimeo.com/_rv/viewer",
+2 -1
View File
@@ -34,7 +34,7 @@ class QueryResult():
"""
def __init__(self, username, site_name, site_url_user, status, ids_data=None,
query_time=None, context=None, tags=[]):
query_time=None, context=None, error=None, tags=[]):
"""Create Query Result Object.
Contains information about a specific method of detecting usernames on
@@ -73,6 +73,7 @@ class QueryResult():
self.context = context
self.ids_data = ids_data
self.tags = tags
self.error = error
def json(self):
return {
@@ -2,7 +2,8 @@
import pytest
import asyncio
import logging
from maigret.checking import AsyncioSimpleExecutor, AsyncioProgressbarExecutor, AsyncioProgressbarSemaphoreExecutor, AsyncioProgressbarQueueExecutor
from maigret.executors import AsyncioSimpleExecutor, AsyncioProgressbarExecutor, \
AsyncioProgressbarSemaphoreExecutor, AsyncioProgressbarQueueExecutor
logger = logging.getLogger(__name__)