Merge pull request #113 from soxoj/errors-stats

Errors stats MVP, some fp fixes
This commit is contained in:
soxoj
2021-04-25 01:13:39 +03:00
committed by GitHub
7 changed files with 230 additions and 181 deletions
+71 -173
View File
@@ -6,7 +6,6 @@ import ssl
import sys import sys
import tqdm import tqdm
import time import time
from typing import Callable, Any, Iterable, Tuple
import aiohttp import aiohttp
import tqdm.asyncio import tqdm.asyncio
@@ -16,8 +15,11 @@ from python_socks import _errors as proxy_errors
from socid_extractor import extract from socid_extractor import extract
from .activation import ParsingActivator, import_aiohttp_cookies from .activation import ParsingActivator, import_aiohttp_cookies
from .executors import AsyncioSimpleExecutor, AsyncioProgressbarQueueExecutor
from .result import QueryResult, QueryStatus from .result import QueryResult, QueryStatus
from .sites import MaigretDatabase, MaigretSite from .sites import MaigretDatabase, MaigretSite
from .types import CheckError
supported_recursive_search_ids = ( supported_recursive_search_ids = (
'yandex_public_id', 'yandex_public_id',
@@ -30,139 +32,23 @@ supported_recursive_search_ids = (
) )
common_errors = { common_errors = {
'<title>Attention Required! | Cloudflare</title>': 'Cloudflare captcha', '<title>Attention Required! | Cloudflare</title>': CheckError('Captcha', 'Cloudflare'),
'Please stand by, while we are checking your browser': 'Cloudflare captcha', 'Please stand by, while we are checking your browser': CheckError('Bot protection', 'Cloudflare'),
'<title>Доступ ограничен</title>': 'Rostelecom censorship', '<title>Доступ ограничен</title>': CheckError('Censorship', 'Rostelecom'),
'document.getElementById(\'validate_form_submit\').disabled=true': 'Mail.ru captcha', 'document.getElementById(\'validate_form_submit\').disabled=true': CheckError('Captcha', 'Mail.ru'),
'Verifying your browser, please wait...<br>DDoS Protection by</font> Blazingfast.io': 'Blazingfast protection', 'Verifying your browser, please wait...<br>DDoS Protection by</font> Blazingfast.io': CheckError('Bot protection', 'Blazingfast'),
'404</h1><p class="error-card__description">Мы&nbsp;не&nbsp;нашли страницу': 'MegaFon 404 page', '404</h1><p class="error-card__description">Мы&nbsp;не&nbsp;нашли страницу': CheckError('Resolving', 'MegaFon 404 page'),
'Доступ к информационному ресурсу ограничен на основании Федерального закона': 'MGTS censorship', 'Доступ к информационному ресурсу ограничен на основании Федерального закона': CheckError('Censorship', 'MGTS'),
'Incapsula incident ID': 'Incapsula antibot protection', 'Incapsula incident ID': CheckError('Bot protection', 'Incapsula'),
} }
unsupported_characters = '#' unsupported_characters = '#'
QueryDraft = Tuple[Callable, Any, Any]
QueriesDraft = Iterable[QueryDraft]
async def get_response(request_future, site_name, logger) -> (str, int, CheckError):
def create_task_func():
if sys.version_info.minor > 6:
create_asyncio_task = asyncio.create_task
else:
loop = asyncio.get_event_loop()
create_asyncio_task = loop.create_task
return create_asyncio_task
class AsyncExecutor:
def __init__(self, *args, **kwargs):
self.logger = kwargs['logger']
async def run(self, tasks: QueriesDraft):
start_time = time.time()
results = await self._run(tasks)
self.execution_time = time.time() - start_time
self.logger.debug(f'Spent time: {self.execution_time}')
return results
async def _run(self, tasks: QueriesDraft):
await asyncio.sleep(0)
class AsyncioSimpleExecutor(AsyncExecutor):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
async def _run(self, tasks: QueriesDraft):
futures = [f(*args, **kwargs) for f, args, kwargs in tasks]
return await asyncio.gather(*futures)
class AsyncioProgressbarExecutor(AsyncExecutor):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
async def _run(self, tasks: QueriesDraft):
futures = [f(*args, **kwargs) for f, args, kwargs in tasks]
results = []
for f in tqdm.asyncio.tqdm.as_completed(futures):
results.append(await f)
return results
class AsyncioProgressbarSemaphoreExecutor(AsyncExecutor):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.semaphore = asyncio.Semaphore(kwargs.get('in_parallel', 1))
async def _run(self, tasks: QueriesDraft):
async def _wrap_query(q: QueryDraft):
async with self.semaphore:
f, args, kwargs = q
return await f(*args, **kwargs)
async def semaphore_gather(tasks: QueriesDraft):
coros = [_wrap_query(q) for q in tasks]
results = []
for f in tqdm.asyncio.tqdm.as_completed(coros):
results.append(await f)
return results
return await semaphore_gather(tasks)
class AsyncioProgressbarQueueExecutor(AsyncExecutor):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.workers_count = kwargs.get('in_parallel', 10)
self.progress_func = kwargs.get('progress_func', tqdm.tqdm)
self.queue = asyncio.Queue(self.workers_count)
self.timeout = kwargs.get('timeout')
async def worker(self):
while True:
try:
f, args, kwargs = self.queue.get_nowait()
except asyncio.QueueEmpty:
return
query_future = f(*args, **kwargs)
query_task = create_task_func()(query_future)
try:
result = await asyncio.wait_for(query_task, timeout=self.timeout)
except asyncio.TimeoutError:
result = None
self.results.append(result)
self.progress.update(1)
self.queue.task_done()
async def _run(self, queries: QueriesDraft):
self.results = []
queries_list = list(queries)
min_workers = min(len(queries_list), self.workers_count)
workers = [create_task_func()(self.worker())
for _ in range(min_workers)]
self.progress = self.progress_func(total=len(queries_list))
for t in queries_list:
await self.queue.put(t)
await self.queue.join()
for w in workers:
w.cancel()
self.progress.close()
return self.results
async def get_response(request_future, site_name, logger):
html_text = None html_text = None
status_code = 0 status_code = 0
error = CheckError('Error')
error_text = "General Unknown Error"
expection_text = None
try: try:
response = await request_future response = await request_future
@@ -173,37 +59,33 @@ async def get_response(request_future, site_name, logger):
decoded_content = response_content.decode(charset, 'ignore') decoded_content = response_content.decode(charset, 'ignore')
html_text = decoded_content html_text = decoded_content
if status_code > 0: if status_code == 0:
error_text = None error = CheckError('Connection lost')
else:
error = None
logger.debug(html_text) logger.debug(html_text)
except asyncio.TimeoutError as errt: except asyncio.TimeoutError as e:
error_text = "Timeout Error" error = CheckError('Request timeout', str(e))
expection_text = str(errt) except aiohttp.client_exceptions.ClientConnectorError as e:
except aiohttp.client_exceptions.ClientConnectorError as err: error = CheckError('Connecting failure', str(e))
error_text = "Error Connecting" except aiohttp.http_exceptions.BadHttpMessage as e:
expection_text = str(err) error = CheckError('HTTP', str(e))
except aiohttp.http_exceptions.BadHttpMessage as err: except proxy_errors.ProxyError as e:
error_text = "HTTP Error" error = CheckError('Proxy', str(e))
expection_text = str(err) except Exception as e:
except proxy_errors.ProxyError as err:
error_text = "Proxy Error"
expection_text = str(err)
except Exception as err:
# python-specific exceptions # python-specific exceptions
if sys.version_info.minor > 6: if sys.version_info.minor > 6:
if isinstance(err, ssl.SSLCertVerificationError) or isinstance(err, ssl.SSLError): if isinstance(e, ssl.SSLCertVerificationError) or isinstance(e, ssl.SSLError):
error_text = "SSL Error" error = CheckError('SSL', str(e))
expection_text = str(err)
else: else:
logger.warning(f'Unhandled error while requesting {site_name}: {err}') logger.warning(f'Unhandled error while requesting {site_name}: {e}')
logger.debug(err, exc_info=True) logger.debug(e, exc_info=True)
error_text = "Some Error" error = CheckError('Error', str(e))
expection_text = str(err)
# TODO: return only needed information # TODO: return only needed information
return html_text, status_code, error_text, expection_text return html_text, status_code, error
async def update_site_dict_from_response(sitename, site_dict, results_info, logger, query_notify): async def update_site_dict_from_response(sitename, site_dict, results_info, logger, query_notify):
@@ -221,24 +103,25 @@ async def update_site_dict_from_response(sitename, site_dict, results_info, logg
# TODO: move to separate class # TODO: move to separate class
def detect_error_page(html_text, status_code, fail_flags, ignore_403): def detect_error_page(html_text, status_code, fail_flags, ignore_403) -> CheckError:
# Detect service restrictions such as a country restriction # Detect service restrictions such as a country restriction
for flag, msg in fail_flags.items(): for flag, msg in fail_flags.items():
if flag in html_text: if flag in html_text:
return 'Some site error', msg return CheckError('Site-specific', msg)
# Detect common restrictions such as provider censorship and bot protection # Detect common restrictions such as provider censorship and bot protection
for flag, msg in common_errors.items(): for flag, err in common_errors.items():
if flag in html_text: if flag in html_text:
return 'Error', msg return err
# Detect common site errors # Detect common site errors
if status_code == 403 and not ignore_403: if status_code == 403 and not ignore_403:
return 'Access denied', 'Access denied, use proxy/vpn' return CheckError('Access denied', '403 status code, use proxy/vpn')
elif status_code >= 500:
return f'Error {status_code}', f'Site error {status_code}'
return None, None elif status_code >= 500:
return CheckError(f'Server', f'{status_code} status code')
return None
def process_site_result(response, query_notify, logger, results_info, site: MaigretSite): def process_site_result(response, query_notify, logger, results_info, site: MaigretSite):
@@ -261,16 +144,12 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
# Get the expected check type # Get the expected check type
check_type = site.check_type check_type = site.check_type
# Get the failure messages and comments
failure_errors = site.errors
# TODO: refactor # TODO: refactor
if not response: if not response:
logger.error(f'No response for {site.name}') logger.error(f'No response for {site.name}')
return results_info return results_info
html_text, status_code, error_text, expection_text = response html_text, status_code, check_error = response
site_error_text = '?'
# TODO: add elapsed request time counting # TODO: add elapsed request time counting
response_time = None response_time = None
@@ -278,13 +157,13 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
if logger.level == logging.DEBUG: if logger.level == logging.DEBUG:
with open('debug.txt', 'a') as f: with open('debug.txt', 'a') as f:
status = status_code or 'No response' status = status_code or 'No response'
f.write(f'url: {url}\nerror: {str(error_text)}\nr: {status}\n') f.write(f'url: {url}\nerror: {check_error}\nr: {status}\n')
if html_text: if html_text:
f.write(f'code: {status}\nresponse: {str(html_text)}\n') f.write(f'code: {status}\nresponse: {str(html_text)}\n')
if status_code and not error_text: # additional check for errors
error_text, site_error_text = detect_error_page(html_text, status_code, failure_errors, if status_code and not check_error:
site.ignore403) check_error = detect_error_page(html_text, status_code, site.errors, site.ignore403)
if site.activation and html_text: if site.activation and html_text:
is_need_activation = any([s for s in site.activation['marks'] if s in html_text]) is_need_activation = any([s for s in site.activation['marks'] if s in html_text])
@@ -312,17 +191,18 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
if presense_flag in html_text: if presense_flag in html_text:
is_presense_detected = True is_presense_detected = True
site.stats['presense_flag'] = presense_flag site.stats['presense_flag'] = presense_flag
logger.info(presense_flag) logger.debug(presense_flag)
break break
if error_text is not None: if check_error:
logger.debug(error_text) logger.debug(check_error)
result = QueryResult(username, result = QueryResult(username,
site.name, site.name,
url, url,
QueryStatus.UNKNOWN, QueryStatus.UNKNOWN,
query_time=response_time, query_time=response_time,
context=f'{error_text}: {site_error_text}', tags=fulltags) error=check_error,
context=str(CheckError), tags=fulltags)
elif check_type == "message": elif check_type == "message":
absence_flags = site.absence_strs absence_flags = site.absence_strs
is_absence_flags_list = isinstance(absence_flags, list) is_absence_flags_list = isinstance(absence_flags, list)
@@ -473,11 +353,11 @@ async def maigret(username, site_dict, logger, query_notify=None,
if logger.level == logging.DEBUG: if logger.level == logging.DEBUG:
future = session.get(url='https://icanhazip.com') future = session.get(url='https://icanhazip.com')
ip, status, error, expection = await get_response(future, None, logger) ip, status, check_error = await get_response(future, None, logger)
if ip: if ip:
logger.debug(f'My IP is: {ip.strip()}') logger.debug(f'My IP is: {ip.strip()}')
else: else:
logger.debug(f'IP requesting {error}: {expection}') logger.debug(f'IP requesting {check_error[0]}: {check_error[1]}')
# Results from analysis of all sites # Results from analysis of all sites
results_total = {} results_total = {}
@@ -594,6 +474,24 @@ async def maigret(username, site_dict, logger, query_notify=None,
await session.close() await session.close()
# TODO: move to separate function
errors = {}
for el in results:
if not el:
continue
_, r = el
if r and isinstance(r, dict) and r.get('status'):
if not isinstance(r['status'], QueryResult):
continue
err = r['status'].error
if not err:
continue
errors[err.type] = errors.get(err.type, 0) + 1
for err, count in sorted(errors.items(), key=lambda x: x[1], reverse=True):
logger.warning(f'Errors of type "{err}": {count}')
# Notify caller that all queries are finished. # Notify caller that all queries are finished.
query_notify.finish() query_notify.finish()
+119
View File
@@ -0,0 +1,119 @@
import asyncio
import time
import tqdm
import sys
from typing import Iterable
from .types import QueryDraft
def create_task_func():
if sys.version_info.minor > 6:
create_asyncio_task = asyncio.create_task
else:
loop = asyncio.get_event_loop()
create_asyncio_task = loop.create_task
return create_asyncio_task
class AsyncExecutor:
def __init__(self, *args, **kwargs):
self.logger = kwargs['logger']
async def run(self, tasks: Iterable[QueryDraft]):
start_time = time.time()
results = await self._run(tasks)
self.execution_time = time.time() - start_time
self.logger.debug(f'Spent time: {self.execution_time}')
return results
async def _run(self, tasks: Iterable[QueryDraft]):
await asyncio.sleep(0)
class AsyncioSimpleExecutor(AsyncExecutor):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
async def _run(self, tasks: Iterable[QueryDraft]):
futures = [f(*args, **kwargs) for f, args, kwargs in tasks]
return await asyncio.gather(*futures)
class AsyncioProgressbarExecutor(AsyncExecutor):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
async def _run(self, tasks: Iterable[QueryDraft]):
futures = [f(*args, **kwargs) for f, args, kwargs in tasks]
results = []
for f in tqdm.asyncio.tqdm.as_completed(futures):
results.append(await f)
return results
class AsyncioProgressbarSemaphoreExecutor(AsyncExecutor):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.semaphore = asyncio.Semaphore(kwargs.get('in_parallel', 1))
async def _run(self, tasks: Iterable[QueryDraft]):
async def _wrap_query(q: QueryDraft):
async with self.semaphore:
f, args, kwargs = q
return await f(*args, **kwargs)
async def semaphore_gather(tasks: Iterable[QueryDraft]):
coros = [_wrap_query(q) for q in tasks]
results = []
for f in tqdm.asyncio.tqdm.as_completed(coros):
results.append(await f)
return results
return await semaphore_gather(tasks)
class AsyncioProgressbarQueueExecutor(AsyncExecutor):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.workers_count = kwargs.get('in_parallel', 10)
self.progress_func = kwargs.get('progress_func', tqdm.tqdm)
self.queue = asyncio.Queue(self.workers_count)
self.timeout = kwargs.get('timeout')
async def worker(self):
while True:
try:
f, args, kwargs = self.queue.get_nowait()
except asyncio.QueueEmpty:
return
query_future = f(*args, **kwargs)
query_task = create_task_func()(query_future)
try:
result = await asyncio.wait_for(query_task, timeout=self.timeout)
except asyncio.TimeoutError:
result = None
self.results.append(result)
self.progress.update(1)
self.queue.task_done()
async def _run(self, queries: Iterable[QueryDraft]):
self.results = []
queries_list = list(queries)
min_workers = min(len(queries_list), self.workers_count)
workers = [create_task_func()(self.worker())
for _ in range(min_workers)]
self.progress = self.progress_func(total=len(queries_list))
for t in queries_list:
await self.queue.put(t)
await self.queue.join()
for w in workers:
w.cancel()
self.progress.close()
return self.results
+1 -1
View File
@@ -241,7 +241,7 @@ class QueryNotifyPrint(QueryNotify):
self.color, self.color,
'?', result.site_name, '?', result.site_name,
Fore.RED, Fore.RED, Fore.RED, Fore.RED,
self.result.context + ids_data_text str(self.result.error) + ids_data_text
) )
elif result.status == QueryStatus.ILLEGAL: elif result.status == QueryStatus.ILLEGAL:
if not self.print_found_only: if not self.print_found_only:
+7 -5
View File
@@ -294,9 +294,10 @@
], ],
"errors": { "errors": {
"INTERNAL_SERVER_ERROR": "Site error", "INTERNAL_SERVER_ERROR": "Site error",
"Something just went wrong": "Site error" "Something just went wrong": "Site error",
"PersistedQueryNotFound": "Site error"
}, },
"urlProbe": "https://api.500px.com/graphql?operationName=ProfileRendererQuery&variables=%7B%22username%22%3A%22{username}%22%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%22105058632482dd2786fd5775745908dc928f537b28e28356b076522757d65c19%22%7D%7D", "urlProbe": "https://api.500px.com/graphql?operationName=ProfileRendererQuery&variables=%7B%22username%22%3A%22{username}%22%7D&extensions=%7B%22persistedQuery%22%3A%7B%22version%22%3A1%2C%22sha256Hash%22%3A%22fcecc7028c308115b0defebc63acec3fe3c12df86a602c3e1785ba5cfb8fff47%22%7D%7D",
"checkType": "message", "checkType": "message",
"absenceStrs": "No message available", "absenceStrs": "No message available",
"alexaRank": 3029, "alexaRank": 3029,
@@ -1919,6 +1920,7 @@
"usernameUnclaimed": "noonewouldeverusethis7" "usernameUnclaimed": "noonewouldeverusethis7"
}, },
"Blu-ray": { "Blu-ray": {
"disabled": true,
"tags": [ "tags": [
"us" "us"
], ],
@@ -12149,7 +12151,7 @@
"us" "us"
], ],
"headers": { "headers": {
"authorization": "Bearer BQC38O_gP1qY7X8Ui7RsgyutAuZ2QissgeFgsDEX6siaE_dAFmzV0mWMSziGB_dLErQwtfJZa7qM9IsmNHI" "authorization": "Bearer BQAjb32z4TLh0t19LDuYfk2BV3gUXCpqyUuy2gBOyJTN_2xoZlN4AW1B6ZVmdKMDcI3Hc8agrrQsKbQZE90"
}, },
"errors": { "errors": {
"Spotify is currently not available in your country.": "Access denied in your country, use proxy/vpn" "Spotify is currently not available in your country.": "Access denied in your country, use proxy/vpn"
@@ -13453,7 +13455,7 @@
"sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"", "sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
"authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA", "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36", "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
"x-guest-token": "1383863637358432258" "x-guest-token": "1386060728566681601"
}, },
"errors": { "errors": {
"Bad guest token": "x-guest-token update required" "Bad guest token": "x-guest-token update required"
@@ -13830,7 +13832,7 @@
"video" "video"
], ],
"headers": { "headers": {
"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MTg3NzQ2ODAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.q3cuo2lPiamc4wj5NgWOl3JPr_d33y5C06epHB92thk" "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MTkzMDI0NDAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.fN8PQIEkzQjfu7znGoIaLEP9Qr6bV8JbA2ZwpBSFI5E"
}, },
"activation": { "activation": {
"url": "https://vimeo.com/_rv/viewer", "url": "https://vimeo.com/_rv/viewer",
+2 -1
View File
@@ -34,7 +34,7 @@ class QueryResult():
""" """
def __init__(self, username, site_name, site_url_user, status, ids_data=None, def __init__(self, username, site_name, site_url_user, status, ids_data=None,
query_time=None, context=None, tags=[]): query_time=None, context=None, error=None, tags=[]):
"""Create Query Result Object. """Create Query Result Object.
Contains information about a specific method of detecting usernames on Contains information about a specific method of detecting usernames on
@@ -73,6 +73,7 @@ class QueryResult():
self.context = context self.context = context
self.ids_data = ids_data self.ids_data = ids_data
self.tags = tags self.tags = tags
self.error = error
def json(self): def json(self):
return { return {
+28
View File
@@ -0,0 +1,28 @@
from typing import Callable, Any, Tuple
# search query
QueryDraft = Tuple[Callable, Any, Any]
# error got as a result of completed search query
class CheckError:
_type = 'Unknown'
_desc = ''
def __init__(self, typename, desc=''):
self._type = typename
self._desc = desc
def __str__(self):
if not self._desc:
return f'{self._type} error'
return f'{self._type} error: {self._desc}'
@property
def type(self):
return self._type
@property
def desc(self):
return self._desc
@@ -2,7 +2,8 @@
import pytest import pytest
import asyncio import asyncio
import logging import logging
from maigret.checking import AsyncioSimpleExecutor, AsyncioProgressbarExecutor, AsyncioProgressbarSemaphoreExecutor, AsyncioProgressbarQueueExecutor from maigret.executors import AsyncioSimpleExecutor, AsyncioProgressbarExecutor, \
AsyncioProgressbarSemaphoreExecutor, AsyncioProgressbarQueueExecutor
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)