Refactoring: updated data & sites storing, tests added

This commit is contained in:
Soxoj
2021-01-03 23:48:33 +03:00
parent 9d3e2d114c
commit d389ba9e76
7 changed files with 18572 additions and 20732 deletions
+31 -22
View File
@@ -154,8 +154,8 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
# We have already determined the user doesn't exist here
return results_info
# Get the expected error type
error_type = site.check_type
# Get the expected check type
check_type = site.check_type
# Get the failure messages and comments
failure_errors = site.errors
@@ -196,7 +196,7 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
QueryStatus.UNKNOWN,
query_time=response_time,
context=f'{error_text}: {site_error_text}', tags=fulltags)
elif error_type == "message":
elif check_type == "message":
absence_flags = site.absence_strs
is_absence_flags_list = isinstance(absence_flags, list)
absence_flags_set = set(absence_flags) if is_absence_flags_list else {absence_flags}
@@ -214,7 +214,7 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
url,
QueryStatus.AVAILABLE,
query_time=response_time, tags=fulltags)
elif error_type == "status_code":
elif check_type == "status_code":
# Checks if the status code of the response is 2XX
if (not status_code >= 300 or status_code < 200) and is_presense_detected:
result = QueryResult(username,
@@ -228,7 +228,7 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
url,
QueryStatus.AVAILABLE,
query_time=response_time, tags=fulltags)
elif error_type == "response_url":
elif check_type == "response_url":
# For this detection method, we have turned off the redirect.
# So, there is no need to check the response URL: it will always
# match the request. Instead, we will ensure that the response
@@ -248,8 +248,8 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
query_time=response_time, tags=fulltags)
else:
# It should be impossible to ever get here...
raise ValueError(f"Unknown Error Type '{error_type}' for "
f"site '{site_name}'")
raise ValueError(f"Unknown check type '{check_type}' for "
f"site '{site.name}'")
extracted_ids_data = {}
@@ -257,7 +257,7 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
try:
extracted_ids_data = extract(html_text)
except Exception as e:
logger.warning(f'Error while parsing {site_name}: {e}', exc_info=True)
logger.warning(f'Error while parsing {site.name}: {e}', exc_info=True)
if extracted_ids_data:
new_usernames = {}
@@ -280,14 +280,14 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
results_info['http_status'] = status_code
results_info['is_similar'] = site.similar_search
# results_site['response_text'] = html_text
results_info['rank'] = site.popularity_rank
results_info['rank'] = site.alexa_rank
return results_info
async def maigret(username, site_dict, query_notify, logger,
proxy=None, timeout=None, recursive_search=False,
id_type='username', tags=None, debug=False, forced=False,
max_connections=100):
max_connections=100, no_progressbar=False):
"""Main search func
Checks for existence of username on various social media sites.
@@ -372,14 +372,16 @@ async def maigret(username, site_dict, query_notify, logger,
headers.update(site.headers)
if not 'url' in site.__dict__:
logger.error('No URL for site %s', site.name)
# URL of user on site (if it exists)
url = site.url_username_format.format(
url = site.url.format(
urlMain=site.url_main,
urlSubpath=site.url_subpath,
username=username
)
# workaround to prevent slash errors
url = url.replace('///', '/')
url = re.sub('(?<!:)/+', '/', url)
# Don't make request if username is invalid for the site
if site.regex_check and re.search(site.regex_check, username) is None:
@@ -462,8 +464,11 @@ async def maigret(username, site_dict, query_notify, logger,
future = asyncio.ensure_future(update_site_coro)
tasks.append(future)
for f in tqdm.asyncio.tqdm.as_completed(tasks):
await f
if no_progressbar:
await asyncio.gather(*tasks)
else:
for f in tqdm.asyncio.tqdm.as_completed(tasks):
await f
await session.close()
@@ -498,15 +503,15 @@ def timeout_check(value):
return timeout
async def site_self_check(site_name, site_data, logger):
async def site_self_check(site_name, site_data, logger, no_progressbar=False):
query_notify = Mock()
changes = {
'disabled': False,
}
check_data = [
(site_data['username_claimed'], QueryStatus.CLAIMED),
(site_data['username_unclaimed'], QueryStatus.AVAILABLE),
(site_data.username_claimed, QueryStatus.CLAIMED),
(site_data.username_unclaimed, QueryStatus.AVAILABLE),
]
logger.info(f'Checking {site_name}...')
@@ -519,29 +524,33 @@ async def site_self_check(site_name, site_data, logger):
logger,
timeout=30,
forced=True,
no_progressbar=no_progressbar,
)
# don't disable entries with other ids types
if site_name not in results:
logger.info(results)
changes['disabled'] = True
continue
site_status = results[site_name]['status'].status
if site_status != status:
if site_status == QueryStatus.UNKNOWN:
msg = site_data.get('errorMsg')
etype = site_data.get('errorType')
logger.info(f'Error while searching {username} in {site_name}: {msg}, type {etype}')
msgs = site_data.absence_strs
etype = site_data.check_type
logger.info(f'Error while searching {username} in {site_name}: {msgs}, type {etype}')
# don't disable in case of available username
if status == QueryStatus.CLAIMED:
changes['disabled'] = True
elif status == QueryStatus.CLAIMED:
logger.info(f'Not found `{username}` in {site_name}, must be claimed')
logger.info(results[site_name])
changes['disabled'] = True
else:
logger.info(f'Found `{username}` in {site_name}, must be available')
logger.info(results[site_name])
changes['disabled'] = True
logger.info(f'Site {site_name} is okay')
logger.info(f'Site {site_name} checking is finished')
return changes
@@ -756,7 +765,7 @@ async def main():
logging.basicConfig(
format='[%(filename)s:%(lineno)d] %(levelname)-3s %(asctime)s %(message)s',
datefmt='%H:%M:%S',
level=logging.ERROR
level=log_level
)
if args.debug:
+18307 -19000
View File
File diff suppressed because it is too large Load Diff
+125 -94
View File
@@ -1,96 +1,118 @@
"""Maigret Sites Information"""
from __future__ import annotations
import copy
import json
import operator
import sys
import requests
from maigret.utils import CaseConverter
class MaigretEngine:
def __init__(self, name, *args, **kwargs):
def __init__(self, name, data):
self.name = name
self.__dict__.update(kwargs)
self.__dict__.update(data)
@property
def json(self):
return self.__dict__
class MaigretSite:
def __init__(self, name, url_main, url_username_format, popularity_rank,
username_claimed, username_unclaimed,
information):
"""Create Site Information Object.
Contains information about a specific web site.
Keyword Arguments:
self -- This object.
name -- String which identifies site.
url_main -- String containing URL for home of site.
url_username_format -- String containing URL for Username format
on site.
NOTE: The string should contain the
token "{}" where the username should
be substituted. For example, a string
of "https://somesite.com/users/{}"
indicates that the individual
usernames would show up under the
"https://somesite.com/users/" area of
the web site.
popularity_rank -- Integer indicating popularity of site.
In general, smaller numbers mean more
popular ("0" or None means ranking
information not available).
username_claimed -- String containing username which is known
to be claimed on web site.
username_unclaimed -- String containing username which is known
to be unclaimed on web site.
information -- Dictionary containing all known information
about web site.
NOTE: Custom information about how to
actually detect the existence of the
username will be included in this
dictionary. This information will
be needed by the detection method,
but it is only recorded in this
object for future use.
Return Value:
Nothing.
"""
def __init__(self, name, information):
self.name = name
self.url_main = url_main
self.url_username_format = url_username_format
if (popularity_rank is None) or (popularity_rank == 0):
# We do not know the popularity, so make site go to bottom of list.
popularity_rank = sys.maxsize
self.popularity_rank = popularity_rank
self.disabled = False
self.similar_search = False
self.ignore_403 = False
self.tags = []
self.username_claimed = username_claimed
self.username_unclaimed = username_unclaimed
self.information = information
self.disabled = information.get('disabled', False)
self.similar_search = information.get('similarSearch', False)
self.ignore_403 = information.get('ignore_403', False)
self.tags = information.get('tags', [])
self.type = 'username'
self.headers = {}
self.errors = {}
self.url_subpath = ''
self.regex_check = None
self.url_probe = None
self.check_type = ''
self.request_head_only = ''
self.type = information.get('type', 'username')
self.headers = information.get('headers', {})
self.errors = information.get('errors', {})
self.url_subpath = information.get('urlSubpath', '')
self.regex_check = information.get('regexCheck', None)
self.url_probe = information.get('urlProbe', None)
self.check_type = information.get('errorType', '')
self.request_head_only = information.get('request_head_only', '')
self.presense_strs = []
self.absence_strs = []
self.presense_strs = information.get('presenseStrs', [])
self.absence_strs = information.get('errorMsg', [])
self.engine = None
self.engine_data = {}
self.engine_obj = None
self.request_future = None
self.alexa_rank = None
for k, v in information.items():
self.__dict__[CaseConverter.camel_to_snake(k)] = v
if (self.alexa_rank is None) or (self.alexa_rank == 0):
# We do not know the popularity, so make site go to bottom of list.
self.alexa_rank = sys.maxsize
def __str__(self):
return f"{self.name} ({self.url_main})"
@property
def json(self):
result = {}
for k, v in self.__dict__.items():
# convert to camelCase
field = CaseConverter.snake_to_camel(k)
# strip empty elements
if v in (False, '', [], {}, None, sys.maxsize, 'username'):
continue
if field in ['name', 'engineData', 'requestFuture', 'detectedEngine', 'engineObj']:
continue
result[field] = v
return result
def update(self, updates: dict) -> MaigretSite:
self.__dict__.update(updates)
return self
def update_from_engine(self, engine: MaigretEngine) -> MaigretSite:
engine_data = engine.site
for k, v in engine_data.items():
field = CaseConverter.camel_to_snake(k)
if isinstance(v, dict):
# TODO: assertion of intersecting keys
# update dicts like errors
self.__dict__.get(field, {}).update(v)
else:
self.__dict__[field] = v
self.engine_obj = engine
return self
def strip_engine_data(self) -> MaigretSite:
if not self.engine_obj:
return self
self.request_future = None
self_copy = copy.deepcopy(self)
engine_data = self_copy.engine_obj.site
for field in engine_data.keys():
if isinstance(engine_data[field], dict):
for k in engine_data[field].keys():
del self_copy.__dict__[field][k]
continue
if field in list(self_copy.__dict__.keys()):
del self_copy.__dict__[field]
if CaseConverter.camel_to_snake(field) in list(self_copy.__dict__.keys()):
del self_copy.__dict__[CaseConverter.camel_to_snake(field)]
return self_copy
class MaigretDatabase:
def __init__(self):
@@ -98,20 +120,43 @@ class MaigretDatabase:
self._engines = []
@property
def sites(self: MaigretDatabase):
def sites(self):
return self._sites
@property
def sites_dict(self):
return {site.name: site for site in self._sites}
@property
def engines(self: MaigretDatabase):
def engines(self):
return self._engines
@property
def engines_dict(self):
return {engine.name: engine for engine in self._engines}
def load_from_json(self: MaigretDatabase, json_data: dict) -> MaigretDatabase:
def update_site(self, site: MaigretSite) -> MaigretDatabase:
for s in self._sites:
if s.name == site.name:
s = site
return self
def save_to_file(self, filename: str) -> MaigretDatabase:
json_data = {
'sites': {site.name: site.strip_engine_data().json for site in self._sites},
'engines': {engine.name: engine.json for engine in self._engines},
}
json_data = json.dumps(json_data, indent=4)
with open(filename, 'w') as f:
f.write(json_data)
return self
def load_from_json(self, json_data: dict) -> MaigretDatabase:
# Add all of site information from the json file to internal site list.
site_data = json_data.get("sites")
engines_data = json_data.get("engines")
@@ -121,25 +166,11 @@ class MaigretDatabase:
for site_name in site_data:
try:
site = {}
site_user_info = site_data[site_name]
# If popularity unknown, make site be at bottom of list.
popularity_rank = site_user_info.get("rank", sys.maxsize)
maigret_site = MaigretSite(site_name, site_data[site_name])
if 'engine' in site_user_info:
engine_info = engines_data[site_user_info['engine']]['site']
site.update(engine_info)
site.update(site_user_info)
maigret_site = MaigretSite(site_name,
site["urlMain"],
site["url"],
popularity_rank,
site["username_claimed"],
site["username_unclaimed"],
site
)
engine = site_data[site_name].get('engine')
if engine:
maigret_site.update_from_engine(self.engines_dict[engine])
self._sites.append(maigret_site)
except KeyError as error:
@@ -150,7 +181,7 @@ class MaigretDatabase:
return self
def load_from_str(self: MaigretDatabase, db_str: str) -> MaigretDatabase:
def load_from_str(self, db_str: str) -> MaigretDatabase:
try:
data = json.loads(db_str)
except Exception as error:
@@ -161,7 +192,7 @@ class MaigretDatabase:
return self.load_from_json(data)
def load_from_url(self: MaigretDatabase, url: str) -> MaigretDatabase:
def load_from_url(self, url: str) -> MaigretDatabase:
is_url_valid = url.startswith('http://') or url.startswith('https://')
if not is_url_valid:
@@ -190,7 +221,7 @@ class MaigretDatabase:
return self.load_from_json(data)
def load_from_file(self: MaigretDatabase, filename: str) -> MaigretDatabase:
def load_from_file(self, filename: str) -> MaigretDatabase:
try:
with open(filename, 'r', encoding='utf-8') as file:
try:
@@ -207,7 +238,7 @@ class MaigretDatabase:
return self.load_from_json(data)
def site_name_list(self: MaigretDatabase, popularity_rank=False):
def site_name_list(self, popularity_rank=False):
"""Get Site Name List.
Keyword Arguments:
+13
View File
@@ -0,0 +1,13 @@
import re
class CaseConverter:
@staticmethod
def camel_to_snake(camelcased_string: str):
return re.sub(r'(?<!^)(?=[A-Z])', '_', camelcased_string).lower()
@staticmethod
def snake_to_camel(snakecased_string: str):
formatted = ''.join(word.title() for word in snakecased_string.split('_'))
result = formatted[0].lower() + formatted[1:]
return result
-1587
View File
File diff suppressed because it is too large Load Diff
+81 -29
View File
@@ -2,6 +2,37 @@
from maigret.sites import MaigretDatabase
EXAMPLE_DB = {
'engines': {
"XenForo": {
"presenseStrs": ["XenForo"],
"site": {
"absenceStrs": [
"The specified member cannot be found. Please enter a member's entire name.",
],
"checkType": "message",
"errors": {
"You must be logged-in to do that.": "Login required"
},
"url": "{urlMain}{urlSubpath}/members/?username={username}"
}
},
},
'sites': {
"Amperka": {
"engine": "XenForo",
"rank": 121613,
"tags": [
"ru"
],
"urlMain": "http://forum.amperka.ru",
"usernameClaimed": "adam",
"usernameUnclaimed": "noonewouldeverusethis7"
},
}
}
def test_load_empty_db_from_str():
db = MaigretDatabase()
db.load_from_str('{"engines": {}, "sites": {}}')
@@ -12,38 +43,59 @@ def test_load_empty_db_from_str():
def test_load_valid_db():
db = MaigretDatabase()
db.load_from_json({
'engines': {
"XenForo": {
"presenseStrs": ["XenForo"],
"site": {
"errorMsg": [
"The specified member cannot be found. Please enter a member's entire name.",
],
"errorType": "message",
"errors": {
"You must be logged-in to do that.": "Login required"
},
"url": "{urlMain}{urlSubpath}/members/?username={username}"
}
},
},
'sites': {
"Amperka": {
"engine": "XenForo",
"rank": 121613,
"tags": [
"ru"
],
"urlMain": "http://forum.amperka.ru",
"username_claimed": "adam",
"username_unclaimed": "noonewouldeverusethis7"
},
}
})
db.load_from_json(EXAMPLE_DB)
assert len(db.sites) == 1
assert len(db.engines) == 1
assert db.sites[0].name == 'Amperka'
assert db.engines[0].name == 'XenForo'
def test_site_json_dump():
db = MaigretDatabase()
db.load_from_json(EXAMPLE_DB)
init_keys = EXAMPLE_DB['sites']['Amperka'].keys()
# contains engine data
obj_keys = db.sites[0].json.keys()
assert set(init_keys).issubset(set(obj_keys))
def test_site_correct_initialization():
db = MaigretDatabase()
db.load_from_json(EXAMPLE_DB)
xenforo = db.engines[0]
assert xenforo.name == 'XenForo'
assert xenforo.site['checkType'] == 'message'
amperka = db.sites[0]
assert amperka.name == 'Amperka'
assert amperka.check_type == 'message'
def test_site_strip_engine_data():
db = MaigretDatabase()
db.load_from_json(EXAMPLE_DB)
amperka = db.sites[0]
amperka_stripped = amperka.strip_engine_data()
assert amperka_stripped.json == EXAMPLE_DB['sites']['Amperka']
def test_saving_site_error():
db = MaigretDatabase()
DB = dict(EXAMPLE_DB)
DB['sites']['Amperka']['errors'] = {'error1': 'text1'}
db.load_from_json(DB)
amperka = db.sites[0]
assert len(amperka.errors) == 2
assert amperka.strip_engine_data().errors == {'error1': 'text1'}
assert amperka.strip_engine_data().json['errors'] == {'error1': 'text1'}
+15
View File
@@ -0,0 +1,15 @@
"""Maigret utils test functions"""
from maigret.utils import CaseConverter
def test_case_convert_camel_to_snake():
a = 'SnakeCasedString'
b = CaseConverter.camel_to_snake(a)
assert b == 'snake_cased_string'
def test_case_convert_snake_to_camel():
a = 'camel_cased_string'
b = CaseConverter.snake_to_camel(a)
assert b == 'camelCasedString'