Merge pull request #114 from soxoj/new-sites-source-feature

Added some new sites and introduced 'source' feature
This commit is contained in:
soxoj
2021-04-29 15:17:13 +03:00
committed by GitHub
4 changed files with 106 additions and 13 deletions
+8 -7
View File
@@ -178,6 +178,7 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
except Exception as e:
logger.warning(f'Failed activation {method} for site {site.name}: {e}')
site_name = site.pretty_name
# presense flags
# True by default
presense_flags = site.presense_strs
@@ -197,7 +198,7 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
if check_error:
logger.debug(check_error)
result = QueryResult(username,
site.name,
site_name,
url,
QueryStatus.UNKNOWN,
query_time=response_time,
@@ -211,13 +212,13 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
is_absence_detected = any([(absence_flag in html_text) for absence_flag in absence_flags_set])
if not is_absence_detected and is_presense_detected:
result = QueryResult(username,
site.name,
site_name,
url,
QueryStatus.CLAIMED,
query_time=response_time, tags=fulltags)
else:
result = QueryResult(username,
site.name,
site_name,
url,
QueryStatus.AVAILABLE,
query_time=response_time, tags=fulltags)
@@ -225,13 +226,13 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
# Checks if the status code of the response is 2XX
if (not status_code >= 300 or status_code < 200) and is_presense_detected:
result = QueryResult(username,
site.name,
site_name,
url,
QueryStatus.CLAIMED,
query_time=response_time, tags=fulltags)
else:
result = QueryResult(username,
site.name,
site_name,
url,
QueryStatus.AVAILABLE,
query_time=response_time, tags=fulltags)
@@ -243,13 +244,13 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
# forward to some odd redirect).
if 200 <= status_code < 300 and is_presense_detected:
result = QueryResult(username,
site.name,
site_name,
url,
QueryStatus.CLAIMED,
query_time=response_time, tags=fulltags)
else:
result = QueryResult(username,
site.name,
site_name,
url,
QueryStatus.AVAILABLE,
query_time=response_time, tags=fulltags)
+78 -3
View File
@@ -9835,6 +9835,7 @@
"<title>Error 404</title>"
],
"alexaRank": 2076,
"source": "Instagram",
"url": "https://www.picuki.com/profile/{username}",
"urlMain": "https://www.picuki.com/",
"usernameClaimed": "adam",
@@ -12151,7 +12152,7 @@
"us"
],
"headers": {
"authorization": "Bearer BQAjb32z4TLh0t19LDuYfk2BV3gUXCpqyUuy2gBOyJTN_2xoZlN4AW1B6ZVmdKMDcI3Hc8agrrQsKbQZE90"
"authorization": "Bearer BQAEeuyBT6S535Anlx4wU-pfPjjgiE8r2e7j0eOSnwZjSvjFvQgDzxwV__03-WNbwxPKyGehoJ5pQCBwUqs"
},
"errors": {
"Spotify is currently not available in your country.": "Access denied in your country, use proxy/vpn"
@@ -13455,7 +13456,7 @@
"sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
"authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
"x-guest-token": "1386060728566681601"
"x-guest-token": "1387733472027070474"
},
"errors": {
"Bad guest token": "x-guest-token update required"
@@ -13832,7 +13833,7 @@
"video"
],
"headers": {
"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MTkzMDI0NDAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.fN8PQIEkzQjfu7znGoIaLEP9Qr6bV8JbA2ZwpBSFI5E"
"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MTk2OTczNjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.yLRq0lhenTYfe0EKKJsk5HZJZt3ykUVNBGuiMCC5HR4"
},
"activation": {
"url": "https://vimeo.com/_rv/viewer",
@@ -23602,6 +23603,80 @@
"urlMain": "https://tapd.co",
"usernameClaimed": "blue",
"usernameUnclaimed": "noonewouldeverusethis7"
},
"wblitz.net": {
"checkType": "message",
"presenseStrs": [
"profileBlock",
"tournaments",
"serverna",
" role=",
" name="
],
"absenceStrs": [
"<html><head><title>404 \u0421\u0442\u0440\u0430\u043d\u0438\u0446\u0430 \u043d\u0435 \u043d\u0430\u0439\u0434\u0435\u043d\u0430</title></head><body><h2>404 \u0421\u0442\u0440\u0430\u043d\u0438\u0446\u0430 \u043d\u0435 \u043d\u0430\u0439\u0434\u0435\u043d\u0430</h2></body></html>"
],
"url": "https://wblitz.net/stat/ru/{username}",
"urlMain": "https://wblitz.net",
"usernameClaimed": "lucklev12",
"usernameUnclaimed": "noonewouldeverusethis7"
},
"unc.ua": {
"checkType": "message",
"presenseStrs": [
"page-user_profile"
],
"absenceStrs": [
"Error Site"
],
"url": "https://unc.ua/{username}",
"urlMain": "https://unc.ua",
"usernameClaimed": "admin",
"usernameUnclaimed": "noonewouldeverusethis7"
},
"kloomba.com": {
"checkType": "message",
"presenseStrs": [
"name",
" role=",
" main"
],
"absenceStrs": [
"error-page"
],
"url": "https://kloomba.com/users/{username}",
"urlMain": "https://kloomba.com",
"usernameClaimed": "dima",
"usernameUnclaimed": "noonewouldeverusethis7"
},
"nevrotic.net": {
"checkType": "message",
"presenseStrs": [
"profile-tabs",
" profile-rating"
],
"absenceStrs": [
"table-404"
],
"url": "http://nevrotic.net/user/{username}",
"urlMain": "http://nevrotic.net",
"usernameClaimed": "admin",
"usernameUnclaimed": "noonewouldeverusethis7"
},
"pikabu.monster": {
"checkType": "message",
"presenseStrs": [
"usertotalcomments",
" usertotalposts"
],
"absenceStrs": [
"<title>\u041e\u0448\u0438\u0431\u043a\u0430</title>"
],
"source": "Pikabu",
"url": "https://pikabu.monster/user/{username}-summary",
"urlMain": "https://pikabu.monster",
"usernameClaimed": "Avezenit",
"usernameUnclaimed": "noonewouldeverusethis7"
}
},
"engines": {
+7
View File
@@ -69,6 +69,7 @@ class MaigretSite:
self.engine_obj = None
self.request_future = None
self.alexa_rank = None
self.source = None
for k, v in information.items():
self.__dict__[CaseConverter.camel_to_snake(k)] = v
@@ -99,6 +100,12 @@ class MaigretSite:
return None
@property
def pretty_name(self):
if self.source:
return f'{self.name} [{self.source}]'
return self.name
@property
def json(self):
result = {}
+13 -3
View File
@@ -10,6 +10,10 @@ DESIRED_STRINGS = ["username", "not found", "пользователь", "profile
SUPPOSED_USERNAMES = ['alex', 'god', 'admin', 'red', 'blue', 'john']
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11.1; rv:55.0) Gecko/20100101 Firefox/55.0',
}
RATIO = 0.6
TOP_FEATURES = 5
URL_RE = re.compile(r'https?://(www\.)?')
@@ -121,7 +125,7 @@ async def detect_known_engine(db, url_exists, url_mainpage):
return None
async def check_features_manually(db, url_exists, url_mainpage, cookie_file):
async def check_features_manually(db, url_exists, url_mainpage, cookie_file, redirects=False):
url_parts = url_exists.split('/')
supposed_username = url_parts[-1]
new_name = input(f'Is "{supposed_username}" a valid username? If not, write it manually: ')
@@ -138,8 +142,11 @@ async def check_features_manually(db, url_exists, url_mainpage, cookie_file):
cookie_jar = await import_aiohttp_cookies(cookie_file)
cookie_dict = {c.key: c.value for c in cookie_jar}
a = requests.get(url_exists, cookies=cookie_dict).text
b = requests.get(url_not_exists, cookies=cookie_dict).text
exists_resp = requests.get(url_exists, cookies=cookie_dict, headers=HEADERS, allow_redirects=redirects)
non_exists_resp = requests.get(url_not_exists, cookies=cookie_dict, headers=HEADERS, allow_redirects=redirects)
a = exists_resp.text
b = non_exists_resp.text
tokens_a = set(a.split('"'))
tokens_b = set(b.split('"'))
@@ -147,6 +154,9 @@ async def check_features_manually(db, url_exists, url_mainpage, cookie_file):
a_minus_b = tokens_a.difference(tokens_b)
b_minus_a = tokens_b.difference(tokens_a)
if len(a_minus_b) == len(b_minus_a) == 0:
print('The pages for existing and non-existing account are the same!')
top_features_count = int(input(f'Specify count of features to extract [default {TOP_FEATURES}]: ') or TOP_FEATURES)
presence_list = sorted(a_minus_b, key=get_match_ratio, reverse=True)[:top_features_count]