Refactoring of submit module, some fixes

This commit is contained in:
Soxoj
2021-06-13 00:43:28 +03:00
parent eb721dc7e3
commit 9b0acc092a
11 changed files with 534 additions and 438 deletions
+1 -1
View File
@@ -25,7 +25,7 @@ format:
pull: pull:
git stash git stash
git checkout main git checkout main
git pull origin head git pull origin main
git stash pop git stash pop
clean: clean:
+10 -4
View File
@@ -36,9 +36,10 @@ from .report import (
sort_report_by_data_points, sort_report_by_data_points,
) )
from .sites import MaigretDatabase from .sites import MaigretDatabase
from .submit import submit_dialog from .submit import Submitter
from .types import QueryResultWrapper from .types import QueryResultWrapper
from .utils import get_dict_ascii_tree from .utils import get_dict_ascii_tree
from .settings import Settings
def notify_about_errors(search_results: QueryResultWrapper, query_notify): def notify_about_errors(search_results: QueryResultWrapper, query_notify):
@@ -496,6 +497,12 @@ async def main():
if args.tags: if args.tags:
args.tags = list(set(str(args.tags).split(','))) args.tags = list(set(str(args.tags).split(',')))
settings = Settings(
os.path.join(
os.path.dirname(os.path.realpath(__file__)), "resources/settings.json"
)
)
if args.db_file is None: if args.db_file is None:
args.db_file = os.path.join( args.db_file = os.path.join(
os.path.dirname(os.path.realpath(__file__)), "resources/data.json" os.path.dirname(os.path.realpath(__file__)), "resources/data.json"
@@ -526,9 +533,8 @@ async def main():
site_data = get_top_sites_for_id(args.id_type) site_data = get_top_sites_for_id(args.id_type)
if args.new_site_to_submit: if args.new_site_to_submit:
is_submitted = await submit_dialog( submitter = Submitter(db=db, logger=logger, settings=settings)
db, args.new_site_to_submit, args.cookie_file, logger is_submitted = await submitter.dialog(args.new_site_to_submit, args.cookie_file)
)
if is_submitted: if is_submitted:
db.save_to_file(args.db_file) db.save_to_file(args.db_file)
+64 -5
View File
@@ -13036,7 +13036,7 @@
"us" "us"
], ],
"headers": { "headers": {
"authorization": "Bearer BQCypIuUtz7zDFov8xN86mj1BelLf7Apf9WBaC5yYfNkmGe4r7Hz4Awp6dqPuCAP9K9F5yYtjbyZX_vlr4I" "authorization": "Bearer BQAkHoH1XLhjIl6oh6r9YzH3kHC1OZg3UXgLiz39FzqRFh_xQrFaVrZcU-esM-t87B6Hqdc4L1HBgukKnWE"
}, },
"errors": { "errors": {
"Spotify is currently not available in your country.": "Access denied in your country, use proxy/vpn" "Spotify is currently not available in your country.": "Access denied in your country, use proxy/vpn"
@@ -13990,7 +13990,8 @@
"us" "us"
], ],
"errors": { "errors": {
"Website unavailable": "Site error" "Website unavailable": "Site error",
"is currently offline": "Site error"
}, },
"checkType": "message", "checkType": "message",
"absenceStrs": [ "absenceStrs": [
@@ -14462,7 +14463,7 @@
"sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"", "sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
"authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA", "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36", "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
"x-guest-token": "1400174453577900043" "x-guest-token": "1403829602053771266"
}, },
"errors": { "errors": {
"Bad guest token": "x-guest-token update required" "Bad guest token": "x-guest-token update required"
@@ -14869,7 +14870,7 @@
"video" "video"
], ],
"headers": { "headers": {
"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MjI2NjcxMjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.V4VVbLzNwPU21rNP5moSxrPcPw--C7_Qz9VHgcJc1CA" "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MjM1MzQ5NjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.5T8_p_q9zXOHXI2FT_XtMhsZUJMtPgCIaqwVF2u4aZI"
}, },
"activation": { "activation": {
"url": "https://vimeo.com/_rv/viewer", "url": "https://vimeo.com/_rv/viewer",
@@ -28457,5 +28458,63 @@
] ]
} }
} }
} },
"tags": [
"gaming",
"coding",
"photo",
"music",
"blog",
"finance",
"freelance",
"dating",
"tech",
"forum",
"porn",
"erotic",
"webcam",
"video",
"movies",
"hacking",
"art",
"discussion",
"sharing",
"writing",
"wiki",
"business",
"shopping",
"sport",
"books",
"news",
"documents",
"travel",
"maps",
"hobby",
"apps",
"classified",
"career",
"geosocial",
"streaming",
"education",
"networking",
"torrent",
"science",
"medicine",
"reading",
"stock",
"messaging",
"trading",
"links",
"fashion",
"tasks",
"military",
"auto",
"gambling",
"cybercriminal",
"review",
"bookmarks",
"design",
"tor",
"i2p"
]
} }
+17
View File
@@ -0,0 +1,17 @@
{
"presence_strings": [
"username",
"not found",
"пользователь",
"profile",
"lastname",
"firstname",
"biography",
"birthday",
"репутация",
"информация",
"e-mail"
],
"supposed_usernames": [
"alex", "god", "admin", "red", "blue", "john"]
}
+29
View File
@@ -0,0 +1,29 @@
import json
class Settings:
presence_strings: list
supposed_usernames: list
def __init__(self, filename):
data = {}
try:
with open(filename, "r", encoding="utf-8") as file:
try:
data = json.load(file)
except Exception as error:
raise ValueError(
f"Problem with parsing json contents of "
f"settings file '{filename}': {str(error)}."
)
except FileNotFoundError as error:
raise FileNotFoundError(
f"Problem while attempting to access settings file '{filename}'."
) from error
self.__dict__.update(data)
@property
def json(self):
return self.__dict__
+11 -66
View File
@@ -9,66 +9,6 @@ import requests
from .utils import CaseConverter, URLMatcher, is_country_tag from .utils import CaseConverter, URLMatcher, is_country_tag
# TODO: move to data.json
SUPPORTED_TAGS = [
"gaming",
"coding",
"photo",
"music",
"blog",
"finance",
"freelance",
"dating",
"tech",
"forum",
"porn",
"erotic",
"webcam",
"video",
"movies",
"hacking",
"art",
"discussion",
"sharing",
"writing",
"wiki",
"business",
"shopping",
"sport",
"books",
"news",
"documents",
"travel",
"maps",
"hobby",
"apps",
"classified",
"career",
"geosocial",
"streaming",
"education",
"networking",
"torrent",
"science",
"medicine",
"reading",
"stock",
"messaging",
"trading",
"links",
"fashion",
"tasks",
"military",
"auto",
"gambling",
"cybercriminal",
"review",
"bookmarks",
"design",
"tor",
"i2p",
]
class MaigretEngine: class MaigretEngine:
site: Dict[str, Any] = {} site: Dict[str, Any] = {}
@@ -204,12 +144,12 @@ class MaigretSite:
errors.update(self.errors) errors.update(self.errors)
return errors return errors
def get_url_type(self) -> str: def get_url_template(self) -> str:
url = URLMatcher.extract_main_part(self.url) url = URLMatcher.extract_main_part(self.url)
if url.startswith("{username}"): if url.startswith("{username}"):
url = "SUBDOMAIN" url = "SUBDOMAIN"
elif url == "": elif url == "":
url = f"{self.url} ({self.engine})" url = f"{self.url} ({self.engine or 'no engine'})"
else: else:
parts = url.split("/") parts = url.split("/")
url = "/" + "/".join(parts[1:]) url = "/" + "/".join(parts[1:])
@@ -273,8 +213,9 @@ class MaigretSite:
class MaigretDatabase: class MaigretDatabase:
def __init__(self): def __init__(self):
self._sites = [] self._tags: list = []
self._engines = [] self._sites: list = []
self._engines: list = []
@property @property
def sites(self): def sites(self):
@@ -354,6 +295,7 @@ class MaigretDatabase:
db_data = { db_data = {
"sites": {site.name: site.strip_engine_data().json for site in self._sites}, "sites": {site.name: site.strip_engine_data().json for site in self._sites},
"engines": {engine.name: engine.json for engine in self._engines}, "engines": {engine.name: engine.json for engine in self._engines},
"tags": self._tags,
} }
json_data = json.dumps(db_data, indent=4) json_data = json.dumps(db_data, indent=4)
@@ -367,6 +309,9 @@ class MaigretDatabase:
# Add all of site information from the json file to internal site list. # Add all of site information from the json file to internal site list.
site_data = json_data.get("sites", {}) site_data = json_data.get("sites", {})
engines_data = json_data.get("engines", {}) engines_data = json_data.get("engines", {})
tags = json_data.get("tags", [])
self._tags += tags
for engine_name in engines_data: for engine_name in engines_data:
self._engines.append(MaigretEngine(engine_name, engines_data[engine_name])) self._engines.append(MaigretEngine(engine_name, engines_data[engine_name]))
@@ -469,7 +414,7 @@ class MaigretDatabase:
if site.disabled: if site.disabled:
disabled_count += 1 disabled_count += 1
url_type = site.get_url_type() url_type = site.get_url_template()
urls[url_type] = urls.get(url_type, 0) + 1 urls[url_type] = urls.get(url_type, 0) + 1
if not site.tags: if not site.tags:
@@ -488,7 +433,7 @@ class MaigretDatabase:
output += "Top tags:\n" output += "Top tags:\n"
for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True)[:200]: for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True)[:200]:
mark = "" mark = ""
if tag not in SUPPORTED_TAGS: if tag not in self._tags:
mark = " (non-standard)" mark = " (non-standard)"
output += f"{count}\t{tag}{mark}\n" output += f"{count}\t{tag}{mark}\n"
+93 -101
View File
@@ -1,5 +1,4 @@
import asyncio import asyncio
import difflib
import re import re
from typing import List from typing import List
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
@@ -8,47 +7,29 @@ import requests
from .activation import import_aiohttp_cookies from .activation import import_aiohttp_cookies
from .checking import maigret from .checking import maigret
from .result import QueryStatus from .result import QueryStatus
from .settings import Settings
from .sites import MaigretDatabase, MaigretSite, MaigretEngine from .sites import MaigretDatabase, MaigretSite, MaigretEngine
from .utils import get_random_user_agent from .utils import get_random_user_agent, get_match_ratio
DESIRED_STRINGS = [ class Submitter:
"username", HEADERS = {
"not found",
"пользователь",
"profile",
"lastname",
"firstname",
"biography",
"birthday",
"репутация",
"информация",
"e-mail",
]
SUPPOSED_USERNAMES = ["alex", "god", "admin", "red", "blue", "john"]
HEADERS = {
"User-Agent": get_random_user_agent(), "User-Agent": get_random_user_agent(),
} }
SEPARATORS = "\"'" SEPARATORS = "\"'"
RATIO = 0.6 RATIO = 0.6
TOP_FEATURES = 5 TOP_FEATURES = 5
URL_RE = re.compile(r"https?://(www\.)?") URL_RE = re.compile(r"https?://(www\.)?")
def __init__(self, db: MaigretDatabase, settings: Settings, logger):
self.settings = settings
self.db = db
self.logger = logger
def get_match_ratio(x): @staticmethod
return round( def get_alexa_rank(site_url_main):
max(
[difflib.SequenceMatcher(a=x.lower(), b=y).ratio() for y in DESIRED_STRINGS]
),
2,
)
def get_alexa_rank(site_url_main):
url = f"http://data.alexa.com/data?cli=10&url={site_url_main}" url = f"http://data.alexa.com/data?cli=10&url={site_url_main}"
xml_data = requests.get(url).text xml_data = requests.get(url).text
root = ET.fromstring(xml_data) root = ET.fromstring(xml_data)
@@ -61,12 +42,11 @@ def get_alexa_rank(site_url_main):
return alexa_rank return alexa_rank
@staticmethod
def extract_mainpage_url(url): def extract_mainpage_url(url):
return "/".join(url.split("/", 3)[:3]) return "/".join(url.split("/", 3)[:3])
async def site_self_check(self, site, semaphore, silent=False):
async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False):
changes = { changes = {
"disabled": False, "disabled": False,
} }
@@ -76,13 +56,13 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F
(site.username_unclaimed, QueryStatus.AVAILABLE), (site.username_unclaimed, QueryStatus.AVAILABLE),
] ]
logger.info(f"Checking {site.name}...") self.logger.info(f"Checking {site.name}...")
for username, status in check_data: for username, status in check_data:
results_dict = await maigret( results_dict = await maigret(
username=username, username=username,
site_dict={site.name: site}, site_dict={site.name: site},
logger=logger, logger=self.logger,
timeout=30, timeout=30,
id_type=site.type, id_type=site.type,
forced=True, forced=True,
@@ -92,7 +72,7 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F
# don't disable entries with other ids types # don't disable entries with other ids types
# TODO: make normal checking # TODO: make normal checking
if site.name not in results_dict: if site.name not in results_dict:
logger.info(results_dict) self.logger.info(results_dict)
changes["disabled"] = True changes["disabled"] = True
continue continue
@@ -104,7 +84,7 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F
if site_status == QueryStatus.UNKNOWN: if site_status == QueryStatus.UNKNOWN:
msgs = site.absence_strs msgs = site.absence_strs
etype = site.check_type etype = site.check_type
logger.warning( self.logger.warning(
"Error while searching '%s' in %s: %s, %s, check type %s", "Error while searching '%s' in %s: %s, %s, check type %s",
username, username,
site.name, site.name,
@@ -116,22 +96,23 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F
if status == QueryStatus.CLAIMED: if status == QueryStatus.CLAIMED:
changes["disabled"] = True changes["disabled"] = True
elif status == QueryStatus.CLAIMED: elif status == QueryStatus.CLAIMED:
logger.warning( self.logger.warning(
f"Not found `{username}` in {site.name}, must be claimed" f"Not found `{username}` in {site.name}, must be claimed"
) )
logger.info(results_dict[site.name]) self.logger.info(results_dict[site.name])
changes["disabled"] = True changes["disabled"] = True
else: else:
logger.warning(f"Found `{username}` in {site.name}, must be available") self.logger.warning(
logger.info(results_dict[site.name]) f"Found `{username}` in {site.name}, must be available"
)
self.logger.info(results_dict[site.name])
changes["disabled"] = True changes["disabled"] = True
logger.info(f"Site {site.name} checking is finished") self.logger.info(f"Site {site.name} checking is finished")
return changes return changes
def generate_additional_fields_dialog(self, engine: MaigretEngine, dialog):
def generate_additional_fields_dialog(engine: MaigretEngine, dialog):
fields = {} fields = {}
if 'urlSubpath' in engine.site.get('url', ''): if 'urlSubpath' in engine.site.get('url', ''):
msg = ( msg = (
@@ -143,19 +124,16 @@ def generate_additional_fields_dialog(engine: MaigretEngine, dialog):
fields['urlSubpath'] = f'/{subpath}' fields['urlSubpath'] = f'/{subpath}'
return fields return fields
async def detect_known_engine(self, url_exists, url_mainpage) -> List[MaigretSite]:
async def detect_known_engine(
db, url_exists, url_mainpage, logger
) -> List[MaigretSite]:
try: try:
r = requests.get(url_mainpage) r = requests.get(url_mainpage)
logger.debug(r.text) self.logger.debug(r.text)
except Exception as e: except Exception as e:
logger.warning(e) self.logger.warning(e)
print("Some error while checking main page") print("Some error while checking main page")
return [] return []
for engine in db.engines: for engine in self.db.engines:
strs_to_check = engine.__dict__.get("presenseStrs") strs_to_check = engine.__dict__.get("presenseStrs")
if strs_to_check and r and r.text: if strs_to_check and r and r.text:
all_strs_in_response = True all_strs_in_response = True
@@ -168,12 +146,14 @@ async def detect_known_engine(
print(f"Detected engine {engine_name} for site {url_mainpage}") print(f"Detected engine {engine_name} for site {url_mainpage}")
usernames_to_check = SUPPOSED_USERNAMES usernames_to_check = self.settings.supposed_usernames
supposed_username = extract_username_dialog(url_exists) supposed_username = self.extract_username_dialog(url_exists)
if supposed_username: if supposed_username:
usernames_to_check = [supposed_username] + usernames_to_check usernames_to_check = [supposed_username] + usernames_to_check
add_fields = generate_additional_fields_dialog(engine, url_exists) add_fields = self.generate_additional_fields_dialog(
engine, url_exists
)
for u in usernames_to_check: for u in usernames_to_check:
site_data = { site_data = {
@@ -184,18 +164,21 @@ async def detect_known_engine(
"usernameUnclaimed": "noonewouldeverusethis7", "usernameUnclaimed": "noonewouldeverusethis7",
**add_fields, **add_fields,
} }
logger.info(site_data) self.logger.info(site_data)
maigret_site = MaigretSite(url_mainpage.split("/")[-1], site_data) maigret_site = MaigretSite(
maigret_site.update_from_engine(db.engines_dict[engine_name]) url_mainpage.split("/")[-1], site_data
)
maigret_site.update_from_engine(
self.db.engines_dict[engine_name]
)
sites.append(maigret_site) sites.append(maigret_site)
return sites return sites
return [] return []
def extract_username_dialog(self, url):
def extract_username_dialog(url):
url_parts = url.rstrip("/").split("/") url_parts = url.rstrip("/").split("/")
supposed_username = url_parts[-1].strip('@') supposed_username = url_parts[-1].strip('@')
entered_username = input( entered_username = input(
@@ -203,10 +186,9 @@ def extract_username_dialog(url):
) )
return entered_username if entered_username else supposed_username return entered_username if entered_username else supposed_username
async def check_features_manually(
async def check_features_manually( self, url_exists, url_mainpage, cookie_file, redirects=False
db, url_exists, url_mainpage, cookie_file, logger, redirects=False ):
):
custom_headers = {} custom_headers = {}
while True: while True:
header_key = input( header_key = input(
@@ -217,41 +199,44 @@ async def check_features_manually(
header_value = input('Header value: ') header_value = input('Header value: ')
custom_headers[header_key.strip()] = header_value.strip() custom_headers[header_key.strip()] = header_value.strip()
supposed_username = extract_username_dialog(url_exists) supposed_username = self.extract_username_dialog(url_exists)
non_exist_username = "noonewouldeverusethis7" non_exist_username = "noonewouldeverusethis7"
url_user = url_exists.replace(supposed_username, "{username}") url_user = url_exists.replace(supposed_username, "{username}")
url_not_exists = url_exists.replace(supposed_username, non_exist_username) url_not_exists = url_exists.replace(supposed_username, non_exist_username)
headers = dict(HEADERS) headers = dict(self.HEADERS)
headers.update(custom_headers) headers.update(custom_headers)
# cookies # cookies
cookie_dict = None cookie_dict = None
if cookie_file: if cookie_file:
logger.info(f'Use {cookie_file} for cookies') self.logger.info(f'Use {cookie_file} for cookies')
cookie_jar = import_aiohttp_cookies(cookie_file) cookie_jar = import_aiohttp_cookies(cookie_file)
cookie_dict = {c.key: c.value for c in cookie_jar} cookie_dict = {c.key: c.value for c in cookie_jar}
exists_resp = requests.get( exists_resp = requests.get(
url_exists, cookies=cookie_dict, headers=headers, allow_redirects=redirects url_exists, cookies=cookie_dict, headers=headers, allow_redirects=redirects
) )
logger.debug(url_exists) self.logger.debug(url_exists)
logger.debug(exists_resp.status_code) self.logger.debug(exists_resp.status_code)
logger.debug(exists_resp.text) self.logger.debug(exists_resp.text)
non_exists_resp = requests.get( non_exists_resp = requests.get(
url_not_exists, cookies=cookie_dict, headers=headers, allow_redirects=redirects url_not_exists,
cookies=cookie_dict,
headers=headers,
allow_redirects=redirects,
) )
logger.debug(url_not_exists) self.logger.debug(url_not_exists)
logger.debug(non_exists_resp.status_code) self.logger.debug(non_exists_resp.status_code)
logger.debug(non_exists_resp.text) self.logger.debug(non_exists_resp.text)
a = exists_resp.text a = exists_resp.text
b = non_exists_resp.text b = non_exists_resp.text
tokens_a = set(re.split(f'[{SEPARATORS}]', a)) tokens_a = set(re.split(f'[{self.SEPARATORS}]', a))
tokens_b = set(re.split(f'[{SEPARATORS}]', b)) tokens_b = set(re.split(f'[{self.SEPARATORS}]', b))
a_minus_b = tokens_a.difference(tokens_b) a_minus_b = tokens_a.difference(tokens_b)
b_minus_a = tokens_b.difference(tokens_a) b_minus_a = tokens_b.difference(tokens_a)
@@ -260,11 +245,15 @@ async def check_features_manually(
print("The pages for existing and non-existing account are the same!") print("The pages for existing and non-existing account are the same!")
top_features_count = int( top_features_count = int(
input(f"Specify count of features to extract [default {TOP_FEATURES}]: ") input(
or TOP_FEATURES f"Specify count of features to extract [default {self.TOP_FEATURES}]: "
)
or self.TOP_FEATURES
) )
presence_list = sorted(a_minus_b, key=get_match_ratio, reverse=True)[ match_fun = get_match_ratio(self.settings.presence_strings)
presence_list = sorted(a_minus_b, key=match_fun, reverse=True)[
:top_features_count :top_features_count
] ]
@@ -274,10 +263,12 @@ async def check_features_manually(
if features: if features:
presence_list = list(map(str.strip, features.split(","))) presence_list = list(map(str.strip, features.split(",")))
absence_list = sorted(b_minus_a, key=get_match_ratio, reverse=True)[ absence_list = sorted(b_minus_a, key=match_fun, reverse=True)[
:top_features_count :top_features_count
] ]
print("Detected text features of non-existing account: " + ", ".join(absence_list)) print(
"Detected text features of non-existing account: " + ", ".join(absence_list)
)
features = input("If features was not detected correctly, write it manually: ") features = input("If features was not detected correctly, write it manually: ")
if features: if features:
@@ -293,20 +284,21 @@ async def check_features_manually(
"checkType": "message", "checkType": "message",
} }
if headers != HEADERS: if headers != self.HEADERS:
site_data['headers'] = headers site_data['headers'] = headers
site = MaigretSite(url_mainpage.split("/")[-1], site_data) site = MaigretSite(url_mainpage.split("/")[-1], site_data)
return site return site
async def dialog(self, url_exists, cookie_file):
async def submit_dialog(db, url_exists, cookie_file, logger): domain_raw = self.URL_RE.sub("", url_exists).strip().strip("/")
domain_raw = URL_RE.sub("", url_exists).strip().strip("/")
domain_raw = domain_raw.split("/")[0] domain_raw = domain_raw.split("/")[0]
logger.info('Domain is %s', domain_raw) self.logger.info('Domain is %s', domain_raw)
# check for existence # check for existence
matched_sites = list(filter(lambda x: domain_raw in x.url_main + x.url, db.sites)) matched_sites = list(
filter(lambda x: domain_raw in x.url_main + x.url, self.db.sites)
)
if matched_sites: if matched_sites:
print( print(
@@ -326,24 +318,24 @@ async def submit_dialog(db, url_exists, cookie_file, logger):
if input("Do you want to continue? [yN] ").lower() in "n": if input("Do you want to continue? [yN] ").lower() in "n":
return False return False
url_mainpage = extract_mainpage_url(url_exists) url_mainpage = self.extract_mainpage_url(url_exists)
print('Detecting site engine, please wait...') print('Detecting site engine, please wait...')
sites = [] sites = []
try: try:
sites = await detect_known_engine(db, url_exists, url_mainpage, logger) sites = await self.detect_known_engine(url_exists, url_mainpage)
except KeyboardInterrupt: except KeyboardInterrupt:
print('Engine detect process is interrupted.') print('Engine detect process is interrupted.')
if not sites: if not sites:
print("Unable to detect site engine, lets generate checking features") print("Unable to detect site engine, lets generate checking features")
sites = [ sites = [
await check_features_manually( await self.check_features_manually(
db, url_exists, url_mainpage, cookie_file, logger url_exists, url_mainpage, cookie_file
) )
] ]
logger.debug(sites[0].__dict__) self.logger.debug(sites[0].__dict__)
sem = asyncio.Semaphore(1) sem = asyncio.Semaphore(1)
@@ -352,7 +344,7 @@ async def submit_dialog(db, url_exists, cookie_file, logger):
chosen_site = None chosen_site = None
for s in sites: for s in sites:
chosen_site = s chosen_site = s
result = await site_self_check(s, logger, sem, db) result = await self.site_self_check(s, sem)
if not result["disabled"]: if not result["disabled"]:
found = True found = True
break break
@@ -377,13 +369,13 @@ async def submit_dialog(db, url_exists, cookie_file, logger):
chosen_site.name = input("Change site name if you want: ") or chosen_site.name chosen_site.name = input("Change site name if you want: ") or chosen_site.name
chosen_site.tags = list(map(str.strip, input("Site tags: ").split(','))) chosen_site.tags = list(map(str.strip, input("Site tags: ").split(',')))
rank = get_alexa_rank(chosen_site.url_main) rank = Submitter.get_alexa_rank(chosen_site.url_main)
if rank: if rank:
print(f'New alexa rank: {rank}') print(f'New alexa rank: {rank}')
chosen_site.alexa_rank = rank chosen_site.alexa_rank = rank
logger.debug(chosen_site.json) self.logger.debug(chosen_site.json)
site_data = chosen_site.strip_engine_data() site_data = chosen_site.strip_engine_data()
logger.debug(site_data.json) self.logger.debug(site_data.json)
db.update_site(site_data) self.db.update_site(site_data)
return True return True
+16
View File
@@ -1,4 +1,5 @@
import ast import ast
import difflib
import re import re
import random import random
from typing import Any from typing import Any
@@ -95,3 +96,18 @@ def get_dict_ascii_tree(items, prepend="", new_line=True):
def get_random_user_agent(): def get_random_user_agent():
return random.choice(DEFAULT_USER_AGENTS) return random.choice(DEFAULT_USER_AGENTS)
def get_match_ratio(base_strs: list):
def get_match_inner(s: str):
return round(
max(
[
difflib.SequenceMatcher(a=s.lower(), b=s2.lower()).ratio()
for s2 in base_strs
]
),
2,
)
return get_match_inner
+3 -2
View File
@@ -1,15 +1,16 @@
"""Maigret data test functions""" """Maigret data test functions"""
from maigret.utils import is_country_tag from maigret.utils import is_country_tag
from maigret.sites import SUPPORTED_TAGS
def test_tags_validity(default_db): def test_tags_validity(default_db):
unknown_tags = set() unknown_tags = set()
tags = default_db._tags
for site in default_db.sites: for site in default_db.sites:
for tag in filter(lambda x: not is_country_tag(x), site.tags): for tag in filter(lambda x: not is_country_tag(x), site.tags):
if tag not in SUPPORTED_TAGS: if tag not in tags:
unknown_tags.add(tag) unknown_tags.add(tag)
assert unknown_tags == set() assert unknown_tags == set()
+24
View File
@@ -1,5 +1,6 @@
"""Maigret Database test functions""" """Maigret Database test functions"""
from maigret.sites import MaigretDatabase, MaigretSite from maigret.sites import MaigretDatabase, MaigretSite
from maigret.utils import URLMatcher
EXAMPLE_DB = { EXAMPLE_DB = {
'engines': { 'engines': {
@@ -179,3 +180,26 @@ def test_ranked_sites_dict_id_type():
assert len(db.ranked_sites_dict()) == 2 assert len(db.ranked_sites_dict()) == 2
assert len(db.ranked_sites_dict(id_type='username')) == 2 assert len(db.ranked_sites_dict(id_type='username')) == 2
assert len(db.ranked_sites_dict(id_type='gaia_id')) == 1 assert len(db.ranked_sites_dict(id_type='gaia_id')) == 1
def test_get_url_template():
site = MaigretSite(
"test",
{
"urlMain": "https://ya.ru/",
"url": "{urlMain}{urlSubpath}/members/?username={username}",
},
)
assert (
site.get_url_template()
== "{urlMain}{urlSubpath}/members/?username={username} (no engine)"
)
site = MaigretSite(
"test",
{
"urlMain": "https://ya.ru/",
"url": "https://{username}.ya.ru",
},
)
assert site.get_url_template() == "SUBDOMAIN"
+7
View File
@@ -8,6 +8,7 @@ from maigret.utils import (
enrich_link_str, enrich_link_str,
URLMatcher, URLMatcher,
get_dict_ascii_tree, get_dict_ascii_tree,
get_match_ratio,
) )
@@ -136,3 +137,9 @@ def test_get_dict_ascii_tree():
instagram_username: Street.Reality.Photography instagram_username: Street.Reality.Photography
twitter_username: Alexaimephotogr""" twitter_username: Alexaimephotogr"""
) )
def test_get_match_ratio():
fun = get_match_ratio(["test", "maigret", "username"])
assert fun("test") == 1