mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-07 06:24:35 +00:00
Merge pull request #169 from soxoj/submit-mode-refactoring
Refactoring of submit module, some fixes
This commit is contained in:
@@ -25,7 +25,7 @@ format:
|
|||||||
pull:
|
pull:
|
||||||
git stash
|
git stash
|
||||||
git checkout main
|
git checkout main
|
||||||
git pull origin head
|
git pull origin main
|
||||||
git stash pop
|
git stash pop
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
|
|||||||
+10
-4
@@ -36,9 +36,10 @@ from .report import (
|
|||||||
sort_report_by_data_points,
|
sort_report_by_data_points,
|
||||||
)
|
)
|
||||||
from .sites import MaigretDatabase
|
from .sites import MaigretDatabase
|
||||||
from .submit import submit_dialog
|
from .submit import Submitter
|
||||||
from .types import QueryResultWrapper
|
from .types import QueryResultWrapper
|
||||||
from .utils import get_dict_ascii_tree
|
from .utils import get_dict_ascii_tree
|
||||||
|
from .settings import Settings
|
||||||
|
|
||||||
|
|
||||||
def notify_about_errors(search_results: QueryResultWrapper, query_notify):
|
def notify_about_errors(search_results: QueryResultWrapper, query_notify):
|
||||||
@@ -496,6 +497,12 @@ async def main():
|
|||||||
if args.tags:
|
if args.tags:
|
||||||
args.tags = list(set(str(args.tags).split(',')))
|
args.tags = list(set(str(args.tags).split(',')))
|
||||||
|
|
||||||
|
settings = Settings(
|
||||||
|
os.path.join(
|
||||||
|
os.path.dirname(os.path.realpath(__file__)), "resources/settings.json"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
if args.db_file is None:
|
if args.db_file is None:
|
||||||
args.db_file = os.path.join(
|
args.db_file = os.path.join(
|
||||||
os.path.dirname(os.path.realpath(__file__)), "resources/data.json"
|
os.path.dirname(os.path.realpath(__file__)), "resources/data.json"
|
||||||
@@ -526,9 +533,8 @@ async def main():
|
|||||||
site_data = get_top_sites_for_id(args.id_type)
|
site_data = get_top_sites_for_id(args.id_type)
|
||||||
|
|
||||||
if args.new_site_to_submit:
|
if args.new_site_to_submit:
|
||||||
is_submitted = await submit_dialog(
|
submitter = Submitter(db=db, logger=logger, settings=settings)
|
||||||
db, args.new_site_to_submit, args.cookie_file, logger
|
is_submitted = await submitter.dialog(args.new_site_to_submit, args.cookie_file)
|
||||||
)
|
|
||||||
if is_submitted:
|
if is_submitted:
|
||||||
db.save_to_file(args.db_file)
|
db.save_to_file(args.db_file)
|
||||||
|
|
||||||
|
|||||||
@@ -13036,7 +13036,7 @@
|
|||||||
"us"
|
"us"
|
||||||
],
|
],
|
||||||
"headers": {
|
"headers": {
|
||||||
"authorization": "Bearer BQCypIuUtz7zDFov8xN86mj1BelLf7Apf9WBaC5yYfNkmGe4r7Hz4Awp6dqPuCAP9K9F5yYtjbyZX_vlr4I"
|
"authorization": "Bearer BQAkHoH1XLhjIl6oh6r9YzH3kHC1OZg3UXgLiz39FzqRFh_xQrFaVrZcU-esM-t87B6Hqdc4L1HBgukKnWE"
|
||||||
},
|
},
|
||||||
"errors": {
|
"errors": {
|
||||||
"Spotify is currently not available in your country.": "Access denied in your country, use proxy/vpn"
|
"Spotify is currently not available in your country.": "Access denied in your country, use proxy/vpn"
|
||||||
@@ -13990,7 +13990,8 @@
|
|||||||
"us"
|
"us"
|
||||||
],
|
],
|
||||||
"errors": {
|
"errors": {
|
||||||
"Website unavailable": "Site error"
|
"Website unavailable": "Site error",
|
||||||
|
"is currently offline": "Site error"
|
||||||
},
|
},
|
||||||
"checkType": "message",
|
"checkType": "message",
|
||||||
"absenceStrs": [
|
"absenceStrs": [
|
||||||
@@ -14462,7 +14463,7 @@
|
|||||||
"sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
|
"sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
|
||||||
"authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
|
"authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
|
||||||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
|
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
|
||||||
"x-guest-token": "1400174453577900043"
|
"x-guest-token": "1403829602053771266"
|
||||||
},
|
},
|
||||||
"errors": {
|
"errors": {
|
||||||
"Bad guest token": "x-guest-token update required"
|
"Bad guest token": "x-guest-token update required"
|
||||||
@@ -14869,7 +14870,7 @@
|
|||||||
"video"
|
"video"
|
||||||
],
|
],
|
||||||
"headers": {
|
"headers": {
|
||||||
"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MjI2NjcxMjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.V4VVbLzNwPU21rNP5moSxrPcPw--C7_Qz9VHgcJc1CA"
|
"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MjM1MzQ5NjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.5T8_p_q9zXOHXI2FT_XtMhsZUJMtPgCIaqwVF2u4aZI"
|
||||||
},
|
},
|
||||||
"activation": {
|
"activation": {
|
||||||
"url": "https://vimeo.com/_rv/viewer",
|
"url": "https://vimeo.com/_rv/viewer",
|
||||||
@@ -28457,5 +28458,63 @@
|
|||||||
]
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
},
|
||||||
|
"tags": [
|
||||||
|
"gaming",
|
||||||
|
"coding",
|
||||||
|
"photo",
|
||||||
|
"music",
|
||||||
|
"blog",
|
||||||
|
"finance",
|
||||||
|
"freelance",
|
||||||
|
"dating",
|
||||||
|
"tech",
|
||||||
|
"forum",
|
||||||
|
"porn",
|
||||||
|
"erotic",
|
||||||
|
"webcam",
|
||||||
|
"video",
|
||||||
|
"movies",
|
||||||
|
"hacking",
|
||||||
|
"art",
|
||||||
|
"discussion",
|
||||||
|
"sharing",
|
||||||
|
"writing",
|
||||||
|
"wiki",
|
||||||
|
"business",
|
||||||
|
"shopping",
|
||||||
|
"sport",
|
||||||
|
"books",
|
||||||
|
"news",
|
||||||
|
"documents",
|
||||||
|
"travel",
|
||||||
|
"maps",
|
||||||
|
"hobby",
|
||||||
|
"apps",
|
||||||
|
"classified",
|
||||||
|
"career",
|
||||||
|
"geosocial",
|
||||||
|
"streaming",
|
||||||
|
"education",
|
||||||
|
"networking",
|
||||||
|
"torrent",
|
||||||
|
"science",
|
||||||
|
"medicine",
|
||||||
|
"reading",
|
||||||
|
"stock",
|
||||||
|
"messaging",
|
||||||
|
"trading",
|
||||||
|
"links",
|
||||||
|
"fashion",
|
||||||
|
"tasks",
|
||||||
|
"military",
|
||||||
|
"auto",
|
||||||
|
"gambling",
|
||||||
|
"cybercriminal",
|
||||||
|
"review",
|
||||||
|
"bookmarks",
|
||||||
|
"design",
|
||||||
|
"tor",
|
||||||
|
"i2p"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
@@ -0,0 +1,17 @@
|
|||||||
|
{
|
||||||
|
"presence_strings": [
|
||||||
|
"username",
|
||||||
|
"not found",
|
||||||
|
"пользователь",
|
||||||
|
"profile",
|
||||||
|
"lastname",
|
||||||
|
"firstname",
|
||||||
|
"biography",
|
||||||
|
"birthday",
|
||||||
|
"репутация",
|
||||||
|
"информация",
|
||||||
|
"e-mail"
|
||||||
|
],
|
||||||
|
"supposed_usernames": [
|
||||||
|
"alex", "god", "admin", "red", "blue", "john"]
|
||||||
|
}
|
||||||
@@ -0,0 +1,29 @@
|
|||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
class Settings:
|
||||||
|
presence_strings: list
|
||||||
|
supposed_usernames: list
|
||||||
|
|
||||||
|
def __init__(self, filename):
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(filename, "r", encoding="utf-8") as file:
|
||||||
|
try:
|
||||||
|
data = json.load(file)
|
||||||
|
except Exception as error:
|
||||||
|
raise ValueError(
|
||||||
|
f"Problem with parsing json contents of "
|
||||||
|
f"settings file '{filename}': {str(error)}."
|
||||||
|
)
|
||||||
|
except FileNotFoundError as error:
|
||||||
|
raise FileNotFoundError(
|
||||||
|
f"Problem while attempting to access settings file '{filename}'."
|
||||||
|
) from error
|
||||||
|
|
||||||
|
self.__dict__.update(data)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def json(self):
|
||||||
|
return self.__dict__
|
||||||
+11
-66
@@ -9,66 +9,6 @@ import requests
|
|||||||
|
|
||||||
from .utils import CaseConverter, URLMatcher, is_country_tag
|
from .utils import CaseConverter, URLMatcher, is_country_tag
|
||||||
|
|
||||||
# TODO: move to data.json
|
|
||||||
SUPPORTED_TAGS = [
|
|
||||||
"gaming",
|
|
||||||
"coding",
|
|
||||||
"photo",
|
|
||||||
"music",
|
|
||||||
"blog",
|
|
||||||
"finance",
|
|
||||||
"freelance",
|
|
||||||
"dating",
|
|
||||||
"tech",
|
|
||||||
"forum",
|
|
||||||
"porn",
|
|
||||||
"erotic",
|
|
||||||
"webcam",
|
|
||||||
"video",
|
|
||||||
"movies",
|
|
||||||
"hacking",
|
|
||||||
"art",
|
|
||||||
"discussion",
|
|
||||||
"sharing",
|
|
||||||
"writing",
|
|
||||||
"wiki",
|
|
||||||
"business",
|
|
||||||
"shopping",
|
|
||||||
"sport",
|
|
||||||
"books",
|
|
||||||
"news",
|
|
||||||
"documents",
|
|
||||||
"travel",
|
|
||||||
"maps",
|
|
||||||
"hobby",
|
|
||||||
"apps",
|
|
||||||
"classified",
|
|
||||||
"career",
|
|
||||||
"geosocial",
|
|
||||||
"streaming",
|
|
||||||
"education",
|
|
||||||
"networking",
|
|
||||||
"torrent",
|
|
||||||
"science",
|
|
||||||
"medicine",
|
|
||||||
"reading",
|
|
||||||
"stock",
|
|
||||||
"messaging",
|
|
||||||
"trading",
|
|
||||||
"links",
|
|
||||||
"fashion",
|
|
||||||
"tasks",
|
|
||||||
"military",
|
|
||||||
"auto",
|
|
||||||
"gambling",
|
|
||||||
"cybercriminal",
|
|
||||||
"review",
|
|
||||||
"bookmarks",
|
|
||||||
"design",
|
|
||||||
"tor",
|
|
||||||
"i2p",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class MaigretEngine:
|
class MaigretEngine:
|
||||||
site: Dict[str, Any] = {}
|
site: Dict[str, Any] = {}
|
||||||
@@ -204,12 +144,12 @@ class MaigretSite:
|
|||||||
errors.update(self.errors)
|
errors.update(self.errors)
|
||||||
return errors
|
return errors
|
||||||
|
|
||||||
def get_url_type(self) -> str:
|
def get_url_template(self) -> str:
|
||||||
url = URLMatcher.extract_main_part(self.url)
|
url = URLMatcher.extract_main_part(self.url)
|
||||||
if url.startswith("{username}"):
|
if url.startswith("{username}"):
|
||||||
url = "SUBDOMAIN"
|
url = "SUBDOMAIN"
|
||||||
elif url == "":
|
elif url == "":
|
||||||
url = f"{self.url} ({self.engine})"
|
url = f"{self.url} ({self.engine or 'no engine'})"
|
||||||
else:
|
else:
|
||||||
parts = url.split("/")
|
parts = url.split("/")
|
||||||
url = "/" + "/".join(parts[1:])
|
url = "/" + "/".join(parts[1:])
|
||||||
@@ -273,8 +213,9 @@ class MaigretSite:
|
|||||||
|
|
||||||
class MaigretDatabase:
|
class MaigretDatabase:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._sites = []
|
self._tags: list = []
|
||||||
self._engines = []
|
self._sites: list = []
|
||||||
|
self._engines: list = []
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def sites(self):
|
def sites(self):
|
||||||
@@ -354,6 +295,7 @@ class MaigretDatabase:
|
|||||||
db_data = {
|
db_data = {
|
||||||
"sites": {site.name: site.strip_engine_data().json for site in self._sites},
|
"sites": {site.name: site.strip_engine_data().json for site in self._sites},
|
||||||
"engines": {engine.name: engine.json for engine in self._engines},
|
"engines": {engine.name: engine.json for engine in self._engines},
|
||||||
|
"tags": self._tags,
|
||||||
}
|
}
|
||||||
|
|
||||||
json_data = json.dumps(db_data, indent=4)
|
json_data = json.dumps(db_data, indent=4)
|
||||||
@@ -367,6 +309,9 @@ class MaigretDatabase:
|
|||||||
# Add all of site information from the json file to internal site list.
|
# Add all of site information from the json file to internal site list.
|
||||||
site_data = json_data.get("sites", {})
|
site_data = json_data.get("sites", {})
|
||||||
engines_data = json_data.get("engines", {})
|
engines_data = json_data.get("engines", {})
|
||||||
|
tags = json_data.get("tags", [])
|
||||||
|
|
||||||
|
self._tags += tags
|
||||||
|
|
||||||
for engine_name in engines_data:
|
for engine_name in engines_data:
|
||||||
self._engines.append(MaigretEngine(engine_name, engines_data[engine_name]))
|
self._engines.append(MaigretEngine(engine_name, engines_data[engine_name]))
|
||||||
@@ -469,7 +414,7 @@ class MaigretDatabase:
|
|||||||
if site.disabled:
|
if site.disabled:
|
||||||
disabled_count += 1
|
disabled_count += 1
|
||||||
|
|
||||||
url_type = site.get_url_type()
|
url_type = site.get_url_template()
|
||||||
urls[url_type] = urls.get(url_type, 0) + 1
|
urls[url_type] = urls.get(url_type, 0) + 1
|
||||||
|
|
||||||
if not site.tags:
|
if not site.tags:
|
||||||
@@ -488,7 +433,7 @@ class MaigretDatabase:
|
|||||||
output += "Top tags:\n"
|
output += "Top tags:\n"
|
||||||
for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True)[:200]:
|
for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True)[:200]:
|
||||||
mark = ""
|
mark = ""
|
||||||
if tag not in SUPPORTED_TAGS:
|
if tag not in self._tags:
|
||||||
mark = " (non-standard)"
|
mark = " (non-standard)"
|
||||||
output += f"{count}\t{tag}{mark}\n"
|
output += f"{count}\t{tag}{mark}\n"
|
||||||
|
|
||||||
|
|||||||
+352
-360
@@ -1,5 +1,4 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import difflib
|
|
||||||
import re
|
import re
|
||||||
from typing import List
|
from typing import List
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
@@ -8,382 +7,375 @@ import requests
|
|||||||
from .activation import import_aiohttp_cookies
|
from .activation import import_aiohttp_cookies
|
||||||
from .checking import maigret
|
from .checking import maigret
|
||||||
from .result import QueryStatus
|
from .result import QueryStatus
|
||||||
|
from .settings import Settings
|
||||||
from .sites import MaigretDatabase, MaigretSite, MaigretEngine
|
from .sites import MaigretDatabase, MaigretSite, MaigretEngine
|
||||||
from .utils import get_random_user_agent
|
from .utils import get_random_user_agent, get_match_ratio
|
||||||
|
|
||||||
|
|
||||||
DESIRED_STRINGS = [
|
class Submitter:
|
||||||
"username",
|
HEADERS = {
|
||||||
"not found",
|
"User-Agent": get_random_user_agent(),
|
||||||
"пользователь",
|
|
||||||
"profile",
|
|
||||||
"lastname",
|
|
||||||
"firstname",
|
|
||||||
"biography",
|
|
||||||
"birthday",
|
|
||||||
"репутация",
|
|
||||||
"информация",
|
|
||||||
"e-mail",
|
|
||||||
]
|
|
||||||
|
|
||||||
SUPPOSED_USERNAMES = ["alex", "god", "admin", "red", "blue", "john"]
|
|
||||||
|
|
||||||
HEADERS = {
|
|
||||||
"User-Agent": get_random_user_agent(),
|
|
||||||
}
|
|
||||||
|
|
||||||
SEPARATORS = "\"'"
|
|
||||||
|
|
||||||
RATIO = 0.6
|
|
||||||
TOP_FEATURES = 5
|
|
||||||
URL_RE = re.compile(r"https?://(www\.)?")
|
|
||||||
|
|
||||||
|
|
||||||
def get_match_ratio(x):
|
|
||||||
return round(
|
|
||||||
max(
|
|
||||||
[difflib.SequenceMatcher(a=x.lower(), b=y).ratio() for y in DESIRED_STRINGS]
|
|
||||||
),
|
|
||||||
2,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_alexa_rank(site_url_main):
|
|
||||||
url = f"http://data.alexa.com/data?cli=10&url={site_url_main}"
|
|
||||||
xml_data = requests.get(url).text
|
|
||||||
root = ET.fromstring(xml_data)
|
|
||||||
alexa_rank = 0
|
|
||||||
|
|
||||||
try:
|
|
||||||
alexa_rank = int(root.find('.//REACH').attrib['RANK'])
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
return alexa_rank
|
|
||||||
|
|
||||||
|
|
||||||
def extract_mainpage_url(url):
|
|
||||||
return "/".join(url.split("/", 3)[:3])
|
|
||||||
|
|
||||||
|
|
||||||
async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False):
|
|
||||||
changes = {
|
|
||||||
"disabled": False,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
check_data = [
|
SEPARATORS = "\"'"
|
||||||
(site.username_claimed, QueryStatus.CLAIMED),
|
|
||||||
(site.username_unclaimed, QueryStatus.AVAILABLE),
|
|
||||||
]
|
|
||||||
|
|
||||||
logger.info(f"Checking {site.name}...")
|
RATIO = 0.6
|
||||||
|
TOP_FEATURES = 5
|
||||||
|
URL_RE = re.compile(r"https?://(www\.)?")
|
||||||
|
|
||||||
for username, status in check_data:
|
def __init__(self, db: MaigretDatabase, settings: Settings, logger):
|
||||||
results_dict = await maigret(
|
self.settings = settings
|
||||||
username=username,
|
self.db = db
|
||||||
site_dict={site.name: site},
|
self.logger = logger
|
||||||
logger=logger,
|
|
||||||
timeout=30,
|
|
||||||
id_type=site.type,
|
|
||||||
forced=True,
|
|
||||||
no_progressbar=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
# don't disable entries with other ids types
|
@staticmethod
|
||||||
# TODO: make normal checking
|
def get_alexa_rank(site_url_main):
|
||||||
if site.name not in results_dict:
|
url = f"http://data.alexa.com/data?cli=10&url={site_url_main}"
|
||||||
logger.info(results_dict)
|
xml_data = requests.get(url).text
|
||||||
changes["disabled"] = True
|
root = ET.fromstring(xml_data)
|
||||||
continue
|
alexa_rank = 0
|
||||||
|
|
||||||
result = results_dict[site.name]["status"]
|
try:
|
||||||
|
alexa_rank = int(root.find('.//REACH').attrib['RANK'])
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
site_status = result.status
|
return alexa_rank
|
||||||
|
|
||||||
if site_status != status:
|
@staticmethod
|
||||||
if site_status == QueryStatus.UNKNOWN:
|
def extract_mainpage_url(url):
|
||||||
msgs = site.absence_strs
|
return "/".join(url.split("/", 3)[:3])
|
||||||
etype = site.check_type
|
|
||||||
logger.warning(
|
|
||||||
"Error while searching '%s' in %s: %s, %s, check type %s",
|
|
||||||
username,
|
|
||||||
site.name,
|
|
||||||
result.context,
|
|
||||||
msgs,
|
|
||||||
etype,
|
|
||||||
)
|
|
||||||
# don't disable in case of available username
|
|
||||||
if status == QueryStatus.CLAIMED:
|
|
||||||
changes["disabled"] = True
|
|
||||||
elif status == QueryStatus.CLAIMED:
|
|
||||||
logger.warning(
|
|
||||||
f"Not found `{username}` in {site.name}, must be claimed"
|
|
||||||
)
|
|
||||||
logger.info(results_dict[site.name])
|
|
||||||
changes["disabled"] = True
|
|
||||||
else:
|
|
||||||
logger.warning(f"Found `{username}` in {site.name}, must be available")
|
|
||||||
logger.info(results_dict[site.name])
|
|
||||||
changes["disabled"] = True
|
|
||||||
|
|
||||||
logger.info(f"Site {site.name} checking is finished")
|
async def site_self_check(self, site, semaphore, silent=False):
|
||||||
|
changes = {
|
||||||
|
"disabled": False,
|
||||||
|
}
|
||||||
|
|
||||||
return changes
|
check_data = [
|
||||||
|
(site.username_claimed, QueryStatus.CLAIMED),
|
||||||
|
(site.username_unclaimed, QueryStatus.AVAILABLE),
|
||||||
def generate_additional_fields_dialog(engine: MaigretEngine, dialog):
|
|
||||||
fields = {}
|
|
||||||
if 'urlSubpath' in engine.site.get('url', ''):
|
|
||||||
msg = (
|
|
||||||
'Detected engine suppose additional URL subpath using (/forum/, /blog/, etc). '
|
|
||||||
'Enter in manually if it exists: '
|
|
||||||
)
|
|
||||||
subpath = input(msg).strip('/')
|
|
||||||
if subpath:
|
|
||||||
fields['urlSubpath'] = f'/{subpath}'
|
|
||||||
return fields
|
|
||||||
|
|
||||||
|
|
||||||
async def detect_known_engine(
|
|
||||||
db, url_exists, url_mainpage, logger
|
|
||||||
) -> List[MaigretSite]:
|
|
||||||
try:
|
|
||||||
r = requests.get(url_mainpage)
|
|
||||||
logger.debug(r.text)
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(e)
|
|
||||||
print("Some error while checking main page")
|
|
||||||
return []
|
|
||||||
|
|
||||||
for engine in db.engines:
|
|
||||||
strs_to_check = engine.__dict__.get("presenseStrs")
|
|
||||||
if strs_to_check and r and r.text:
|
|
||||||
all_strs_in_response = True
|
|
||||||
for s in strs_to_check:
|
|
||||||
if s not in r.text:
|
|
||||||
all_strs_in_response = False
|
|
||||||
sites = []
|
|
||||||
if all_strs_in_response:
|
|
||||||
engine_name = engine.__dict__.get("name")
|
|
||||||
|
|
||||||
print(f"Detected engine {engine_name} for site {url_mainpage}")
|
|
||||||
|
|
||||||
usernames_to_check = SUPPOSED_USERNAMES
|
|
||||||
supposed_username = extract_username_dialog(url_exists)
|
|
||||||
if supposed_username:
|
|
||||||
usernames_to_check = [supposed_username] + usernames_to_check
|
|
||||||
|
|
||||||
add_fields = generate_additional_fields_dialog(engine, url_exists)
|
|
||||||
|
|
||||||
for u in usernames_to_check:
|
|
||||||
site_data = {
|
|
||||||
"urlMain": url_mainpage,
|
|
||||||
"name": url_mainpage.split("//")[1],
|
|
||||||
"engine": engine_name,
|
|
||||||
"usernameClaimed": u,
|
|
||||||
"usernameUnclaimed": "noonewouldeverusethis7",
|
|
||||||
**add_fields,
|
|
||||||
}
|
|
||||||
logger.info(site_data)
|
|
||||||
|
|
||||||
maigret_site = MaigretSite(url_mainpage.split("/")[-1], site_data)
|
|
||||||
maigret_site.update_from_engine(db.engines_dict[engine_name])
|
|
||||||
sites.append(maigret_site)
|
|
||||||
|
|
||||||
return sites
|
|
||||||
|
|
||||||
return []
|
|
||||||
|
|
||||||
|
|
||||||
def extract_username_dialog(url):
|
|
||||||
url_parts = url.rstrip("/").split("/")
|
|
||||||
supposed_username = url_parts[-1].strip('@')
|
|
||||||
entered_username = input(
|
|
||||||
f'Is "{supposed_username}" a valid username? If not, write it manually: '
|
|
||||||
)
|
|
||||||
return entered_username if entered_username else supposed_username
|
|
||||||
|
|
||||||
|
|
||||||
async def check_features_manually(
|
|
||||||
db, url_exists, url_mainpage, cookie_file, logger, redirects=False
|
|
||||||
):
|
|
||||||
custom_headers = {}
|
|
||||||
while True:
|
|
||||||
header_key = input(
|
|
||||||
'Specify custom header if you need or just press Enter to skip. Header name: '
|
|
||||||
)
|
|
||||||
if not header_key:
|
|
||||||
break
|
|
||||||
header_value = input('Header value: ')
|
|
||||||
custom_headers[header_key.strip()] = header_value.strip()
|
|
||||||
|
|
||||||
supposed_username = extract_username_dialog(url_exists)
|
|
||||||
non_exist_username = "noonewouldeverusethis7"
|
|
||||||
|
|
||||||
url_user = url_exists.replace(supposed_username, "{username}")
|
|
||||||
url_not_exists = url_exists.replace(supposed_username, non_exist_username)
|
|
||||||
|
|
||||||
headers = dict(HEADERS)
|
|
||||||
headers.update(custom_headers)
|
|
||||||
|
|
||||||
# cookies
|
|
||||||
cookie_dict = None
|
|
||||||
if cookie_file:
|
|
||||||
logger.info(f'Use {cookie_file} for cookies')
|
|
||||||
cookie_jar = import_aiohttp_cookies(cookie_file)
|
|
||||||
cookie_dict = {c.key: c.value for c in cookie_jar}
|
|
||||||
|
|
||||||
exists_resp = requests.get(
|
|
||||||
url_exists, cookies=cookie_dict, headers=headers, allow_redirects=redirects
|
|
||||||
)
|
|
||||||
logger.debug(url_exists)
|
|
||||||
logger.debug(exists_resp.status_code)
|
|
||||||
logger.debug(exists_resp.text)
|
|
||||||
|
|
||||||
non_exists_resp = requests.get(
|
|
||||||
url_not_exists, cookies=cookie_dict, headers=headers, allow_redirects=redirects
|
|
||||||
)
|
|
||||||
logger.debug(url_not_exists)
|
|
||||||
logger.debug(non_exists_resp.status_code)
|
|
||||||
logger.debug(non_exists_resp.text)
|
|
||||||
|
|
||||||
a = exists_resp.text
|
|
||||||
b = non_exists_resp.text
|
|
||||||
|
|
||||||
tokens_a = set(re.split(f'[{SEPARATORS}]', a))
|
|
||||||
tokens_b = set(re.split(f'[{SEPARATORS}]', b))
|
|
||||||
|
|
||||||
a_minus_b = tokens_a.difference(tokens_b)
|
|
||||||
b_minus_a = tokens_b.difference(tokens_a)
|
|
||||||
|
|
||||||
if len(a_minus_b) == len(b_minus_a) == 0:
|
|
||||||
print("The pages for existing and non-existing account are the same!")
|
|
||||||
|
|
||||||
top_features_count = int(
|
|
||||||
input(f"Specify count of features to extract [default {TOP_FEATURES}]: ")
|
|
||||||
or TOP_FEATURES
|
|
||||||
)
|
|
||||||
|
|
||||||
presence_list = sorted(a_minus_b, key=get_match_ratio, reverse=True)[
|
|
||||||
:top_features_count
|
|
||||||
]
|
|
||||||
|
|
||||||
print("Detected text features of existing account: " + ", ".join(presence_list))
|
|
||||||
features = input("If features was not detected correctly, write it manually: ")
|
|
||||||
|
|
||||||
if features:
|
|
||||||
presence_list = list(map(str.strip, features.split(",")))
|
|
||||||
|
|
||||||
absence_list = sorted(b_minus_a, key=get_match_ratio, reverse=True)[
|
|
||||||
:top_features_count
|
|
||||||
]
|
|
||||||
print("Detected text features of non-existing account: " + ", ".join(absence_list))
|
|
||||||
features = input("If features was not detected correctly, write it manually: ")
|
|
||||||
|
|
||||||
if features:
|
|
||||||
absence_list = list(map(str.strip, features.split(",")))
|
|
||||||
|
|
||||||
site_data = {
|
|
||||||
"absenceStrs": absence_list,
|
|
||||||
"presenseStrs": presence_list,
|
|
||||||
"url": url_user,
|
|
||||||
"urlMain": url_mainpage,
|
|
||||||
"usernameClaimed": supposed_username,
|
|
||||||
"usernameUnclaimed": non_exist_username,
|
|
||||||
"checkType": "message",
|
|
||||||
}
|
|
||||||
|
|
||||||
if headers != HEADERS:
|
|
||||||
site_data['headers'] = headers
|
|
||||||
|
|
||||||
site = MaigretSite(url_mainpage.split("/")[-1], site_data)
|
|
||||||
return site
|
|
||||||
|
|
||||||
|
|
||||||
async def submit_dialog(db, url_exists, cookie_file, logger):
|
|
||||||
domain_raw = URL_RE.sub("", url_exists).strip().strip("/")
|
|
||||||
domain_raw = domain_raw.split("/")[0]
|
|
||||||
logger.info('Domain is %s', domain_raw)
|
|
||||||
|
|
||||||
# check for existence
|
|
||||||
matched_sites = list(filter(lambda x: domain_raw in x.url_main + x.url, db.sites))
|
|
||||||
|
|
||||||
if matched_sites:
|
|
||||||
print(
|
|
||||||
f'Sites with domain "{domain_raw}" already exists in the Maigret database!'
|
|
||||||
)
|
|
||||||
status = lambda s: "(disabled)" if s.disabled else ""
|
|
||||||
url_block = lambda s: f"\n\t{s.url_main}\n\t{s.url}"
|
|
||||||
print(
|
|
||||||
"\n".join(
|
|
||||||
[
|
|
||||||
f"{site.name} {status(site)}{url_block(site)}"
|
|
||||||
for site in matched_sites
|
|
||||||
]
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
if input("Do you want to continue? [yN] ").lower() in "n":
|
|
||||||
return False
|
|
||||||
|
|
||||||
url_mainpage = extract_mainpage_url(url_exists)
|
|
||||||
|
|
||||||
print('Detecting site engine, please wait...')
|
|
||||||
sites = []
|
|
||||||
try:
|
|
||||||
sites = await detect_known_engine(db, url_exists, url_mainpage, logger)
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
print('Engine detect process is interrupted.')
|
|
||||||
|
|
||||||
if not sites:
|
|
||||||
print("Unable to detect site engine, lets generate checking features")
|
|
||||||
sites = [
|
|
||||||
await check_features_manually(
|
|
||||||
db, url_exists, url_mainpage, cookie_file, logger
|
|
||||||
)
|
|
||||||
]
|
]
|
||||||
|
|
||||||
logger.debug(sites[0].__dict__)
|
self.logger.info(f"Checking {site.name}...")
|
||||||
|
|
||||||
sem = asyncio.Semaphore(1)
|
for username, status in check_data:
|
||||||
|
results_dict = await maigret(
|
||||||
print("Checking, please wait...")
|
username=username,
|
||||||
found = False
|
site_dict={site.name: site},
|
||||||
chosen_site = None
|
logger=self.logger,
|
||||||
for s in sites:
|
timeout=30,
|
||||||
chosen_site = s
|
id_type=site.type,
|
||||||
result = await site_self_check(s, logger, sem, db)
|
forced=True,
|
||||||
if not result["disabled"]:
|
no_progressbar=True,
|
||||||
found = True
|
)
|
||||||
break
|
|
||||||
|
# don't disable entries with other ids types
|
||||||
if not found:
|
# TODO: make normal checking
|
||||||
print(
|
if site.name not in results_dict:
|
||||||
f"Sorry, we couldn't find params to detect account presence/absence in {chosen_site.name}."
|
self.logger.info(results_dict)
|
||||||
)
|
changes["disabled"] = True
|
||||||
print(
|
continue
|
||||||
"Try to run this mode again and increase features count or choose others."
|
|
||||||
)
|
result = results_dict[site.name]["status"]
|
||||||
return False
|
|
||||||
else:
|
site_status = result.status
|
||||||
if (
|
|
||||||
input(
|
if site_status != status:
|
||||||
f"Site {chosen_site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] "
|
if site_status == QueryStatus.UNKNOWN:
|
||||||
|
msgs = site.absence_strs
|
||||||
|
etype = site.check_type
|
||||||
|
self.logger.warning(
|
||||||
|
"Error while searching '%s' in %s: %s, %s, check type %s",
|
||||||
|
username,
|
||||||
|
site.name,
|
||||||
|
result.context,
|
||||||
|
msgs,
|
||||||
|
etype,
|
||||||
|
)
|
||||||
|
# don't disable in case of available username
|
||||||
|
if status == QueryStatus.CLAIMED:
|
||||||
|
changes["disabled"] = True
|
||||||
|
elif status == QueryStatus.CLAIMED:
|
||||||
|
self.logger.warning(
|
||||||
|
f"Not found `{username}` in {site.name}, must be claimed"
|
||||||
|
)
|
||||||
|
self.logger.info(results_dict[site.name])
|
||||||
|
changes["disabled"] = True
|
||||||
|
else:
|
||||||
|
self.logger.warning(
|
||||||
|
f"Found `{username}` in {site.name}, must be available"
|
||||||
|
)
|
||||||
|
self.logger.info(results_dict[site.name])
|
||||||
|
changes["disabled"] = True
|
||||||
|
|
||||||
|
self.logger.info(f"Site {site.name} checking is finished")
|
||||||
|
|
||||||
|
return changes
|
||||||
|
|
||||||
|
def generate_additional_fields_dialog(self, engine: MaigretEngine, dialog):
|
||||||
|
fields = {}
|
||||||
|
if 'urlSubpath' in engine.site.get('url', ''):
|
||||||
|
msg = (
|
||||||
|
'Detected engine suppose additional URL subpath using (/forum/, /blog/, etc). '
|
||||||
|
'Enter in manually if it exists: '
|
||||||
|
)
|
||||||
|
subpath = input(msg).strip('/')
|
||||||
|
if subpath:
|
||||||
|
fields['urlSubpath'] = f'/{subpath}'
|
||||||
|
return fields
|
||||||
|
|
||||||
|
async def detect_known_engine(self, url_exists, url_mainpage) -> List[MaigretSite]:
|
||||||
|
try:
|
||||||
|
r = requests.get(url_mainpage)
|
||||||
|
self.logger.debug(r.text)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warning(e)
|
||||||
|
print("Some error while checking main page")
|
||||||
|
return []
|
||||||
|
|
||||||
|
for engine in self.db.engines:
|
||||||
|
strs_to_check = engine.__dict__.get("presenseStrs")
|
||||||
|
if strs_to_check and r and r.text:
|
||||||
|
all_strs_in_response = True
|
||||||
|
for s in strs_to_check:
|
||||||
|
if s not in r.text:
|
||||||
|
all_strs_in_response = False
|
||||||
|
sites = []
|
||||||
|
if all_strs_in_response:
|
||||||
|
engine_name = engine.__dict__.get("name")
|
||||||
|
|
||||||
|
print(f"Detected engine {engine_name} for site {url_mainpage}")
|
||||||
|
|
||||||
|
usernames_to_check = self.settings.supposed_usernames
|
||||||
|
supposed_username = self.extract_username_dialog(url_exists)
|
||||||
|
if supposed_username:
|
||||||
|
usernames_to_check = [supposed_username] + usernames_to_check
|
||||||
|
|
||||||
|
add_fields = self.generate_additional_fields_dialog(
|
||||||
|
engine, url_exists
|
||||||
|
)
|
||||||
|
|
||||||
|
for u in usernames_to_check:
|
||||||
|
site_data = {
|
||||||
|
"urlMain": url_mainpage,
|
||||||
|
"name": url_mainpage.split("//")[1],
|
||||||
|
"engine": engine_name,
|
||||||
|
"usernameClaimed": u,
|
||||||
|
"usernameUnclaimed": "noonewouldeverusethis7",
|
||||||
|
**add_fields,
|
||||||
|
}
|
||||||
|
self.logger.info(site_data)
|
||||||
|
|
||||||
|
maigret_site = MaigretSite(
|
||||||
|
url_mainpage.split("/")[-1], site_data
|
||||||
|
)
|
||||||
|
maigret_site.update_from_engine(
|
||||||
|
self.db.engines_dict[engine_name]
|
||||||
|
)
|
||||||
|
sites.append(maigret_site)
|
||||||
|
|
||||||
|
return sites
|
||||||
|
|
||||||
|
return []
|
||||||
|
|
||||||
|
def extract_username_dialog(self, url):
|
||||||
|
url_parts = url.rstrip("/").split("/")
|
||||||
|
supposed_username = url_parts[-1].strip('@')
|
||||||
|
entered_username = input(
|
||||||
|
f'Is "{supposed_username}" a valid username? If not, write it manually: '
|
||||||
|
)
|
||||||
|
return entered_username if entered_username else supposed_username
|
||||||
|
|
||||||
|
async def check_features_manually(
|
||||||
|
self, url_exists, url_mainpage, cookie_file, redirects=False
|
||||||
|
):
|
||||||
|
custom_headers = {}
|
||||||
|
while True:
|
||||||
|
header_key = input(
|
||||||
|
'Specify custom header if you need or just press Enter to skip. Header name: '
|
||||||
|
)
|
||||||
|
if not header_key:
|
||||||
|
break
|
||||||
|
header_value = input('Header value: ')
|
||||||
|
custom_headers[header_key.strip()] = header_value.strip()
|
||||||
|
|
||||||
|
supposed_username = self.extract_username_dialog(url_exists)
|
||||||
|
non_exist_username = "noonewouldeverusethis7"
|
||||||
|
|
||||||
|
url_user = url_exists.replace(supposed_username, "{username}")
|
||||||
|
url_not_exists = url_exists.replace(supposed_username, non_exist_username)
|
||||||
|
|
||||||
|
headers = dict(self.HEADERS)
|
||||||
|
headers.update(custom_headers)
|
||||||
|
|
||||||
|
# cookies
|
||||||
|
cookie_dict = None
|
||||||
|
if cookie_file:
|
||||||
|
self.logger.info(f'Use {cookie_file} for cookies')
|
||||||
|
cookie_jar = import_aiohttp_cookies(cookie_file)
|
||||||
|
cookie_dict = {c.key: c.value for c in cookie_jar}
|
||||||
|
|
||||||
|
exists_resp = requests.get(
|
||||||
|
url_exists, cookies=cookie_dict, headers=headers, allow_redirects=redirects
|
||||||
|
)
|
||||||
|
self.logger.debug(url_exists)
|
||||||
|
self.logger.debug(exists_resp.status_code)
|
||||||
|
self.logger.debug(exists_resp.text)
|
||||||
|
|
||||||
|
non_exists_resp = requests.get(
|
||||||
|
url_not_exists,
|
||||||
|
cookies=cookie_dict,
|
||||||
|
headers=headers,
|
||||||
|
allow_redirects=redirects,
|
||||||
|
)
|
||||||
|
self.logger.debug(url_not_exists)
|
||||||
|
self.logger.debug(non_exists_resp.status_code)
|
||||||
|
self.logger.debug(non_exists_resp.text)
|
||||||
|
|
||||||
|
a = exists_resp.text
|
||||||
|
b = non_exists_resp.text
|
||||||
|
|
||||||
|
tokens_a = set(re.split(f'[{self.SEPARATORS}]', a))
|
||||||
|
tokens_b = set(re.split(f'[{self.SEPARATORS}]', b))
|
||||||
|
|
||||||
|
a_minus_b = tokens_a.difference(tokens_b)
|
||||||
|
b_minus_a = tokens_b.difference(tokens_a)
|
||||||
|
|
||||||
|
if len(a_minus_b) == len(b_minus_a) == 0:
|
||||||
|
print("The pages for existing and non-existing account are the same!")
|
||||||
|
|
||||||
|
top_features_count = int(
|
||||||
|
input(
|
||||||
|
f"Specify count of features to extract [default {self.TOP_FEATURES}]: "
|
||||||
|
)
|
||||||
|
or self.TOP_FEATURES
|
||||||
|
)
|
||||||
|
|
||||||
|
match_fun = get_match_ratio(self.settings.presence_strings)
|
||||||
|
|
||||||
|
presence_list = sorted(a_minus_b, key=match_fun, reverse=True)[
|
||||||
|
:top_features_count
|
||||||
|
]
|
||||||
|
|
||||||
|
print("Detected text features of existing account: " + ", ".join(presence_list))
|
||||||
|
features = input("If features was not detected correctly, write it manually: ")
|
||||||
|
|
||||||
|
if features:
|
||||||
|
presence_list = list(map(str.strip, features.split(",")))
|
||||||
|
|
||||||
|
absence_list = sorted(b_minus_a, key=match_fun, reverse=True)[
|
||||||
|
:top_features_count
|
||||||
|
]
|
||||||
|
print(
|
||||||
|
"Detected text features of non-existing account: " + ", ".join(absence_list)
|
||||||
|
)
|
||||||
|
features = input("If features was not detected correctly, write it manually: ")
|
||||||
|
|
||||||
|
if features:
|
||||||
|
absence_list = list(map(str.strip, features.split(",")))
|
||||||
|
|
||||||
|
site_data = {
|
||||||
|
"absenceStrs": absence_list,
|
||||||
|
"presenseStrs": presence_list,
|
||||||
|
"url": url_user,
|
||||||
|
"urlMain": url_mainpage,
|
||||||
|
"usernameClaimed": supposed_username,
|
||||||
|
"usernameUnclaimed": non_exist_username,
|
||||||
|
"checkType": "message",
|
||||||
|
}
|
||||||
|
|
||||||
|
if headers != self.HEADERS:
|
||||||
|
site_data['headers'] = headers
|
||||||
|
|
||||||
|
site = MaigretSite(url_mainpage.split("/")[-1], site_data)
|
||||||
|
return site
|
||||||
|
|
||||||
|
async def dialog(self, url_exists, cookie_file):
|
||||||
|
domain_raw = self.URL_RE.sub("", url_exists).strip().strip("/")
|
||||||
|
domain_raw = domain_raw.split("/")[0]
|
||||||
|
self.logger.info('Domain is %s', domain_raw)
|
||||||
|
|
||||||
|
# check for existence
|
||||||
|
matched_sites = list(
|
||||||
|
filter(lambda x: domain_raw in x.url_main + x.url, self.db.sites)
|
||||||
|
)
|
||||||
|
|
||||||
|
if matched_sites:
|
||||||
|
print(
|
||||||
|
f'Sites with domain "{domain_raw}" already exists in the Maigret database!'
|
||||||
|
)
|
||||||
|
status = lambda s: "(disabled)" if s.disabled else ""
|
||||||
|
url_block = lambda s: f"\n\t{s.url_main}\n\t{s.url}"
|
||||||
|
print(
|
||||||
|
"\n".join(
|
||||||
|
[
|
||||||
|
f"{site.name} {status(site)}{url_block(site)}"
|
||||||
|
for site in matched_sites
|
||||||
|
]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if input("Do you want to continue? [yN] ").lower() in "n":
|
||||||
|
return False
|
||||||
|
|
||||||
|
url_mainpage = self.extract_mainpage_url(url_exists)
|
||||||
|
|
||||||
|
print('Detecting site engine, please wait...')
|
||||||
|
sites = []
|
||||||
|
try:
|
||||||
|
sites = await self.detect_known_engine(url_exists, url_mainpage)
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print('Engine detect process is interrupted.')
|
||||||
|
|
||||||
|
if not sites:
|
||||||
|
print("Unable to detect site engine, lets generate checking features")
|
||||||
|
sites = [
|
||||||
|
await self.check_features_manually(
|
||||||
|
url_exists, url_mainpage, cookie_file
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
self.logger.debug(sites[0].__dict__)
|
||||||
|
|
||||||
|
sem = asyncio.Semaphore(1)
|
||||||
|
|
||||||
|
print("Checking, please wait...")
|
||||||
|
found = False
|
||||||
|
chosen_site = None
|
||||||
|
for s in sites:
|
||||||
|
chosen_site = s
|
||||||
|
result = await self.site_self_check(s, sem)
|
||||||
|
if not result["disabled"]:
|
||||||
|
found = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if not found:
|
||||||
|
print(
|
||||||
|
f"Sorry, we couldn't find params to detect account presence/absence in {chosen_site.name}."
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
"Try to run this mode again and increase features count or choose others."
|
||||||
)
|
)
|
||||||
.lower()
|
|
||||||
.strip("y")
|
|
||||||
):
|
|
||||||
return False
|
return False
|
||||||
|
else:
|
||||||
|
if (
|
||||||
|
input(
|
||||||
|
f"Site {chosen_site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] "
|
||||||
|
)
|
||||||
|
.lower()
|
||||||
|
.strip("y")
|
||||||
|
):
|
||||||
|
return False
|
||||||
|
|
||||||
chosen_site.name = input("Change site name if you want: ") or chosen_site.name
|
chosen_site.name = input("Change site name if you want: ") or chosen_site.name
|
||||||
chosen_site.tags = list(map(str.strip, input("Site tags: ").split(',')))
|
chosen_site.tags = list(map(str.strip, input("Site tags: ").split(',')))
|
||||||
rank = get_alexa_rank(chosen_site.url_main)
|
rank = Submitter.get_alexa_rank(chosen_site.url_main)
|
||||||
if rank:
|
if rank:
|
||||||
print(f'New alexa rank: {rank}')
|
print(f'New alexa rank: {rank}')
|
||||||
chosen_site.alexa_rank = rank
|
chosen_site.alexa_rank = rank
|
||||||
|
|
||||||
logger.debug(chosen_site.json)
|
self.logger.debug(chosen_site.json)
|
||||||
site_data = chosen_site.strip_engine_data()
|
site_data = chosen_site.strip_engine_data()
|
||||||
logger.debug(site_data.json)
|
self.logger.debug(site_data.json)
|
||||||
db.update_site(site_data)
|
self.db.update_site(site_data)
|
||||||
return True
|
return True
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import ast
|
import ast
|
||||||
|
import difflib
|
||||||
import re
|
import re
|
||||||
import random
|
import random
|
||||||
from typing import Any
|
from typing import Any
|
||||||
@@ -95,3 +96,18 @@ def get_dict_ascii_tree(items, prepend="", new_line=True):
|
|||||||
|
|
||||||
def get_random_user_agent():
|
def get_random_user_agent():
|
||||||
return random.choice(DEFAULT_USER_AGENTS)
|
return random.choice(DEFAULT_USER_AGENTS)
|
||||||
|
|
||||||
|
|
||||||
|
def get_match_ratio(base_strs: list):
|
||||||
|
def get_match_inner(s: str):
|
||||||
|
return round(
|
||||||
|
max(
|
||||||
|
[
|
||||||
|
difflib.SequenceMatcher(a=s.lower(), b=s2.lower()).ratio()
|
||||||
|
for s2 in base_strs
|
||||||
|
]
|
||||||
|
),
|
||||||
|
2,
|
||||||
|
)
|
||||||
|
|
||||||
|
return get_match_inner
|
||||||
|
|||||||
+3
-2
@@ -1,15 +1,16 @@
|
|||||||
"""Maigret data test functions"""
|
"""Maigret data test functions"""
|
||||||
|
|
||||||
from maigret.utils import is_country_tag
|
from maigret.utils import is_country_tag
|
||||||
from maigret.sites import SUPPORTED_TAGS
|
|
||||||
|
|
||||||
|
|
||||||
def test_tags_validity(default_db):
|
def test_tags_validity(default_db):
|
||||||
unknown_tags = set()
|
unknown_tags = set()
|
||||||
|
|
||||||
|
tags = default_db._tags
|
||||||
|
|
||||||
for site in default_db.sites:
|
for site in default_db.sites:
|
||||||
for tag in filter(lambda x: not is_country_tag(x), site.tags):
|
for tag in filter(lambda x: not is_country_tag(x), site.tags):
|
||||||
if tag not in SUPPORTED_TAGS:
|
if tag not in tags:
|
||||||
unknown_tags.add(tag)
|
unknown_tags.add(tag)
|
||||||
|
|
||||||
assert unknown_tags == set()
|
assert unknown_tags == set()
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
"""Maigret Database test functions"""
|
"""Maigret Database test functions"""
|
||||||
from maigret.sites import MaigretDatabase, MaigretSite
|
from maigret.sites import MaigretDatabase, MaigretSite
|
||||||
|
from maigret.utils import URLMatcher
|
||||||
|
|
||||||
EXAMPLE_DB = {
|
EXAMPLE_DB = {
|
||||||
'engines': {
|
'engines': {
|
||||||
@@ -179,3 +180,26 @@ def test_ranked_sites_dict_id_type():
|
|||||||
assert len(db.ranked_sites_dict()) == 2
|
assert len(db.ranked_sites_dict()) == 2
|
||||||
assert len(db.ranked_sites_dict(id_type='username')) == 2
|
assert len(db.ranked_sites_dict(id_type='username')) == 2
|
||||||
assert len(db.ranked_sites_dict(id_type='gaia_id')) == 1
|
assert len(db.ranked_sites_dict(id_type='gaia_id')) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_url_template():
|
||||||
|
site = MaigretSite(
|
||||||
|
"test",
|
||||||
|
{
|
||||||
|
"urlMain": "https://ya.ru/",
|
||||||
|
"url": "{urlMain}{urlSubpath}/members/?username={username}",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
site.get_url_template()
|
||||||
|
== "{urlMain}{urlSubpath}/members/?username={username} (no engine)"
|
||||||
|
)
|
||||||
|
|
||||||
|
site = MaigretSite(
|
||||||
|
"test",
|
||||||
|
{
|
||||||
|
"urlMain": "https://ya.ru/",
|
||||||
|
"url": "https://{username}.ya.ru",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert site.get_url_template() == "SUBDOMAIN"
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ from maigret.utils import (
|
|||||||
enrich_link_str,
|
enrich_link_str,
|
||||||
URLMatcher,
|
URLMatcher,
|
||||||
get_dict_ascii_tree,
|
get_dict_ascii_tree,
|
||||||
|
get_match_ratio,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -136,3 +137,9 @@ def test_get_dict_ascii_tree():
|
|||||||
┣╸instagram_username: Street.Reality.Photography
|
┣╸instagram_username: Street.Reality.Photography
|
||||||
┗╸twitter_username: Alexaimephotogr"""
|
┗╸twitter_username: Alexaimephotogr"""
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_match_ratio():
|
||||||
|
fun = get_match_ratio(["test", "maigret", "username"])
|
||||||
|
|
||||||
|
assert fun("test") == 1
|
||||||
|
|||||||
Reference in New Issue
Block a user