mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-06 22:19:01 +00:00
Refactoring of submit module, some fixes
This commit is contained in:
@@ -25,7 +25,7 @@ format:
|
||||
pull:
|
||||
git stash
|
||||
git checkout main
|
||||
git pull origin head
|
||||
git pull origin main
|
||||
git stash pop
|
||||
|
||||
clean:
|
||||
|
||||
+10
-4
@@ -36,9 +36,10 @@ from .report import (
|
||||
sort_report_by_data_points,
|
||||
)
|
||||
from .sites import MaigretDatabase
|
||||
from .submit import submit_dialog
|
||||
from .submit import Submitter
|
||||
from .types import QueryResultWrapper
|
||||
from .utils import get_dict_ascii_tree
|
||||
from .settings import Settings
|
||||
|
||||
|
||||
def notify_about_errors(search_results: QueryResultWrapper, query_notify):
|
||||
@@ -496,6 +497,12 @@ async def main():
|
||||
if args.tags:
|
||||
args.tags = list(set(str(args.tags).split(',')))
|
||||
|
||||
settings = Settings(
|
||||
os.path.join(
|
||||
os.path.dirname(os.path.realpath(__file__)), "resources/settings.json"
|
||||
)
|
||||
)
|
||||
|
||||
if args.db_file is None:
|
||||
args.db_file = os.path.join(
|
||||
os.path.dirname(os.path.realpath(__file__)), "resources/data.json"
|
||||
@@ -526,9 +533,8 @@ async def main():
|
||||
site_data = get_top_sites_for_id(args.id_type)
|
||||
|
||||
if args.new_site_to_submit:
|
||||
is_submitted = await submit_dialog(
|
||||
db, args.new_site_to_submit, args.cookie_file, logger
|
||||
)
|
||||
submitter = Submitter(db=db, logger=logger, settings=settings)
|
||||
is_submitted = await submitter.dialog(args.new_site_to_submit, args.cookie_file)
|
||||
if is_submitted:
|
||||
db.save_to_file(args.db_file)
|
||||
|
||||
|
||||
@@ -13036,7 +13036,7 @@
|
||||
"us"
|
||||
],
|
||||
"headers": {
|
||||
"authorization": "Bearer BQCypIuUtz7zDFov8xN86mj1BelLf7Apf9WBaC5yYfNkmGe4r7Hz4Awp6dqPuCAP9K9F5yYtjbyZX_vlr4I"
|
||||
"authorization": "Bearer BQAkHoH1XLhjIl6oh6r9YzH3kHC1OZg3UXgLiz39FzqRFh_xQrFaVrZcU-esM-t87B6Hqdc4L1HBgukKnWE"
|
||||
},
|
||||
"errors": {
|
||||
"Spotify is currently not available in your country.": "Access denied in your country, use proxy/vpn"
|
||||
@@ -13990,7 +13990,8 @@
|
||||
"us"
|
||||
],
|
||||
"errors": {
|
||||
"Website unavailable": "Site error"
|
||||
"Website unavailable": "Site error",
|
||||
"is currently offline": "Site error"
|
||||
},
|
||||
"checkType": "message",
|
||||
"absenceStrs": [
|
||||
@@ -14462,7 +14463,7 @@
|
||||
"sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
|
||||
"authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
|
||||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
|
||||
"x-guest-token": "1400174453577900043"
|
||||
"x-guest-token": "1403829602053771266"
|
||||
},
|
||||
"errors": {
|
||||
"Bad guest token": "x-guest-token update required"
|
||||
@@ -14869,7 +14870,7 @@
|
||||
"video"
|
||||
],
|
||||
"headers": {
|
||||
"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MjI2NjcxMjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.V4VVbLzNwPU21rNP5moSxrPcPw--C7_Qz9VHgcJc1CA"
|
||||
"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MjM1MzQ5NjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.5T8_p_q9zXOHXI2FT_XtMhsZUJMtPgCIaqwVF2u4aZI"
|
||||
},
|
||||
"activation": {
|
||||
"url": "https://vimeo.com/_rv/viewer",
|
||||
@@ -28457,5 +28458,63 @@
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"tags": [
|
||||
"gaming",
|
||||
"coding",
|
||||
"photo",
|
||||
"music",
|
||||
"blog",
|
||||
"finance",
|
||||
"freelance",
|
||||
"dating",
|
||||
"tech",
|
||||
"forum",
|
||||
"porn",
|
||||
"erotic",
|
||||
"webcam",
|
||||
"video",
|
||||
"movies",
|
||||
"hacking",
|
||||
"art",
|
||||
"discussion",
|
||||
"sharing",
|
||||
"writing",
|
||||
"wiki",
|
||||
"business",
|
||||
"shopping",
|
||||
"sport",
|
||||
"books",
|
||||
"news",
|
||||
"documents",
|
||||
"travel",
|
||||
"maps",
|
||||
"hobby",
|
||||
"apps",
|
||||
"classified",
|
||||
"career",
|
||||
"geosocial",
|
||||
"streaming",
|
||||
"education",
|
||||
"networking",
|
||||
"torrent",
|
||||
"science",
|
||||
"medicine",
|
||||
"reading",
|
||||
"stock",
|
||||
"messaging",
|
||||
"trading",
|
||||
"links",
|
||||
"fashion",
|
||||
"tasks",
|
||||
"military",
|
||||
"auto",
|
||||
"gambling",
|
||||
"cybercriminal",
|
||||
"review",
|
||||
"bookmarks",
|
||||
"design",
|
||||
"tor",
|
||||
"i2p"
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,17 @@
|
||||
{
|
||||
"presence_strings": [
|
||||
"username",
|
||||
"not found",
|
||||
"пользователь",
|
||||
"profile",
|
||||
"lastname",
|
||||
"firstname",
|
||||
"biography",
|
||||
"birthday",
|
||||
"репутация",
|
||||
"информация",
|
||||
"e-mail"
|
||||
],
|
||||
"supposed_usernames": [
|
||||
"alex", "god", "admin", "red", "blue", "john"]
|
||||
}
|
||||
@@ -0,0 +1,29 @@
|
||||
import json
|
||||
|
||||
|
||||
class Settings:
|
||||
presence_strings: list
|
||||
supposed_usernames: list
|
||||
|
||||
def __init__(self, filename):
|
||||
data = {}
|
||||
|
||||
try:
|
||||
with open(filename, "r", encoding="utf-8") as file:
|
||||
try:
|
||||
data = json.load(file)
|
||||
except Exception as error:
|
||||
raise ValueError(
|
||||
f"Problem with parsing json contents of "
|
||||
f"settings file '{filename}': {str(error)}."
|
||||
)
|
||||
except FileNotFoundError as error:
|
||||
raise FileNotFoundError(
|
||||
f"Problem while attempting to access settings file '{filename}'."
|
||||
) from error
|
||||
|
||||
self.__dict__.update(data)
|
||||
|
||||
@property
|
||||
def json(self):
|
||||
return self.__dict__
|
||||
+11
-66
@@ -9,66 +9,6 @@ import requests
|
||||
|
||||
from .utils import CaseConverter, URLMatcher, is_country_tag
|
||||
|
||||
# TODO: move to data.json
|
||||
SUPPORTED_TAGS = [
|
||||
"gaming",
|
||||
"coding",
|
||||
"photo",
|
||||
"music",
|
||||
"blog",
|
||||
"finance",
|
||||
"freelance",
|
||||
"dating",
|
||||
"tech",
|
||||
"forum",
|
||||
"porn",
|
||||
"erotic",
|
||||
"webcam",
|
||||
"video",
|
||||
"movies",
|
||||
"hacking",
|
||||
"art",
|
||||
"discussion",
|
||||
"sharing",
|
||||
"writing",
|
||||
"wiki",
|
||||
"business",
|
||||
"shopping",
|
||||
"sport",
|
||||
"books",
|
||||
"news",
|
||||
"documents",
|
||||
"travel",
|
||||
"maps",
|
||||
"hobby",
|
||||
"apps",
|
||||
"classified",
|
||||
"career",
|
||||
"geosocial",
|
||||
"streaming",
|
||||
"education",
|
||||
"networking",
|
||||
"torrent",
|
||||
"science",
|
||||
"medicine",
|
||||
"reading",
|
||||
"stock",
|
||||
"messaging",
|
||||
"trading",
|
||||
"links",
|
||||
"fashion",
|
||||
"tasks",
|
||||
"military",
|
||||
"auto",
|
||||
"gambling",
|
||||
"cybercriminal",
|
||||
"review",
|
||||
"bookmarks",
|
||||
"design",
|
||||
"tor",
|
||||
"i2p",
|
||||
]
|
||||
|
||||
|
||||
class MaigretEngine:
|
||||
site: Dict[str, Any] = {}
|
||||
@@ -204,12 +144,12 @@ class MaigretSite:
|
||||
errors.update(self.errors)
|
||||
return errors
|
||||
|
||||
def get_url_type(self) -> str:
|
||||
def get_url_template(self) -> str:
|
||||
url = URLMatcher.extract_main_part(self.url)
|
||||
if url.startswith("{username}"):
|
||||
url = "SUBDOMAIN"
|
||||
elif url == "":
|
||||
url = f"{self.url} ({self.engine})"
|
||||
url = f"{self.url} ({self.engine or 'no engine'})"
|
||||
else:
|
||||
parts = url.split("/")
|
||||
url = "/" + "/".join(parts[1:])
|
||||
@@ -273,8 +213,9 @@ class MaigretSite:
|
||||
|
||||
class MaigretDatabase:
|
||||
def __init__(self):
|
||||
self._sites = []
|
||||
self._engines = []
|
||||
self._tags: list = []
|
||||
self._sites: list = []
|
||||
self._engines: list = []
|
||||
|
||||
@property
|
||||
def sites(self):
|
||||
@@ -354,6 +295,7 @@ class MaigretDatabase:
|
||||
db_data = {
|
||||
"sites": {site.name: site.strip_engine_data().json for site in self._sites},
|
||||
"engines": {engine.name: engine.json for engine in self._engines},
|
||||
"tags": self._tags,
|
||||
}
|
||||
|
||||
json_data = json.dumps(db_data, indent=4)
|
||||
@@ -367,6 +309,9 @@ class MaigretDatabase:
|
||||
# Add all of site information from the json file to internal site list.
|
||||
site_data = json_data.get("sites", {})
|
||||
engines_data = json_data.get("engines", {})
|
||||
tags = json_data.get("tags", [])
|
||||
|
||||
self._tags += tags
|
||||
|
||||
for engine_name in engines_data:
|
||||
self._engines.append(MaigretEngine(engine_name, engines_data[engine_name]))
|
||||
@@ -469,7 +414,7 @@ class MaigretDatabase:
|
||||
if site.disabled:
|
||||
disabled_count += 1
|
||||
|
||||
url_type = site.get_url_type()
|
||||
url_type = site.get_url_template()
|
||||
urls[url_type] = urls.get(url_type, 0) + 1
|
||||
|
||||
if not site.tags:
|
||||
@@ -488,7 +433,7 @@ class MaigretDatabase:
|
||||
output += "Top tags:\n"
|
||||
for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True)[:200]:
|
||||
mark = ""
|
||||
if tag not in SUPPORTED_TAGS:
|
||||
if tag not in self._tags:
|
||||
mark = " (non-standard)"
|
||||
output += f"{count}\t{tag}{mark}\n"
|
||||
|
||||
|
||||
+352
-360
@@ -1,5 +1,4 @@
|
||||
import asyncio
|
||||
import difflib
|
||||
import re
|
||||
from typing import List
|
||||
import xml.etree.ElementTree as ET
|
||||
@@ -8,382 +7,375 @@ import requests
|
||||
from .activation import import_aiohttp_cookies
|
||||
from .checking import maigret
|
||||
from .result import QueryStatus
|
||||
from .settings import Settings
|
||||
from .sites import MaigretDatabase, MaigretSite, MaigretEngine
|
||||
from .utils import get_random_user_agent
|
||||
from .utils import get_random_user_agent, get_match_ratio
|
||||
|
||||
|
||||
DESIRED_STRINGS = [
|
||||
"username",
|
||||
"not found",
|
||||
"пользователь",
|
||||
"profile",
|
||||
"lastname",
|
||||
"firstname",
|
||||
"biography",
|
||||
"birthday",
|
||||
"репутация",
|
||||
"информация",
|
||||
"e-mail",
|
||||
]
|
||||
|
||||
SUPPOSED_USERNAMES = ["alex", "god", "admin", "red", "blue", "john"]
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": get_random_user_agent(),
|
||||
}
|
||||
|
||||
SEPARATORS = "\"'"
|
||||
|
||||
RATIO = 0.6
|
||||
TOP_FEATURES = 5
|
||||
URL_RE = re.compile(r"https?://(www\.)?")
|
||||
|
||||
|
||||
def get_match_ratio(x):
|
||||
return round(
|
||||
max(
|
||||
[difflib.SequenceMatcher(a=x.lower(), b=y).ratio() for y in DESIRED_STRINGS]
|
||||
),
|
||||
2,
|
||||
)
|
||||
|
||||
|
||||
def get_alexa_rank(site_url_main):
|
||||
url = f"http://data.alexa.com/data?cli=10&url={site_url_main}"
|
||||
xml_data = requests.get(url).text
|
||||
root = ET.fromstring(xml_data)
|
||||
alexa_rank = 0
|
||||
|
||||
try:
|
||||
alexa_rank = int(root.find('.//REACH').attrib['RANK'])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return alexa_rank
|
||||
|
||||
|
||||
def extract_mainpage_url(url):
|
||||
return "/".join(url.split("/", 3)[:3])
|
||||
|
||||
|
||||
async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False):
|
||||
changes = {
|
||||
"disabled": False,
|
||||
class Submitter:
|
||||
HEADERS = {
|
||||
"User-Agent": get_random_user_agent(),
|
||||
}
|
||||
|
||||
check_data = [
|
||||
(site.username_claimed, QueryStatus.CLAIMED),
|
||||
(site.username_unclaimed, QueryStatus.AVAILABLE),
|
||||
]
|
||||
SEPARATORS = "\"'"
|
||||
|
||||
logger.info(f"Checking {site.name}...")
|
||||
RATIO = 0.6
|
||||
TOP_FEATURES = 5
|
||||
URL_RE = re.compile(r"https?://(www\.)?")
|
||||
|
||||
for username, status in check_data:
|
||||
results_dict = await maigret(
|
||||
username=username,
|
||||
site_dict={site.name: site},
|
||||
logger=logger,
|
||||
timeout=30,
|
||||
id_type=site.type,
|
||||
forced=True,
|
||||
no_progressbar=True,
|
||||
)
|
||||
def __init__(self, db: MaigretDatabase, settings: Settings, logger):
|
||||
self.settings = settings
|
||||
self.db = db
|
||||
self.logger = logger
|
||||
|
||||
# don't disable entries with other ids types
|
||||
# TODO: make normal checking
|
||||
if site.name not in results_dict:
|
||||
logger.info(results_dict)
|
||||
changes["disabled"] = True
|
||||
continue
|
||||
@staticmethod
|
||||
def get_alexa_rank(site_url_main):
|
||||
url = f"http://data.alexa.com/data?cli=10&url={site_url_main}"
|
||||
xml_data = requests.get(url).text
|
||||
root = ET.fromstring(xml_data)
|
||||
alexa_rank = 0
|
||||
|
||||
result = results_dict[site.name]["status"]
|
||||
try:
|
||||
alexa_rank = int(root.find('.//REACH').attrib['RANK'])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
site_status = result.status
|
||||
return alexa_rank
|
||||
|
||||
if site_status != status:
|
||||
if site_status == QueryStatus.UNKNOWN:
|
||||
msgs = site.absence_strs
|
||||
etype = site.check_type
|
||||
logger.warning(
|
||||
"Error while searching '%s' in %s: %s, %s, check type %s",
|
||||
username,
|
||||
site.name,
|
||||
result.context,
|
||||
msgs,
|
||||
etype,
|
||||
)
|
||||
# don't disable in case of available username
|
||||
if status == QueryStatus.CLAIMED:
|
||||
changes["disabled"] = True
|
||||
elif status == QueryStatus.CLAIMED:
|
||||
logger.warning(
|
||||
f"Not found `{username}` in {site.name}, must be claimed"
|
||||
)
|
||||
logger.info(results_dict[site.name])
|
||||
changes["disabled"] = True
|
||||
else:
|
||||
logger.warning(f"Found `{username}` in {site.name}, must be available")
|
||||
logger.info(results_dict[site.name])
|
||||
changes["disabled"] = True
|
||||
@staticmethod
|
||||
def extract_mainpage_url(url):
|
||||
return "/".join(url.split("/", 3)[:3])
|
||||
|
||||
logger.info(f"Site {site.name} checking is finished")
|
||||
async def site_self_check(self, site, semaphore, silent=False):
|
||||
changes = {
|
||||
"disabled": False,
|
||||
}
|
||||
|
||||
return changes
|
||||
|
||||
|
||||
def generate_additional_fields_dialog(engine: MaigretEngine, dialog):
|
||||
fields = {}
|
||||
if 'urlSubpath' in engine.site.get('url', ''):
|
||||
msg = (
|
||||
'Detected engine suppose additional URL subpath using (/forum/, /blog/, etc). '
|
||||
'Enter in manually if it exists: '
|
||||
)
|
||||
subpath = input(msg).strip('/')
|
||||
if subpath:
|
||||
fields['urlSubpath'] = f'/{subpath}'
|
||||
return fields
|
||||
|
||||
|
||||
async def detect_known_engine(
|
||||
db, url_exists, url_mainpage, logger
|
||||
) -> List[MaigretSite]:
|
||||
try:
|
||||
r = requests.get(url_mainpage)
|
||||
logger.debug(r.text)
|
||||
except Exception as e:
|
||||
logger.warning(e)
|
||||
print("Some error while checking main page")
|
||||
return []
|
||||
|
||||
for engine in db.engines:
|
||||
strs_to_check = engine.__dict__.get("presenseStrs")
|
||||
if strs_to_check and r and r.text:
|
||||
all_strs_in_response = True
|
||||
for s in strs_to_check:
|
||||
if s not in r.text:
|
||||
all_strs_in_response = False
|
||||
sites = []
|
||||
if all_strs_in_response:
|
||||
engine_name = engine.__dict__.get("name")
|
||||
|
||||
print(f"Detected engine {engine_name} for site {url_mainpage}")
|
||||
|
||||
usernames_to_check = SUPPOSED_USERNAMES
|
||||
supposed_username = extract_username_dialog(url_exists)
|
||||
if supposed_username:
|
||||
usernames_to_check = [supposed_username] + usernames_to_check
|
||||
|
||||
add_fields = generate_additional_fields_dialog(engine, url_exists)
|
||||
|
||||
for u in usernames_to_check:
|
||||
site_data = {
|
||||
"urlMain": url_mainpage,
|
||||
"name": url_mainpage.split("//")[1],
|
||||
"engine": engine_name,
|
||||
"usernameClaimed": u,
|
||||
"usernameUnclaimed": "noonewouldeverusethis7",
|
||||
**add_fields,
|
||||
}
|
||||
logger.info(site_data)
|
||||
|
||||
maigret_site = MaigretSite(url_mainpage.split("/")[-1], site_data)
|
||||
maigret_site.update_from_engine(db.engines_dict[engine_name])
|
||||
sites.append(maigret_site)
|
||||
|
||||
return sites
|
||||
|
||||
return []
|
||||
|
||||
|
||||
def extract_username_dialog(url):
|
||||
url_parts = url.rstrip("/").split("/")
|
||||
supposed_username = url_parts[-1].strip('@')
|
||||
entered_username = input(
|
||||
f'Is "{supposed_username}" a valid username? If not, write it manually: '
|
||||
)
|
||||
return entered_username if entered_username else supposed_username
|
||||
|
||||
|
||||
async def check_features_manually(
|
||||
db, url_exists, url_mainpage, cookie_file, logger, redirects=False
|
||||
):
|
||||
custom_headers = {}
|
||||
while True:
|
||||
header_key = input(
|
||||
'Specify custom header if you need or just press Enter to skip. Header name: '
|
||||
)
|
||||
if not header_key:
|
||||
break
|
||||
header_value = input('Header value: ')
|
||||
custom_headers[header_key.strip()] = header_value.strip()
|
||||
|
||||
supposed_username = extract_username_dialog(url_exists)
|
||||
non_exist_username = "noonewouldeverusethis7"
|
||||
|
||||
url_user = url_exists.replace(supposed_username, "{username}")
|
||||
url_not_exists = url_exists.replace(supposed_username, non_exist_username)
|
||||
|
||||
headers = dict(HEADERS)
|
||||
headers.update(custom_headers)
|
||||
|
||||
# cookies
|
||||
cookie_dict = None
|
||||
if cookie_file:
|
||||
logger.info(f'Use {cookie_file} for cookies')
|
||||
cookie_jar = import_aiohttp_cookies(cookie_file)
|
||||
cookie_dict = {c.key: c.value for c in cookie_jar}
|
||||
|
||||
exists_resp = requests.get(
|
||||
url_exists, cookies=cookie_dict, headers=headers, allow_redirects=redirects
|
||||
)
|
||||
logger.debug(url_exists)
|
||||
logger.debug(exists_resp.status_code)
|
||||
logger.debug(exists_resp.text)
|
||||
|
||||
non_exists_resp = requests.get(
|
||||
url_not_exists, cookies=cookie_dict, headers=headers, allow_redirects=redirects
|
||||
)
|
||||
logger.debug(url_not_exists)
|
||||
logger.debug(non_exists_resp.status_code)
|
||||
logger.debug(non_exists_resp.text)
|
||||
|
||||
a = exists_resp.text
|
||||
b = non_exists_resp.text
|
||||
|
||||
tokens_a = set(re.split(f'[{SEPARATORS}]', a))
|
||||
tokens_b = set(re.split(f'[{SEPARATORS}]', b))
|
||||
|
||||
a_minus_b = tokens_a.difference(tokens_b)
|
||||
b_minus_a = tokens_b.difference(tokens_a)
|
||||
|
||||
if len(a_minus_b) == len(b_minus_a) == 0:
|
||||
print("The pages for existing and non-existing account are the same!")
|
||||
|
||||
top_features_count = int(
|
||||
input(f"Specify count of features to extract [default {TOP_FEATURES}]: ")
|
||||
or TOP_FEATURES
|
||||
)
|
||||
|
||||
presence_list = sorted(a_minus_b, key=get_match_ratio, reverse=True)[
|
||||
:top_features_count
|
||||
]
|
||||
|
||||
print("Detected text features of existing account: " + ", ".join(presence_list))
|
||||
features = input("If features was not detected correctly, write it manually: ")
|
||||
|
||||
if features:
|
||||
presence_list = list(map(str.strip, features.split(",")))
|
||||
|
||||
absence_list = sorted(b_minus_a, key=get_match_ratio, reverse=True)[
|
||||
:top_features_count
|
||||
]
|
||||
print("Detected text features of non-existing account: " + ", ".join(absence_list))
|
||||
features = input("If features was not detected correctly, write it manually: ")
|
||||
|
||||
if features:
|
||||
absence_list = list(map(str.strip, features.split(",")))
|
||||
|
||||
site_data = {
|
||||
"absenceStrs": absence_list,
|
||||
"presenseStrs": presence_list,
|
||||
"url": url_user,
|
||||
"urlMain": url_mainpage,
|
||||
"usernameClaimed": supposed_username,
|
||||
"usernameUnclaimed": non_exist_username,
|
||||
"checkType": "message",
|
||||
}
|
||||
|
||||
if headers != HEADERS:
|
||||
site_data['headers'] = headers
|
||||
|
||||
site = MaigretSite(url_mainpage.split("/")[-1], site_data)
|
||||
return site
|
||||
|
||||
|
||||
async def submit_dialog(db, url_exists, cookie_file, logger):
|
||||
domain_raw = URL_RE.sub("", url_exists).strip().strip("/")
|
||||
domain_raw = domain_raw.split("/")[0]
|
||||
logger.info('Domain is %s', domain_raw)
|
||||
|
||||
# check for existence
|
||||
matched_sites = list(filter(lambda x: domain_raw in x.url_main + x.url, db.sites))
|
||||
|
||||
if matched_sites:
|
||||
print(
|
||||
f'Sites with domain "{domain_raw}" already exists in the Maigret database!'
|
||||
)
|
||||
status = lambda s: "(disabled)" if s.disabled else ""
|
||||
url_block = lambda s: f"\n\t{s.url_main}\n\t{s.url}"
|
||||
print(
|
||||
"\n".join(
|
||||
[
|
||||
f"{site.name} {status(site)}{url_block(site)}"
|
||||
for site in matched_sites
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
if input("Do you want to continue? [yN] ").lower() in "n":
|
||||
return False
|
||||
|
||||
url_mainpage = extract_mainpage_url(url_exists)
|
||||
|
||||
print('Detecting site engine, please wait...')
|
||||
sites = []
|
||||
try:
|
||||
sites = await detect_known_engine(db, url_exists, url_mainpage, logger)
|
||||
except KeyboardInterrupt:
|
||||
print('Engine detect process is interrupted.')
|
||||
|
||||
if not sites:
|
||||
print("Unable to detect site engine, lets generate checking features")
|
||||
sites = [
|
||||
await check_features_manually(
|
||||
db, url_exists, url_mainpage, cookie_file, logger
|
||||
)
|
||||
check_data = [
|
||||
(site.username_claimed, QueryStatus.CLAIMED),
|
||||
(site.username_unclaimed, QueryStatus.AVAILABLE),
|
||||
]
|
||||
|
||||
logger.debug(sites[0].__dict__)
|
||||
self.logger.info(f"Checking {site.name}...")
|
||||
|
||||
sem = asyncio.Semaphore(1)
|
||||
|
||||
print("Checking, please wait...")
|
||||
found = False
|
||||
chosen_site = None
|
||||
for s in sites:
|
||||
chosen_site = s
|
||||
result = await site_self_check(s, logger, sem, db)
|
||||
if not result["disabled"]:
|
||||
found = True
|
||||
break
|
||||
|
||||
if not found:
|
||||
print(
|
||||
f"Sorry, we couldn't find params to detect account presence/absence in {chosen_site.name}."
|
||||
)
|
||||
print(
|
||||
"Try to run this mode again and increase features count or choose others."
|
||||
)
|
||||
return False
|
||||
else:
|
||||
if (
|
||||
input(
|
||||
f"Site {chosen_site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] "
|
||||
for username, status in check_data:
|
||||
results_dict = await maigret(
|
||||
username=username,
|
||||
site_dict={site.name: site},
|
||||
logger=self.logger,
|
||||
timeout=30,
|
||||
id_type=site.type,
|
||||
forced=True,
|
||||
no_progressbar=True,
|
||||
)
|
||||
|
||||
# don't disable entries with other ids types
|
||||
# TODO: make normal checking
|
||||
if site.name not in results_dict:
|
||||
self.logger.info(results_dict)
|
||||
changes["disabled"] = True
|
||||
continue
|
||||
|
||||
result = results_dict[site.name]["status"]
|
||||
|
||||
site_status = result.status
|
||||
|
||||
if site_status != status:
|
||||
if site_status == QueryStatus.UNKNOWN:
|
||||
msgs = site.absence_strs
|
||||
etype = site.check_type
|
||||
self.logger.warning(
|
||||
"Error while searching '%s' in %s: %s, %s, check type %s",
|
||||
username,
|
||||
site.name,
|
||||
result.context,
|
||||
msgs,
|
||||
etype,
|
||||
)
|
||||
# don't disable in case of available username
|
||||
if status == QueryStatus.CLAIMED:
|
||||
changes["disabled"] = True
|
||||
elif status == QueryStatus.CLAIMED:
|
||||
self.logger.warning(
|
||||
f"Not found `{username}` in {site.name}, must be claimed"
|
||||
)
|
||||
self.logger.info(results_dict[site.name])
|
||||
changes["disabled"] = True
|
||||
else:
|
||||
self.logger.warning(
|
||||
f"Found `{username}` in {site.name}, must be available"
|
||||
)
|
||||
self.logger.info(results_dict[site.name])
|
||||
changes["disabled"] = True
|
||||
|
||||
self.logger.info(f"Site {site.name} checking is finished")
|
||||
|
||||
return changes
|
||||
|
||||
def generate_additional_fields_dialog(self, engine: MaigretEngine, dialog):
|
||||
fields = {}
|
||||
if 'urlSubpath' in engine.site.get('url', ''):
|
||||
msg = (
|
||||
'Detected engine suppose additional URL subpath using (/forum/, /blog/, etc). '
|
||||
'Enter in manually if it exists: '
|
||||
)
|
||||
subpath = input(msg).strip('/')
|
||||
if subpath:
|
||||
fields['urlSubpath'] = f'/{subpath}'
|
||||
return fields
|
||||
|
||||
async def detect_known_engine(self, url_exists, url_mainpage) -> List[MaigretSite]:
|
||||
try:
|
||||
r = requests.get(url_mainpage)
|
||||
self.logger.debug(r.text)
|
||||
except Exception as e:
|
||||
self.logger.warning(e)
|
||||
print("Some error while checking main page")
|
||||
return []
|
||||
|
||||
for engine in self.db.engines:
|
||||
strs_to_check = engine.__dict__.get("presenseStrs")
|
||||
if strs_to_check and r and r.text:
|
||||
all_strs_in_response = True
|
||||
for s in strs_to_check:
|
||||
if s not in r.text:
|
||||
all_strs_in_response = False
|
||||
sites = []
|
||||
if all_strs_in_response:
|
||||
engine_name = engine.__dict__.get("name")
|
||||
|
||||
print(f"Detected engine {engine_name} for site {url_mainpage}")
|
||||
|
||||
usernames_to_check = self.settings.supposed_usernames
|
||||
supposed_username = self.extract_username_dialog(url_exists)
|
||||
if supposed_username:
|
||||
usernames_to_check = [supposed_username] + usernames_to_check
|
||||
|
||||
add_fields = self.generate_additional_fields_dialog(
|
||||
engine, url_exists
|
||||
)
|
||||
|
||||
for u in usernames_to_check:
|
||||
site_data = {
|
||||
"urlMain": url_mainpage,
|
||||
"name": url_mainpage.split("//")[1],
|
||||
"engine": engine_name,
|
||||
"usernameClaimed": u,
|
||||
"usernameUnclaimed": "noonewouldeverusethis7",
|
||||
**add_fields,
|
||||
}
|
||||
self.logger.info(site_data)
|
||||
|
||||
maigret_site = MaigretSite(
|
||||
url_mainpage.split("/")[-1], site_data
|
||||
)
|
||||
maigret_site.update_from_engine(
|
||||
self.db.engines_dict[engine_name]
|
||||
)
|
||||
sites.append(maigret_site)
|
||||
|
||||
return sites
|
||||
|
||||
return []
|
||||
|
||||
def extract_username_dialog(self, url):
|
||||
url_parts = url.rstrip("/").split("/")
|
||||
supposed_username = url_parts[-1].strip('@')
|
||||
entered_username = input(
|
||||
f'Is "{supposed_username}" a valid username? If not, write it manually: '
|
||||
)
|
||||
return entered_username if entered_username else supposed_username
|
||||
|
||||
async def check_features_manually(
|
||||
self, url_exists, url_mainpage, cookie_file, redirects=False
|
||||
):
|
||||
custom_headers = {}
|
||||
while True:
|
||||
header_key = input(
|
||||
'Specify custom header if you need or just press Enter to skip. Header name: '
|
||||
)
|
||||
if not header_key:
|
||||
break
|
||||
header_value = input('Header value: ')
|
||||
custom_headers[header_key.strip()] = header_value.strip()
|
||||
|
||||
supposed_username = self.extract_username_dialog(url_exists)
|
||||
non_exist_username = "noonewouldeverusethis7"
|
||||
|
||||
url_user = url_exists.replace(supposed_username, "{username}")
|
||||
url_not_exists = url_exists.replace(supposed_username, non_exist_username)
|
||||
|
||||
headers = dict(self.HEADERS)
|
||||
headers.update(custom_headers)
|
||||
|
||||
# cookies
|
||||
cookie_dict = None
|
||||
if cookie_file:
|
||||
self.logger.info(f'Use {cookie_file} for cookies')
|
||||
cookie_jar = import_aiohttp_cookies(cookie_file)
|
||||
cookie_dict = {c.key: c.value for c in cookie_jar}
|
||||
|
||||
exists_resp = requests.get(
|
||||
url_exists, cookies=cookie_dict, headers=headers, allow_redirects=redirects
|
||||
)
|
||||
self.logger.debug(url_exists)
|
||||
self.logger.debug(exists_resp.status_code)
|
||||
self.logger.debug(exists_resp.text)
|
||||
|
||||
non_exists_resp = requests.get(
|
||||
url_not_exists,
|
||||
cookies=cookie_dict,
|
||||
headers=headers,
|
||||
allow_redirects=redirects,
|
||||
)
|
||||
self.logger.debug(url_not_exists)
|
||||
self.logger.debug(non_exists_resp.status_code)
|
||||
self.logger.debug(non_exists_resp.text)
|
||||
|
||||
a = exists_resp.text
|
||||
b = non_exists_resp.text
|
||||
|
||||
tokens_a = set(re.split(f'[{self.SEPARATORS}]', a))
|
||||
tokens_b = set(re.split(f'[{self.SEPARATORS}]', b))
|
||||
|
||||
a_minus_b = tokens_a.difference(tokens_b)
|
||||
b_minus_a = tokens_b.difference(tokens_a)
|
||||
|
||||
if len(a_minus_b) == len(b_minus_a) == 0:
|
||||
print("The pages for existing and non-existing account are the same!")
|
||||
|
||||
top_features_count = int(
|
||||
input(
|
||||
f"Specify count of features to extract [default {self.TOP_FEATURES}]: "
|
||||
)
|
||||
or self.TOP_FEATURES
|
||||
)
|
||||
|
||||
match_fun = get_match_ratio(self.settings.presence_strings)
|
||||
|
||||
presence_list = sorted(a_minus_b, key=match_fun, reverse=True)[
|
||||
:top_features_count
|
||||
]
|
||||
|
||||
print("Detected text features of existing account: " + ", ".join(presence_list))
|
||||
features = input("If features was not detected correctly, write it manually: ")
|
||||
|
||||
if features:
|
||||
presence_list = list(map(str.strip, features.split(",")))
|
||||
|
||||
absence_list = sorted(b_minus_a, key=match_fun, reverse=True)[
|
||||
:top_features_count
|
||||
]
|
||||
print(
|
||||
"Detected text features of non-existing account: " + ", ".join(absence_list)
|
||||
)
|
||||
features = input("If features was not detected correctly, write it manually: ")
|
||||
|
||||
if features:
|
||||
absence_list = list(map(str.strip, features.split(",")))
|
||||
|
||||
site_data = {
|
||||
"absenceStrs": absence_list,
|
||||
"presenseStrs": presence_list,
|
||||
"url": url_user,
|
||||
"urlMain": url_mainpage,
|
||||
"usernameClaimed": supposed_username,
|
||||
"usernameUnclaimed": non_exist_username,
|
||||
"checkType": "message",
|
||||
}
|
||||
|
||||
if headers != self.HEADERS:
|
||||
site_data['headers'] = headers
|
||||
|
||||
site = MaigretSite(url_mainpage.split("/")[-1], site_data)
|
||||
return site
|
||||
|
||||
async def dialog(self, url_exists, cookie_file):
|
||||
domain_raw = self.URL_RE.sub("", url_exists).strip().strip("/")
|
||||
domain_raw = domain_raw.split("/")[0]
|
||||
self.logger.info('Domain is %s', domain_raw)
|
||||
|
||||
# check for existence
|
||||
matched_sites = list(
|
||||
filter(lambda x: domain_raw in x.url_main + x.url, self.db.sites)
|
||||
)
|
||||
|
||||
if matched_sites:
|
||||
print(
|
||||
f'Sites with domain "{domain_raw}" already exists in the Maigret database!'
|
||||
)
|
||||
status = lambda s: "(disabled)" if s.disabled else ""
|
||||
url_block = lambda s: f"\n\t{s.url_main}\n\t{s.url}"
|
||||
print(
|
||||
"\n".join(
|
||||
[
|
||||
f"{site.name} {status(site)}{url_block(site)}"
|
||||
for site in matched_sites
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
if input("Do you want to continue? [yN] ").lower() in "n":
|
||||
return False
|
||||
|
||||
url_mainpage = self.extract_mainpage_url(url_exists)
|
||||
|
||||
print('Detecting site engine, please wait...')
|
||||
sites = []
|
||||
try:
|
||||
sites = await self.detect_known_engine(url_exists, url_mainpage)
|
||||
except KeyboardInterrupt:
|
||||
print('Engine detect process is interrupted.')
|
||||
|
||||
if not sites:
|
||||
print("Unable to detect site engine, lets generate checking features")
|
||||
sites = [
|
||||
await self.check_features_manually(
|
||||
url_exists, url_mainpage, cookie_file
|
||||
)
|
||||
]
|
||||
|
||||
self.logger.debug(sites[0].__dict__)
|
||||
|
||||
sem = asyncio.Semaphore(1)
|
||||
|
||||
print("Checking, please wait...")
|
||||
found = False
|
||||
chosen_site = None
|
||||
for s in sites:
|
||||
chosen_site = s
|
||||
result = await self.site_self_check(s, sem)
|
||||
if not result["disabled"]:
|
||||
found = True
|
||||
break
|
||||
|
||||
if not found:
|
||||
print(
|
||||
f"Sorry, we couldn't find params to detect account presence/absence in {chosen_site.name}."
|
||||
)
|
||||
print(
|
||||
"Try to run this mode again and increase features count or choose others."
|
||||
)
|
||||
.lower()
|
||||
.strip("y")
|
||||
):
|
||||
return False
|
||||
else:
|
||||
if (
|
||||
input(
|
||||
f"Site {chosen_site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] "
|
||||
)
|
||||
.lower()
|
||||
.strip("y")
|
||||
):
|
||||
return False
|
||||
|
||||
chosen_site.name = input("Change site name if you want: ") or chosen_site.name
|
||||
chosen_site.tags = list(map(str.strip, input("Site tags: ").split(',')))
|
||||
rank = get_alexa_rank(chosen_site.url_main)
|
||||
if rank:
|
||||
print(f'New alexa rank: {rank}')
|
||||
chosen_site.alexa_rank = rank
|
||||
chosen_site.name = input("Change site name if you want: ") or chosen_site.name
|
||||
chosen_site.tags = list(map(str.strip, input("Site tags: ").split(',')))
|
||||
rank = Submitter.get_alexa_rank(chosen_site.url_main)
|
||||
if rank:
|
||||
print(f'New alexa rank: {rank}')
|
||||
chosen_site.alexa_rank = rank
|
||||
|
||||
logger.debug(chosen_site.json)
|
||||
site_data = chosen_site.strip_engine_data()
|
||||
logger.debug(site_data.json)
|
||||
db.update_site(site_data)
|
||||
return True
|
||||
self.logger.debug(chosen_site.json)
|
||||
site_data = chosen_site.strip_engine_data()
|
||||
self.logger.debug(site_data.json)
|
||||
self.db.update_site(site_data)
|
||||
return True
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import ast
|
||||
import difflib
|
||||
import re
|
||||
import random
|
||||
from typing import Any
|
||||
@@ -95,3 +96,18 @@ def get_dict_ascii_tree(items, prepend="", new_line=True):
|
||||
|
||||
def get_random_user_agent():
|
||||
return random.choice(DEFAULT_USER_AGENTS)
|
||||
|
||||
|
||||
def get_match_ratio(base_strs: list):
|
||||
def get_match_inner(s: str):
|
||||
return round(
|
||||
max(
|
||||
[
|
||||
difflib.SequenceMatcher(a=s.lower(), b=s2.lower()).ratio()
|
||||
for s2 in base_strs
|
||||
]
|
||||
),
|
||||
2,
|
||||
)
|
||||
|
||||
return get_match_inner
|
||||
|
||||
+3
-2
@@ -1,15 +1,16 @@
|
||||
"""Maigret data test functions"""
|
||||
|
||||
from maigret.utils import is_country_tag
|
||||
from maigret.sites import SUPPORTED_TAGS
|
||||
|
||||
|
||||
def test_tags_validity(default_db):
|
||||
unknown_tags = set()
|
||||
|
||||
tags = default_db._tags
|
||||
|
||||
for site in default_db.sites:
|
||||
for tag in filter(lambda x: not is_country_tag(x), site.tags):
|
||||
if tag not in SUPPORTED_TAGS:
|
||||
if tag not in tags:
|
||||
unknown_tags.add(tag)
|
||||
|
||||
assert unknown_tags == set()
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
"""Maigret Database test functions"""
|
||||
from maigret.sites import MaigretDatabase, MaigretSite
|
||||
from maigret.utils import URLMatcher
|
||||
|
||||
EXAMPLE_DB = {
|
||||
'engines': {
|
||||
@@ -179,3 +180,26 @@ def test_ranked_sites_dict_id_type():
|
||||
assert len(db.ranked_sites_dict()) == 2
|
||||
assert len(db.ranked_sites_dict(id_type='username')) == 2
|
||||
assert len(db.ranked_sites_dict(id_type='gaia_id')) == 1
|
||||
|
||||
|
||||
def test_get_url_template():
|
||||
site = MaigretSite(
|
||||
"test",
|
||||
{
|
||||
"urlMain": "https://ya.ru/",
|
||||
"url": "{urlMain}{urlSubpath}/members/?username={username}",
|
||||
},
|
||||
)
|
||||
assert (
|
||||
site.get_url_template()
|
||||
== "{urlMain}{urlSubpath}/members/?username={username} (no engine)"
|
||||
)
|
||||
|
||||
site = MaigretSite(
|
||||
"test",
|
||||
{
|
||||
"urlMain": "https://ya.ru/",
|
||||
"url": "https://{username}.ya.ru",
|
||||
},
|
||||
)
|
||||
assert site.get_url_template() == "SUBDOMAIN"
|
||||
|
||||
@@ -8,6 +8,7 @@ from maigret.utils import (
|
||||
enrich_link_str,
|
||||
URLMatcher,
|
||||
get_dict_ascii_tree,
|
||||
get_match_ratio,
|
||||
)
|
||||
|
||||
|
||||
@@ -136,3 +137,9 @@ def test_get_dict_ascii_tree():
|
||||
┣╸instagram_username: Street.Reality.Photography
|
||||
┗╸twitter_username: Alexaimephotogr"""
|
||||
)
|
||||
|
||||
|
||||
def test_get_match_ratio():
|
||||
fun = get_match_ratio(["test", "maigret", "username"])
|
||||
|
||||
assert fun("test") == 1
|
||||
|
||||
Reference in New Issue
Block a user