mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-06 22:19:01 +00:00
f8ab484cd2
futher -> further
660 lines
23 KiB
Python
660 lines
23 KiB
Python
import asyncio
|
|
import json
|
|
import re
|
|
import os
|
|
import logging
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
from aiohttp import ClientSession, TCPConnector
|
|
from aiohttp_socks import ProxyConnector
|
|
import cloudscraper
|
|
from colorama import Fore, Style
|
|
|
|
from .activation import import_aiohttp_cookies
|
|
from .result import MaigretCheckResult
|
|
from .settings import Settings
|
|
from .sites import MaigretDatabase, MaigretEngine, MaigretSite
|
|
from .utils import get_random_user_agent
|
|
from .checking import site_self_check
|
|
from .utils import get_match_ratio, generate_random_username
|
|
|
|
|
|
class CloudflareSession:
|
|
def __init__(self):
|
|
self.scraper = cloudscraper.create_scraper()
|
|
|
|
async def get(self, *args, **kwargs):
|
|
await asyncio.sleep(0)
|
|
res = self.scraper.get(*args, **kwargs)
|
|
self.last_text = res.text
|
|
self.status = res.status_code
|
|
return self
|
|
|
|
def status_code(self):
|
|
return self.status
|
|
|
|
async def text(self):
|
|
await asyncio.sleep(0)
|
|
return self.last_text
|
|
|
|
async def close(self):
|
|
pass
|
|
|
|
|
|
class Submitter:
|
|
HEADERS = {
|
|
"User-Agent": get_random_user_agent(),
|
|
}
|
|
|
|
SEPARATORS = "\"'\n"
|
|
|
|
RATIO = 0.6
|
|
TOP_FEATURES = 5
|
|
URL_RE = re.compile(r"https?://(www\.)?")
|
|
|
|
def __init__(self, db: MaigretDatabase, settings: Settings, logger, args):
|
|
self.settings = settings
|
|
self.args = args
|
|
self.db = db
|
|
self.logger = logger
|
|
|
|
from aiohttp_socks import ProxyConnector
|
|
|
|
proxy = self.args.proxy
|
|
cookie_jar = None
|
|
if args.cookie_file:
|
|
if not os.path.exists(args.cookie_file):
|
|
logger.error(f"Cookie file {args.cookie_file} does not exist!")
|
|
else:
|
|
cookie_jar = import_aiohttp_cookies(args.cookie_file)
|
|
|
|
connector = ProxyConnector.from_url(proxy) if proxy else TCPConnector(ssl=False)
|
|
connector.verify_ssl = False
|
|
self.session = ClientSession(
|
|
connector=connector, trust_env=True, cookie_jar=cookie_jar
|
|
)
|
|
|
|
async def close(self):
|
|
await self.session.close()
|
|
|
|
@staticmethod
|
|
def get_alexa_rank(site_url_main):
|
|
import requests
|
|
import xml.etree.ElementTree as ElementTree
|
|
|
|
url = f"http://data.alexa.com/data?cli=10&url={site_url_main}"
|
|
xml_data = requests.get(url).text
|
|
root = ElementTree.fromstring(xml_data)
|
|
alexa_rank = 0
|
|
|
|
try:
|
|
alexa_rank = int(root.find('.//REACH').attrib['RANK'])
|
|
except Exception:
|
|
pass
|
|
|
|
return alexa_rank
|
|
|
|
@staticmethod
|
|
def extract_mainpage_url(url):
|
|
return "/".join(url.split("/", 3)[:3])
|
|
|
|
async def site_self_check(self, site, semaphore, silent=False):
|
|
# Call the general function from the checking.py
|
|
changes = await site_self_check(
|
|
site=site,
|
|
logger=self.logger,
|
|
semaphore=semaphore,
|
|
db=self.db,
|
|
silent=silent,
|
|
proxy=self.args.proxy,
|
|
cookies=self.args.cookie_file,
|
|
# Don't skip errors in submit mode - we need check both false positives/true negatives
|
|
skip_errors=False,
|
|
)
|
|
return changes
|
|
|
|
def generate_additional_fields_dialog(self, engine: MaigretEngine, dialog):
|
|
fields = {}
|
|
if 'urlSubpath' in engine.site.get('url', ''):
|
|
msg = (
|
|
'Detected engine suppose additional URL subpath using (/forum/, /blog/, etc). '
|
|
'Enter in manually if it exists: '
|
|
)
|
|
subpath = input(msg).strip('/')
|
|
if subpath:
|
|
fields['urlSubpath'] = f'/{subpath}'
|
|
return fields
|
|
|
|
async def detect_known_engine(
|
|
self, url_exists, url_mainpage, session, follow_redirects, headers
|
|
) -> [List[MaigretSite], str]:
|
|
|
|
session = session or self.session
|
|
resp_text, _ = await self.get_html_response_to_compare(
|
|
url_exists, session, follow_redirects, headers
|
|
)
|
|
|
|
for engine in self.db.engines:
|
|
strs_to_check = engine.__dict__.get("presenseStrs")
|
|
if strs_to_check and resp_text:
|
|
all_strs_in_response = True
|
|
for s in strs_to_check:
|
|
if s not in resp_text:
|
|
all_strs_in_response = False
|
|
sites = []
|
|
if all_strs_in_response:
|
|
engine_name = engine.__dict__.get("name")
|
|
|
|
print(f"Detected engine {engine_name} for site {url_mainpage}")
|
|
|
|
usernames_to_check = self.settings.supposed_usernames
|
|
supposed_username = self.extract_username_dialog(url_exists)
|
|
if supposed_username:
|
|
usernames_to_check = [supposed_username] + usernames_to_check
|
|
|
|
add_fields = self.generate_additional_fields_dialog(
|
|
engine, url_exists
|
|
)
|
|
|
|
for u in usernames_to_check:
|
|
site_data = {
|
|
"urlMain": url_mainpage,
|
|
"name": url_mainpage.split("//")[1].split("/")[0],
|
|
"engine": engine_name,
|
|
"usernameClaimed": u,
|
|
"usernameUnclaimed": "noonewouldeverusethis7",
|
|
**add_fields,
|
|
}
|
|
self.logger.info(site_data)
|
|
|
|
maigret_site = MaigretSite(
|
|
url_mainpage.split("/")[-1], site_data
|
|
)
|
|
maigret_site.update_from_engine(
|
|
self.db.engines_dict[engine_name]
|
|
)
|
|
sites.append(maigret_site)
|
|
|
|
return sites, resp_text
|
|
|
|
return [], resp_text
|
|
|
|
@staticmethod
|
|
def extract_username_dialog(url):
|
|
url_parts = url.rstrip("/").split("/")
|
|
supposed_username = url_parts[-1].strip('@')
|
|
entered_username = input(
|
|
f"{Fore.GREEN}[?] Is \"{supposed_username}\" a valid username? If not, write it manually: {Style.RESET_ALL}"
|
|
)
|
|
return entered_username if entered_username else supposed_username
|
|
|
|
@staticmethod
|
|
async def get_html_response_to_compare(
|
|
url: str, session: ClientSession = None, redirects=False, headers: Dict = None
|
|
):
|
|
async with session.get(
|
|
url, allow_redirects=redirects, headers=headers
|
|
) as response:
|
|
# Try different encodings or fallback to 'ignore' errors
|
|
try:
|
|
html_response = await response.text(encoding='utf-8')
|
|
except UnicodeDecodeError:
|
|
try:
|
|
html_response = await response.text(encoding='latin1')
|
|
except UnicodeDecodeError:
|
|
html_response = await response.text(errors='ignore')
|
|
return html_response, response.status
|
|
|
|
async def check_features_manually(
|
|
self,
|
|
username: str,
|
|
url_exists: str,
|
|
cookie_filename="", # TODO: use cookies
|
|
session: ClientSession = None,
|
|
follow_redirects=False,
|
|
headers: dict = None,
|
|
) -> Tuple[List[str], List[str], str, str]:
|
|
|
|
random_username = generate_random_username()
|
|
url_of_non_existing_account = url_exists.lower().replace(
|
|
username.lower(), random_username
|
|
)
|
|
|
|
try:
|
|
session = session or self.session
|
|
first_html_response, first_status = await self.get_html_response_to_compare(
|
|
url_exists, session, follow_redirects, headers
|
|
)
|
|
second_html_response, second_status = (
|
|
await self.get_html_response_to_compare(
|
|
url_of_non_existing_account, session, follow_redirects, headers
|
|
)
|
|
)
|
|
await session.close()
|
|
except Exception as e:
|
|
self.logger.error(
|
|
f"Error while getting HTTP response for username {username}: {e}",
|
|
exc_info=True,
|
|
)
|
|
return None, None, str(e), random_username
|
|
|
|
self.logger.info(f"URL with existing account: {url_exists}")
|
|
self.logger.info(
|
|
f"HTTP response status for URL with existing account: {first_status}"
|
|
)
|
|
self.logger.info(
|
|
f"HTTP response length URL with existing account: {len(first_html_response)}"
|
|
)
|
|
self.logger.debug(first_html_response)
|
|
|
|
self.logger.info(f"URL with existing account: {url_of_non_existing_account}")
|
|
self.logger.info(
|
|
f"HTTP response status for URL with non-existing account: {second_status}"
|
|
)
|
|
self.logger.info(
|
|
f"HTTP response length URL with non-existing account: {len(second_html_response)}"
|
|
)
|
|
self.logger.debug(second_html_response)
|
|
|
|
# TODO: filter by errors, move to dialog function
|
|
if (
|
|
"/cdn-cgi/challenge-platform" in first_html_response
|
|
or "\t\t\t\tnow: " in first_html_response
|
|
or "Sorry, you have been blocked" in first_html_response
|
|
):
|
|
self.logger.info("Cloudflare detected, skipping")
|
|
return None, None, "Cloudflare detected, skipping", random_username
|
|
|
|
tokens_a = set(re.split(f'[{self.SEPARATORS}]', first_html_response))
|
|
tokens_b = set(re.split(f'[{self.SEPARATORS}]', second_html_response))
|
|
|
|
a_minus_b = tokens_a.difference(tokens_b)
|
|
b_minus_a = tokens_b.difference(tokens_a)
|
|
|
|
a_minus_b = list(map(lambda x: x.strip('\\'), a_minus_b))
|
|
b_minus_a = list(map(lambda x: x.strip('\\'), b_minus_a))
|
|
|
|
# Filter out strings containing usernames
|
|
a_minus_b = [s for s in a_minus_b if username.lower() not in s.lower()]
|
|
b_minus_a = [s for s in b_minus_a if random_username.lower() not in s.lower()]
|
|
|
|
def filter_tokens(token: str, html_response: str) -> bool:
|
|
is_in_html = token in html_response
|
|
is_long_str = len(token) >= 50
|
|
is_number = re.match(r'^\d\.?\d+$', token) or re.match(r':^\d+$', token)
|
|
is_whitelisted_number = token in ['200', '404', '403']
|
|
|
|
return not (
|
|
is_in_html or is_long_str or (is_number and not is_whitelisted_number)
|
|
)
|
|
|
|
a_minus_b = list(
|
|
filter(lambda t: filter_tokens(t, second_html_response), a_minus_b)
|
|
)
|
|
b_minus_a = list(
|
|
filter(lambda t: filter_tokens(t, first_html_response), b_minus_a)
|
|
)
|
|
|
|
if len(a_minus_b) == len(b_minus_a) == 0:
|
|
return (
|
|
None,
|
|
None,
|
|
"HTTP responses for pages with existing and non-existing accounts are the same",
|
|
random_username,
|
|
)
|
|
|
|
match_fun = get_match_ratio(self.settings.presence_strings)
|
|
|
|
presence_list = sorted(a_minus_b, key=match_fun, reverse=True)[
|
|
: self.TOP_FEATURES
|
|
]
|
|
absence_list = sorted(b_minus_a, key=match_fun, reverse=True)[
|
|
: self.TOP_FEATURES
|
|
]
|
|
|
|
self.logger.info(f"Detected presence features: {presence_list}")
|
|
self.logger.info(f"Detected absence features: {absence_list}")
|
|
|
|
return presence_list, absence_list, "Found", random_username
|
|
|
|
async def add_site(self, site):
|
|
sem = asyncio.Semaphore(1)
|
|
print(
|
|
f"{Fore.BLUE}{Style.BRIGHT}[*] Adding site {site.name}, let's check it...{Style.RESET_ALL}"
|
|
)
|
|
|
|
result = await self.site_self_check(site, sem)
|
|
if result["disabled"]:
|
|
print(f"Checks failed for {site.name}, please, verify them manually.")
|
|
return {
|
|
"valid": False,
|
|
"reason": "checks_failed",
|
|
}
|
|
|
|
while True:
|
|
print("\nAvailable fields to edit:")
|
|
editable_fields = {
|
|
'1': 'name',
|
|
'2': 'tags',
|
|
'3': 'url',
|
|
'4': 'url_main',
|
|
'5': 'username_claimed',
|
|
'6': 'username_unclaimed',
|
|
'7': 'presense_strs',
|
|
'8': 'absence_strs',
|
|
}
|
|
|
|
for num, field in editable_fields.items():
|
|
current_value = getattr(site, field)
|
|
print(f"{num}. {field} (current: {current_value})")
|
|
|
|
print("0. finish editing")
|
|
print("10. reject and block domain")
|
|
print("11. invalid params, remove")
|
|
|
|
choice = input("\nSelect field number to edit (0-8): ").strip()
|
|
|
|
if choice == '0':
|
|
break
|
|
|
|
if choice == '10':
|
|
return {
|
|
"valid": False,
|
|
"reason": "manual block",
|
|
}
|
|
|
|
if choice == '11':
|
|
return {
|
|
"valid": False,
|
|
"reason": "remove",
|
|
}
|
|
|
|
if choice in editable_fields:
|
|
field = editable_fields[choice]
|
|
current_value = getattr(site, field)
|
|
new_value = input(
|
|
f"Enter new value for {field} (current: {current_value}): "
|
|
).strip()
|
|
|
|
if field in ['tags', 'presense_strs', 'absence_strs']:
|
|
new_value = list(map(str.strip, new_value.split(',')))
|
|
|
|
if new_value:
|
|
setattr(site, field, new_value)
|
|
print(f"Updated {field} to: {new_value}")
|
|
|
|
self.logger.info(site.json)
|
|
self.db.update_site(site)
|
|
return {
|
|
"valid": True,
|
|
}
|
|
|
|
async def dialog(self, url_exists, cookie_file):
|
|
"""
|
|
An implementation of the submit mode:
|
|
- User provides a URL of a existing social media account
|
|
- Maigret tries to detect the site engine and understand how to check
|
|
for account presence with HTTP responses analysis
|
|
- If detection succeeds, Maigret generates a new site entry/replace old one in the database
|
|
"""
|
|
old_site = None
|
|
additional_options_enabled = self.logger.level in (
|
|
logging.DEBUG,
|
|
logging.WARNING,
|
|
)
|
|
|
|
domain_raw = self.URL_RE.sub("", url_exists).strip().strip("/")
|
|
domain_raw = domain_raw.split("/")[0]
|
|
self.logger.info('Domain is %s', domain_raw)
|
|
|
|
# check for existence
|
|
matched_sites = list(
|
|
filter(lambda x: domain_raw in x.url_main + x.url, self.db.sites)
|
|
)
|
|
|
|
if matched_sites:
|
|
# TODO: update the existing site
|
|
print(
|
|
f"{Fore.YELLOW}[!] Sites with domain \"{domain_raw}\" already exists in the Maigret database!{Style.RESET_ALL}"
|
|
)
|
|
|
|
status = lambda s: "(disabled)" if s.disabled else ""
|
|
url_block = lambda s: f"\n\t{s.url_main}\n\t{s.url}"
|
|
print(
|
|
"\n".join(
|
|
[
|
|
f"{site.name} {status(site)}{url_block(site)}"
|
|
for site in matched_sites
|
|
]
|
|
)
|
|
)
|
|
|
|
if (
|
|
input(
|
|
f"{Fore.GREEN}[?] Do you want to continue? [yN] {Style.RESET_ALL}"
|
|
).lower()
|
|
in "n"
|
|
):
|
|
return False
|
|
|
|
site_names = [site.name for site in matched_sites]
|
|
site_name = (
|
|
input(
|
|
f"{Fore.GREEN}[?] Which site do you want to update in case of success? 1st by default. [{', '.join(site_names)}] {Style.RESET_ALL}"
|
|
)
|
|
or matched_sites[0].name
|
|
)
|
|
old_site = next(
|
|
(site for site in matched_sites if site.name == site_name), None
|
|
)
|
|
print(
|
|
f'{Fore.GREEN}[+] We will update site "{old_site.name}" in case of success.{Style.RESET_ALL}'
|
|
)
|
|
|
|
# Check if the site check is ordinary or not
|
|
if old_site and (old_site.url_probe or old_site.activation):
|
|
skip = input(f"{Fore.RED}[!] The site check depends on activation / probing mechanism! Consider to update it manually. Continue? [yN]{Style.RESET_ALL}")
|
|
if skip.lower() in ['n', '']:
|
|
return False
|
|
|
|
# TODO: urlProbe support
|
|
# TODO: activation support
|
|
|
|
url_mainpage = self.extract_mainpage_url(url_exists)
|
|
|
|
# headers update
|
|
custom_headers = dict(self.HEADERS)
|
|
while additional_options_enabled:
|
|
header_key = input(
|
|
f'{Fore.GREEN}[?] Specify custom header if you need or just press Enter to skip. Header name: {Style.RESET_ALL}'
|
|
)
|
|
if not header_key:
|
|
break
|
|
header_value = input(f'{Fore.GREEN}[?] Header value: {Style.RESET_ALL}')
|
|
custom_headers[header_key.strip()] = header_value.strip()
|
|
|
|
# redirects settings update
|
|
redirects = False
|
|
if additional_options_enabled:
|
|
redirects = (
|
|
'y'
|
|
in input(
|
|
f'{Fore.GREEN}[?] Should we do redirects automatically? [yN] {Style.RESET_ALL}'
|
|
).lower()
|
|
)
|
|
|
|
print('Detecting site engine, please wait...')
|
|
sites = []
|
|
text = None
|
|
try:
|
|
sites, text = await self.detect_known_engine(
|
|
url_exists,
|
|
url_exists,
|
|
session=None,
|
|
follow_redirects=redirects,
|
|
headers=custom_headers,
|
|
)
|
|
except KeyboardInterrupt:
|
|
print('Engine detect process is interrupted.')
|
|
|
|
if 'cloudflare' in text.lower():
|
|
print(
|
|
'Cloudflare protection detected. I will use cloudscraper for further work'
|
|
)
|
|
# self.session = CloudflareSession()
|
|
|
|
if not sites:
|
|
print("Unable to detect site engine, lets generate checking features")
|
|
|
|
supposed_username = self.extract_username_dialog(url_exists)
|
|
self.logger.info(f"Supposed username: {supposed_username}")
|
|
|
|
# TODO: pass status_codes
|
|
# check it here and suggest to enable / auto-enable redirects
|
|
presence_list, absence_list, status, non_exist_username = (
|
|
await self.check_features_manually(
|
|
username=supposed_username,
|
|
url_exists=url_exists,
|
|
cookie_filename=cookie_file,
|
|
follow_redirects=redirects,
|
|
headers=custom_headers,
|
|
)
|
|
)
|
|
|
|
if status == "Found":
|
|
site_data = {
|
|
"absenceStrs": absence_list,
|
|
"presenseStrs": presence_list,
|
|
"url": url_exists.replace(supposed_username, '{username}'),
|
|
"urlMain": url_mainpage,
|
|
"usernameClaimed": supposed_username,
|
|
"usernameUnclaimed": non_exist_username,
|
|
"headers": custom_headers,
|
|
"checkType": "message",
|
|
}
|
|
self.logger.info(json.dumps(site_data, indent=4))
|
|
|
|
if custom_headers != self.HEADERS:
|
|
site_data['headers'] = custom_headers
|
|
|
|
site = MaigretSite(url_mainpage.split("/")[-1], site_data)
|
|
sites.append(site)
|
|
|
|
else:
|
|
print(
|
|
f"{Fore.RED}[!] The check for site failed! Reason: {status}{Style.RESET_ALL}"
|
|
)
|
|
return False
|
|
|
|
self.logger.debug(sites[0].__dict__)
|
|
|
|
sem = asyncio.Semaphore(1)
|
|
|
|
print(f"{Fore.GREEN}[*] Checking, please wait...{Style.RESET_ALL}")
|
|
found = False
|
|
chosen_site = None
|
|
for s in sites:
|
|
chosen_site = s
|
|
result = await self.site_self_check(s, sem)
|
|
if not result["disabled"]:
|
|
found = True
|
|
break
|
|
|
|
if not found:
|
|
print(
|
|
f"{Fore.RED}[!] The check for site '{chosen_site.name}' failed!{Style.RESET_ALL}"
|
|
)
|
|
print(
|
|
"Try to run this mode again and increase features count or choose others."
|
|
)
|
|
self.logger.debug(json.dumps(chosen_site.json))
|
|
return False
|
|
else:
|
|
if (
|
|
input(
|
|
f"{Fore.GREEN}[?] Site {chosen_site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] {Style.RESET_ALL}"
|
|
)
|
|
.lower()
|
|
.strip("y")
|
|
):
|
|
return False
|
|
|
|
if self.args.verbose:
|
|
self.logger.info(
|
|
"Verbose mode is enabled, additional settings are available"
|
|
)
|
|
source = input(
|
|
f"{Fore.GREEN}[?] Name the source site if it is mirror: {Style.RESET_ALL}"
|
|
)
|
|
if source:
|
|
chosen_site.source = source
|
|
|
|
default_site_name = old_site.name if old_site else chosen_site.name
|
|
new_name = (
|
|
input(
|
|
f"{Fore.GREEN}[?] Change site name if you want [{default_site_name}]: {Style.RESET_ALL}"
|
|
)
|
|
or default_site_name
|
|
)
|
|
if new_name != default_site_name:
|
|
self.logger.info(f"New site name is {new_name}")
|
|
chosen_site.name = new_name
|
|
|
|
default_tags_str = ""
|
|
if old_site:
|
|
default_tags_str = f' [{", ".join(old_site.tags)}]'
|
|
|
|
new_tags = input(f"{Fore.GREEN}[?] Site tags{default_tags_str}: {Style.RESET_ALL}")
|
|
if new_tags:
|
|
chosen_site.tags = list(map(str.strip, new_tags.split(',')))
|
|
else:
|
|
chosen_site.tags = []
|
|
self.logger.info(f"Site tags are: {', '.join(chosen_site.tags)}")
|
|
# rank = Submitter.get_alexa_rank(chosen_site.url_main)
|
|
# if rank:
|
|
# print(f'New alexa rank: {rank}')
|
|
# chosen_site.alexa_rank = rank
|
|
|
|
self.logger.info(chosen_site.json)
|
|
site_data = chosen_site.strip_engine_data()
|
|
self.logger.info(site_data.json)
|
|
|
|
if old_site:
|
|
# Update old site with new values and log changes
|
|
fields_to_check = {
|
|
'url': 'URL',
|
|
'url_main': 'Main URL',
|
|
'username_claimed': 'Username claimed',
|
|
'username_unclaimed': 'Username unclaimed',
|
|
'check_type': 'Check type',
|
|
'presense_strs': 'Presence strings',
|
|
'absence_strs': 'Absence strings',
|
|
'tags': 'Tags',
|
|
'source': 'Source',
|
|
'headers': 'Headers',
|
|
}
|
|
|
|
for field, display_name in fields_to_check.items():
|
|
old_value = getattr(old_site, field)
|
|
new_value = getattr(site_data, field)
|
|
if field == 'tags' and not new_tags:
|
|
continue
|
|
if str(old_value) != str(new_value):
|
|
print(
|
|
f"{Fore.YELLOW}[*] '{display_name}' updated: {Fore.RED}{old_value} {Fore.YELLOW}to {Fore.GREEN}{new_value}{Style.RESET_ALL}"
|
|
)
|
|
old_site.__dict__[field] = new_value
|
|
|
|
# update the site
|
|
final_site = old_site if old_site else site_data
|
|
self.db.update_site(final_site)
|
|
|
|
# save the db in file
|
|
if self.args.db_file != self.settings.sites_db_path:
|
|
print(
|
|
f"{Fore.GREEN}[+] Maigret DB is saved to {self.args.db}.{Style.RESET_ALL}"
|
|
)
|
|
self.db.save_to_file(self.args.db)
|
|
|
|
return True
|