mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-06 14:08:59 +00:00
281 lines
9.0 KiB
Python
Executable File
281 lines
9.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
import json
|
|
import random
|
|
import re
|
|
|
|
import tqdm.asyncio
|
|
from mock import Mock
|
|
import requests
|
|
|
|
from maigret.maigret import *
|
|
from maigret.result import QueryStatus
|
|
from maigret.sites import MaigretSite
|
|
|
|
URL_RE = re.compile(r"https?://(www\.)?")
|
|
TIMEOUT = 200
|
|
|
|
|
|
async def maigret_check(site, site_data, username, status, logger):
|
|
query_notify = Mock()
|
|
logger.debug(f'Checking {site}...')
|
|
|
|
for username, status in [(username, status)]:
|
|
results = await maigret(
|
|
username,
|
|
{site: site_data},
|
|
logger,
|
|
query_notify,
|
|
timeout=TIMEOUT,
|
|
forced=True,
|
|
no_progressbar=True,
|
|
)
|
|
|
|
if results[site]['status'].status != status:
|
|
if results[site]['status'].status == QueryStatus.UNKNOWN:
|
|
msg = site_data.absence_strs
|
|
etype = site_data.check_type
|
|
context = results[site]['status'].context
|
|
|
|
logger.debug(f'Error while searching {username} in {site}, must be claimed. Context: {context}')
|
|
# if site_data.get('errors'):
|
|
# continue
|
|
return False
|
|
|
|
if status == QueryStatus.CLAIMED:
|
|
logger.debug(f'Not found {username} in {site}, must be claimed')
|
|
logger.debug(results[site])
|
|
pass
|
|
else:
|
|
logger.debug(f'Found {username} in {site}, must be available')
|
|
logger.debug(results[site])
|
|
pass
|
|
return False
|
|
|
|
return site_data
|
|
|
|
|
|
async def check_and_add_maigret_site(site_data, semaphore, logger, ok_usernames, bad_usernames):
|
|
async with semaphore:
|
|
sitename = site_data.name
|
|
positive = False
|
|
negative = False
|
|
|
|
for ok_username in ok_usernames:
|
|
site_data.username_claimed = ok_username
|
|
status = QueryStatus.CLAIMED
|
|
if await maigret_check(sitename, site_data, ok_username, status, logger):
|
|
# print(f'{sitename} positive case is okay')
|
|
positive = True
|
|
break
|
|
|
|
for bad_username in bad_usernames:
|
|
site_data.username_unclaimed = bad_username
|
|
status = QueryStatus.AVAILABLE
|
|
if await maigret_check(sitename, site_data, bad_username, status, logger):
|
|
# print(f'{sitename} negative case is okay')
|
|
negative = True
|
|
break
|
|
|
|
if positive and negative:
|
|
site_data = site_data.strip_engine_data()
|
|
|
|
db.update_site(site_data)
|
|
print(site_data.json)
|
|
try:
|
|
db.save_to_file(args.base_file)
|
|
except Exception as e:
|
|
logging.error(e, exc_info=True)
|
|
print(f'Saved new site {sitename}...')
|
|
ok_sites.append(site_data)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter
|
|
)
|
|
parser.add_argument("--base", "-b", metavar="BASE_FILE",
|
|
dest="base_file", default="maigret/resources/data.json",
|
|
help="JSON file with sites data to update.")
|
|
|
|
parser.add_argument("--add-engine", dest="add_engine", help="Additional engine to check")
|
|
|
|
parser.add_argument("--only-engine", dest="only_engine", help="Use only this engine from detected to check")
|
|
|
|
parser.add_argument('--check', help='only check sites in database', action='store_true')
|
|
|
|
parser.add_argument('--random', help='shuffle list of urls', action='store_true', default=False)
|
|
|
|
parser.add_argument('--top', help='top count of records in file', type=int, default=10000)
|
|
|
|
parser.add_argument('--filter', help='substring to filter input urls', type=str, default='')
|
|
|
|
parser.add_argument('--username', help='preferable username to check with', type=str)
|
|
|
|
parser.add_argument(
|
|
"--info",
|
|
"-vv",
|
|
action="store_true",
|
|
dest="info",
|
|
default=False,
|
|
help="Display service information.",
|
|
)
|
|
parser.add_argument(
|
|
"--verbose",
|
|
"-v",
|
|
action="store_true",
|
|
dest="verbose",
|
|
default=False,
|
|
help="Display extra information and metrics.",
|
|
)
|
|
parser.add_argument(
|
|
"-d",
|
|
"--debug",
|
|
"-vvv",
|
|
action="store_true",
|
|
dest="debug",
|
|
default=False,
|
|
help="Saving debugging information and sites responses in debug.txt.",
|
|
)
|
|
|
|
parser.add_argument("urls_file",
|
|
metavar='URLS_FILE',
|
|
action="store",
|
|
help="File with base site URLs"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
log_level = logging.ERROR
|
|
if args.debug:
|
|
log_level = logging.DEBUG
|
|
elif args.info:
|
|
log_level = logging.INFO
|
|
elif args.verbose:
|
|
log_level = logging.WARNING
|
|
|
|
logging.basicConfig(
|
|
format='[%(filename)s:%(lineno)d] %(levelname)-3s %(asctime)s %(message)s',
|
|
datefmt='%H:%M:%S',
|
|
level=log_level
|
|
)
|
|
logger = logging.getLogger('engines-check')
|
|
logger.setLevel(log_level)
|
|
|
|
db = MaigretDatabase()
|
|
sites_subset = db.load_from_file(args.base_file).sites
|
|
sites = {site.name: site for site in sites_subset}
|
|
engines = db.engines
|
|
|
|
# TODO: usernames extractors
|
|
ok_usernames = ['alex', 'god', 'admin', 'red', 'blue', 'john']
|
|
if args.username:
|
|
ok_usernames = [args.username] + ok_usernames
|
|
|
|
bad_usernames = ['noonewouldeverusethis7']
|
|
|
|
with open(args.urls_file, 'r') as urls_file:
|
|
urls = urls_file.read().splitlines()
|
|
if args.random:
|
|
random.shuffle(urls)
|
|
urls = urls[:args.top]
|
|
|
|
raw_maigret_data = json.dumps({site.name: site.json for site in sites_subset})
|
|
|
|
new_sites = []
|
|
for site in tqdm.asyncio.tqdm(urls):
|
|
site_lowercase = site.lower()
|
|
|
|
domain_raw = URL_RE.sub('', site_lowercase).strip().strip('/')
|
|
domain_raw = domain_raw.split('/')[0]
|
|
|
|
if args.filter and args.filter not in domain_raw:
|
|
logger.debug('Site %s skipped due to filtering by "%s"', domain_raw, args.filter)
|
|
continue
|
|
|
|
if domain_raw in raw_maigret_data:
|
|
logger.debug(f'Site {domain_raw} already exists in the Maigret database!')
|
|
continue
|
|
|
|
if '"' in domain_raw:
|
|
logger.debug(f'Invalid site {domain_raw}')
|
|
continue
|
|
|
|
main_page_url = '/'.join(site.split('/', 3)[:3])
|
|
|
|
site_data = {
|
|
'url': site,
|
|
'urlMain': main_page_url,
|
|
'name': domain_raw,
|
|
}
|
|
|
|
try:
|
|
r = requests.get(main_page_url, timeout=5)
|
|
except:
|
|
r = None
|
|
pass
|
|
|
|
detected_engines = []
|
|
|
|
for e in engines:
|
|
strs_to_check = e.__dict__.get('presenseStrs')
|
|
if strs_to_check and r and r.text:
|
|
all_strs_in_response = True
|
|
for s in strs_to_check:
|
|
if not s in r.text:
|
|
all_strs_in_response = False
|
|
if all_strs_in_response:
|
|
engine_name = e.__dict__.get('name')
|
|
detected_engines.append(engine_name)
|
|
logger.info(f'Detected engine {engine_name} for site {main_page_url}')
|
|
|
|
if args.only_engine and args.only_engine in detected_engines:
|
|
detected_engines = [args.only_engine]
|
|
elif not detected_engines and args.add_engine:
|
|
logging.debug('Could not detect any engine, applying default engine %s...', args.add_engine)
|
|
detected_engines = [args.add_engine]
|
|
|
|
def create_site_from_engine(sitename, data, e):
|
|
site = MaigretSite(sitename, data)
|
|
site.update_from_engine(db.engines_dict[e])
|
|
site.engine = e
|
|
return site
|
|
|
|
for engine_name in detected_engines:
|
|
site = create_site_from_engine(domain_raw, site_data, engine_name)
|
|
new_sites.append(site)
|
|
logger.debug(site.json)
|
|
|
|
# if engine_name == "phpBB":
|
|
# site_data_with_subpath = dict(site_data)
|
|
# site_data_with_subpath["urlSubpath"] = "/forum"
|
|
# site = create_site_from_engine(domain_raw, site_data_with_subpath, engine_name)
|
|
# new_sites.append(site)
|
|
|
|
# except Exception as e:
|
|
# print(f'Error: {str(e)}')
|
|
# pass
|
|
|
|
print(f'Found {len(new_sites)}/{len(urls)} new sites')
|
|
|
|
if args.check:
|
|
for s in new_sites:
|
|
print(s.url_main)
|
|
sys.exit(0)
|
|
|
|
sem = asyncio.Semaphore(20)
|
|
loop = asyncio.get_event_loop()
|
|
|
|
ok_sites = []
|
|
tasks = []
|
|
for site in new_sites:
|
|
check_coro = check_and_add_maigret_site(site, sem, logger, ok_usernames, bad_usernames)
|
|
future = asyncio.ensure_future(check_coro)
|
|
tasks.append(future)
|
|
|
|
for f in tqdm.asyncio.tqdm.as_completed(tasks, timeout=TIMEOUT):
|
|
try:
|
|
loop.run_until_complete(f)
|
|
except asyncio.exceptions.TimeoutError:
|
|
pass
|
|
|
|
print(f'Found and saved {len(ok_sites)} sites!')
|