mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-07 14:34:33 +00:00
Added some scripts (#355)
This commit is contained in:
Executable
+152
@@ -0,0 +1,152 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Maigret: Supported Site Listing with Alexa ranking and country tags
|
||||
This module generates the listing of supported sites in file `SITES.md`
|
||||
and pretty prints file with sites data.
|
||||
"""
|
||||
import aiohttp
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
import requests
|
||||
import logging
|
||||
import threading
|
||||
import xml.etree.ElementTree as ET
|
||||
from datetime import datetime
|
||||
from argparse import ArgumentParser, RawDescriptionHelpFormatter
|
||||
|
||||
import tqdm.asyncio
|
||||
|
||||
from maigret.maigret import get_response, site_self_check
|
||||
from maigret.sites import MaigretSite, MaigretDatabase, MaigretEngine
|
||||
from maigret.utils import CaseConverter
|
||||
|
||||
|
||||
async def check_engine_of_site(site_name, sites_with_engines, future, engine_name, semaphore, logger):
|
||||
async with semaphore:
|
||||
response = await get_response(request_future=future,
|
||||
site_name=site_name,
|
||||
logger=logger)
|
||||
|
||||
html_text, status_code, error_text, expection_text = response
|
||||
|
||||
if html_text and engine_name in html_text:
|
||||
sites_with_engines.append(site_name)
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter
|
||||
)
|
||||
parser.add_argument("--base","-b", metavar="BASE_FILE",
|
||||
dest="base_file", default="maigret/resources/data.json",
|
||||
help="JSON file with sites data to update.")
|
||||
|
||||
parser.add_argument('--engine', '-e', help='check only selected engine', type=str)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
log_level = logging.INFO
|
||||
logging.basicConfig(
|
||||
format='[%(filename)s:%(lineno)d] %(levelname)-3s %(asctime)s %(message)s',
|
||||
datefmt='%H:%M:%S',
|
||||
level=log_level
|
||||
)
|
||||
logger = logging.getLogger('engines-check')
|
||||
logger.setLevel(log_level)
|
||||
|
||||
db = MaigretDatabase()
|
||||
sites_subset = db.load_from_file(args.base_file).sites
|
||||
sites = {site.name: site for site in sites_subset}
|
||||
|
||||
with open(args.base_file, "r", encoding="utf-8") as data_file:
|
||||
sites_info = json.load(data_file)
|
||||
engines = sites_info['engines']
|
||||
|
||||
for engine_name, engine_data in engines.items():
|
||||
if args.engine and args.engine != engine_name:
|
||||
continue
|
||||
|
||||
if not 'presenseStrs' in engine_data:
|
||||
print(f'No features to automatically detect sites on engine {engine_name}')
|
||||
continue
|
||||
|
||||
engine_obj = MaigretEngine(engine_name, engine_data)
|
||||
|
||||
# setup connections for checking both engine and usernames
|
||||
connector = aiohttp.TCPConnector(ssl=False)
|
||||
connector.verify_ssl=False
|
||||
session = aiohttp.ClientSession(connector=connector)
|
||||
|
||||
sem = asyncio.Semaphore(100)
|
||||
loop = asyncio.get_event_loop()
|
||||
tasks = []
|
||||
|
||||
# check sites without engine if they look like sites on this engine
|
||||
new_engine_sites = []
|
||||
for site_name, site_data in sites.items():
|
||||
if site_data.engine:
|
||||
continue
|
||||
|
||||
future = session.get(url=site_data.url_main,
|
||||
allow_redirects=True,
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
check_engine_coro = check_engine_of_site(site_name, new_engine_sites, future, engine_name, sem, logger)
|
||||
future = asyncio.ensure_future(check_engine_coro)
|
||||
tasks.append(future)
|
||||
|
||||
# progress bar
|
||||
for f in tqdm.asyncio.tqdm.as_completed(tasks):
|
||||
loop.run_until_complete(f)
|
||||
|
||||
print(f'Total detected {len(new_engine_sites)} sites on engine {engine_name}')
|
||||
# dict with new found engine sites
|
||||
new_sites = {site_name: sites[site_name] for site_name in new_engine_sites}
|
||||
|
||||
# update sites obj from engine
|
||||
for site_name, site in new_sites.items():
|
||||
site.request_future = None
|
||||
site.engine = engine_name
|
||||
site.update_from_engine(engine_obj)
|
||||
|
||||
async def update_site_data(site_name, site_data, all_sites, logger, no_progressbar):
|
||||
updates = await site_self_check(site_name, site_data, logger, no_progressbar)
|
||||
all_sites[site_name].update(updates)
|
||||
|
||||
tasks = []
|
||||
# for new_site_name, new_site_data in new_sites.items():
|
||||
# coro = update_site_data(new_site_name, new_site_data, new_sites, logger)
|
||||
# future = asyncio.ensure_future(coro)
|
||||
# tasks.append(future)
|
||||
|
||||
# asyncio.gather(*tasks)
|
||||
for new_site_name, new_site_data in new_sites.items():
|
||||
coro = update_site_data(new_site_name, new_site_data, new_sites, logger, no_progressbar=True)
|
||||
loop.run_until_complete(coro)
|
||||
|
||||
updated_sites_count = 0
|
||||
|
||||
for s in new_sites:
|
||||
site = new_sites[s]
|
||||
site.request_future = None
|
||||
|
||||
if site.disabled:
|
||||
print(f'{site.name} failed username checking of engine {engine_name}')
|
||||
continue
|
||||
|
||||
site = site.strip_engine_data()
|
||||
|
||||
db.update_site(site)
|
||||
updated_sites_count += 1
|
||||
db.save_to_file(args.base_file)
|
||||
|
||||
print(f'Site "{s}": ' + json.dumps(site.json, indent=4))
|
||||
|
||||
print(f'Updated total {updated_sites_count} sites!')
|
||||
print(f'Checking all sites on engine {engine_name}')
|
||||
|
||||
loop.run_until_complete(session.close())
|
||||
|
||||
print("\nFinished updating supported site listing!")
|
||||
Reference in New Issue
Block a user