#!/usr/bin/env python3 """Maigret: Supported Site Listing with Alexa ranking and country tags This module generates the listing of supported sites in file `SITES.md` and pretty prints file with sites data. """ import asyncio import json import logging from argparse import ArgumentParser, RawDescriptionHelpFormatter from maigret.maigret import get_response from maigret.sites import MaigretDatabase, MaigretEngine async def check_engine_of_site(site_name, sites_with_engines, future, engine_name, semaphore, logger): async with semaphore: response = await get_response(request_future=future, site_name=site_name, logger=logger) html_text, status_code, error_text, expection_text = response if html_text and engine_name in html_text: sites_with_engines.append(site_name) return True return False if __name__ == '__main__': parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter ) parser.add_argument("--base","-b", metavar="BASE_FILE", dest="base_file", default="maigret/resources/data.json", help="JSON file with sites data to update.") parser.add_argument('--engine', '-e', help='check only selected engine', type=str) args = parser.parse_args() log_level = logging.INFO logging.basicConfig( format='[%(filename)s:%(lineno)d] %(levelname)-3s %(asctime)s %(message)s', datefmt='%H:%M:%S', level=log_level ) logger = logging.getLogger('engines-check') logger.setLevel(log_level) db = MaigretDatabase() sites_subset = db.load_from_file(args.base_file).sites sites = {site.name: site for site in sites_subset} with open(args.base_file, "r", encoding="utf-8") as data_file: sites_info = json.load(data_file) engines = sites_info['engines'] for engine_name, engine_data in engines.items(): if args.engine and args.engine != engine_name: continue if not 'presenseStrs' in engine_data: print(f'No features to automatically detect sites on engine {engine_name}') continue engine_obj = MaigretEngine(engine_name, engine_data) # setup connections for checking both engine and usernames connector = aiohttp.TCPConnector(ssl=False) connector.verify_ssl=False session = aiohttp.ClientSession(connector=connector) sem = asyncio.Semaphore(100) loop = asyncio.get_event_loop() tasks = [] # check sites without engine if they look like sites on this engine new_engine_sites = [] for site_name, site_data in sites.items(): if site_data.engine: continue future = session.get(url=site_data.url_main, allow_redirects=True, timeout=10, ) check_engine_coro = check_engine_of_site(site_name, new_engine_sites, future, engine_name, sem, logger) future = asyncio.ensure_future(check_engine_coro) tasks.append(future) # progress bar with alive_progress(len(tasks), title='Checking sites') as progress: for f in asyncio.as_completed(tasks): loop.run_until_complete(f) progress() print(f'Total detected {len(new_engine_sites)} sites on engine {engine_name}') # dict with new found engine sites new_sites = {site_name: sites[site_name] for site_name in new_engine_sites} # update sites obj from engine for site_name, site in new_sites.items(): site.request_future = None site.engine = engine_name site.update_from_engine(engine_obj) async def update_site_data(site_name, site_data, all_sites, logger, no_progressbar): updates = await site_self_check(site_name, site_data, logger, no_progressbar) all_sites[site_name].update(updates) tasks = [] # for new_site_name, new_site_data in new_sites.items(): # coro = update_site_data(new_site_name, new_site_data, new_sites, logger) # future = asyncio.ensure_future(coro) # tasks.append(future) # asyncio.gather(*tasks) for new_site_name, new_site_data in new_sites.items(): coro = update_site_data(new_site_name, new_site_data, new_sites, logger, no_progressbar=True) loop.run_until_complete(coro) updated_sites_count = 0 for s in new_sites: site = new_sites[s] site.request_future = None if site.disabled: print(f'{site.name} failed username checking of engine {engine_name}') continue site = site.strip_engine_data() db.update_site(site) updated_sites_count += 1 db.save_to_file(args.base_file) print(f'Site "{s}": ' + json.dumps(site.json, indent=4)) print(f'Updated total {updated_sites_count} sites!') print(f'Checking all sites on engine {engine_name}') loop.run_until_complete(session.close()) print("\nFinished updating supported site listing!")