Parallel execution optimization (#1897)

* Connection failure fix: removed futures, added semaphores

* Additional fixes

* Tqdm replace to alive_progress, poetry update

* Self-check mode fix, tests fixes

* Sites checks fixes (#1896)

* Fixed incorrect site names, added method to compare sites
This commit is contained in:
Soxoj
2024-11-26 13:55:12 +01:00
committed by GitHub
parent b370bc4c44
commit 324c118530
10 changed files with 1301 additions and 1134 deletions
+6 -14
View File
@@ -3,23 +3,13 @@
This module generates the listing of supported sites in file `SITES.md`
and pretty prints file with sites data.
"""
import aiohttp
import asyncio
import json
import sys
import requests
import logging
import threading
import xml.etree.ElementTree as ET
from datetime import datetime
from argparse import ArgumentParser, RawDescriptionHelpFormatter
import tqdm.asyncio
from maigret.maigret import get_response, site_self_check
from maigret.sites import MaigretSite, MaigretDatabase, MaigretEngine
from maigret.utils import CaseConverter
from maigret.maigret import get_response
from maigret.sites import MaigretDatabase, MaigretEngine
async def check_engine_of_site(site_name, sites_with_engines, future, engine_name, semaphore, logger):
async with semaphore:
@@ -98,8 +88,10 @@ if __name__ == '__main__':
tasks.append(future)
# progress bar
for f in tqdm.asyncio.tqdm.as_completed(tasks):
loop.run_until_complete(f)
with alive_progress(len(tasks), title='Checking sites') as progress:
for f in asyncio.as_completed(tasks):
loop.run_until_complete(f)
progress()
print(f'Total detected {len(new_engine_sites)} sites on engine {engine_name}')
# dict with new found engine sites
+5 -3
View File
@@ -3,7 +3,7 @@ import json
import random
import re
import tqdm.asyncio
import alive_progress
from mock import Mock
import requests
@@ -181,7 +181,7 @@ if __name__ == '__main__':
raw_maigret_data = json.dumps({site.name: site.json for site in sites_subset})
new_sites = []
for site in tqdm.asyncio.tqdm(urls):
for site in alive_progress.alive_it(urls):
site_lowercase = site.lower()
domain_raw = URL_RE.sub('', site_lowercase).strip().strip('/')
@@ -271,7 +271,9 @@ if __name__ == '__main__':
future = asyncio.ensure_future(check_coro)
tasks.append(future)
for f in tqdm.asyncio.tqdm.as_completed(tasks, timeout=TIMEOUT):
with alive_progress(len(tasks), title='Checking sites') as progress:
for f in asyncio.as_completed(tasks):
progress()
try:
loop.run_until_complete(f)
except asyncio.exceptions.TimeoutError:
+1 -1
View File
@@ -137,7 +137,7 @@ Rank data fetched from Alexa by domains.
site_file.write(f'1. {favicon} [{site}]({url_main})*: top {valid_rank}{tags}*{note}\n')
db.update_site(site)
site_file.write(f'\nThe list was updated at ({datetime.now(timezone.utc)} UTC)\n')
site_file.write(f'\nThe list was updated at ({datetime.now(timezone.utc).date()} UTC)\n')
db.save_to_file(args.base_file)
statistics_text = db.get_db_stats(is_markdown=True)