Files
maigret/utils/update_site_data.py
T
Soxoj a7338e97f3 Make xhtml2pdf optional, fix install on Linux without libcairo (#2659)
* Make xhtml2pdf optional, fix install on Linux without libcairo

Move xhtml2pdf to the new [pdf] extra so default `pip install maigret`
no longer pulls pycairo (which has no Linux/macOS wheels and breaks the
build without libcairo2-dev). save_pdf_report now raises a clear
RuntimeError pointing to `pip install 'maigret[pdf]'`, and the CLI
turns it into a friendly warning instead of a crash. Adds tests
covering the missing-extra path, plus per-OS install docs.

Fix for #2657, #2534

* Make arabic-reshaper and python-bidi optional; idempotent update of db_meta.json and sites.md

* Regenerated poerty.lock

* Update CI workflow to cover minimal installation without PDF deps
2026-05-15 14:33:55 +02:00

282 lines
9.2 KiB
Python
Executable File

#!/usr/bin/env python3
"""Maigret: Supported Site Listing with Alexa ranking and country tags
This module generates the listing of supported sites in file `SITES.md`
and pretty prints file with sites data.
"""
import io
import os
import re
import sys
import socket
import requests
import logging
import threading
import xml.etree.ElementTree as ET
from datetime import datetime, timezone
from argparse import ArgumentParser, RawDescriptionHelpFormatter
from maigret.maigret import MaigretDatabase
from utils.generate_db_meta import write_meta_if_changed
SITES_MD_DATE_RE = re.compile(r'\nThe list was updated at \(\d{4}-\d{2}-\d{2}\)\n')
SITES_MD_DATE_PLACEHOLDER = '\nThe list was updated at (DATE)\n'
def sites_md_payload_equals(a: str, b: str) -> bool:
"""Compare two sites.md bodies ignoring the volatile 'updated at' date."""
return SITES_MD_DATE_RE.sub(SITES_MD_DATE_PLACEHOLDER, a) == SITES_MD_DATE_RE.sub(SITES_MD_DATE_PLACEHOLDER, b)
def write_sites_md_if_changed(content: str, path: str) -> bool:
"""Write `content` to `path` only if it differs from the existing file
by something other than the 'updated at' date. Returns True if a write
happened. Keeps the precommit hook from rewriting the file when the
site database itself hasn't moved.
"""
if os.path.exists(path):
try:
with open(path, "r", encoding="utf-8") as f:
existing = f.read()
except OSError:
existing = None
if existing is not None and sites_md_payload_equals(existing, content):
return False
with open(path, "w", encoding="utf-8") as f:
f.write(content)
return True
RANKS = {str(i):str(i) for i in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 50, 100, 500]}
RANKS.update({
'1000': '1K',
'5000': '5K',
'10000': '10K',
'100000': '100K',
'10000000': '10M',
'50000000': '50M',
'100000000': '100M',
})
import csv
import io
from urllib.parse import urlparse
def fetch_majestic_million():
print("Fetching Majestic Million CSV (this may take a few seconds)...")
ranks = {}
url = "https://downloads.majestic.com/majestic_million.csv"
try:
response = requests.get(url, stream=True)
response.raise_for_status()
csv_file = io.StringIO(response.text)
reader = csv.reader(csv_file)
next(reader) # skip headers
for row in reader:
if not row or len(row) < 3:
continue
rank = int(row[0])
domain = row[2].lower()
ranks[domain] = rank
except Exception as e:
logging.error(f"Error fetching Majestic Million: {e}")
print(f"Loaded {len(ranks)} domains from Majestic Million.")
return ranks
def get_base_domain(url):
try:
netloc = urlparse(url).netloc
if netloc.startswith('www.'):
netloc = netloc[4:]
return netloc.lower()
except Exception:
return ""
def check_dns(domain, timeout=5):
"""Check if a domain resolves via DNS. Returns True if it resolves."""
try:
socket.setdefaulttimeout(timeout)
socket.getaddrinfo(domain, None)
return True
except (socket.gaierror, socket.timeout, OSError):
return False
def check_sites_dns(sites):
"""Check DNS resolution for all sites. Returns a set of site names that failed."""
SKIP_TLDS = ('.onion', '.i2p')
domains = {}
for site in sites:
domain = get_base_domain(site.url_main)
if domain and not any(domain.endswith(tld) for tld in SKIP_TLDS):
domains.setdefault(domain, []).append(site)
failed_sites = set()
results = {}
def resolve(domain):
results[domain] = check_dns(domain)
threads = []
for domain in domains:
t = threading.Thread(target=resolve, args=(domain,))
threads.append(t)
t.start()
for t in threads:
t.join()
for domain, resolved in results.items():
if not resolved:
for site in domains[domain]:
failed_sites.add(site.name)
logging.warning(f"DNS resolution failed for {domain}")
return failed_sites
def get_step_rank(rank):
def get_readable_rank(r):
return RANKS[str(r)]
valid_step_ranks = sorted(map(int, RANKS.keys()))
if rank == 0 or rank == sys.maxsize:
return get_readable_rank(valid_step_ranks[-1])
else:
return get_readable_rank(list(filter(lambda x: x >= rank, valid_step_ranks))[0])
def main():
parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter
)
parser.add_argument("--base","-b", metavar="BASE_FILE",
dest="base_file", default="maigret/resources/data.json",
help="JSON file with sites data to update.")
parser.add_argument('--with-rank', help='update with use of local data only', action='store_true')
parser.add_argument('--empty-only', help='update only sites without rating', action='store_true')
parser.add_argument('--exclude-engine', help='do not update score with certain engine',
action="append", dest="exclude_engine_list", default=[])
parser.add_argument('--dns-check', help='disable sites whose domains do not resolve via DNS',
action='store_true')
pool = list()
args = parser.parse_args()
db = MaigretDatabase()
sites_subset = db.load_from_file(args.base_file).sites
print(f"\nUpdating supported sites list (don't worry, it's needed)...")
site_file = io.StringIO()
site_file.write(f"""
## List of supported sites (search methods): total {len(sites_subset)}\n
Rank data fetched from Majestic Million by domains.
""")
if args.dns_check:
print("Checking DNS resolution for all site domains...")
failed = check_sites_dns(sites_subset)
disabled_count = 0
re_enabled_count = 0
for site in sites_subset:
if site.name in failed:
if not site.disabled:
site.disabled = True
disabled_count += 1
print(f" Disabled {site.name}: DNS does not resolve ({get_base_domain(site.url_main)})")
else:
if site.disabled:
# Re-enable previously disabled site if DNS now resolves
# (only if it was likely disabled due to DNS failure)
pass
print(f"DNS check complete: {disabled_count} site(s) disabled, {len(failed)} domain(s) unresolvable.")
majestic_ranks = {}
if args.with_rank:
majestic_ranks = fetch_majestic_million()
for site in sites_subset:
if not args.with_rank:
break
if site.alexa_rank < sys.maxsize and args.empty_only:
continue
if args.exclude_engine_list and site.engine in args.exclude_engine_list:
continue
domain = get_base_domain(site.url_main)
if domain in majestic_ranks:
site.alexa_rank = majestic_ranks[domain]
else:
site.alexa_rank = sys.maxsize
# In memory matching complete, no threads to join
if args.with_rank:
print("Successfully updated ranks matching Majestic Million dataset.")
sites_full_list = [(s, int(s.alexa_rank)) for s in sites_subset]
sites_full_list.sort(reverse=False, key=lambda x: x[1])
while sites_full_list[0][1] == 0:
site = sites_full_list.pop(0)
sites_full_list.append(site)
for num, site_tuple in enumerate(sites_full_list):
site, rank = site_tuple
url_main = site.url_main
valid_rank = get_step_rank(rank)
all_tags = site.tags
all_tags.sort()
tags = ', ' + ', '.join(all_tags) if all_tags else ''
note = ''
if site.disabled:
note = ', search is disabled'
favicon = f"![](https://www.google.com/s2/favicons?domain={url_main})"
site_file.write(f'1. {favicon} [{site}]({url_main})*: top {valid_rank}{tags}*{note}\n')
db.update_site(site)
site_file.write(f'\nThe list was updated at ({datetime.now(timezone.utc).date()})\n')
db.save_to_file(args.base_file)
statistics_text = db.get_db_stats(is_markdown=True)
site_file.write('## Statistics\n\n')
site_file.write(statistics_text)
sites_md_written = write_sites_md_if_changed(site_file.getvalue(), "sites.md")
if not sites_md_written:
print("sites.md unchanged, skipping write")
# Regenerate db_meta.json to stay in sync with data.json — also a no-op
# if only the timestamp would change.
try:
meta_path = os.path.join(os.path.dirname(args.base_file), "db_meta.json")
meta, meta_written = write_meta_if_changed(
args.base_file,
meta_path,
min_version="0.5.0",
data_url="https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/data.json",
)
if meta_written:
print(f"Updated {meta_path} ({meta['sites_count']} sites)")
else:
print(f"{meta_path} unchanged, skipping write")
except Exception as e:
print(f"Warning: could not regenerate db_meta.json: {e}")
print("Finished updating supported site listing!")
if __name__ == '__main__':
main()