Make xhtml2pdf optional, fix install on Linux without libcairo (#2659)

* Make xhtml2pdf optional, fix install on Linux without libcairo

Move xhtml2pdf to the new [pdf] extra so default `pip install maigret`
no longer pulls pycairo (which has no Linux/macOS wheels and breaks the
build without libcairo2-dev). save_pdf_report now raises a clear
RuntimeError pointing to `pip install 'maigret[pdf]'`, and the CLI
turns it into a friendly warning instead of a crash. Adds tests
covering the missing-extra path, plus per-OS install docs.

Fix for #2657, #2534

* Make arabic-reshaper and python-bidi optional; idempotent update of db_meta.json and sites.md

* Regenerated poerty.lock

* Update CI workflow to cover minimal installation without PDF deps
This commit is contained in:
Soxoj
2026-05-15 14:33:55 +02:00
committed by GitHub
parent bf84125f3a
commit a7338e97f3
13 changed files with 749 additions and 155 deletions
+116 -83
View File
@@ -3,6 +3,9 @@
This module generates the listing of supported sites in file `SITES.md`
and pretty prints file with sites data.
"""
import io
import os
import re
import sys
import socket
import requests
@@ -13,6 +16,35 @@ from datetime import datetime, timezone
from argparse import ArgumentParser, RawDescriptionHelpFormatter
from maigret.maigret import MaigretDatabase
from utils.generate_db_meta import write_meta_if_changed
SITES_MD_DATE_RE = re.compile(r'\nThe list was updated at \(\d{4}-\d{2}-\d{2}\)\n')
SITES_MD_DATE_PLACEHOLDER = '\nThe list was updated at (DATE)\n'
def sites_md_payload_equals(a: str, b: str) -> bool:
"""Compare two sites.md bodies ignoring the volatile 'updated at' date."""
return SITES_MD_DATE_RE.sub(SITES_MD_DATE_PLACEHOLDER, a) == SITES_MD_DATE_RE.sub(SITES_MD_DATE_PLACEHOLDER, b)
def write_sites_md_if_changed(content: str, path: str) -> bool:
"""Write `content` to `path` only if it differs from the existing file
by something other than the 'updated at' date. Returns True if a write
happened. Keeps the precommit hook from rewriting the file when the
site database itself hasn't moved.
"""
if os.path.exists(path):
try:
with open(path, "r", encoding="utf-8") as f:
existing = f.read()
except OSError:
existing = None
if existing is not None and sites_md_payload_equals(existing, content):
return False
with open(path, "w", encoding="utf-8") as f:
f.write(content)
return True
RANKS = {str(i):str(i) for i in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 50, 100, 500]}
RANKS.update({
@@ -142,104 +174,105 @@ def main():
print(f"\nUpdating supported sites list (don't worry, it's needed)...")
with open("sites.md", "w") as site_file:
site_file.write(f"""
site_file = io.StringIO()
site_file.write(f"""
## List of supported sites (search methods): total {len(sites_subset)}\n
Rank data fetched from Majestic Million by domains.
""")
if args.dns_check:
print("Checking DNS resolution for all site domains...")
failed = check_sites_dns(sites_subset)
disabled_count = 0
re_enabled_count = 0
for site in sites_subset:
if site.name in failed:
if not site.disabled:
site.disabled = True
disabled_count += 1
print(f" Disabled {site.name}: DNS does not resolve ({get_base_domain(site.url_main)})")
else:
if site.disabled:
# Re-enable previously disabled site if DNS now resolves
# (only if it was likely disabled due to DNS failure)
pass
print(f"DNS check complete: {disabled_count} site(s) disabled, {len(failed)} domain(s) unresolvable.")
majestic_ranks = {}
if args.with_rank:
majestic_ranks = fetch_majestic_million()
if args.dns_check:
print("Checking DNS resolution for all site domains...")
failed = check_sites_dns(sites_subset)
disabled_count = 0
re_enabled_count = 0
for site in sites_subset:
if not args.with_rank:
break
if site.alexa_rank < sys.maxsize and args.empty_only:
continue
if args.exclude_engine_list and site.engine in args.exclude_engine_list:
continue
domain = get_base_domain(site.url_main)
if domain in majestic_ranks:
site.alexa_rank = majestic_ranks[domain]
if site.name in failed:
if not site.disabled:
site.disabled = True
disabled_count += 1
print(f" Disabled {site.name}: DNS does not resolve ({get_base_domain(site.url_main)})")
else:
site.alexa_rank = sys.maxsize
# In memory matching complete, no threads to join
if args.with_rank:
print("Successfully updated ranks matching Majestic Million dataset.")
if site.disabled:
# Re-enable previously disabled site if DNS now resolves
# (only if it was likely disabled due to DNS failure)
pass
print(f"DNS check complete: {disabled_count} site(s) disabled, {len(failed)} domain(s) unresolvable.")
sites_full_list = [(s, int(s.alexa_rank)) for s in sites_subset]
majestic_ranks = {}
if args.with_rank:
majestic_ranks = fetch_majestic_million()
sites_full_list.sort(reverse=False, key=lambda x: x[1])
for site in sites_subset:
if not args.with_rank:
break
while sites_full_list[0][1] == 0:
site = sites_full_list.pop(0)
sites_full_list.append(site)
if site.alexa_rank < sys.maxsize and args.empty_only:
continue
if args.exclude_engine_list and site.engine in args.exclude_engine_list:
continue
for num, site_tuple in enumerate(sites_full_list):
site, rank = site_tuple
url_main = site.url_main
valid_rank = get_step_rank(rank)
all_tags = site.tags
all_tags.sort()
tags = ', ' + ', '.join(all_tags) if all_tags else ''
note = ''
if site.disabled:
note = ', search is disabled'
domain = get_base_domain(site.url_main)
favicon = f"![](https://www.google.com/s2/favicons?domain={url_main})"
site_file.write(f'1. {favicon} [{site}]({url_main})*: top {valid_rank}{tags}*{note}\n')
db.update_site(site)
if domain in majestic_ranks:
site.alexa_rank = majestic_ranks[domain]
else:
site.alexa_rank = sys.maxsize
site_file.write(f'\nThe list was updated at ({datetime.now(timezone.utc).date()})\n')
db.save_to_file(args.base_file)
# In memory matching complete, no threads to join
if args.with_rank:
print("Successfully updated ranks matching Majestic Million dataset.")
# Regenerate db_meta.json to stay in sync with data.json
try:
import hashlib, json, os
db_data_raw = open(args.base_file, 'rb').read()
db_data_parsed = json.loads(db_data_raw)
meta = {
"version": 1,
"updated_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
"sites_count": len(db_data_parsed.get("sites", {})),
"min_maigret_version": "0.5.0",
"data_sha256": hashlib.sha256(db_data_raw).hexdigest(),
"data_url": "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/data.json",
}
meta_path = os.path.join(os.path.dirname(args.base_file), "db_meta.json")
with open(meta_path, "w", encoding="utf-8") as mf:
json.dump(meta, mf, indent=4, ensure_ascii=False)
sites_full_list = [(s, int(s.alexa_rank)) for s in sites_subset]
sites_full_list.sort(reverse=False, key=lambda x: x[1])
while sites_full_list[0][1] == 0:
site = sites_full_list.pop(0)
sites_full_list.append(site)
for num, site_tuple in enumerate(sites_full_list):
site, rank = site_tuple
url_main = site.url_main
valid_rank = get_step_rank(rank)
all_tags = site.tags
all_tags.sort()
tags = ', ' + ', '.join(all_tags) if all_tags else ''
note = ''
if site.disabled:
note = ', search is disabled'
favicon = f"![](https://www.google.com/s2/favicons?domain={url_main})"
site_file.write(f'1. {favicon} [{site}]({url_main})*: top {valid_rank}{tags}*{note}\n')
db.update_site(site)
site_file.write(f'\nThe list was updated at ({datetime.now(timezone.utc).date()})\n')
db.save_to_file(args.base_file)
statistics_text = db.get_db_stats(is_markdown=True)
site_file.write('## Statistics\n\n')
site_file.write(statistics_text)
sites_md_written = write_sites_md_if_changed(site_file.getvalue(), "sites.md")
if not sites_md_written:
print("sites.md unchanged, skipping write")
# Regenerate db_meta.json to stay in sync with data.json — also a no-op
# if only the timestamp would change.
try:
meta_path = os.path.join(os.path.dirname(args.base_file), "db_meta.json")
meta, meta_written = write_meta_if_changed(
args.base_file,
meta_path,
min_version="0.5.0",
data_url="https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/data.json",
)
if meta_written:
print(f"Updated {meta_path} ({meta['sites_count']} sites)")
except Exception as e:
print(f"Warning: could not regenerate db_meta.json: {e}")
statistics_text = db.get_db_stats(is_markdown=True)
site_file.write('## Statistics\n\n')
site_file.write(statistics_text)
else:
print(f"{meta_path} unchanged, skipping write")
except Exception as e:
print(f"Warning: could not regenerate db_meta.json: {e}")
print("Finished updating supported site listing!")