Make xhtml2pdf optional, fix install on Linux without libcairo (#2659)

* Make xhtml2pdf optional, fix install on Linux without libcairo

Move xhtml2pdf to the new [pdf] extra so default `pip install maigret`
no longer pulls pycairo (which has no Linux/macOS wheels and breaks the
build without libcairo2-dev). save_pdf_report now raises a clear
RuntimeError pointing to `pip install 'maigret[pdf]'`, and the CLI
turns it into a friendly warning instead of a crash. Adds tests
covering the missing-extra path, plus per-OS install docs.

Fix for #2657, #2534

* Make arabic-reshaper and python-bidi optional; idempotent update of db_meta.json and sites.md

* Regenerated poerty.lock

* Update CI workflow to cover minimal installation without PDF deps
This commit is contained in:
Soxoj
2026-05-15 14:33:55 +02:00
committed by GitHub
parent bf84125f3a
commit a7338e97f3
13 changed files with 749 additions and 155 deletions
+223
View File
@@ -0,0 +1,223 @@
"""Tests for the 'don't rewrite files unless content actually changed' logic
in utils.generate_db_meta and utils.update_site_data. The point is to keep
sites.md and db_meta.json untouched when only the embedded timestamp/date
would change — so a precommit hook doesn't end up staging a no-op diff
every time someone runs the updater.
"""
import json
from datetime import datetime, timezone
from utils.generate_db_meta import (
build_meta,
meta_payload_equals,
write_meta_if_changed,
)
from utils.update_site_data import (
sites_md_payload_equals,
write_sites_md_if_changed,
)
# ---------------------------------------------------------------------------
# generate_db_meta
# ---------------------------------------------------------------------------
def _write_data_json(path, sites):
with open(path, "w", encoding="utf-8") as f:
json.dump({"sites": sites}, f)
def test_meta_payload_equals_ignores_timestamp():
a = {"sites_count": 10, "data_sha256": "abc", "updated_at": "2026-01-01T00:00:00Z"}
b = {"sites_count": 10, "data_sha256": "abc", "updated_at": "2027-12-31T23:59:59Z"}
assert meta_payload_equals(a, b)
def test_meta_payload_equals_detects_real_change():
a = {"sites_count": 10, "data_sha256": "abc", "updated_at": "2026-01-01T00:00:00Z"}
b = {"sites_count": 11, "data_sha256": "abc", "updated_at": "2026-01-01T00:00:00Z"}
assert not meta_payload_equals(a, b)
def test_write_meta_creates_file_when_missing(tmp_path):
data_path = tmp_path / "data.json"
meta_path = tmp_path / "db_meta.json"
_write_data_json(data_path, {"GitHub": {}})
meta, written = write_meta_if_changed(
str(data_path), str(meta_path), "0.6.0", "https://example/data.json"
)
assert written is True
assert meta_path.exists()
on_disk = json.loads(meta_path.read_text())
assert on_disk["sites_count"] == 1
assert on_disk["updated_at"] == meta["updated_at"]
def test_write_meta_skips_when_only_timestamp_would_change(tmp_path):
data_path = tmp_path / "data.json"
meta_path = tmp_path / "db_meta.json"
_write_data_json(data_path, {"GitHub": {}})
# First write seeds the file with an old timestamp.
old = datetime(2026, 1, 1, tzinfo=timezone.utc)
_, written_first = write_meta_if_changed(
str(data_path), str(meta_path), "0.6.0", "https://example/data.json", now=old
)
assert written_first is True
seeded_bytes = meta_path.read_bytes()
# Second call with a NEW `now` but identical data.json — must be a no-op.
new = datetime(2027, 6, 15, tzinfo=timezone.utc)
_, written_second = write_meta_if_changed(
str(data_path), str(meta_path), "0.6.0", "https://example/data.json", now=new
)
assert written_second is False
# File on disk is byte-for-byte the same — including the OLD timestamp.
assert meta_path.read_bytes() == seeded_bytes
on_disk = json.loads(meta_path.read_text())
assert on_disk["updated_at"] == "2026-01-01T00:00:00Z"
def test_write_meta_writes_when_data_sha256_changes(tmp_path):
data_path = tmp_path / "data.json"
meta_path = tmp_path / "db_meta.json"
_write_data_json(data_path, {"GitHub": {}})
write_meta_if_changed(
str(data_path),
str(meta_path),
"0.6.0",
"https://example/data.json",
now=datetime(2026, 1, 1, tzinfo=timezone.utc),
)
# Real change to data.json — sha256 + sites_count both move.
_write_data_json(data_path, {"GitHub": {}, "GitLab": {}})
new_now = datetime(2027, 6, 15, tzinfo=timezone.utc)
meta, written = write_meta_if_changed(
str(data_path), str(meta_path), "0.6.0", "https://example/data.json", now=new_now
)
assert written is True
on_disk = json.loads(meta_path.read_text())
assert on_disk["sites_count"] == 2
assert on_disk["updated_at"] == "2027-06-15T00:00:00Z"
def test_write_meta_writes_when_min_version_changes(tmp_path):
data_path = tmp_path / "data.json"
meta_path = tmp_path / "db_meta.json"
_write_data_json(data_path, {"GitHub": {}})
write_meta_if_changed(
str(data_path),
str(meta_path),
"0.5.0",
"https://example/data.json",
now=datetime(2026, 1, 1, tzinfo=timezone.utc),
)
_, written = write_meta_if_changed(
str(data_path),
str(meta_path),
"0.6.0", # bumped
"https://example/data.json",
now=datetime(2026, 1, 2, tzinfo=timezone.utc),
)
assert written is True
on_disk = json.loads(meta_path.read_text())
assert on_disk["min_maigret_version"] == "0.6.0"
def test_write_meta_writes_when_existing_file_is_corrupt(tmp_path):
data_path = tmp_path / "data.json"
meta_path = tmp_path / "db_meta.json"
_write_data_json(data_path, {"GitHub": {}})
meta_path.write_text("this is not valid json")
_, written = write_meta_if_changed(
str(data_path), str(meta_path), "0.6.0", "https://example/data.json"
)
assert written is True
json.loads(meta_path.read_text()) # now parseable
def test_build_meta_uses_provided_now(tmp_path):
data_path = tmp_path / "data.json"
_write_data_json(data_path, {"GitHub": {}})
fixed = datetime(2030, 7, 4, 12, 0, 0, tzinfo=timezone.utc)
meta = build_meta(str(data_path), "0.6.0", "https://example/data.json", now=fixed)
assert meta["updated_at"] == "2030-07-04T12:00:00Z"
# ---------------------------------------------------------------------------
# update_site_data
# ---------------------------------------------------------------------------
_SITES_MD_TEMPLATE = (
"## List of supported sites (search methods): total 1\n\n"
"Rank data fetched from Majestic Million by domains.\n\n"
"1. [GitHub](https://github.com/)*: top 100*\n"
"\nThe list was updated at ({date})\n"
"## Statistics\n\n"
"Some stats.\n"
)
def test_sites_md_payload_equals_ignores_date():
a = _SITES_MD_TEMPLATE.format(date="2026-01-01")
b = _SITES_MD_TEMPLATE.format(date="2027-12-31")
assert sites_md_payload_equals(a, b)
def test_sites_md_payload_equals_detects_body_change():
a = _SITES_MD_TEMPLATE.format(date="2026-01-01")
b = a.replace("GitHub", "GitLab")
assert not sites_md_payload_equals(a, b)
def test_write_sites_md_creates_file_when_missing(tmp_path):
target = tmp_path / "sites.md"
content = _SITES_MD_TEMPLATE.format(date="2026-05-15")
written = write_sites_md_if_changed(content, str(target))
assert written is True
assert target.read_text() == content
def test_write_sites_md_skips_when_only_date_would_change(tmp_path):
target = tmp_path / "sites.md"
seeded = _SITES_MD_TEMPLATE.format(date="2026-01-01")
target.write_text(seeded)
# New content has a different date but identical body.
new_content = _SITES_MD_TEMPLATE.format(date="2027-12-31")
written = write_sites_md_if_changed(new_content, str(target))
assert written is False
# File untouched, including the OLD date.
assert target.read_text() == seeded
def test_write_sites_md_writes_when_body_changes(tmp_path):
target = tmp_path / "sites.md"
target.write_text(_SITES_MD_TEMPLATE.format(date="2026-01-01"))
new_content = _SITES_MD_TEMPLATE.format(date="2026-01-01").replace(
"GitHub", "GitLab"
)
written = write_sites_md_if_changed(new_content, str(target))
assert written is True
assert "GitLab" in target.read_text()
assert "GitHub" not in target.read_text()
+70
View File
@@ -3,6 +3,9 @@
import copy
import json
import os
import subprocess
import sys
import textwrap
import pytest
from io import StringIO
@@ -442,6 +445,73 @@ def test_pdf_report():
assert os.path.exists(report_name)
def test_save_pdf_report_raises_helpful_error_without_xhtml2pdf(
monkeypatch, tmp_path
):
# Setting an entry to None makes a subsequent `import` raise ImportError —
# this simulates the optional 'pdf' extra not being installed without
# actually uninstalling xhtml2pdf from the test environment.
monkeypatch.setitem(sys.modules, 'xhtml2pdf', None)
monkeypatch.setitem(sys.modules, 'xhtml2pdf.pisa', None)
context = generate_report_context(TEST)
target = tmp_path / "report.pdf"
with pytest.raises(RuntimeError) as excinfo:
save_pdf_report(str(target), context)
msg = str(excinfo.value)
assert "maigret[pdf]" in msg
assert "pip install" in msg
assert not target.exists()
def test_xhtml2pdf_is_not_module_level_dependency():
# Guard against a regression where someone hoists `import xhtml2pdf` /
# `from xhtml2pdf import pisa` to the top of maigret/report.py — that
# would force every Maigret user to install the optional extra.
import maigret.report as report_module
module_globals = vars(report_module)
assert 'xhtml2pdf' not in module_globals
assert 'pisa' not in module_globals
def test_import_maigret_without_pdf_extras():
# End-to-end check: spawn a fresh interpreter with every package in the
# [pdf] extra blocked before any maigret module is loaded, and confirm
# the package, the report module, and save_pdf_report itself all import
# cleanly. Mirrors what a user who ran `pip install maigret` (without
# [pdf]) would experience.
code = textwrap.dedent(
"""
import sys
for name in (
'xhtml2pdf', 'xhtml2pdf.pisa',
'arabic_reshaper',
'bidi', 'bidi.algorithm',
):
sys.modules[name] = None
import maigret
import maigret.report
from maigret.report import save_pdf_report
assert callable(save_pdf_report)
print("OK")
"""
)
result = subprocess.run(
[sys.executable, "-c", code],
capture_output=True,
text=True,
)
assert result.returncode == 0, (
f"stdout={result.stdout!r} stderr={result.stderr!r}"
)
assert "OK" in result.stdout
def test_text_report():
context = generate_report_context(TEST)
report_text = get_plaintext_report(context)