mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-16 03:15:40 +00:00
Make xhtml2pdf optional, fix install on Linux without libcairo (#2659)
* Make xhtml2pdf optional, fix install on Linux without libcairo Move xhtml2pdf to the new [pdf] extra so default `pip install maigret` no longer pulls pycairo (which has no Linux/macOS wheels and breaks the build without libcairo2-dev). save_pdf_report now raises a clear RuntimeError pointing to `pip install 'maigret[pdf]'`, and the CLI turns it into a friendly warning instead of a crash. Adds tests covering the missing-extra path, plus per-OS install docs. Fix for #2657, #2534 * Make arabic-reshaper and python-bidi optional; idempotent update of db_meta.json and sites.md * Regenerated poerty.lock * Update CI workflow to cover minimal installation without PDF deps
This commit is contained in:
@@ -0,0 +1,223 @@
|
||||
"""Tests for the 'don't rewrite files unless content actually changed' logic
|
||||
in utils.generate_db_meta and utils.update_site_data. The point is to keep
|
||||
sites.md and db_meta.json untouched when only the embedded timestamp/date
|
||||
would change — so a precommit hook doesn't end up staging a no-op diff
|
||||
every time someone runs the updater.
|
||||
"""
|
||||
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from utils.generate_db_meta import (
|
||||
build_meta,
|
||||
meta_payload_equals,
|
||||
write_meta_if_changed,
|
||||
)
|
||||
from utils.update_site_data import (
|
||||
sites_md_payload_equals,
|
||||
write_sites_md_if_changed,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# generate_db_meta
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _write_data_json(path, sites):
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
json.dump({"sites": sites}, f)
|
||||
|
||||
|
||||
def test_meta_payload_equals_ignores_timestamp():
|
||||
a = {"sites_count": 10, "data_sha256": "abc", "updated_at": "2026-01-01T00:00:00Z"}
|
||||
b = {"sites_count": 10, "data_sha256": "abc", "updated_at": "2027-12-31T23:59:59Z"}
|
||||
assert meta_payload_equals(a, b)
|
||||
|
||||
|
||||
def test_meta_payload_equals_detects_real_change():
|
||||
a = {"sites_count": 10, "data_sha256": "abc", "updated_at": "2026-01-01T00:00:00Z"}
|
||||
b = {"sites_count": 11, "data_sha256": "abc", "updated_at": "2026-01-01T00:00:00Z"}
|
||||
assert not meta_payload_equals(a, b)
|
||||
|
||||
|
||||
def test_write_meta_creates_file_when_missing(tmp_path):
|
||||
data_path = tmp_path / "data.json"
|
||||
meta_path = tmp_path / "db_meta.json"
|
||||
_write_data_json(data_path, {"GitHub": {}})
|
||||
|
||||
meta, written = write_meta_if_changed(
|
||||
str(data_path), str(meta_path), "0.6.0", "https://example/data.json"
|
||||
)
|
||||
|
||||
assert written is True
|
||||
assert meta_path.exists()
|
||||
on_disk = json.loads(meta_path.read_text())
|
||||
assert on_disk["sites_count"] == 1
|
||||
assert on_disk["updated_at"] == meta["updated_at"]
|
||||
|
||||
|
||||
def test_write_meta_skips_when_only_timestamp_would_change(tmp_path):
|
||||
data_path = tmp_path / "data.json"
|
||||
meta_path = tmp_path / "db_meta.json"
|
||||
_write_data_json(data_path, {"GitHub": {}})
|
||||
|
||||
# First write seeds the file with an old timestamp.
|
||||
old = datetime(2026, 1, 1, tzinfo=timezone.utc)
|
||||
_, written_first = write_meta_if_changed(
|
||||
str(data_path), str(meta_path), "0.6.0", "https://example/data.json", now=old
|
||||
)
|
||||
assert written_first is True
|
||||
seeded_bytes = meta_path.read_bytes()
|
||||
|
||||
# Second call with a NEW `now` but identical data.json — must be a no-op.
|
||||
new = datetime(2027, 6, 15, tzinfo=timezone.utc)
|
||||
_, written_second = write_meta_if_changed(
|
||||
str(data_path), str(meta_path), "0.6.0", "https://example/data.json", now=new
|
||||
)
|
||||
assert written_second is False
|
||||
# File on disk is byte-for-byte the same — including the OLD timestamp.
|
||||
assert meta_path.read_bytes() == seeded_bytes
|
||||
on_disk = json.loads(meta_path.read_text())
|
||||
assert on_disk["updated_at"] == "2026-01-01T00:00:00Z"
|
||||
|
||||
|
||||
def test_write_meta_writes_when_data_sha256_changes(tmp_path):
|
||||
data_path = tmp_path / "data.json"
|
||||
meta_path = tmp_path / "db_meta.json"
|
||||
|
||||
_write_data_json(data_path, {"GitHub": {}})
|
||||
write_meta_if_changed(
|
||||
str(data_path),
|
||||
str(meta_path),
|
||||
"0.6.0",
|
||||
"https://example/data.json",
|
||||
now=datetime(2026, 1, 1, tzinfo=timezone.utc),
|
||||
)
|
||||
|
||||
# Real change to data.json — sha256 + sites_count both move.
|
||||
_write_data_json(data_path, {"GitHub": {}, "GitLab": {}})
|
||||
new_now = datetime(2027, 6, 15, tzinfo=timezone.utc)
|
||||
meta, written = write_meta_if_changed(
|
||||
str(data_path), str(meta_path), "0.6.0", "https://example/data.json", now=new_now
|
||||
)
|
||||
|
||||
assert written is True
|
||||
on_disk = json.loads(meta_path.read_text())
|
||||
assert on_disk["sites_count"] == 2
|
||||
assert on_disk["updated_at"] == "2027-06-15T00:00:00Z"
|
||||
|
||||
|
||||
def test_write_meta_writes_when_min_version_changes(tmp_path):
|
||||
data_path = tmp_path / "data.json"
|
||||
meta_path = tmp_path / "db_meta.json"
|
||||
_write_data_json(data_path, {"GitHub": {}})
|
||||
|
||||
write_meta_if_changed(
|
||||
str(data_path),
|
||||
str(meta_path),
|
||||
"0.5.0",
|
||||
"https://example/data.json",
|
||||
now=datetime(2026, 1, 1, tzinfo=timezone.utc),
|
||||
)
|
||||
|
||||
_, written = write_meta_if_changed(
|
||||
str(data_path),
|
||||
str(meta_path),
|
||||
"0.6.0", # bumped
|
||||
"https://example/data.json",
|
||||
now=datetime(2026, 1, 2, tzinfo=timezone.utc),
|
||||
)
|
||||
|
||||
assert written is True
|
||||
on_disk = json.loads(meta_path.read_text())
|
||||
assert on_disk["min_maigret_version"] == "0.6.0"
|
||||
|
||||
|
||||
def test_write_meta_writes_when_existing_file_is_corrupt(tmp_path):
|
||||
data_path = tmp_path / "data.json"
|
||||
meta_path = tmp_path / "db_meta.json"
|
||||
_write_data_json(data_path, {"GitHub": {}})
|
||||
meta_path.write_text("this is not valid json")
|
||||
|
||||
_, written = write_meta_if_changed(
|
||||
str(data_path), str(meta_path), "0.6.0", "https://example/data.json"
|
||||
)
|
||||
|
||||
assert written is True
|
||||
json.loads(meta_path.read_text()) # now parseable
|
||||
|
||||
|
||||
def test_build_meta_uses_provided_now(tmp_path):
|
||||
data_path = tmp_path / "data.json"
|
||||
_write_data_json(data_path, {"GitHub": {}})
|
||||
fixed = datetime(2030, 7, 4, 12, 0, 0, tzinfo=timezone.utc)
|
||||
|
||||
meta = build_meta(str(data_path), "0.6.0", "https://example/data.json", now=fixed)
|
||||
|
||||
assert meta["updated_at"] == "2030-07-04T12:00:00Z"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# update_site_data
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
_SITES_MD_TEMPLATE = (
|
||||
"## List of supported sites (search methods): total 1\n\n"
|
||||
"Rank data fetched from Majestic Million by domains.\n\n"
|
||||
"1. [GitHub](https://github.com/)*: top 100*\n"
|
||||
"\nThe list was updated at ({date})\n"
|
||||
"## Statistics\n\n"
|
||||
"Some stats.\n"
|
||||
)
|
||||
|
||||
|
||||
def test_sites_md_payload_equals_ignores_date():
|
||||
a = _SITES_MD_TEMPLATE.format(date="2026-01-01")
|
||||
b = _SITES_MD_TEMPLATE.format(date="2027-12-31")
|
||||
assert sites_md_payload_equals(a, b)
|
||||
|
||||
|
||||
def test_sites_md_payload_equals_detects_body_change():
|
||||
a = _SITES_MD_TEMPLATE.format(date="2026-01-01")
|
||||
b = a.replace("GitHub", "GitLab")
|
||||
assert not sites_md_payload_equals(a, b)
|
||||
|
||||
|
||||
def test_write_sites_md_creates_file_when_missing(tmp_path):
|
||||
target = tmp_path / "sites.md"
|
||||
content = _SITES_MD_TEMPLATE.format(date="2026-05-15")
|
||||
|
||||
written = write_sites_md_if_changed(content, str(target))
|
||||
|
||||
assert written is True
|
||||
assert target.read_text() == content
|
||||
|
||||
|
||||
def test_write_sites_md_skips_when_only_date_would_change(tmp_path):
|
||||
target = tmp_path / "sites.md"
|
||||
seeded = _SITES_MD_TEMPLATE.format(date="2026-01-01")
|
||||
target.write_text(seeded)
|
||||
|
||||
# New content has a different date but identical body.
|
||||
new_content = _SITES_MD_TEMPLATE.format(date="2027-12-31")
|
||||
written = write_sites_md_if_changed(new_content, str(target))
|
||||
|
||||
assert written is False
|
||||
# File untouched, including the OLD date.
|
||||
assert target.read_text() == seeded
|
||||
|
||||
|
||||
def test_write_sites_md_writes_when_body_changes(tmp_path):
|
||||
target = tmp_path / "sites.md"
|
||||
target.write_text(_SITES_MD_TEMPLATE.format(date="2026-01-01"))
|
||||
|
||||
new_content = _SITES_MD_TEMPLATE.format(date="2026-01-01").replace(
|
||||
"GitHub", "GitLab"
|
||||
)
|
||||
written = write_sites_md_if_changed(new_content, str(target))
|
||||
|
||||
assert written is True
|
||||
assert "GitLab" in target.read_text()
|
||||
assert "GitHub" not in target.read_text()
|
||||
@@ -3,6 +3,9 @@
|
||||
import copy
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import textwrap
|
||||
import pytest
|
||||
from io import StringIO
|
||||
|
||||
@@ -442,6 +445,73 @@ def test_pdf_report():
|
||||
assert os.path.exists(report_name)
|
||||
|
||||
|
||||
def test_save_pdf_report_raises_helpful_error_without_xhtml2pdf(
|
||||
monkeypatch, tmp_path
|
||||
):
|
||||
# Setting an entry to None makes a subsequent `import` raise ImportError —
|
||||
# this simulates the optional 'pdf' extra not being installed without
|
||||
# actually uninstalling xhtml2pdf from the test environment.
|
||||
monkeypatch.setitem(sys.modules, 'xhtml2pdf', None)
|
||||
monkeypatch.setitem(sys.modules, 'xhtml2pdf.pisa', None)
|
||||
|
||||
context = generate_report_context(TEST)
|
||||
target = tmp_path / "report.pdf"
|
||||
|
||||
with pytest.raises(RuntimeError) as excinfo:
|
||||
save_pdf_report(str(target), context)
|
||||
|
||||
msg = str(excinfo.value)
|
||||
assert "maigret[pdf]" in msg
|
||||
assert "pip install" in msg
|
||||
assert not target.exists()
|
||||
|
||||
|
||||
def test_xhtml2pdf_is_not_module_level_dependency():
|
||||
# Guard against a regression where someone hoists `import xhtml2pdf` /
|
||||
# `from xhtml2pdf import pisa` to the top of maigret/report.py — that
|
||||
# would force every Maigret user to install the optional extra.
|
||||
import maigret.report as report_module
|
||||
|
||||
module_globals = vars(report_module)
|
||||
assert 'xhtml2pdf' not in module_globals
|
||||
assert 'pisa' not in module_globals
|
||||
|
||||
|
||||
def test_import_maigret_without_pdf_extras():
|
||||
# End-to-end check: spawn a fresh interpreter with every package in the
|
||||
# [pdf] extra blocked before any maigret module is loaded, and confirm
|
||||
# the package, the report module, and save_pdf_report itself all import
|
||||
# cleanly. Mirrors what a user who ran `pip install maigret` (without
|
||||
# [pdf]) would experience.
|
||||
code = textwrap.dedent(
|
||||
"""
|
||||
import sys
|
||||
for name in (
|
||||
'xhtml2pdf', 'xhtml2pdf.pisa',
|
||||
'arabic_reshaper',
|
||||
'bidi', 'bidi.algorithm',
|
||||
):
|
||||
sys.modules[name] = None
|
||||
|
||||
import maigret
|
||||
import maigret.report
|
||||
from maigret.report import save_pdf_report
|
||||
|
||||
assert callable(save_pdf_report)
|
||||
print("OK")
|
||||
"""
|
||||
)
|
||||
result = subprocess.run(
|
||||
[sys.executable, "-c", code],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
assert result.returncode == 0, (
|
||||
f"stdout={result.stdout!r} stderr={result.stderr!r}"
|
||||
)
|
||||
assert "OK" in result.stdout
|
||||
|
||||
|
||||
def test_text_report():
|
||||
context = generate_report_context(TEST)
|
||||
report_text = get_plaintext_report(context)
|
||||
|
||||
Reference in New Issue
Block a user