From a7338e97f30d4bd09a527d827214c9c74db55311 Mon Sep 17 00:00:00 2001 From: Soxoj <31013580+soxoj@users.noreply.github.com> Date: Fri, 15 May 2026 14:33:55 +0200 Subject: [PATCH] Make xhtml2pdf optional, fix install on Linux without libcairo (#2659) * Make xhtml2pdf optional, fix install on Linux without libcairo Move xhtml2pdf to the new [pdf] extra so default `pip install maigret` no longer pulls pycairo (which has no Linux/macOS wheels and breaks the build without libcairo2-dev). save_pdf_report now raises a clear RuntimeError pointing to `pip install 'maigret[pdf]'`, and the CLI turns it into a friendly warning instead of a crash. Adds tests covering the missing-extra path, plus per-OS install docs. Fix for #2657, #2534 * Make arabic-reshaper and python-bidi optional; idempotent update of db_meta.json and sites.md * Regenerated poerty.lock * Update CI workflow to cover minimal installation without PDF deps --- .github/workflows/python-package.yml | 31 +++- .github/workflows/update-site-data.yml | 4 - README.md | 2 + docs/source/installation.rst | 144 ++++++++++++++++ maigret/maigret.py | 10 +- maigret/report.py | 16 +- maigret/resources/db_meta.json | 2 +- poetry.lock | 92 ++++++---- pyproject.toml | 20 ++- tests/test_idempotent_writes.py | 223 +++++++++++++++++++++++++ tests/test_report.py | 70 ++++++++ utils/generate_db_meta.py | 91 +++++++--- utils/update_site_data.py | 199 +++++++++++++--------- 13 files changed, 749 insertions(+), 155 deletions(-) create mode 100644 tests/test_idempotent_writes.py diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index ec4df6a..6c2fa32 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -46,4 +46,33 @@ jobs: uses: actions/upload-artifact@v4 with: name: htmlcov-${{ strategy.job-index }} - path: htmlcov \ No newline at end of file + path: htmlcov + + minimal-install: + # Verify a fresh `pip install maigret` succeeds and the test suite + # passes WITHOUT the optional [pdf] extra and WITHOUT system cairo. + # Catches regressions where core code accidentally grows a hard + # dependency on xhtml2pdf / pycairo / libcairo2. + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install Maigret without [pdf] extra (no libcairo on host) + run: | + python -m pip install --upgrade pip + pip install . + pip install pytest pytest-asyncio pytest-rerunfailures pytest-httpserver + + - name: Smoke-check the install + run: | + python -c "import maigret; from maigret.report import save_pdf_report; print('import OK')" + maigret --version + + - name: Run tests without [pdf] extra + run: pytest --reruns 3 --reruns-delay 5 tests \ No newline at end of file diff --git a/.github/workflows/update-site-data.yml b/.github/workflows/update-site-data.yml index 3f0d24c..aa1732e 100644 --- a/.github/workflows/update-site-data.yml +++ b/.github/workflows/update-site-data.yml @@ -18,10 +18,6 @@ jobs: ref: main fetch-depth: 0 # otherwise, there would be errors pushing refs to the destination repository. - - name: Install system dependencies - run: | - sudo apt-get update && sudo apt-get install -y libcairo2-dev - - name: Build application run: | pip3 install . diff --git a/README.md b/README.md index 0c4a248..dff917e 100644 --- a/README.md +++ b/README.md @@ -173,6 +173,8 @@ docker build --target web -t maigret-web . # Web UI image Build errors? See the [troubleshooting guide](https://maigret.readthedocs.io/en/latest/installation.html#troubleshooting). +PDF reports (`--pdf`) are an optional extra — install with `pip install 'maigret[pdf]'`. They need system-level graphics libraries on Linux/macOS; see the [PDF reports section](https://maigret.readthedocs.io/en/latest/installation.html#optional-pdf-reports-maigret-pdf) for per-OS install steps. + ## Usage ### Examples diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 43f4dbc..3671449 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -58,6 +58,17 @@ Maigret ships with a bundled site database. After installation from PyPI (or any # usage maigret username +PDF report support is shipped as an **optional extra** because it relies on +system-level graphics libraries that pip cannot install for you. If you plan to +use ``--pdf``, install Maigret with the ``pdf`` extra: + +.. code-block:: bash + + pip3 install 'maigret[pdf]' + +See :ref:`pdf-extra` below for the full background on why PDF support is +optional and how to fix the most common build errors. + Development version (GitHub) ---------------------------- @@ -126,6 +137,139 @@ After installing the system dependencies, retry the maigret installation. If you continue to have issues, consider using Docker instead, which includes all necessary dependencies. +.. _pdf-extra: + +Optional: PDF reports (``maigret[pdf]``) +---------------------------------------- + +The ``--pdf`` report format is shipped as an optional extra. To enable it: + +.. code-block:: bash + + pip3 install 'maigret[pdf]' + +If PDF support is not installed and you pass ``--pdf``, Maigret prints a +warning and continues without crashing — every other output format +(``--html``, ``--json``, ``--csv``, ``--txt``, ``--xmind``, ``--graph``) +keeps working. + +Why is PDF optional? +~~~~~~~~~~~~~~~~~~~~ + +Maigret renders PDFs by converting an HTML template, and that conversion +pipeline ultimately depends on the ``cairo`` graphics library through a +chain of Python packages roughly shaped like:: + + maigret[pdf] → xhtml2pdf → svglib → rlPyCairo → pycairo → libcairo2 (system) + +The bottom of that chain is a C library — ``libcairo2`` — that has to exist +on the host *before* pip can build the Python bindings. The Python binding +package (``pycairo``) currently ships **only Windows wheels** on PyPI; on +Linux and macOS pip falls back to building from source, and the build fails +the moment ``pkg-config`` cannot find ``cairo``. The error looks like:: + + ../cairo/meson.build:31:12: ERROR: Dependency "cairo" not found (tried pkg-config) + note: This error originates from a subprocess, and is likely not a problem with pip. + error: metadata-generation-failed + +Pulling this whole chain for every Maigret install just so the much smaller +group of users who actually want PDFs can have them is a poor trade — so +``xhtml2pdf`` is gated behind the ``pdf`` extra. + +Two more packages — ``arabic-reshaper`` and ``python-bidi`` — are bundled +into the same extra. Maigret core never imports them; they are only used +by ``xhtml2pdf`` to shape Arabic glyphs and lay out right-to-left text in +PDFs. ``python-bidi`` v0.5+ is also a Rust binding, so on niche platforms +without a published wheel it would otherwise pull in a Cargo build for +users who never asked for PDF support. + +Installing the system prerequisites +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Install the cairo headers, ``pkg-config``, and a working C toolchain +*before* running ``pip install 'maigret[pdf]'``. + +**Debian / Ubuntu / Linux Mint / Kali:** + +.. code-block:: bash + + sudo apt update + sudo apt install -y libcairo2-dev pkg-config python3-dev build-essential + pip3 install --upgrade pip setuptools wheel + pip3 install 'maigret[pdf]' + +**Fedora / RHEL / CentOS:** + +.. code-block:: bash + + sudo dnf install -y cairo-devel pkgconfig python3-devel gcc + pip3 install 'maigret[pdf]' + +**Arch Linux:** + +.. code-block:: bash + + sudo pacman -S cairo pkgconf base-devel + pip3 install 'maigret[pdf]' + +**Alpine Linux:** + +.. code-block:: bash + + sudo apk add cairo-dev pkgconf python3-dev build-base + pip3 install 'maigret[pdf]' + +**macOS (Homebrew):** + +.. code-block:: bash + + brew install cairo pkg-config + pip3 install --upgrade pip setuptools wheel + pip3 install 'maigret[pdf]' + +**Windows:** + +No system packages are needed — ``pycairo`` ships prebuilt wheels for +Windows. Just run: + +.. code-block:: bash + + pip install 'maigret[pdf]' + +**Google Cloud Shell / Colab / Replit / generic CI:** + +These environments behave like Debian/Ubuntu — install the same +``libcairo2-dev pkg-config python3-dev build-essential`` triple before +``pip install 'maigret[pdf]'``. If you do not control the base image and +cannot ``apt install``, skip the extra and use ``--html`` reports instead; +HTML reports contain the same data and open in any browser. + +``maigret: command not found`` after install +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If pip prints warnings like:: + + WARNING: The scripts maigret and update_sitesmd are installed in + '/home//.local/bin' which is not on PATH. + +…and ``maigret --version`` then fails with ``command not found``, your +``--user`` install put the entry-point script in a directory the shell does +not search. Add it to ``PATH``: + +.. code-block:: bash + + echo 'export PATH="$HOME/.local/bin:$PATH"' >> ~/.bashrc + source ~/.bashrc + +Or install into a virtual environment, where the entry point lands in the +venv's ``bin/`` automatically: + +.. code-block:: bash + + python3 -m venv ~/.venvs/maigret + source ~/.venvs/maigret/bin/activate + pip install 'maigret[pdf]' # or just `pip install maigret` + Optional: Cloudflare bypass solver ---------------------------------- diff --git a/maigret/maigret.py b/maigret/maigret.py index d4a65bf..eeb4b81 100755 --- a/maigret/maigret.py +++ b/maigret/maigret.py @@ -908,8 +908,14 @@ async def main(): if args.pdf: username = username.replace('/', '_') filename = report_filepath_tpl.format(username=username, postfix='.pdf') - save_pdf_report(filename, report_context) - query_notify.warning(f'PDF report on all usernames saved in {filename}') + try: + save_pdf_report(filename, report_context) + except RuntimeError as e: + query_notify.warning(str(e)) + else: + query_notify.warning( + f'PDF report on all usernames saved in {filename}' + ) if args.md: username = username.replace('/', '_') diff --git a/maigret/report.py b/maigret/report.py index 3f79305..c79eef5 100644 --- a/maigret/report.py +++ b/maigret/report.py @@ -78,13 +78,23 @@ def save_html_report(filename: str, context: dict): f.write(filled_template) +PDF_EXTRA_HINT = ( + "PDF reports require the optional 'pdf' extra. " + "Install it with: pip install 'maigret[pdf]'" +) + + def save_pdf_report(filename: str, context: dict): + # Imported lazily so that users without the optional 'pdf' extra + # can still import maigret.report and use other report formats. + try: + from xhtml2pdf import pisa # type: ignore[import-untyped] + except ImportError as e: + raise RuntimeError(PDF_EXTRA_HINT) from e + template, css = generate_report_template(is_pdf=True) filled_template = template.render(**context) - # moved here to speed up the launch of Maigret - from xhtml2pdf import pisa # type: ignore[import-untyped] - with open(filename, "w+b") as f: pisa.pisaDocument(io.StringIO(filled_template), dest=f, default_css=css) diff --git a/maigret/resources/db_meta.json b/maigret/resources/db_meta.json index bc1ebcd..510a8f4 100644 --- a/maigret/resources/db_meta.json +++ b/maigret/resources/db_meta.json @@ -1,6 +1,6 @@ { "version": 1, - "updated_at": "2026-05-13T10:39:40Z", + "updated_at": "2026-05-15T12:30:52Z", "sites_count": 3154, "min_maigret_version": "0.6.0", "data_sha256": "f86d77a18bcd1d353933b64d99953634ce5e2966860f25bacd5e3de5659fb8a7", diff --git a/poetry.lock b/poetry.lock index c1dbe27..afab42b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.3.3 and should not be changed by hand. [[package]] name = "about-time" @@ -236,11 +236,12 @@ version = "3.0.1" description = "Reconstruct Arabic sentences to be used in applications that do not support Arabic" optional = false python-versions = ">=3.10" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "arabic_reshaper-3.0.1-py3-none-any.whl", hash = "sha256:41c5adc2420f85758eada7e880251c4b6a2adbd83377bd27e5d4eba71f648bc7"}, {file = "arabic_reshaper-3.0.1.tar.gz", hash = "sha256:a0d9b2a9fa29b5f2c1d705f407adf6ca4242405b9cac0e5cc09e6c4f3f8fb68c"}, ] +markers = {main = "extra == \"pdf\""} [package.extras] with-fonttools = ["fonttools (>=4.0)"] @@ -269,11 +270,12 @@ version = "1.5.1" description = "Fast ASN.1 parser and serializer with definitions for private keys, public keys, certificates, CRL, OCSP, CMS, PKCS#3, PKCS#7, PKCS#8, PKCS#12, PKCS#5, X.509 and TSP" optional = false python-versions = "*" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "asn1crypto-1.5.1-py2.py3-none-any.whl", hash = "sha256:db4e40728b728508912cbb3d44f19ce188f218e9eba635821bb4b68564f8fd67"}, {file = "asn1crypto-1.5.1.tar.gz", hash = "sha256:13ae38502be632115abf8a24cbe5f4da52e3b5231990aff31123c805306ccb9c"}, ] +markers = {main = "extra == \"pdf\""} [[package]] name = "ast-serialize" @@ -463,7 +465,7 @@ version = "2026.4.22" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.7" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "certifi-2026.4.22-py3-none-any.whl", hash = "sha256:3cb2210c8f88ba2318d29b0388d1023c8492ff72ecdde4ebdaddbb13a31b1c4a"}, {file = "certifi-2026.4.22.tar.gz", hash = "sha256:8d455352a37b71bf76a79caa83a3d6c25afee4a385d632127b6afb3963f1c580"}, @@ -475,7 +477,7 @@ version = "2.0.0" description = "Foreign Function Interface for Python calling C code." optional = false python-versions = ">=3.9" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "cffi-2.0.0-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:0cf2d91ecc3fcc0625c2c530fe004f82c110405f101548512cce44322fa8ac44"}, {file = "cffi-2.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f73b96c41e3b2adedc34a7356e64c8eb96e03a3782b535e043a986276ce12a49"}, @@ -562,6 +564,7 @@ files = [ {file = "cffi-2.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:b882b3df248017dba09d6b16defe9b5c407fe32fc7c65a9c69798e6175601be9"}, {file = "cffi-2.0.0.tar.gz", hash = "sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529"}, ] +markers = {dev = "platform_python_implementation != \"PyPy\""} [package.dependencies] pycparser = {version = "*", markers = "implementation_name != \"PyPy\""} @@ -770,6 +773,7 @@ files = [ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] +markers = {dev = "sys_platform == \"win32\" or platform_system == \"Windows\""} [[package]] name = "coverage" @@ -899,7 +903,7 @@ version = "46.0.7" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." optional = false python-versions = "!=3.9.0,!=3.9.1,>=3.8" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "cryptography-46.0.7-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:ea42cbe97209df307fdc3b155f1b6fa2577c0defa8f1f7d3be7d31d189108ad4"}, {file = "cryptography-46.0.7-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b36a4695e29fe69215d75960b22577197aca3f7a25b9cf9d165dcfe9d80bc325"}, @@ -951,6 +955,7 @@ files = [ {file = "cryptography-46.0.7-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:258514877e15963bd43b558917bc9f54cf7cf866c38aa576ebf47a77ddbc43a4"}, {file = "cryptography-46.0.7.tar.gz", hash = "sha256:e4cfd68c5f3e0bfdad0d38e023239b96a2fe84146481852dffbcca442c245aa5"}, ] +markers = {main = "extra == \"pdf\""} [package.dependencies] cffi = {version = ">=2.0.0", markers = "python_full_version >= \"3.9.0\" and platform_python_implementation != \"PyPy\""} @@ -972,11 +977,12 @@ version = "0.7.0" description = "CSS selectors for Python ElementTree" optional = false python-versions = ">=3.7" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "cssselect2-0.7.0-py3-none-any.whl", hash = "sha256:fd23a65bfd444595913f02fc71f6b286c29261e354c41d722ca7a261a49b5969"}, {file = "cssselect2-0.7.0.tar.gz", hash = "sha256:1ccd984dab89fc68955043aca4e1b03e0cf29cad9880f6e28e3ba7a74b14aa5a"}, ] +markers = {main = "extra == \"pdf\""} [package.dependencies] tinycss2 = "*" @@ -1119,7 +1125,7 @@ version = "2.5.1" description = "Freetype python bindings" optional = false python-versions = ">=3.7" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "freetype-py-2.5.1.zip", hash = "sha256:cfe2686a174d0dd3d71a9d8ee9bf6a2c23f5872385cf8ce9f24af83d076e2fbd"}, {file = "freetype_py-2.5.1-py3-none-macosx_10_9_universal2.whl", hash = "sha256:d01ded2557694f06aa0413f3400c0c0b2b5ebcaabeef7aaf3d756be44f51e90b"}, @@ -1129,6 +1135,7 @@ files = [ {file = "freetype_py-2.5.1-py3-none-musllinux_1_1_x86_64.whl", hash = "sha256:3c1aefc4f0d5b7425f014daccc5fdc7c6f914fb7d6a695cc684f1c09cd8c1660"}, {file = "freetype_py-2.5.1-py3-none-win_amd64.whl", hash = "sha256:0b7f8e0342779f65ca13ef8bc103938366fecade23e6bb37cb671c2b8ad7f124"}, ] +markers = {main = "extra == \"pdf\""} [[package]] name = "frozenlist" @@ -1284,7 +1291,7 @@ version = "1.1" description = "HTML parser based on the WHATWG HTML specification" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "html5lib-1.1-py2.py3-none-any.whl", hash = "sha256:0d78f8fde1c230e99fe37986a60526d7049ed4bf8a9fadbad5f00e22e58e041d"}, {file = "html5lib-1.1.tar.gz", hash = "sha256:b2e5b40261e20f354d198eae92afc10d750afb487ed5e50f9c4eaf07c184146f"}, @@ -1306,7 +1313,7 @@ version = "3.15" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.8" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "idna-3.15-py3-none-any.whl", hash = "sha256:048adeaf8c2d788c40fee287673ccaa74c24ffd8dcf09ffa555a2fbb59f10ac8"}, {file = "idna-3.15.tar.gz", hash = "sha256:ca962446ea538f7092a95e057da437618e886f4d349216d2b1e294abfdb65fdc"}, @@ -1542,7 +1549,7 @@ version = "6.1.0" description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." optional = false python-versions = ">=3.8" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "lxml-6.1.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:41dcc4c7b10484257cbd6c37b83ddb26df2b0e5aff5ac00d095689015af868ec"}, {file = "lxml-6.1.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a31286dbb5e74c8e9a5344465b77ab4c5bd511a253b355b5ca2fae7e579fafec"}, @@ -2133,11 +2140,12 @@ version = "1.3.0" description = "TLS (SSL) sockets, key generation, encryption, decryption, signing, verification and KDFs using the OS crypto libraries. Does not require a compiler, and relies on the OS for patching. Works on Windows, OS X and Linux/BSD." optional = false python-versions = "*" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "oscrypto-1.3.0-py2.py3-none-any.whl", hash = "sha256:2b2f1d2d42ec152ca90ccb5682f3e051fb55986e1b170ebde472b133713e7085"}, {file = "oscrypto-1.3.0.tar.gz", hash = "sha256:6f5fef59cb5b3708321db7cca56aed8ad7e662853351e7991fcf60ec606d47a4"}, ] +markers = {main = "extra == \"pdf\""} [package.dependencies] asn1crypto = ">=1.5.1" @@ -2482,7 +2490,7 @@ version = "1.29.0" description = "Python interface for cairo" optional = false python-versions = ">=3.10" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "pycairo-1.29.0-cp310-cp310-win32.whl", hash = "sha256:96c67e6caba72afd285c2372806a0175b1aa2f4537aa88fb4d9802d726effcd1"}, {file = "pycairo-1.29.0-cp310-cp310-win_amd64.whl", hash = "sha256:65bddd944aee9f7d7d72821b1c87e97593856617c2820a78d589d66aa8afbd08"}, @@ -2503,6 +2511,7 @@ files = [ {file = "pycairo-1.29.0-cp314-cp314t-win_arm64.whl", hash = "sha256:caba0837a4b40d47c8dfb0f24cccc12c7831e3dd450837f2a356c75f21ce5a15"}, {file = "pycairo-1.29.0.tar.gz", hash = "sha256:f3f7fde97325cae80224c09f12564ef58d0d0f655da0e3b040f5807bd5bd3142"}, ] +markers = {main = "extra == \"pdf\""} [[package]] name = "pycares" @@ -2638,12 +2647,12 @@ version = "2.22" description = "C parser in Python" optional = false python-versions = ">=3.8" -groups = ["main"] -markers = "implementation_name != \"PyPy\"" +groups = ["main", "dev"] files = [ {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"}, {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"}, ] +markers = {main = "implementation_name != \"PyPy\"", dev = "platform_python_implementation != \"PyPy\" and implementation_name != \"PyPy\""} [[package]] name = "pyflakes" @@ -2678,11 +2687,12 @@ version = "0.25.3" description = "Tools for stamping and signing PDF files" optional = false python-versions = ">=3.8" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "pyHanko-0.25.3-py3-none-any.whl", hash = "sha256:d66ec499f057191df100f322c2fd22949057a9b0d981f4e75bc077c1a817497f"}, {file = "pyhanko-0.25.3.tar.gz", hash = "sha256:e879fd44e20f4b7726e75c62e8c7b0c41ea41f8fa5bda626bc7d206ae3d30dec"}, ] +markers = {main = "extra == \"pdf\""} [package.dependencies] asn1crypto = ">=1.5.1" @@ -2714,11 +2724,12 @@ version = "0.26.5" description = "Validates X.509 certificates and paths; forked from wbond/certvalidator" optional = false python-versions = ">=3.7" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "pyhanko_certvalidator-0.26.5-py3-none-any.whl", hash = "sha256:86a56df420bfb273ba881826b76245a53b2bd039fea7a7826231dbe76d761a8a"}, {file = "pyhanko_certvalidator-0.26.5.tar.gz", hash = "sha256:800f5a7744d23870a5203cb38007689902c79c44e7374dab0c9b02e1b1a89bd4"}, ] +markers = {main = "extra == \"pdf\""} [package.dependencies] asn1crypto = ">=1.5.1" @@ -2753,11 +2764,12 @@ version = "6.10.2" description = "A pure-python PDF library capable of splitting, merging, cropping, and transforming PDF files" optional = false python-versions = ">=3.9" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "pypdf-6.10.2-py3-none-any.whl", hash = "sha256:aa53be9826655b51c96741e5d7983ca224d898ac0a77896e64636810517624aa"}, {file = "pypdf-6.10.2.tar.gz", hash = "sha256:7d09ce108eff6bf67465d461b6ef352dcb8d84f7a91befc02f904455c6eea11d"}, ] +markers = {main = "extra == \"pdf\""} [package.dependencies] typing_extensions = {version = ">=4.0", markers = "python_version < \"3.11\""} @@ -2904,7 +2916,7 @@ version = "0.6.10" description = "Python Bidi layout wrapping the Rust crate unicode-bidi" optional = false python-versions = ">=3.9" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "python_bidi-0.6.10-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:327e570f10443995d3697e8096bc337970dfc32cd5339759fa4e87093cf5cdf9"}, {file = "python_bidi-0.6.10-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fc012f8738e21462b8b173278ef9278a822373a64f558ac1bfa36eceb56296df"}, @@ -3046,6 +3058,7 @@ files = [ {file = "python_bidi-0.6.10-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:9545c3cd8238a79ab7e0ff7b27326bef3439001207984ea47fa3be31551d364e"}, {file = "python_bidi-0.6.10.tar.gz", hash = "sha256:a7853e894f723675489ac49aa4b52dc8eac87d7a67b5940631c8c9d2aab46f90"}, ] +markers = {main = "extra == \"pdf\""} [package.extras] dev = ["nox", "pytest"] @@ -3164,7 +3177,7 @@ version = "6.0.2" description = "YAML parser and emitter for Python" optional = false python-versions = ">=3.8" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"}, {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"}, @@ -3220,6 +3233,7 @@ files = [ {file = "PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8"}, {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"}, ] +markers = {main = "extra == \"pdf\""} [[package]] name = "qrcode" @@ -3227,11 +3241,12 @@ version = "8.0" description = "QR Code image generator" optional = false python-versions = "<4.0,>=3.9" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "qrcode-8.0-py3-none-any.whl", hash = "sha256:9fc05f03305ad27a709eb742cf3097fa19e6f6f93bb9e2f039c0979190f6f1b1"}, {file = "qrcode-8.0.tar.gz", hash = "sha256:025ce2b150f7fe4296d116ee9bad455a6643ab4f6e7dce541613a4758cbce347"}, ] +markers = {main = "extra == \"pdf\""} [package.dependencies] colorama = {version = "*", markers = "sys_platform == \"win32\""} @@ -3270,7 +3285,7 @@ version = "2.34.1" description = "Python HTTP for Humans." optional = false python-versions = ">=3.10" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "requests-2.34.1-py3-none-any.whl", hash = "sha256:bf38a3ff993960d3dd819c08862c40b3c703306eb7c744fcd9f4ddbb95b548f0"}, {file = "requests-2.34.1.tar.gz", hash = "sha256:0fc5669f2b69704449fe1552360bd2a73a54512dfd03e65529157f1513322beb"}, @@ -3344,11 +3359,12 @@ version = "0.4.0" description = "Plugin backend renderer for reportlab.graphics.renderPM" optional = false python-versions = ">=3.7" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "rlpycairo-0.4.0-py3-none-any.whl", hash = "sha256:3ce83825d5761c03bc3571c7db12a336ad51417e63189e3512d11b8922576aa9"}, {file = "rlpycairo-0.4.0.tar.gz", hash = "sha256:07c2c3c47828e83d9c09657a54ecbcd1a97aac9dc199780234456d3473faadc7"}, ] +markers = {main = "extra == \"pdf\""} [package.dependencies] freetype-py = ">=2.3" @@ -3360,7 +3376,7 @@ version = "1.17.0" description = "Python 2 and 3 compatibility utilities" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274"}, {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"}, @@ -3432,11 +3448,12 @@ version = "1.6.0" description = "A pure-Python library for reading and converting SVG" optional = false python-versions = ">=3.9" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "svglib-1.6.0-py3-none-any.whl", hash = "sha256:9aea8e2e81cbbf9c844460e4c7dc90e0a06aea7983bc201975ccd279d7b2d194"}, {file = "svglib-1.6.0.tar.gz", hash = "sha256:4c38a274a744ef0d1677f55d5d62fc0fb798819f813e52872a796e615741733d"}, ] +markers = {main = "extra == \"pdf\""} [package.dependencies] cssselect2 = ">=0.2.0" @@ -3454,11 +3471,12 @@ version = "1.4.0" description = "A tiny CSS parser" optional = false python-versions = ">=3.8" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "tinycss2-1.4.0-py3-none-any.whl", hash = "sha256:3a49cf47b7675da0b15d0c6e1df8df4ebd96e9394bb905a5775adb0d884c5289"}, {file = "tinycss2-1.4.0.tar.gz", hash = "sha256:10c0972f6fc0fbee87c3edb76549357415e94548c1ae10ebccdea16fb404a9b7"}, ] +markers = {main = "extra == \"pdf\""} [package.dependencies] webencodings = ">=0.4" @@ -3584,12 +3602,12 @@ version = "2024.2" description = "Provider of IANA time zone data" optional = false python-versions = ">=2" -groups = ["main"] -markers = "platform_system == \"Windows\"" +groups = ["main", "dev"] files = [ {file = "tzdata-2024.2-py2.py3-none-any.whl", hash = "sha256:a48093786cdcde33cad18c2555e8532f34422074448fbc874186f0abd79565cd"}, {file = "tzdata-2024.2.tar.gz", hash = "sha256:7d85cc416e9382e69095b7bdf4afd9e3880418a2413feec7069d533d6b4e31cc"}, ] +markers = {main = "extra == \"pdf\" and platform_system == \"Windows\"", dev = "platform_system == \"Windows\""} [[package]] name = "tzlocal" @@ -3597,11 +3615,12 @@ version = "5.2" description = "tzinfo object for the local timezone" optional = false python-versions = ">=3.8" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "tzlocal-5.2-py3-none-any.whl", hash = "sha256:49816ef2fe65ea8ac19d19aa7a1ae0551c834303d5014c6d5a62e4cbda8047b8"}, {file = "tzlocal-5.2.tar.gz", hash = "sha256:8d399205578f1a9342816409cc1e46a93ebd5755e39ea2d85334bea911bf0e6e"}, ] +markers = {main = "extra == \"pdf\""} [package.dependencies] tzdata = {version = "*", markers = "platform_system == \"Windows\""} @@ -3615,11 +3634,12 @@ version = "4.0.3" description = "URI parsing, classification and composition" optional = false python-versions = ">=3.7" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "uritools-4.0.3-py3-none-any.whl", hash = "sha256:bae297d090e69a0451130ffba6f2f1c9477244aa0a5543d66aed2d9f77d0dd9c"}, {file = "uritools-4.0.3.tar.gz", hash = "sha256:ee06a182a9c849464ce9d5fa917539aacc8edd2a4924d1b7aabeeecabcae3bc2"}, ] +markers = {main = "extra == \"pdf\""} [[package]] name = "urllib3" @@ -3627,7 +3647,7 @@ version = "2.7.0" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.10" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "urllib3-2.7.0-py3-none-any.whl", hash = "sha256:9fb4c81ebbb1ce9531cce37674bbc6f1360472bc18ca9a553ede278ef7276897"}, {file = "urllib3-2.7.0.tar.gz", hash = "sha256:231e0ec3b63ceb14667c67be60f2f2c40a518cb38b03af60abc813da26505f4c"}, @@ -3657,7 +3677,7 @@ version = "0.5.1" description = "Character encoding aliases for legacy web content" optional = false python-versions = "*" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78"}, {file = "webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"}, @@ -3687,11 +3707,12 @@ version = "0.2.17" description = "PDF generator using HTML and CSS" optional = false python-versions = ">=3.8" -groups = ["main"] +groups = ["main", "dev"] files = [ {file = "xhtml2pdf-0.2.17-py3-none-any.whl", hash = "sha256:61a7ecac829fed518f7dbcb916e9d56bea6e521e02e54644b3d0ca33f0658315"}, {file = "xhtml2pdf-0.2.17.tar.gz", hash = "sha256:09ddbc31aa0e38a16f2f3cb73be89af5f7c968c17a564afdd685d280e39c526d"}, ] +markers = {main = "extra == \"pdf\""} [package.dependencies] arabic-reshaper = ">=3.0.0" @@ -3866,7 +3887,10 @@ idna = ">=2.0" multidict = ">=4.0" propcache = ">=0.2.1" +[extras] +pdf = ["arabic-reshaper", "python-bidi", "xhtml2pdf"] + [metadata] lock-version = "2.1" python-versions = "^3.10" -content-hash = "7a178b95a83821789c83e5b693cc4701a1b99988e7147504ff2b7b34b8065d9b" +content-hash = "eeae363c45c18085321a7c0cbdb7835713a0ca4256aebc7c4abe984ad855c8a8" diff --git a/pyproject.toml b/pyproject.toml index 6d24669..35e03fe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -39,7 +39,7 @@ python = "^3.10" aiodns = ">=3,<5" aiohttp = "^3.12.14" aiohttp-socks = ">=0.10.1,<0.12.0" -arabic-reshaper = "^3.0.0" +arabic-reshaper = {version = "^3.0.0", optional = true} async-timeout = "^5.0.1" attrs = ">=25.3,<27.0" certifi = ">=2025.6.15,<2027.0.0" @@ -57,7 +57,7 @@ multidict = "^6.6.3" pycountry = ">=24.6.1,<27.0.0" PyPDF2 = "^3.0.1" PySocks = "^1.7.1" -python-bidi = "^0.6.3" +python-bidi = {version = "^0.6.3", optional = true} requests = "^2.32.4" requests-futures = "^1.0.2" requests-toolbelt = "^1.0.0" @@ -69,7 +69,7 @@ torrequest = "^0.1.0" alive_progress = "^3.2.0" typing-extensions = "^4.14.1" webencodings = "^0.5.1" -xhtml2pdf = "^0.2.11" +xhtml2pdf = {version = "^0.2.11", optional = true} XMind = "^1.2.0" yarl = "^1.20.1" networkx = "^2.6.3" @@ -82,6 +82,17 @@ platformdirs = "^4.3.8" curl-cffi = ">=0.14,<1.0" +[tool.poetry.extras] +# Install PDF support with: pip install 'maigret[pdf]' +# Skipped by default because the underlying `pycairo` has no Linux/macOS +# wheels on PyPI and requires system libcairo + pkg-config to build. +# arabic-reshaper and python-bidi are pulled in too — they're only used +# by xhtml2pdf (RTL text shaping in PDFs), nothing in maigret core touches +# them, and python-bidi v0.5+ is a Rust binding that can need cargo on +# niche platforms. +pdf = ["xhtml2pdf", "arabic-reshaper", "python-bidi"] + + [tool.poetry.group.dev.dependencies] # How to add a new dev dependency: poetry add black --group dev # Install dev dependencies with: poetry install --with dev @@ -92,6 +103,9 @@ pytest-cov = ">=6,<8" pytest-httpserver = "^1.0.0" pytest-rerunfailures = ">=15.1,<17.0" reportlab = "^4.4.3" +xhtml2pdf = "^0.2.11" +arabic-reshaper = "^3.0.0" +python-bidi = "^0.6.3" mypy = ">=1.14.1,<3.0.0" tuna = "^0.5.11" coverage = "^7.9.2" diff --git a/tests/test_idempotent_writes.py b/tests/test_idempotent_writes.py new file mode 100644 index 0000000..95ebc5e --- /dev/null +++ b/tests/test_idempotent_writes.py @@ -0,0 +1,223 @@ +"""Tests for the 'don't rewrite files unless content actually changed' logic +in utils.generate_db_meta and utils.update_site_data. The point is to keep +sites.md and db_meta.json untouched when only the embedded timestamp/date +would change — so a precommit hook doesn't end up staging a no-op diff +every time someone runs the updater. +""" + +import json +from datetime import datetime, timezone + +from utils.generate_db_meta import ( + build_meta, + meta_payload_equals, + write_meta_if_changed, +) +from utils.update_site_data import ( + sites_md_payload_equals, + write_sites_md_if_changed, +) + + +# --------------------------------------------------------------------------- +# generate_db_meta +# --------------------------------------------------------------------------- + + +def _write_data_json(path, sites): + with open(path, "w", encoding="utf-8") as f: + json.dump({"sites": sites}, f) + + +def test_meta_payload_equals_ignores_timestamp(): + a = {"sites_count": 10, "data_sha256": "abc", "updated_at": "2026-01-01T00:00:00Z"} + b = {"sites_count": 10, "data_sha256": "abc", "updated_at": "2027-12-31T23:59:59Z"} + assert meta_payload_equals(a, b) + + +def test_meta_payload_equals_detects_real_change(): + a = {"sites_count": 10, "data_sha256": "abc", "updated_at": "2026-01-01T00:00:00Z"} + b = {"sites_count": 11, "data_sha256": "abc", "updated_at": "2026-01-01T00:00:00Z"} + assert not meta_payload_equals(a, b) + + +def test_write_meta_creates_file_when_missing(tmp_path): + data_path = tmp_path / "data.json" + meta_path = tmp_path / "db_meta.json" + _write_data_json(data_path, {"GitHub": {}}) + + meta, written = write_meta_if_changed( + str(data_path), str(meta_path), "0.6.0", "https://example/data.json" + ) + + assert written is True + assert meta_path.exists() + on_disk = json.loads(meta_path.read_text()) + assert on_disk["sites_count"] == 1 + assert on_disk["updated_at"] == meta["updated_at"] + + +def test_write_meta_skips_when_only_timestamp_would_change(tmp_path): + data_path = tmp_path / "data.json" + meta_path = tmp_path / "db_meta.json" + _write_data_json(data_path, {"GitHub": {}}) + + # First write seeds the file with an old timestamp. + old = datetime(2026, 1, 1, tzinfo=timezone.utc) + _, written_first = write_meta_if_changed( + str(data_path), str(meta_path), "0.6.0", "https://example/data.json", now=old + ) + assert written_first is True + seeded_bytes = meta_path.read_bytes() + + # Second call with a NEW `now` but identical data.json — must be a no-op. + new = datetime(2027, 6, 15, tzinfo=timezone.utc) + _, written_second = write_meta_if_changed( + str(data_path), str(meta_path), "0.6.0", "https://example/data.json", now=new + ) + assert written_second is False + # File on disk is byte-for-byte the same — including the OLD timestamp. + assert meta_path.read_bytes() == seeded_bytes + on_disk = json.loads(meta_path.read_text()) + assert on_disk["updated_at"] == "2026-01-01T00:00:00Z" + + +def test_write_meta_writes_when_data_sha256_changes(tmp_path): + data_path = tmp_path / "data.json" + meta_path = tmp_path / "db_meta.json" + + _write_data_json(data_path, {"GitHub": {}}) + write_meta_if_changed( + str(data_path), + str(meta_path), + "0.6.0", + "https://example/data.json", + now=datetime(2026, 1, 1, tzinfo=timezone.utc), + ) + + # Real change to data.json — sha256 + sites_count both move. + _write_data_json(data_path, {"GitHub": {}, "GitLab": {}}) + new_now = datetime(2027, 6, 15, tzinfo=timezone.utc) + meta, written = write_meta_if_changed( + str(data_path), str(meta_path), "0.6.0", "https://example/data.json", now=new_now + ) + + assert written is True + on_disk = json.loads(meta_path.read_text()) + assert on_disk["sites_count"] == 2 + assert on_disk["updated_at"] == "2027-06-15T00:00:00Z" + + +def test_write_meta_writes_when_min_version_changes(tmp_path): + data_path = tmp_path / "data.json" + meta_path = tmp_path / "db_meta.json" + _write_data_json(data_path, {"GitHub": {}}) + + write_meta_if_changed( + str(data_path), + str(meta_path), + "0.5.0", + "https://example/data.json", + now=datetime(2026, 1, 1, tzinfo=timezone.utc), + ) + + _, written = write_meta_if_changed( + str(data_path), + str(meta_path), + "0.6.0", # bumped + "https://example/data.json", + now=datetime(2026, 1, 2, tzinfo=timezone.utc), + ) + + assert written is True + on_disk = json.loads(meta_path.read_text()) + assert on_disk["min_maigret_version"] == "0.6.0" + + +def test_write_meta_writes_when_existing_file_is_corrupt(tmp_path): + data_path = tmp_path / "data.json" + meta_path = tmp_path / "db_meta.json" + _write_data_json(data_path, {"GitHub": {}}) + meta_path.write_text("this is not valid json") + + _, written = write_meta_if_changed( + str(data_path), str(meta_path), "0.6.0", "https://example/data.json" + ) + + assert written is True + json.loads(meta_path.read_text()) # now parseable + + +def test_build_meta_uses_provided_now(tmp_path): + data_path = tmp_path / "data.json" + _write_data_json(data_path, {"GitHub": {}}) + fixed = datetime(2030, 7, 4, 12, 0, 0, tzinfo=timezone.utc) + + meta = build_meta(str(data_path), "0.6.0", "https://example/data.json", now=fixed) + + assert meta["updated_at"] == "2030-07-04T12:00:00Z" + + +# --------------------------------------------------------------------------- +# update_site_data +# --------------------------------------------------------------------------- + + +_SITES_MD_TEMPLATE = ( + "## List of supported sites (search methods): total 1\n\n" + "Rank data fetched from Majestic Million by domains.\n\n" + "1. [GitHub](https://github.com/)*: top 100*\n" + "\nThe list was updated at ({date})\n" + "## Statistics\n\n" + "Some stats.\n" +) + + +def test_sites_md_payload_equals_ignores_date(): + a = _SITES_MD_TEMPLATE.format(date="2026-01-01") + b = _SITES_MD_TEMPLATE.format(date="2027-12-31") + assert sites_md_payload_equals(a, b) + + +def test_sites_md_payload_equals_detects_body_change(): + a = _SITES_MD_TEMPLATE.format(date="2026-01-01") + b = a.replace("GitHub", "GitLab") + assert not sites_md_payload_equals(a, b) + + +def test_write_sites_md_creates_file_when_missing(tmp_path): + target = tmp_path / "sites.md" + content = _SITES_MD_TEMPLATE.format(date="2026-05-15") + + written = write_sites_md_if_changed(content, str(target)) + + assert written is True + assert target.read_text() == content + + +def test_write_sites_md_skips_when_only_date_would_change(tmp_path): + target = tmp_path / "sites.md" + seeded = _SITES_MD_TEMPLATE.format(date="2026-01-01") + target.write_text(seeded) + + # New content has a different date but identical body. + new_content = _SITES_MD_TEMPLATE.format(date="2027-12-31") + written = write_sites_md_if_changed(new_content, str(target)) + + assert written is False + # File untouched, including the OLD date. + assert target.read_text() == seeded + + +def test_write_sites_md_writes_when_body_changes(tmp_path): + target = tmp_path / "sites.md" + target.write_text(_SITES_MD_TEMPLATE.format(date="2026-01-01")) + + new_content = _SITES_MD_TEMPLATE.format(date="2026-01-01").replace( + "GitHub", "GitLab" + ) + written = write_sites_md_if_changed(new_content, str(target)) + + assert written is True + assert "GitLab" in target.read_text() + assert "GitHub" not in target.read_text() diff --git a/tests/test_report.py b/tests/test_report.py index 8e12408..22f1677 100644 --- a/tests/test_report.py +++ b/tests/test_report.py @@ -3,6 +3,9 @@ import copy import json import os +import subprocess +import sys +import textwrap import pytest from io import StringIO @@ -442,6 +445,73 @@ def test_pdf_report(): assert os.path.exists(report_name) +def test_save_pdf_report_raises_helpful_error_without_xhtml2pdf( + monkeypatch, tmp_path +): + # Setting an entry to None makes a subsequent `import` raise ImportError — + # this simulates the optional 'pdf' extra not being installed without + # actually uninstalling xhtml2pdf from the test environment. + monkeypatch.setitem(sys.modules, 'xhtml2pdf', None) + monkeypatch.setitem(sys.modules, 'xhtml2pdf.pisa', None) + + context = generate_report_context(TEST) + target = tmp_path / "report.pdf" + + with pytest.raises(RuntimeError) as excinfo: + save_pdf_report(str(target), context) + + msg = str(excinfo.value) + assert "maigret[pdf]" in msg + assert "pip install" in msg + assert not target.exists() + + +def test_xhtml2pdf_is_not_module_level_dependency(): + # Guard against a regression where someone hoists `import xhtml2pdf` / + # `from xhtml2pdf import pisa` to the top of maigret/report.py — that + # would force every Maigret user to install the optional extra. + import maigret.report as report_module + + module_globals = vars(report_module) + assert 'xhtml2pdf' not in module_globals + assert 'pisa' not in module_globals + + +def test_import_maigret_without_pdf_extras(): + # End-to-end check: spawn a fresh interpreter with every package in the + # [pdf] extra blocked before any maigret module is loaded, and confirm + # the package, the report module, and save_pdf_report itself all import + # cleanly. Mirrors what a user who ran `pip install maigret` (without + # [pdf]) would experience. + code = textwrap.dedent( + """ + import sys + for name in ( + 'xhtml2pdf', 'xhtml2pdf.pisa', + 'arabic_reshaper', + 'bidi', 'bidi.algorithm', + ): + sys.modules[name] = None + + import maigret + import maigret.report + from maigret.report import save_pdf_report + + assert callable(save_pdf_report) + print("OK") + """ + ) + result = subprocess.run( + [sys.executable, "-c", code], + capture_output=True, + text=True, + ) + assert result.returncode == 0, ( + f"stdout={result.stdout!r} stderr={result.stderr!r}" + ) + assert "OK" in result.stdout + + def test_text_report(): context = generate_report_context(TEST) report_text = get_plaintext_report(context) diff --git a/utils/generate_db_meta.py b/utils/generate_db_meta.py index 33a99ba..e839b5b 100644 --- a/utils/generate_db_meta.py +++ b/utils/generate_db_meta.py @@ -4,14 +4,16 @@ import argparse import hashlib import json import os.path as path -import sys from datetime import datetime, timezone +from typing import Optional, Tuple RESOURCES_DIR = path.join(path.dirname(path.dirname(path.abspath(__file__))), "maigret", "resources") DATA_JSON_PATH = path.join(RESOURCES_DIR, "data.json") META_JSON_PATH = path.join(RESOURCES_DIR, "db_meta.json") DEFAULT_DATA_URL = "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/data.json" +_TIMESTAMP_KEY = "updated_at" + def get_current_version(): version_file = path.join(path.dirname(path.dirname(path.abspath(__file__))), "maigret", "__version__.py") @@ -22,6 +24,62 @@ def get_current_version(): return "0.0.0" +def build_meta(data_path: str, min_version: str, data_url: str, now: Optional[datetime] = None) -> dict: + """Build a db_meta dict for the given data.json. Does not touch the filesystem.""" + with open(data_path, "rb") as f: + raw = f.read() + data = json.loads(raw) + ts = (now or datetime.now(timezone.utc)).strftime("%Y-%m-%dT%H:%M:%SZ") + return { + "version": 1, + _TIMESTAMP_KEY: ts, + "sites_count": len(data.get("sites", {})), + "min_maigret_version": min_version, + "data_sha256": hashlib.sha256(raw).hexdigest(), + "data_url": data_url, + } + + +def meta_payload_equals(a: dict, b: dict) -> bool: + """Compare two db_meta dicts ignoring the volatile 'updated_at' field.""" + a_clean = {k: v for k, v in a.items() if k != _TIMESTAMP_KEY} + b_clean = {k: v for k, v in b.items() if k != _TIMESTAMP_KEY} + return a_clean == b_clean + + +def _read_meta(meta_path: str) -> Optional[dict]: + try: + with open(meta_path, "r", encoding="utf-8") as f: + return json.load(f) + except (OSError, ValueError): + return None + + +def write_meta_if_changed( + data_path: str, + meta_path: str, + min_version: str, + data_url: str, + now: Optional[datetime] = None, +) -> Tuple[dict, bool]: + """Generate db_meta.json next to data.json. Skip the write entirely if + the only thing that would change is `updated_at` — keeps the file (and + git/precommit hooks) quiet when the underlying site database hasn't + actually moved. + + Returns the meta dict that *would* be written and a bool indicating + whether a write happened. + """ + new_meta = build_meta(data_path, min_version, data_url, now=now) + existing = _read_meta(meta_path) + if existing is not None and meta_payload_equals(existing, new_meta): + return existing, False + + with open(meta_path, "w", encoding="utf-8") as f: + json.dump(new_meta, f, indent=4, ensure_ascii=False) + return new_meta, True + + def main(): parser = argparse.ArgumentParser(description="Generate db_meta.json from data.json") parser.add_argument("--min-version", default=None, help="Minimum compatible maigret version (default: current version)") @@ -29,30 +87,15 @@ def main(): args = parser.parse_args() min_version = args.min_version or get_current_version() + meta, written = write_meta_if_changed(DATA_JSON_PATH, META_JSON_PATH, min_version, args.data_url) - with open(DATA_JSON_PATH, "rb") as f: - raw = f.read() - sha256 = hashlib.sha256(raw).hexdigest() - - data = json.loads(raw) - sites_count = len(data.get("sites", {})) - - meta = { - "version": 1, - "updated_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), - "sites_count": sites_count, - "min_maigret_version": min_version, - "data_sha256": sha256, - "data_url": args.data_url, - } - - with open(META_JSON_PATH, "w", encoding="utf-8") as f: - json.dump(meta, f, indent=4, ensure_ascii=False) - - print(f"Generated {META_JSON_PATH}") - print(f" sites: {sites_count}") - print(f" sha256: {sha256[:16]}...") - print(f" min_version: {min_version}") + if written: + print(f"Generated {META_JSON_PATH}") + else: + print(f"Skipped {META_JSON_PATH}: nothing changed except timestamp") + print(f" sites: {meta['sites_count']}") + print(f" sha256: {meta['data_sha256'][:16]}...") + print(f" min_version: {meta['min_maigret_version']}") if __name__ == "__main__": diff --git a/utils/update_site_data.py b/utils/update_site_data.py index 96af9d6..5d7ecf8 100755 --- a/utils/update_site_data.py +++ b/utils/update_site_data.py @@ -3,6 +3,9 @@ This module generates the listing of supported sites in file `SITES.md` and pretty prints file with sites data. """ +import io +import os +import re import sys import socket import requests @@ -13,6 +16,35 @@ from datetime import datetime, timezone from argparse import ArgumentParser, RawDescriptionHelpFormatter from maigret.maigret import MaigretDatabase +from utils.generate_db_meta import write_meta_if_changed + +SITES_MD_DATE_RE = re.compile(r'\nThe list was updated at \(\d{4}-\d{2}-\d{2}\)\n') +SITES_MD_DATE_PLACEHOLDER = '\nThe list was updated at (DATE)\n' + + +def sites_md_payload_equals(a: str, b: str) -> bool: + """Compare two sites.md bodies ignoring the volatile 'updated at' date.""" + return SITES_MD_DATE_RE.sub(SITES_MD_DATE_PLACEHOLDER, a) == SITES_MD_DATE_RE.sub(SITES_MD_DATE_PLACEHOLDER, b) + + +def write_sites_md_if_changed(content: str, path: str) -> bool: + """Write `content` to `path` only if it differs from the existing file + by something other than the 'updated at' date. Returns True if a write + happened. Keeps the precommit hook from rewriting the file when the + site database itself hasn't moved. + """ + if os.path.exists(path): + try: + with open(path, "r", encoding="utf-8") as f: + existing = f.read() + except OSError: + existing = None + if existing is not None and sites_md_payload_equals(existing, content): + return False + + with open(path, "w", encoding="utf-8") as f: + f.write(content) + return True RANKS = {str(i):str(i) for i in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 50, 100, 500]} RANKS.update({ @@ -142,104 +174,105 @@ def main(): print(f"\nUpdating supported sites list (don't worry, it's needed)...") - with open("sites.md", "w") as site_file: - site_file.write(f""" + site_file = io.StringIO() + site_file.write(f""" ## List of supported sites (search methods): total {len(sites_subset)}\n Rank data fetched from Majestic Million by domains. """) - if args.dns_check: - print("Checking DNS resolution for all site domains...") - failed = check_sites_dns(sites_subset) - disabled_count = 0 - re_enabled_count = 0 - for site in sites_subset: - if site.name in failed: - if not site.disabled: - site.disabled = True - disabled_count += 1 - print(f" Disabled {site.name}: DNS does not resolve ({get_base_domain(site.url_main)})") - else: - if site.disabled: - # Re-enable previously disabled site if DNS now resolves - # (only if it was likely disabled due to DNS failure) - pass - print(f"DNS check complete: {disabled_count} site(s) disabled, {len(failed)} domain(s) unresolvable.") - - majestic_ranks = {} - if args.with_rank: - majestic_ranks = fetch_majestic_million() - + if args.dns_check: + print("Checking DNS resolution for all site domains...") + failed = check_sites_dns(sites_subset) + disabled_count = 0 + re_enabled_count = 0 for site in sites_subset: - if not args.with_rank: - break - - if site.alexa_rank < sys.maxsize and args.empty_only: - continue - if args.exclude_engine_list and site.engine in args.exclude_engine_list: - continue - - domain = get_base_domain(site.url_main) - - if domain in majestic_ranks: - site.alexa_rank = majestic_ranks[domain] + if site.name in failed: + if not site.disabled: + site.disabled = True + disabled_count += 1 + print(f" Disabled {site.name}: DNS does not resolve ({get_base_domain(site.url_main)})") else: - site.alexa_rank = sys.maxsize - - # In memory matching complete, no threads to join - if args.with_rank: - print("Successfully updated ranks matching Majestic Million dataset.") + if site.disabled: + # Re-enable previously disabled site if DNS now resolves + # (only if it was likely disabled due to DNS failure) + pass + print(f"DNS check complete: {disabled_count} site(s) disabled, {len(failed)} domain(s) unresolvable.") - sites_full_list = [(s, int(s.alexa_rank)) for s in sites_subset] + majestic_ranks = {} + if args.with_rank: + majestic_ranks = fetch_majestic_million() - sites_full_list.sort(reverse=False, key=lambda x: x[1]) + for site in sites_subset: + if not args.with_rank: + break - while sites_full_list[0][1] == 0: - site = sites_full_list.pop(0) - sites_full_list.append(site) + if site.alexa_rank < sys.maxsize and args.empty_only: + continue + if args.exclude_engine_list and site.engine in args.exclude_engine_list: + continue - for num, site_tuple in enumerate(sites_full_list): - site, rank = site_tuple - url_main = site.url_main - valid_rank = get_step_rank(rank) - all_tags = site.tags - all_tags.sort() - tags = ', ' + ', '.join(all_tags) if all_tags else '' - note = '' - if site.disabled: - note = ', search is disabled' + domain = get_base_domain(site.url_main) - favicon = f"![](https://www.google.com/s2/favicons?domain={url_main})" - site_file.write(f'1. {favicon} [{site}]({url_main})*: top {valid_rank}{tags}*{note}\n') - db.update_site(site) + if domain in majestic_ranks: + site.alexa_rank = majestic_ranks[domain] + else: + site.alexa_rank = sys.maxsize - site_file.write(f'\nThe list was updated at ({datetime.now(timezone.utc).date()})\n') - db.save_to_file(args.base_file) + # In memory matching complete, no threads to join + if args.with_rank: + print("Successfully updated ranks matching Majestic Million dataset.") - # Regenerate db_meta.json to stay in sync with data.json - try: - import hashlib, json, os - db_data_raw = open(args.base_file, 'rb').read() - db_data_parsed = json.loads(db_data_raw) - meta = { - "version": 1, - "updated_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), - "sites_count": len(db_data_parsed.get("sites", {})), - "min_maigret_version": "0.5.0", - "data_sha256": hashlib.sha256(db_data_raw).hexdigest(), - "data_url": "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/data.json", - } - meta_path = os.path.join(os.path.dirname(args.base_file), "db_meta.json") - with open(meta_path, "w", encoding="utf-8") as mf: - json.dump(meta, mf, indent=4, ensure_ascii=False) + sites_full_list = [(s, int(s.alexa_rank)) for s in sites_subset] + + sites_full_list.sort(reverse=False, key=lambda x: x[1]) + + while sites_full_list[0][1] == 0: + site = sites_full_list.pop(0) + sites_full_list.append(site) + + for num, site_tuple in enumerate(sites_full_list): + site, rank = site_tuple + url_main = site.url_main + valid_rank = get_step_rank(rank) + all_tags = site.tags + all_tags.sort() + tags = ', ' + ', '.join(all_tags) if all_tags else '' + note = '' + if site.disabled: + note = ', search is disabled' + + favicon = f"![](https://www.google.com/s2/favicons?domain={url_main})" + site_file.write(f'1. {favicon} [{site}]({url_main})*: top {valid_rank}{tags}*{note}\n') + db.update_site(site) + + site_file.write(f'\nThe list was updated at ({datetime.now(timezone.utc).date()})\n') + db.save_to_file(args.base_file) + + statistics_text = db.get_db_stats(is_markdown=True) + site_file.write('## Statistics\n\n') + site_file.write(statistics_text) + + sites_md_written = write_sites_md_if_changed(site_file.getvalue(), "sites.md") + if not sites_md_written: + print("sites.md unchanged, skipping write") + + # Regenerate db_meta.json to stay in sync with data.json — also a no-op + # if only the timestamp would change. + try: + meta_path = os.path.join(os.path.dirname(args.base_file), "db_meta.json") + meta, meta_written = write_meta_if_changed( + args.base_file, + meta_path, + min_version="0.5.0", + data_url="https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/data.json", + ) + if meta_written: print(f"Updated {meta_path} ({meta['sites_count']} sites)") - except Exception as e: - print(f"Warning: could not regenerate db_meta.json: {e}") - - statistics_text = db.get_db_stats(is_markdown=True) - site_file.write('## Statistics\n\n') - site_file.write(statistics_text) + else: + print(f"{meta_path} unchanged, skipping write") + except Exception as e: + print(f"Warning: could not regenerate db_meta.json: {e}") print("Finished updating supported site listing!")