mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-16 03:15:40 +00:00
Make xhtml2pdf optional, fix install on Linux without libcairo (#2659)
* Make xhtml2pdf optional, fix install on Linux without libcairo Move xhtml2pdf to the new [pdf] extra so default `pip install maigret` no longer pulls pycairo (which has no Linux/macOS wheels and breaks the build without libcairo2-dev). save_pdf_report now raises a clear RuntimeError pointing to `pip install 'maigret[pdf]'`, and the CLI turns it into a friendly warning instead of a crash. Adds tests covering the missing-extra path, plus per-OS install docs. Fix for #2657, #2534 * Make arabic-reshaper and python-bidi optional; idempotent update of db_meta.json and sites.md * Regenerated poerty.lock * Update CI workflow to cover minimal installation without PDF deps
This commit is contained in:
+67
-24
@@ -4,14 +4,16 @@ import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import os.path as path
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional, Tuple
|
||||
|
||||
RESOURCES_DIR = path.join(path.dirname(path.dirname(path.abspath(__file__))), "maigret", "resources")
|
||||
DATA_JSON_PATH = path.join(RESOURCES_DIR, "data.json")
|
||||
META_JSON_PATH = path.join(RESOURCES_DIR, "db_meta.json")
|
||||
DEFAULT_DATA_URL = "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/data.json"
|
||||
|
||||
_TIMESTAMP_KEY = "updated_at"
|
||||
|
||||
|
||||
def get_current_version():
|
||||
version_file = path.join(path.dirname(path.dirname(path.abspath(__file__))), "maigret", "__version__.py")
|
||||
@@ -22,6 +24,62 @@ def get_current_version():
|
||||
return "0.0.0"
|
||||
|
||||
|
||||
def build_meta(data_path: str, min_version: str, data_url: str, now: Optional[datetime] = None) -> dict:
|
||||
"""Build a db_meta dict for the given data.json. Does not touch the filesystem."""
|
||||
with open(data_path, "rb") as f:
|
||||
raw = f.read()
|
||||
data = json.loads(raw)
|
||||
ts = (now or datetime.now(timezone.utc)).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
return {
|
||||
"version": 1,
|
||||
_TIMESTAMP_KEY: ts,
|
||||
"sites_count": len(data.get("sites", {})),
|
||||
"min_maigret_version": min_version,
|
||||
"data_sha256": hashlib.sha256(raw).hexdigest(),
|
||||
"data_url": data_url,
|
||||
}
|
||||
|
||||
|
||||
def meta_payload_equals(a: dict, b: dict) -> bool:
|
||||
"""Compare two db_meta dicts ignoring the volatile 'updated_at' field."""
|
||||
a_clean = {k: v for k, v in a.items() if k != _TIMESTAMP_KEY}
|
||||
b_clean = {k: v for k, v in b.items() if k != _TIMESTAMP_KEY}
|
||||
return a_clean == b_clean
|
||||
|
||||
|
||||
def _read_meta(meta_path: str) -> Optional[dict]:
|
||||
try:
|
||||
with open(meta_path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
except (OSError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def write_meta_if_changed(
|
||||
data_path: str,
|
||||
meta_path: str,
|
||||
min_version: str,
|
||||
data_url: str,
|
||||
now: Optional[datetime] = None,
|
||||
) -> Tuple[dict, bool]:
|
||||
"""Generate db_meta.json next to data.json. Skip the write entirely if
|
||||
the only thing that would change is `updated_at` — keeps the file (and
|
||||
git/precommit hooks) quiet when the underlying site database hasn't
|
||||
actually moved.
|
||||
|
||||
Returns the meta dict that *would* be written and a bool indicating
|
||||
whether a write happened.
|
||||
"""
|
||||
new_meta = build_meta(data_path, min_version, data_url, now=now)
|
||||
existing = _read_meta(meta_path)
|
||||
if existing is not None and meta_payload_equals(existing, new_meta):
|
||||
return existing, False
|
||||
|
||||
with open(meta_path, "w", encoding="utf-8") as f:
|
||||
json.dump(new_meta, f, indent=4, ensure_ascii=False)
|
||||
return new_meta, True
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Generate db_meta.json from data.json")
|
||||
parser.add_argument("--min-version", default=None, help="Minimum compatible maigret version (default: current version)")
|
||||
@@ -29,30 +87,15 @@ def main():
|
||||
args = parser.parse_args()
|
||||
|
||||
min_version = args.min_version or get_current_version()
|
||||
meta, written = write_meta_if_changed(DATA_JSON_PATH, META_JSON_PATH, min_version, args.data_url)
|
||||
|
||||
with open(DATA_JSON_PATH, "rb") as f:
|
||||
raw = f.read()
|
||||
sha256 = hashlib.sha256(raw).hexdigest()
|
||||
|
||||
data = json.loads(raw)
|
||||
sites_count = len(data.get("sites", {}))
|
||||
|
||||
meta = {
|
||||
"version": 1,
|
||||
"updated_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"sites_count": sites_count,
|
||||
"min_maigret_version": min_version,
|
||||
"data_sha256": sha256,
|
||||
"data_url": args.data_url,
|
||||
}
|
||||
|
||||
with open(META_JSON_PATH, "w", encoding="utf-8") as f:
|
||||
json.dump(meta, f, indent=4, ensure_ascii=False)
|
||||
|
||||
print(f"Generated {META_JSON_PATH}")
|
||||
print(f" sites: {sites_count}")
|
||||
print(f" sha256: {sha256[:16]}...")
|
||||
print(f" min_version: {min_version}")
|
||||
if written:
|
||||
print(f"Generated {META_JSON_PATH}")
|
||||
else:
|
||||
print(f"Skipped {META_JSON_PATH}: nothing changed except timestamp")
|
||||
print(f" sites: {meta['sites_count']}")
|
||||
print(f" sha256: {meta['data_sha256'][:16]}...")
|
||||
print(f" min_version: {meta['min_maigret_version']}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
+116
-83
@@ -3,6 +3,9 @@
|
||||
This module generates the listing of supported sites in file `SITES.md`
|
||||
and pretty prints file with sites data.
|
||||
"""
|
||||
import io
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import socket
|
||||
import requests
|
||||
@@ -13,6 +16,35 @@ from datetime import datetime, timezone
|
||||
from argparse import ArgumentParser, RawDescriptionHelpFormatter
|
||||
|
||||
from maigret.maigret import MaigretDatabase
|
||||
from utils.generate_db_meta import write_meta_if_changed
|
||||
|
||||
SITES_MD_DATE_RE = re.compile(r'\nThe list was updated at \(\d{4}-\d{2}-\d{2}\)\n')
|
||||
SITES_MD_DATE_PLACEHOLDER = '\nThe list was updated at (DATE)\n'
|
||||
|
||||
|
||||
def sites_md_payload_equals(a: str, b: str) -> bool:
|
||||
"""Compare two sites.md bodies ignoring the volatile 'updated at' date."""
|
||||
return SITES_MD_DATE_RE.sub(SITES_MD_DATE_PLACEHOLDER, a) == SITES_MD_DATE_RE.sub(SITES_MD_DATE_PLACEHOLDER, b)
|
||||
|
||||
|
||||
def write_sites_md_if_changed(content: str, path: str) -> bool:
|
||||
"""Write `content` to `path` only if it differs from the existing file
|
||||
by something other than the 'updated at' date. Returns True if a write
|
||||
happened. Keeps the precommit hook from rewriting the file when the
|
||||
site database itself hasn't moved.
|
||||
"""
|
||||
if os.path.exists(path):
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
existing = f.read()
|
||||
except OSError:
|
||||
existing = None
|
||||
if existing is not None and sites_md_payload_equals(existing, content):
|
||||
return False
|
||||
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
f.write(content)
|
||||
return True
|
||||
|
||||
RANKS = {str(i):str(i) for i in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 50, 100, 500]}
|
||||
RANKS.update({
|
||||
@@ -142,104 +174,105 @@ def main():
|
||||
|
||||
print(f"\nUpdating supported sites list (don't worry, it's needed)...")
|
||||
|
||||
with open("sites.md", "w") as site_file:
|
||||
site_file.write(f"""
|
||||
site_file = io.StringIO()
|
||||
site_file.write(f"""
|
||||
## List of supported sites (search methods): total {len(sites_subset)}\n
|
||||
Rank data fetched from Majestic Million by domains.
|
||||
|
||||
""")
|
||||
|
||||
if args.dns_check:
|
||||
print("Checking DNS resolution for all site domains...")
|
||||
failed = check_sites_dns(sites_subset)
|
||||
disabled_count = 0
|
||||
re_enabled_count = 0
|
||||
for site in sites_subset:
|
||||
if site.name in failed:
|
||||
if not site.disabled:
|
||||
site.disabled = True
|
||||
disabled_count += 1
|
||||
print(f" Disabled {site.name}: DNS does not resolve ({get_base_domain(site.url_main)})")
|
||||
else:
|
||||
if site.disabled:
|
||||
# Re-enable previously disabled site if DNS now resolves
|
||||
# (only if it was likely disabled due to DNS failure)
|
||||
pass
|
||||
print(f"DNS check complete: {disabled_count} site(s) disabled, {len(failed)} domain(s) unresolvable.")
|
||||
|
||||
majestic_ranks = {}
|
||||
if args.with_rank:
|
||||
majestic_ranks = fetch_majestic_million()
|
||||
|
||||
if args.dns_check:
|
||||
print("Checking DNS resolution for all site domains...")
|
||||
failed = check_sites_dns(sites_subset)
|
||||
disabled_count = 0
|
||||
re_enabled_count = 0
|
||||
for site in sites_subset:
|
||||
if not args.with_rank:
|
||||
break
|
||||
|
||||
if site.alexa_rank < sys.maxsize and args.empty_only:
|
||||
continue
|
||||
if args.exclude_engine_list and site.engine in args.exclude_engine_list:
|
||||
continue
|
||||
|
||||
domain = get_base_domain(site.url_main)
|
||||
|
||||
if domain in majestic_ranks:
|
||||
site.alexa_rank = majestic_ranks[domain]
|
||||
if site.name in failed:
|
||||
if not site.disabled:
|
||||
site.disabled = True
|
||||
disabled_count += 1
|
||||
print(f" Disabled {site.name}: DNS does not resolve ({get_base_domain(site.url_main)})")
|
||||
else:
|
||||
site.alexa_rank = sys.maxsize
|
||||
|
||||
# In memory matching complete, no threads to join
|
||||
if args.with_rank:
|
||||
print("Successfully updated ranks matching Majestic Million dataset.")
|
||||
if site.disabled:
|
||||
# Re-enable previously disabled site if DNS now resolves
|
||||
# (only if it was likely disabled due to DNS failure)
|
||||
pass
|
||||
print(f"DNS check complete: {disabled_count} site(s) disabled, {len(failed)} domain(s) unresolvable.")
|
||||
|
||||
sites_full_list = [(s, int(s.alexa_rank)) for s in sites_subset]
|
||||
majestic_ranks = {}
|
||||
if args.with_rank:
|
||||
majestic_ranks = fetch_majestic_million()
|
||||
|
||||
sites_full_list.sort(reverse=False, key=lambda x: x[1])
|
||||
for site in sites_subset:
|
||||
if not args.with_rank:
|
||||
break
|
||||
|
||||
while sites_full_list[0][1] == 0:
|
||||
site = sites_full_list.pop(0)
|
||||
sites_full_list.append(site)
|
||||
if site.alexa_rank < sys.maxsize and args.empty_only:
|
||||
continue
|
||||
if args.exclude_engine_list and site.engine in args.exclude_engine_list:
|
||||
continue
|
||||
|
||||
for num, site_tuple in enumerate(sites_full_list):
|
||||
site, rank = site_tuple
|
||||
url_main = site.url_main
|
||||
valid_rank = get_step_rank(rank)
|
||||
all_tags = site.tags
|
||||
all_tags.sort()
|
||||
tags = ', ' + ', '.join(all_tags) if all_tags else ''
|
||||
note = ''
|
||||
if site.disabled:
|
||||
note = ', search is disabled'
|
||||
domain = get_base_domain(site.url_main)
|
||||
|
||||
favicon = f""
|
||||
site_file.write(f'1. {favicon} [{site}]({url_main})*: top {valid_rank}{tags}*{note}\n')
|
||||
db.update_site(site)
|
||||
if domain in majestic_ranks:
|
||||
site.alexa_rank = majestic_ranks[domain]
|
||||
else:
|
||||
site.alexa_rank = sys.maxsize
|
||||
|
||||
site_file.write(f'\nThe list was updated at ({datetime.now(timezone.utc).date()})\n')
|
||||
db.save_to_file(args.base_file)
|
||||
# In memory matching complete, no threads to join
|
||||
if args.with_rank:
|
||||
print("Successfully updated ranks matching Majestic Million dataset.")
|
||||
|
||||
# Regenerate db_meta.json to stay in sync with data.json
|
||||
try:
|
||||
import hashlib, json, os
|
||||
db_data_raw = open(args.base_file, 'rb').read()
|
||||
db_data_parsed = json.loads(db_data_raw)
|
||||
meta = {
|
||||
"version": 1,
|
||||
"updated_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"sites_count": len(db_data_parsed.get("sites", {})),
|
||||
"min_maigret_version": "0.5.0",
|
||||
"data_sha256": hashlib.sha256(db_data_raw).hexdigest(),
|
||||
"data_url": "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/data.json",
|
||||
}
|
||||
meta_path = os.path.join(os.path.dirname(args.base_file), "db_meta.json")
|
||||
with open(meta_path, "w", encoding="utf-8") as mf:
|
||||
json.dump(meta, mf, indent=4, ensure_ascii=False)
|
||||
sites_full_list = [(s, int(s.alexa_rank)) for s in sites_subset]
|
||||
|
||||
sites_full_list.sort(reverse=False, key=lambda x: x[1])
|
||||
|
||||
while sites_full_list[0][1] == 0:
|
||||
site = sites_full_list.pop(0)
|
||||
sites_full_list.append(site)
|
||||
|
||||
for num, site_tuple in enumerate(sites_full_list):
|
||||
site, rank = site_tuple
|
||||
url_main = site.url_main
|
||||
valid_rank = get_step_rank(rank)
|
||||
all_tags = site.tags
|
||||
all_tags.sort()
|
||||
tags = ', ' + ', '.join(all_tags) if all_tags else ''
|
||||
note = ''
|
||||
if site.disabled:
|
||||
note = ', search is disabled'
|
||||
|
||||
favicon = f""
|
||||
site_file.write(f'1. {favicon} [{site}]({url_main})*: top {valid_rank}{tags}*{note}\n')
|
||||
db.update_site(site)
|
||||
|
||||
site_file.write(f'\nThe list was updated at ({datetime.now(timezone.utc).date()})\n')
|
||||
db.save_to_file(args.base_file)
|
||||
|
||||
statistics_text = db.get_db_stats(is_markdown=True)
|
||||
site_file.write('## Statistics\n\n')
|
||||
site_file.write(statistics_text)
|
||||
|
||||
sites_md_written = write_sites_md_if_changed(site_file.getvalue(), "sites.md")
|
||||
if not sites_md_written:
|
||||
print("sites.md unchanged, skipping write")
|
||||
|
||||
# Regenerate db_meta.json to stay in sync with data.json — also a no-op
|
||||
# if only the timestamp would change.
|
||||
try:
|
||||
meta_path = os.path.join(os.path.dirname(args.base_file), "db_meta.json")
|
||||
meta, meta_written = write_meta_if_changed(
|
||||
args.base_file,
|
||||
meta_path,
|
||||
min_version="0.5.0",
|
||||
data_url="https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/data.json",
|
||||
)
|
||||
if meta_written:
|
||||
print(f"Updated {meta_path} ({meta['sites_count']} sites)")
|
||||
except Exception as e:
|
||||
print(f"Warning: could not regenerate db_meta.json: {e}")
|
||||
|
||||
statistics_text = db.get_db_stats(is_markdown=True)
|
||||
site_file.write('## Statistics\n\n')
|
||||
site_file.write(statistics_text)
|
||||
else:
|
||||
print(f"{meta_path} unchanged, skipping write")
|
||||
except Exception as e:
|
||||
print(f"Warning: could not regenerate db_meta.json: {e}")
|
||||
|
||||
print("Finished updating supported site listing!")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user