From 269d50eedcbfed22f8e7b3290ceb65b5ffb330cb Mon Sep 17 00:00:00 2001 From: Soxoj <31013580+soxoj@users.noreply.github.com> Date: Sat, 4 Apr 2026 18:00:50 +0200 Subject: [PATCH] DB update mechanism (#2458) * Database update mechanism --- .githooks/pre-commit | 6 +- .github/workflows/update-site-data.yml | 3 + docs/source/settings.rst | 74 ++++++ maigret/db_updater.py | 330 +++++++++++++++++++++++++ maigret/maigret.py | 42 +++- maigret/resources/data.json | 1 - maigret/resources/db_meta.json | 8 + maigret/resources/settings.json | 5 +- maigret/settings.py | 3 + tests/test_cli.py | 2 + tests/test_db_updater.py | 233 +++++++++++++++++ utils/generate_db_meta.py | 59 +++++ utils/update_site_data.py | 7 + 13 files changed, 766 insertions(+), 7 deletions(-) create mode 100644 maigret/db_updater.py create mode 100644 maigret/resources/db_meta.json create mode 100644 tests/test_db_updater.py create mode 100644 utils/generate_db_meta.py diff --git a/.githooks/pre-commit b/.githooks/pre-commit index 207c24a..7a9bbe1 100755 --- a/.githooks/pre-commit +++ b/.githooks/pre-commit @@ -1,3 +1,7 @@ #!/bin/sh echo 'Activating update_sitesmd hook script...' -poetry run update_sitesmd \ No newline at end of file +poetry run update_sitesmd + +echo 'Regenerating db_meta.json...' +python3 utils/generate_db_meta.py +git add maigret/resources/db_meta.json \ No newline at end of file diff --git a/.github/workflows/update-site-data.yml b/.github/workflows/update-site-data.yml index 1b5b2ea..3f0d24c 100644 --- a/.github/workflows/update-site-data.yml +++ b/.github/workflows/update-site-data.yml @@ -27,6 +27,9 @@ jobs: pip3 install . python3 ./utils/update_site_data.py --empty-only + - name: Regenerate db_meta.json + run: python3 utils/generate_db_meta.py + - name: Remove ambiguous main tag run: git tag -d main || true diff --git a/docs/source/settings.rst b/docs/source/settings.rst index ade3a2a..dc36000 100644 --- a/docs/source/settings.rst +++ b/docs/source/settings.rst @@ -27,3 +27,77 @@ Missing any of these files is not an error. If the next settings file contains already known option, this option will be rewrited. So it is possible to make custom configuration for different users and directories. + +.. _database-auto-update: + +Database auto-update +-------------------- + +Maigret ships with a bundled site database, but it gets outdated between releases. To keep the database current, Maigret automatically checks for updates on startup. + +**How it works:** + +1. On startup, Maigret checks if more than 24 hours have passed since the last update check. +2. If so, it fetches a lightweight metadata file (~200 bytes) from GitHub to see if a newer database is available. +3. If a newer, compatible database exists, Maigret downloads it to ``~/.maigret/data.json`` and uses it instead of the bundled copy. +4. If the download fails or the new database is incompatible with your Maigret version, the bundled database is used as a fallback. + +The downloaded database has **higher priority** than the bundled one — it replaces, not overlays. + +**Status messages** are printed only when an action occurs: + +.. code-block:: text + + [*] DB auto-update: checking for updates... + [+] DB auto-update: database updated successfully (3180 sites) + [*] DB auto-update: database is up to date (3157 sites) + [!] DB auto-update: latest database requires maigret >= 0.6.0, you have 0.5.0 + +**Forcing an update:** + +Use the ``--force-update`` flag to check for updates immediately, ignoring the check interval: + +.. code-block:: console + + maigret username --force-update + +The update happens at startup, then the search continues normally with the freshly downloaded database. + +**Disabling auto-update:** + +Use the ``--no-autoupdate`` flag to skip the update check entirely: + +.. code-block:: console + + maigret username --no-autoupdate + +Or set it permanently in ``~/.maigret/settings.json``: + +.. code-block:: json + + { + "no_autoupdate": true + } + +This is recommended for **Docker containers**, **CI pipelines**, and **air-gapped environments**. + +**Configuration options** (in ``settings.json``): + +.. list-table:: + :header-rows: 1 + :widths: 35 15 50 + + * - Setting + - Default + - Description + * - ``no_autoupdate`` + - ``false`` + - Disable auto-update entirely + * - ``autoupdate_check_interval_hours`` + - ``24`` + - How often to check for updates (in hours) + * - ``db_update_meta_url`` + - GitHub raw URL + - URL of the metadata file (for custom mirrors) + +**Using a custom database** with ``--db`` always skips auto-update — you are explicitly choosing your data source. diff --git a/maigret/db_updater.py b/maigret/db_updater.py new file mode 100644 index 0000000..dc8e6bc --- /dev/null +++ b/maigret/db_updater.py @@ -0,0 +1,330 @@ +""" +Database auto-update logic for maigret. + +Checks a lightweight meta file to determine if a newer site database is available, +downloads it if compatible, and caches it locally in ~/.maigret/. +""" + +import hashlib +import json +import logging +import os +import os.path as path +import tempfile +from datetime import datetime, timezone +from typing import Optional + +import requests +from colorama import Fore, Style + +from .__version__ import __version__ + +logger = logging.getLogger("maigret") + +_use_color = True + + +def _print_info(msg: str) -> None: + text = f"[*] {msg}" + if _use_color: + print(Style.BRIGHT + Fore.GREEN + text + Style.RESET_ALL) + else: + print(text) + + +def _print_success(msg: str) -> None: + text = f"[+] {msg}" + if _use_color: + print(Style.BRIGHT + Fore.GREEN + text + Style.RESET_ALL) + else: + print(text) + + +def _print_warning(msg: str) -> None: + text = f"[!] {msg}" + if _use_color: + print(Style.BRIGHT + Fore.YELLOW + text + Style.RESET_ALL) + else: + print(text) + + +DEFAULT_META_URL = ( + "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/db_meta.json" +) +DEFAULT_CHECK_INTERVAL_HOURS = 24 +MAIGRET_HOME = path.expanduser("~/.maigret") +CACHED_DB_PATH = path.join(MAIGRET_HOME, "data.json") +STATE_PATH = path.join(MAIGRET_HOME, "autoupdate_state.json") +BUNDLED_DB_PATH = path.join(path.dirname(path.realpath(__file__)), "resources", "data.json") + + +def _parse_version(version_str: str) -> tuple: + """Parse a version string like '0.5.0' into a comparable tuple (0, 5, 0).""" + try: + return tuple(int(x) for x in version_str.strip().split(".")) + except (ValueError, AttributeError): + return (0, 0, 0) + + +def _ensure_maigret_home() -> None: + os.makedirs(MAIGRET_HOME, exist_ok=True) + + +def _load_state() -> dict: + try: + with open(STATE_PATH, "r", encoding="utf-8") as f: + return json.load(f) + except (FileNotFoundError, json.JSONDecodeError, OSError): + return {} + + +def _save_state(state: dict) -> None: + _ensure_maigret_home() + tmp_path = STATE_PATH + ".tmp" + try: + with open(tmp_path, "w", encoding="utf-8") as f: + json.dump(state, f, indent=2, ensure_ascii=False) + os.replace(tmp_path, STATE_PATH) + except OSError: + try: + os.unlink(tmp_path) + except OSError: + pass + + +def _needs_check(state: dict, interval_hours: int) -> bool: + last_check = state.get("last_check_at") + if not last_check: + return True + try: + last_dt = datetime.fromisoformat(last_check.replace("Z", "+00:00")) + elapsed = (datetime.now(timezone.utc) - last_dt).total_seconds() / 3600 + return elapsed >= interval_hours + except (ValueError, TypeError): + return True + + +def _fetch_meta(meta_url: str, timeout: int = 10) -> Optional[dict]: + try: + response = requests.get(meta_url, timeout=timeout) + if response.status_code == 200: + return response.json() + except Exception: + pass + return None + + +def _is_version_compatible(meta: dict) -> bool: + min_ver = meta.get("min_maigret_version", "0.0.0") + return _parse_version(__version__) >= _parse_version(min_ver) + + +def _is_update_available(meta: dict, state: dict) -> bool: + if not path.isfile(CACHED_DB_PATH): + return True + remote_date = meta.get("updated_at", "") + cached_date = state.get("last_meta", {}).get("updated_at", "") + return remote_date > cached_date + + +def _download_and_verify(data_url: str, expected_sha256: str, timeout: int = 60) -> Optional[str]: + _ensure_maigret_home() + tmp_fd, tmp_path = tempfile.mkstemp(dir=MAIGRET_HOME, suffix=".json") + try: + response = requests.get(data_url, timeout=timeout) + if response.status_code != 200: + return None + + content = response.content + actual_sha256 = hashlib.sha256(content).hexdigest() + if actual_sha256 != expected_sha256: + _print_warning("DB auto-update: SHA-256 mismatch, download rejected") + return None + + # Validate JSON structure + data = json.loads(content) + if not all(k in data for k in ("sites", "engines", "tags")): + _print_warning("DB auto-update: invalid database structure") + return None + + os.write(tmp_fd, content) + os.close(tmp_fd) + tmp_fd = None + os.replace(tmp_path, CACHED_DB_PATH) + return CACHED_DB_PATH + except Exception: + return None + finally: + if tmp_fd is not None: + os.close(tmp_fd) + try: + os.unlink(tmp_path) + except OSError: + pass + + +def _best_local() -> str: + """Return cached DB if it exists and is valid, otherwise bundled.""" + if path.isfile(CACHED_DB_PATH): + try: + with open(CACHED_DB_PATH, "r", encoding="utf-8") as f: + data = json.load(f) + if "sites" in data: + return CACHED_DB_PATH + except (json.JSONDecodeError, OSError): + pass + return BUNDLED_DB_PATH + + +def _now_iso() -> str: + return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + + +def resolve_db_path( + db_file_arg: str, + no_autoupdate: bool = False, + meta_url: str = DEFAULT_META_URL, + check_interval_hours: int = DEFAULT_CHECK_INTERVAL_HOURS, + color: bool = True, +) -> str: + """ + Determine which database file to use, potentially downloading an update. + + Returns the path to the database file that should be loaded. + """ + global _use_color + _use_color = color + + default_db_name = "resources/data.json" + + # User specified a custom DB — skip auto-update + is_url = db_file_arg.startswith("http://") or db_file_arg.startswith("https://") + is_default = db_file_arg == default_db_name + if is_url: + return db_file_arg + if not is_default: + return path.join(path.dirname(path.realpath(__file__)), db_file_arg) + + # Auto-update disabled + if no_autoupdate: + return _best_local() + + # Check interval + _ensure_maigret_home() + state = _load_state() + if not _needs_check(state, check_interval_hours): + return _best_local() + + # Time to check + _print_info("DB auto-update: checking for updates...") + meta = _fetch_meta(meta_url) + if meta is None: + _print_warning("DB auto-update: could not reach update server, using local database") + state["last_check_at"] = _now_iso() + _save_state(state) + return _best_local() + + # Version compatibility + if not _is_version_compatible(meta): + min_ver = meta.get("min_maigret_version", "?") + _print_warning( + f"DB auto-update: latest database requires maigret >= {min_ver}, " + f"you have {__version__}. Please upgrade with: pip install -U maigret" + ) + state["last_check_at"] = _now_iso() + _save_state(state) + return _best_local() + + # Check if update available + if not _is_update_available(meta, state): + sites_count = meta.get("sites_count", "?") + _print_info(f"DB auto-update: database is up to date ({sites_count} sites)") + state["last_check_at"] = _now_iso() + state["last_meta"] = meta + _save_state(state) + return _best_local() + + # Download update + new_count = meta.get("sites_count", "?") + old_count = state.get("last_meta", {}).get("sites_count") + if old_count: + _print_info(f"DB auto-update: downloading updated database ({new_count} sites, was {old_count})...") + else: + _print_info(f"DB auto-update: downloading database ({new_count} sites)...") + + data_url = meta.get("data_url", "") + expected_sha = meta.get("data_sha256", "") + result = _download_and_verify(data_url, expected_sha) + + if result is None: + _print_warning("DB auto-update: download failed, using local database") + state["last_check_at"] = _now_iso() + _save_state(state) + return _best_local() + + _print_success(f"DB auto-update: database updated successfully ({new_count} sites)") + state["last_check_at"] = _now_iso() + state["last_meta"] = meta + state["cached_db_sha256"] = expected_sha + _save_state(state) + return CACHED_DB_PATH + + +def force_update( + meta_url: str = DEFAULT_META_URL, + color: bool = True, +) -> bool: + """ + Force check for database updates and download if available. + + Returns True if database was updated, False otherwise. + """ + global _use_color + _use_color = color + + _ensure_maigret_home() + + _print_info("DB update: checking for updates...") + meta = _fetch_meta(meta_url) + if meta is None: + _print_warning("DB update: could not reach update server") + return False + + if not _is_version_compatible(meta): + min_ver = meta.get("min_maigret_version", "?") + _print_warning( + f"DB update: latest database requires maigret >= {min_ver}, " + f"you have {__version__}. Please upgrade with: pip install -U maigret" + ) + return False + + state = _load_state() + new_count = meta.get("sites_count", "?") + old_count = state.get("last_meta", {}).get("sites_count") + + if not _is_update_available(meta, state): + _print_info(f"DB update: database is already up to date ({new_count} sites)") + state["last_check_at"] = _now_iso() + state["last_meta"] = meta + _save_state(state) + return False + + if old_count: + _print_info(f"DB update: downloading updated database ({new_count} sites, was {old_count})...") + else: + _print_info(f"DB update: downloading database ({new_count} sites)...") + + data_url = meta.get("data_url", "") + expected_sha = meta.get("data_sha256", "") + result = _download_and_verify(data_url, expected_sha) + + if result is None: + _print_warning("DB update: download failed") + return False + + _print_success(f"DB update: database updated successfully ({new_count} sites)") + state["last_check_at"] = _now_iso() + state["last_meta"] = meta + state["cached_db_sha256"] = expected_sha + _save_state(state) + return True diff --git a/maigret/maigret.py b/maigret/maigret.py index dc1235e..3a191bf 100755 --- a/maigret/maigret.py +++ b/maigret/maigret.py @@ -201,6 +201,20 @@ def setup_arguments_parser(settings: Settings): default=settings.sites_db_path, help="Load Maigret database from a JSON file or HTTP web resource.", ) + parser.add_argument( + "--no-autoupdate", + action="store_true", + dest="no_autoupdate", + default=settings.no_autoupdate, + help="Disable automatic database updates on startup.", + ) + parser.add_argument( + "--force-update", + action="store_true", + dest="force_update", + default=False, + help="Force check for database updates and download if available.", + ) parser.add_argument( "--cookies-jar-file", metavar="COOKIE_FILE", @@ -543,9 +557,21 @@ async def main(): else: args.exclude_tags = [] - db_file = args.db_file \ - if (args.db_file.startswith("http://") or args.db_file.startswith("https://")) \ - else path.join(path.dirname(path.realpath(__file__)), args.db_file) + from .db_updater import resolve_db_path, force_update, BUNDLED_DB_PATH + + if args.force_update: + force_update( + meta_url=settings.db_update_meta_url, + color=not args.no_color, + ) + + db_file = resolve_db_path( + db_file_arg=args.db_file, + no_autoupdate=args.no_autoupdate or args.force_update, + meta_url=settings.db_update_meta_url, + check_interval_hours=settings.autoupdate_check_interval_hours, + color=not args.no_color, + ) if args.top_sites == 0 or args.all_sites: args.top_sites = sys.maxsize @@ -560,7 +586,15 @@ async def main(): ) # Create object with all information about sites we are aware of. - db = MaigretDatabase().load_from_path(db_file) + try: + db = MaigretDatabase().load_from_path(db_file) + except Exception as e: + logger.warning(f"Failed to load database from {db_file}: {e}") + if db_file != BUNDLED_DB_PATH: + logger.warning("Falling back to bundled database") + db = MaigretDatabase().load_from_path(BUNDLED_DB_PATH) + else: + raise get_top_sites_for_id = lambda x: db.ranked_sites_dict( top=args.top_sites, tags=args.tags, diff --git a/maigret/resources/data.json b/maigret/resources/data.json index 2a10a23..72dce22 100644 --- a/maigret/resources/data.json +++ b/maigret/resources/data.json @@ -29262,7 +29262,6 @@ "usernameClaimed": "alex", "usernameUnclaimed": "noonewouldeverusethis7" }, - "izmailonline.com": { "tags": [ "ua" diff --git a/maigret/resources/db_meta.json b/maigret/resources/db_meta.json new file mode 100644 index 0000000..2d1d487 --- /dev/null +++ b/maigret/resources/db_meta.json @@ -0,0 +1,8 @@ +{ + "version": 1, + "updated_at": "2026-04-04T15:54:23Z", + "sites_count": 3157, + "min_maigret_version": "0.5.0", + "data_sha256": "880a56363cf5d71e13ca389330388fbc4796bff50d6e207a056112c4a5606f83", + "data_url": "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/data.json" +} \ No newline at end of file diff --git a/maigret/resources/settings.json b/maigret/resources/settings.json index 82f90f7..d17ec6c 100644 --- a/maigret/resources/settings.json +++ b/maigret/resources/settings.json @@ -54,5 +54,8 @@ "graph_report": false, "pdf_report": false, "html_report": false, - "web_interface_port": 5000 + "web_interface_port": 5000, + "no_autoupdate": false, + "db_update_meta_url": "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/db_meta.json", + "autoupdate_check_interval_hours": 24 } \ No newline at end of file diff --git a/maigret/settings.py b/maigret/settings.py index 5c927fa..d37a7a3 100644 --- a/maigret/settings.py +++ b/maigret/settings.py @@ -43,6 +43,9 @@ class Settings: html_report: bool graph_report: bool web_interface_port: int + no_autoupdate: bool + db_update_meta_url: str + autoupdate_check_interval_hours: int # submit mode settings presence_strings: list diff --git a/tests/test_cli.py b/tests/test_cli.py index 1c69525..3d58936 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -48,6 +48,8 @@ DEFAULT_ARGS: Dict[str, Any] = { 'web': None, 'with_domains': False, 'xmind': False, + 'no_autoupdate': False, + 'force_update': False, } diff --git a/tests/test_db_updater.py b/tests/test_db_updater.py new file mode 100644 index 0000000..a5cf8d1 --- /dev/null +++ b/tests/test_db_updater.py @@ -0,0 +1,233 @@ +"""Tests for the database auto-update system.""" + +import json +import os +import hashlib +from datetime import datetime, timezone, timedelta +from unittest.mock import patch, MagicMock + +import pytest + +from maigret.db_updater import ( + _parse_version, + _needs_check, + _is_version_compatible, + _is_update_available, + _load_state, + _save_state, + _best_local, + _now_iso, + resolve_db_path, + force_update, + CACHED_DB_PATH, + BUNDLED_DB_PATH, + STATE_PATH, + MAIGRET_HOME, +) + + +def test_parse_version(): + assert _parse_version("0.5.0") == (0, 5, 0) + assert _parse_version("1.2.3") == (1, 2, 3) + assert _parse_version("bad") == (0, 0, 0) + assert _parse_version("") == (0, 0, 0) + + +def test_needs_check_no_state(): + assert _needs_check({}, 24) is True + + +def test_needs_check_recent(): + state = {"last_check_at": _now_iso()} + assert _needs_check(state, 24) is False + + +def test_needs_check_expired(): + old_time = (datetime.now(timezone.utc) - timedelta(hours=25)).strftime("%Y-%m-%dT%H:%M:%SZ") + state = {"last_check_at": old_time} + assert _needs_check(state, 24) is True + + +def test_needs_check_corrupt(): + state = {"last_check_at": "not-a-date"} + assert _needs_check(state, 24) is True + + +def test_version_compatible(): + with patch("maigret.db_updater.__version__", "0.5.0"): + assert _is_version_compatible({"min_maigret_version": "0.5.0"}) is True + assert _is_version_compatible({"min_maigret_version": "0.4.0"}) is True + assert _is_version_compatible({"min_maigret_version": "0.6.0"}) is False + assert _is_version_compatible({}) is True # missing field = compatible + + +def test_update_available_no_cache(tmp_path): + with patch("maigret.db_updater.CACHED_DB_PATH", str(tmp_path / "nonexistent.json")): + assert _is_update_available({"updated_at": "2026-01-01T00:00:00Z"}, {}) is True + + +def test_update_available_newer(tmp_path): + cache = tmp_path / "data.json" + cache.write_text("{}") + with patch("maigret.db_updater.CACHED_DB_PATH", str(cache)): + state = {"last_meta": {"updated_at": "2026-01-01T00:00:00Z"}} + meta = {"updated_at": "2026-02-01T00:00:00Z"} + assert _is_update_available(meta, state) is True + + +def test_update_available_same(tmp_path): + cache = tmp_path / "data.json" + cache.write_text("{}") + with patch("maigret.db_updater.CACHED_DB_PATH", str(cache)): + state = {"last_meta": {"updated_at": "2026-01-01T00:00:00Z"}} + meta = {"updated_at": "2026-01-01T00:00:00Z"} + assert _is_update_available(meta, state) is False + + +def test_load_state_missing(tmp_path): + with patch("maigret.db_updater.STATE_PATH", str(tmp_path / "missing.json")): + assert _load_state() == {} + + +def test_load_state_corrupt(tmp_path): + corrupt = tmp_path / "state.json" + corrupt.write_text("not json{{{") + with patch("maigret.db_updater.STATE_PATH", str(corrupt)): + assert _load_state() == {} + + +def test_save_and_load_state(tmp_path): + state_file = tmp_path / "state.json" + with patch("maigret.db_updater.STATE_PATH", str(state_file)): + with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)): + _save_state({"last_check_at": "2026-01-01T00:00:00Z"}) + loaded = _load_state() + assert loaded["last_check_at"] == "2026-01-01T00:00:00Z" + + +def test_best_local_with_valid_cache(tmp_path): + cache = tmp_path / "data.json" + cache.write_text('{"sites": {}, "engines": {}, "tags": []}') + with patch("maigret.db_updater.CACHED_DB_PATH", str(cache)): + assert _best_local() == str(cache) + + +def test_best_local_with_corrupt_cache(tmp_path): + cache = tmp_path / "data.json" + cache.write_text("not json") + with patch("maigret.db_updater.CACHED_DB_PATH", str(cache)): + assert _best_local() == BUNDLED_DB_PATH + + +def test_best_local_no_cache(tmp_path): + with patch("maigret.db_updater.CACHED_DB_PATH", str(tmp_path / "missing.json")): + assert _best_local() == BUNDLED_DB_PATH + + +def test_resolve_db_path_custom_url(): + result = resolve_db_path("https://example.com/db.json") + assert result == "https://example.com/db.json" + + +def test_resolve_db_path_custom_file(): + result = resolve_db_path("custom/path.json") + assert result.endswith("custom/path.json") + + +def test_resolve_db_path_no_autoupdate(tmp_path): + with patch("maigret.db_updater.CACHED_DB_PATH", str(tmp_path / "missing.json")): + result = resolve_db_path("resources/data.json", no_autoupdate=True) + assert result == BUNDLED_DB_PATH + + +def test_resolve_db_path_no_autoupdate_with_cache(tmp_path): + cache = tmp_path / "data.json" + cache.write_text('{"sites": {}, "engines": {}, "tags": []}') + with patch("maigret.db_updater.CACHED_DB_PATH", str(cache)): + result = resolve_db_path("resources/data.json", no_autoupdate=True) + assert result == str(cache) + + +@patch("maigret.db_updater._fetch_meta") +def test_resolve_db_path_network_failure(mock_fetch, tmp_path): + mock_fetch.return_value = None + with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)): + with patch("maigret.db_updater.STATE_PATH", str(tmp_path / "state.json")): + with patch("maigret.db_updater.CACHED_DB_PATH", str(tmp_path / "missing.json")): + result = resolve_db_path("resources/data.json") + assert result == BUNDLED_DB_PATH + + +# --- force_update tests --- + + +@patch("maigret.db_updater._fetch_meta") +def test_force_update_network_failure(mock_fetch, tmp_path): + mock_fetch.return_value = None + with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)): + with patch("maigret.db_updater.STATE_PATH", str(tmp_path / "state.json")): + assert force_update() is False + + +@patch("maigret.db_updater._fetch_meta") +def test_force_update_incompatible_version(mock_fetch, tmp_path): + mock_fetch.return_value = {"min_maigret_version": "99.0.0", "sites_count": 100} + with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)): + with patch("maigret.db_updater.STATE_PATH", str(tmp_path / "state.json")): + assert force_update() is False + + +@patch("maigret.db_updater._download_and_verify") +@patch("maigret.db_updater._fetch_meta") +def test_force_update_success(mock_fetch, mock_download, tmp_path): + mock_fetch.return_value = { + "min_maigret_version": "0.1.0", + "sites_count": 3200, + "updated_at": "2099-01-01T00:00:00Z", + "data_url": "https://example.com/data.json", + "data_sha256": "abc123", + } + mock_download.return_value = str(tmp_path / "data.json") + with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)): + with patch("maigret.db_updater.STATE_PATH", str(tmp_path / "state.json")): + with patch("maigret.db_updater.CACHED_DB_PATH", str(tmp_path / "missing.json")): + assert force_update() is True + state = _load_state() + assert state["last_meta"]["sites_count"] == 3200 + + +@patch("maigret.db_updater._fetch_meta") +def test_force_update_already_up_to_date(mock_fetch, tmp_path): + cache = tmp_path / "data.json" + cache.write_text('{"sites": {}, "engines": {}, "tags": []}') + state_file = tmp_path / "state.json" + state_file.write_text(json.dumps({ + "last_check_at": _now_iso(), + "last_meta": {"updated_at": "2026-01-01T00:00:00Z", "sites_count": 3000}, + })) + mock_fetch.return_value = { + "min_maigret_version": "0.1.0", + "sites_count": 3000, + "updated_at": "2026-01-01T00:00:00Z", + } + with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)): + with patch("maigret.db_updater.STATE_PATH", str(state_file)): + with patch("maigret.db_updater.CACHED_DB_PATH", str(cache)): + assert force_update() is False + + +@patch("maigret.db_updater._download_and_verify") +@patch("maigret.db_updater._fetch_meta") +def test_force_update_download_fails(mock_fetch, mock_download, tmp_path): + mock_fetch.return_value = { + "min_maigret_version": "0.1.0", + "sites_count": 3200, + "updated_at": "2099-01-01T00:00:00Z", + "data_url": "https://example.com/data.json", + "data_sha256": "abc123", + } + mock_download.return_value = None + with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)): + with patch("maigret.db_updater.STATE_PATH", str(tmp_path / "state.json")): + with patch("maigret.db_updater.CACHED_DB_PATH", str(tmp_path / "missing.json")): + assert force_update() is False diff --git a/utils/generate_db_meta.py b/utils/generate_db_meta.py new file mode 100644 index 0000000..33a99ba --- /dev/null +++ b/utils/generate_db_meta.py @@ -0,0 +1,59 @@ +"""Generate db_meta.json from data.json for the auto-update system.""" + +import argparse +import hashlib +import json +import os.path as path +import sys +from datetime import datetime, timezone + +RESOURCES_DIR = path.join(path.dirname(path.dirname(path.abspath(__file__))), "maigret", "resources") +DATA_JSON_PATH = path.join(RESOURCES_DIR, "data.json") +META_JSON_PATH = path.join(RESOURCES_DIR, "db_meta.json") +DEFAULT_DATA_URL = "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/data.json" + + +def get_current_version(): + version_file = path.join(path.dirname(path.dirname(path.abspath(__file__))), "maigret", "__version__.py") + with open(version_file) as f: + for line in f: + if line.startswith("__version__"): + return line.split("=")[1].strip().strip("'\"") + return "0.0.0" + + +def main(): + parser = argparse.ArgumentParser(description="Generate db_meta.json from data.json") + parser.add_argument("--min-version", default=None, help="Minimum compatible maigret version (default: current version)") + parser.add_argument("--data-url", default=DEFAULT_DATA_URL, help="URL where data.json can be downloaded") + args = parser.parse_args() + + min_version = args.min_version or get_current_version() + + with open(DATA_JSON_PATH, "rb") as f: + raw = f.read() + sha256 = hashlib.sha256(raw).hexdigest() + + data = json.loads(raw) + sites_count = len(data.get("sites", {})) + + meta = { + "version": 1, + "updated_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"), + "sites_count": sites_count, + "min_maigret_version": min_version, + "data_sha256": sha256, + "data_url": args.data_url, + } + + with open(META_JSON_PATH, "w", encoding="utf-8") as f: + json.dump(meta, f, indent=4, ensure_ascii=False) + + print(f"Generated {META_JSON_PATH}") + print(f" sites: {sites_count}") + print(f" sha256: {sha256[:16]}...") + print(f" min_version: {min_version}") + + +if __name__ == "__main__": + main() diff --git a/utils/update_site_data.py b/utils/update_site_data.py index 3e4f820..7e8b9c2 100755 --- a/utils/update_site_data.py +++ b/utils/update_site_data.py @@ -217,6 +217,13 @@ Rank data fetched from Majestic Million by domains. site_file.write(f'\nThe list was updated at ({datetime.now(timezone.utc).date()})\n') db.save_to_file(args.base_file) + # Regenerate db_meta.json to stay in sync with data.json + try: + from generate_db_meta import main as generate_meta + generate_meta() + except Exception as e: + print(f"Warning: could not regenerate db_meta.json: {e}") + statistics_text = db.get_db_stats(is_markdown=True) site_file.write('## Statistics\n\n') site_file.write(statistics_text)