DB update mechanism (#2458)

* Database update mechanism
This commit is contained in:
Soxoj
2026-04-04 18:00:50 +02:00
committed by GitHub
parent e8f4318e5d
commit 269d50eedc
13 changed files with 766 additions and 7 deletions
+5 -1
View File
@@ -1,3 +1,7 @@
#!/bin/sh
echo 'Activating update_sitesmd hook script...'
poetry run update_sitesmd
poetry run update_sitesmd
echo 'Regenerating db_meta.json...'
python3 utils/generate_db_meta.py
git add maigret/resources/db_meta.json
+3
View File
@@ -27,6 +27,9 @@ jobs:
pip3 install .
python3 ./utils/update_site_data.py --empty-only
- name: Regenerate db_meta.json
run: python3 utils/generate_db_meta.py
- name: Remove ambiguous main tag
run: git tag -d main || true
+74
View File
@@ -27,3 +27,77 @@ Missing any of these files is not an error.
If the next settings file contains already known option,
this option will be rewrited. So it is possible to make
custom configuration for different users and directories.
.. _database-auto-update:
Database auto-update
--------------------
Maigret ships with a bundled site database, but it gets outdated between releases. To keep the database current, Maigret automatically checks for updates on startup.
**How it works:**
1. On startup, Maigret checks if more than 24 hours have passed since the last update check.
2. If so, it fetches a lightweight metadata file (~200 bytes) from GitHub to see if a newer database is available.
3. If a newer, compatible database exists, Maigret downloads it to ``~/.maigret/data.json`` and uses it instead of the bundled copy.
4. If the download fails or the new database is incompatible with your Maigret version, the bundled database is used as a fallback.
The downloaded database has **higher priority** than the bundled one — it replaces, not overlays.
**Status messages** are printed only when an action occurs:
.. code-block:: text
[*] DB auto-update: checking for updates...
[+] DB auto-update: database updated successfully (3180 sites)
[*] DB auto-update: database is up to date (3157 sites)
[!] DB auto-update: latest database requires maigret >= 0.6.0, you have 0.5.0
**Forcing an update:**
Use the ``--force-update`` flag to check for updates immediately, ignoring the check interval:
.. code-block:: console
maigret username --force-update
The update happens at startup, then the search continues normally with the freshly downloaded database.
**Disabling auto-update:**
Use the ``--no-autoupdate`` flag to skip the update check entirely:
.. code-block:: console
maigret username --no-autoupdate
Or set it permanently in ``~/.maigret/settings.json``:
.. code-block:: json
{
"no_autoupdate": true
}
This is recommended for **Docker containers**, **CI pipelines**, and **air-gapped environments**.
**Configuration options** (in ``settings.json``):
.. list-table::
:header-rows: 1
:widths: 35 15 50
* - Setting
- Default
- Description
* - ``no_autoupdate``
- ``false``
- Disable auto-update entirely
* - ``autoupdate_check_interval_hours``
- ``24``
- How often to check for updates (in hours)
* - ``db_update_meta_url``
- GitHub raw URL
- URL of the metadata file (for custom mirrors)
**Using a custom database** with ``--db`` always skips auto-update — you are explicitly choosing your data source.
+330
View File
@@ -0,0 +1,330 @@
"""
Database auto-update logic for maigret.
Checks a lightweight meta file to determine if a newer site database is available,
downloads it if compatible, and caches it locally in ~/.maigret/.
"""
import hashlib
import json
import logging
import os
import os.path as path
import tempfile
from datetime import datetime, timezone
from typing import Optional
import requests
from colorama import Fore, Style
from .__version__ import __version__
logger = logging.getLogger("maigret")
_use_color = True
def _print_info(msg: str) -> None:
text = f"[*] {msg}"
if _use_color:
print(Style.BRIGHT + Fore.GREEN + text + Style.RESET_ALL)
else:
print(text)
def _print_success(msg: str) -> None:
text = f"[+] {msg}"
if _use_color:
print(Style.BRIGHT + Fore.GREEN + text + Style.RESET_ALL)
else:
print(text)
def _print_warning(msg: str) -> None:
text = f"[!] {msg}"
if _use_color:
print(Style.BRIGHT + Fore.YELLOW + text + Style.RESET_ALL)
else:
print(text)
DEFAULT_META_URL = (
"https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/db_meta.json"
)
DEFAULT_CHECK_INTERVAL_HOURS = 24
MAIGRET_HOME = path.expanduser("~/.maigret")
CACHED_DB_PATH = path.join(MAIGRET_HOME, "data.json")
STATE_PATH = path.join(MAIGRET_HOME, "autoupdate_state.json")
BUNDLED_DB_PATH = path.join(path.dirname(path.realpath(__file__)), "resources", "data.json")
def _parse_version(version_str: str) -> tuple:
"""Parse a version string like '0.5.0' into a comparable tuple (0, 5, 0)."""
try:
return tuple(int(x) for x in version_str.strip().split("."))
except (ValueError, AttributeError):
return (0, 0, 0)
def _ensure_maigret_home() -> None:
os.makedirs(MAIGRET_HOME, exist_ok=True)
def _load_state() -> dict:
try:
with open(STATE_PATH, "r", encoding="utf-8") as f:
return json.load(f)
except (FileNotFoundError, json.JSONDecodeError, OSError):
return {}
def _save_state(state: dict) -> None:
_ensure_maigret_home()
tmp_path = STATE_PATH + ".tmp"
try:
with open(tmp_path, "w", encoding="utf-8") as f:
json.dump(state, f, indent=2, ensure_ascii=False)
os.replace(tmp_path, STATE_PATH)
except OSError:
try:
os.unlink(tmp_path)
except OSError:
pass
def _needs_check(state: dict, interval_hours: int) -> bool:
last_check = state.get("last_check_at")
if not last_check:
return True
try:
last_dt = datetime.fromisoformat(last_check.replace("Z", "+00:00"))
elapsed = (datetime.now(timezone.utc) - last_dt).total_seconds() / 3600
return elapsed >= interval_hours
except (ValueError, TypeError):
return True
def _fetch_meta(meta_url: str, timeout: int = 10) -> Optional[dict]:
try:
response = requests.get(meta_url, timeout=timeout)
if response.status_code == 200:
return response.json()
except Exception:
pass
return None
def _is_version_compatible(meta: dict) -> bool:
min_ver = meta.get("min_maigret_version", "0.0.0")
return _parse_version(__version__) >= _parse_version(min_ver)
def _is_update_available(meta: dict, state: dict) -> bool:
if not path.isfile(CACHED_DB_PATH):
return True
remote_date = meta.get("updated_at", "")
cached_date = state.get("last_meta", {}).get("updated_at", "")
return remote_date > cached_date
def _download_and_verify(data_url: str, expected_sha256: str, timeout: int = 60) -> Optional[str]:
_ensure_maigret_home()
tmp_fd, tmp_path = tempfile.mkstemp(dir=MAIGRET_HOME, suffix=".json")
try:
response = requests.get(data_url, timeout=timeout)
if response.status_code != 200:
return None
content = response.content
actual_sha256 = hashlib.sha256(content).hexdigest()
if actual_sha256 != expected_sha256:
_print_warning("DB auto-update: SHA-256 mismatch, download rejected")
return None
# Validate JSON structure
data = json.loads(content)
if not all(k in data for k in ("sites", "engines", "tags")):
_print_warning("DB auto-update: invalid database structure")
return None
os.write(tmp_fd, content)
os.close(tmp_fd)
tmp_fd = None
os.replace(tmp_path, CACHED_DB_PATH)
return CACHED_DB_PATH
except Exception:
return None
finally:
if tmp_fd is not None:
os.close(tmp_fd)
try:
os.unlink(tmp_path)
except OSError:
pass
def _best_local() -> str:
"""Return cached DB if it exists and is valid, otherwise bundled."""
if path.isfile(CACHED_DB_PATH):
try:
with open(CACHED_DB_PATH, "r", encoding="utf-8") as f:
data = json.load(f)
if "sites" in data:
return CACHED_DB_PATH
except (json.JSONDecodeError, OSError):
pass
return BUNDLED_DB_PATH
def _now_iso() -> str:
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
def resolve_db_path(
db_file_arg: str,
no_autoupdate: bool = False,
meta_url: str = DEFAULT_META_URL,
check_interval_hours: int = DEFAULT_CHECK_INTERVAL_HOURS,
color: bool = True,
) -> str:
"""
Determine which database file to use, potentially downloading an update.
Returns the path to the database file that should be loaded.
"""
global _use_color
_use_color = color
default_db_name = "resources/data.json"
# User specified a custom DB — skip auto-update
is_url = db_file_arg.startswith("http://") or db_file_arg.startswith("https://")
is_default = db_file_arg == default_db_name
if is_url:
return db_file_arg
if not is_default:
return path.join(path.dirname(path.realpath(__file__)), db_file_arg)
# Auto-update disabled
if no_autoupdate:
return _best_local()
# Check interval
_ensure_maigret_home()
state = _load_state()
if not _needs_check(state, check_interval_hours):
return _best_local()
# Time to check
_print_info("DB auto-update: checking for updates...")
meta = _fetch_meta(meta_url)
if meta is None:
_print_warning("DB auto-update: could not reach update server, using local database")
state["last_check_at"] = _now_iso()
_save_state(state)
return _best_local()
# Version compatibility
if not _is_version_compatible(meta):
min_ver = meta.get("min_maigret_version", "?")
_print_warning(
f"DB auto-update: latest database requires maigret >= {min_ver}, "
f"you have {__version__}. Please upgrade with: pip install -U maigret"
)
state["last_check_at"] = _now_iso()
_save_state(state)
return _best_local()
# Check if update available
if not _is_update_available(meta, state):
sites_count = meta.get("sites_count", "?")
_print_info(f"DB auto-update: database is up to date ({sites_count} sites)")
state["last_check_at"] = _now_iso()
state["last_meta"] = meta
_save_state(state)
return _best_local()
# Download update
new_count = meta.get("sites_count", "?")
old_count = state.get("last_meta", {}).get("sites_count")
if old_count:
_print_info(f"DB auto-update: downloading updated database ({new_count} sites, was {old_count})...")
else:
_print_info(f"DB auto-update: downloading database ({new_count} sites)...")
data_url = meta.get("data_url", "")
expected_sha = meta.get("data_sha256", "")
result = _download_and_verify(data_url, expected_sha)
if result is None:
_print_warning("DB auto-update: download failed, using local database")
state["last_check_at"] = _now_iso()
_save_state(state)
return _best_local()
_print_success(f"DB auto-update: database updated successfully ({new_count} sites)")
state["last_check_at"] = _now_iso()
state["last_meta"] = meta
state["cached_db_sha256"] = expected_sha
_save_state(state)
return CACHED_DB_PATH
def force_update(
meta_url: str = DEFAULT_META_URL,
color: bool = True,
) -> bool:
"""
Force check for database updates and download if available.
Returns True if database was updated, False otherwise.
"""
global _use_color
_use_color = color
_ensure_maigret_home()
_print_info("DB update: checking for updates...")
meta = _fetch_meta(meta_url)
if meta is None:
_print_warning("DB update: could not reach update server")
return False
if not _is_version_compatible(meta):
min_ver = meta.get("min_maigret_version", "?")
_print_warning(
f"DB update: latest database requires maigret >= {min_ver}, "
f"you have {__version__}. Please upgrade with: pip install -U maigret"
)
return False
state = _load_state()
new_count = meta.get("sites_count", "?")
old_count = state.get("last_meta", {}).get("sites_count")
if not _is_update_available(meta, state):
_print_info(f"DB update: database is already up to date ({new_count} sites)")
state["last_check_at"] = _now_iso()
state["last_meta"] = meta
_save_state(state)
return False
if old_count:
_print_info(f"DB update: downloading updated database ({new_count} sites, was {old_count})...")
else:
_print_info(f"DB update: downloading database ({new_count} sites)...")
data_url = meta.get("data_url", "")
expected_sha = meta.get("data_sha256", "")
result = _download_and_verify(data_url, expected_sha)
if result is None:
_print_warning("DB update: download failed")
return False
_print_success(f"DB update: database updated successfully ({new_count} sites)")
state["last_check_at"] = _now_iso()
state["last_meta"] = meta
state["cached_db_sha256"] = expected_sha
_save_state(state)
return True
+38 -4
View File
@@ -201,6 +201,20 @@ def setup_arguments_parser(settings: Settings):
default=settings.sites_db_path,
help="Load Maigret database from a JSON file or HTTP web resource.",
)
parser.add_argument(
"--no-autoupdate",
action="store_true",
dest="no_autoupdate",
default=settings.no_autoupdate,
help="Disable automatic database updates on startup.",
)
parser.add_argument(
"--force-update",
action="store_true",
dest="force_update",
default=False,
help="Force check for database updates and download if available.",
)
parser.add_argument(
"--cookies-jar-file",
metavar="COOKIE_FILE",
@@ -543,9 +557,21 @@ async def main():
else:
args.exclude_tags = []
db_file = args.db_file \
if (args.db_file.startswith("http://") or args.db_file.startswith("https://")) \
else path.join(path.dirname(path.realpath(__file__)), args.db_file)
from .db_updater import resolve_db_path, force_update, BUNDLED_DB_PATH
if args.force_update:
force_update(
meta_url=settings.db_update_meta_url,
color=not args.no_color,
)
db_file = resolve_db_path(
db_file_arg=args.db_file,
no_autoupdate=args.no_autoupdate or args.force_update,
meta_url=settings.db_update_meta_url,
check_interval_hours=settings.autoupdate_check_interval_hours,
color=not args.no_color,
)
if args.top_sites == 0 or args.all_sites:
args.top_sites = sys.maxsize
@@ -560,7 +586,15 @@ async def main():
)
# Create object with all information about sites we are aware of.
db = MaigretDatabase().load_from_path(db_file)
try:
db = MaigretDatabase().load_from_path(db_file)
except Exception as e:
logger.warning(f"Failed to load database from {db_file}: {e}")
if db_file != BUNDLED_DB_PATH:
logger.warning("Falling back to bundled database")
db = MaigretDatabase().load_from_path(BUNDLED_DB_PATH)
else:
raise
get_top_sites_for_id = lambda x: db.ranked_sites_dict(
top=args.top_sites,
tags=args.tags,
-1
View File
@@ -29262,7 +29262,6 @@
"usernameClaimed": "alex",
"usernameUnclaimed": "noonewouldeverusethis7"
},
"izmailonline.com": {
"tags": [
"ua"
+8
View File
@@ -0,0 +1,8 @@
{
"version": 1,
"updated_at": "2026-04-04T15:54:23Z",
"sites_count": 3157,
"min_maigret_version": "0.5.0",
"data_sha256": "880a56363cf5d71e13ca389330388fbc4796bff50d6e207a056112c4a5606f83",
"data_url": "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/data.json"
}
+4 -1
View File
@@ -54,5 +54,8 @@
"graph_report": false,
"pdf_report": false,
"html_report": false,
"web_interface_port": 5000
"web_interface_port": 5000,
"no_autoupdate": false,
"db_update_meta_url": "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/db_meta.json",
"autoupdate_check_interval_hours": 24
}
+3
View File
@@ -43,6 +43,9 @@ class Settings:
html_report: bool
graph_report: bool
web_interface_port: int
no_autoupdate: bool
db_update_meta_url: str
autoupdate_check_interval_hours: int
# submit mode settings
presence_strings: list
+2
View File
@@ -48,6 +48,8 @@ DEFAULT_ARGS: Dict[str, Any] = {
'web': None,
'with_domains': False,
'xmind': False,
'no_autoupdate': False,
'force_update': False,
}
+233
View File
@@ -0,0 +1,233 @@
"""Tests for the database auto-update system."""
import json
import os
import hashlib
from datetime import datetime, timezone, timedelta
from unittest.mock import patch, MagicMock
import pytest
from maigret.db_updater import (
_parse_version,
_needs_check,
_is_version_compatible,
_is_update_available,
_load_state,
_save_state,
_best_local,
_now_iso,
resolve_db_path,
force_update,
CACHED_DB_PATH,
BUNDLED_DB_PATH,
STATE_PATH,
MAIGRET_HOME,
)
def test_parse_version():
assert _parse_version("0.5.0") == (0, 5, 0)
assert _parse_version("1.2.3") == (1, 2, 3)
assert _parse_version("bad") == (0, 0, 0)
assert _parse_version("") == (0, 0, 0)
def test_needs_check_no_state():
assert _needs_check({}, 24) is True
def test_needs_check_recent():
state = {"last_check_at": _now_iso()}
assert _needs_check(state, 24) is False
def test_needs_check_expired():
old_time = (datetime.now(timezone.utc) - timedelta(hours=25)).strftime("%Y-%m-%dT%H:%M:%SZ")
state = {"last_check_at": old_time}
assert _needs_check(state, 24) is True
def test_needs_check_corrupt():
state = {"last_check_at": "not-a-date"}
assert _needs_check(state, 24) is True
def test_version_compatible():
with patch("maigret.db_updater.__version__", "0.5.0"):
assert _is_version_compatible({"min_maigret_version": "0.5.0"}) is True
assert _is_version_compatible({"min_maigret_version": "0.4.0"}) is True
assert _is_version_compatible({"min_maigret_version": "0.6.0"}) is False
assert _is_version_compatible({}) is True # missing field = compatible
def test_update_available_no_cache(tmp_path):
with patch("maigret.db_updater.CACHED_DB_PATH", str(tmp_path / "nonexistent.json")):
assert _is_update_available({"updated_at": "2026-01-01T00:00:00Z"}, {}) is True
def test_update_available_newer(tmp_path):
cache = tmp_path / "data.json"
cache.write_text("{}")
with patch("maigret.db_updater.CACHED_DB_PATH", str(cache)):
state = {"last_meta": {"updated_at": "2026-01-01T00:00:00Z"}}
meta = {"updated_at": "2026-02-01T00:00:00Z"}
assert _is_update_available(meta, state) is True
def test_update_available_same(tmp_path):
cache = tmp_path / "data.json"
cache.write_text("{}")
with patch("maigret.db_updater.CACHED_DB_PATH", str(cache)):
state = {"last_meta": {"updated_at": "2026-01-01T00:00:00Z"}}
meta = {"updated_at": "2026-01-01T00:00:00Z"}
assert _is_update_available(meta, state) is False
def test_load_state_missing(tmp_path):
with patch("maigret.db_updater.STATE_PATH", str(tmp_path / "missing.json")):
assert _load_state() == {}
def test_load_state_corrupt(tmp_path):
corrupt = tmp_path / "state.json"
corrupt.write_text("not json{{{")
with patch("maigret.db_updater.STATE_PATH", str(corrupt)):
assert _load_state() == {}
def test_save_and_load_state(tmp_path):
state_file = tmp_path / "state.json"
with patch("maigret.db_updater.STATE_PATH", str(state_file)):
with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)):
_save_state({"last_check_at": "2026-01-01T00:00:00Z"})
loaded = _load_state()
assert loaded["last_check_at"] == "2026-01-01T00:00:00Z"
def test_best_local_with_valid_cache(tmp_path):
cache = tmp_path / "data.json"
cache.write_text('{"sites": {}, "engines": {}, "tags": []}')
with patch("maigret.db_updater.CACHED_DB_PATH", str(cache)):
assert _best_local() == str(cache)
def test_best_local_with_corrupt_cache(tmp_path):
cache = tmp_path / "data.json"
cache.write_text("not json")
with patch("maigret.db_updater.CACHED_DB_PATH", str(cache)):
assert _best_local() == BUNDLED_DB_PATH
def test_best_local_no_cache(tmp_path):
with patch("maigret.db_updater.CACHED_DB_PATH", str(tmp_path / "missing.json")):
assert _best_local() == BUNDLED_DB_PATH
def test_resolve_db_path_custom_url():
result = resolve_db_path("https://example.com/db.json")
assert result == "https://example.com/db.json"
def test_resolve_db_path_custom_file():
result = resolve_db_path("custom/path.json")
assert result.endswith("custom/path.json")
def test_resolve_db_path_no_autoupdate(tmp_path):
with patch("maigret.db_updater.CACHED_DB_PATH", str(tmp_path / "missing.json")):
result = resolve_db_path("resources/data.json", no_autoupdate=True)
assert result == BUNDLED_DB_PATH
def test_resolve_db_path_no_autoupdate_with_cache(tmp_path):
cache = tmp_path / "data.json"
cache.write_text('{"sites": {}, "engines": {}, "tags": []}')
with patch("maigret.db_updater.CACHED_DB_PATH", str(cache)):
result = resolve_db_path("resources/data.json", no_autoupdate=True)
assert result == str(cache)
@patch("maigret.db_updater._fetch_meta")
def test_resolve_db_path_network_failure(mock_fetch, tmp_path):
mock_fetch.return_value = None
with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)):
with patch("maigret.db_updater.STATE_PATH", str(tmp_path / "state.json")):
with patch("maigret.db_updater.CACHED_DB_PATH", str(tmp_path / "missing.json")):
result = resolve_db_path("resources/data.json")
assert result == BUNDLED_DB_PATH
# --- force_update tests ---
@patch("maigret.db_updater._fetch_meta")
def test_force_update_network_failure(mock_fetch, tmp_path):
mock_fetch.return_value = None
with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)):
with patch("maigret.db_updater.STATE_PATH", str(tmp_path / "state.json")):
assert force_update() is False
@patch("maigret.db_updater._fetch_meta")
def test_force_update_incompatible_version(mock_fetch, tmp_path):
mock_fetch.return_value = {"min_maigret_version": "99.0.0", "sites_count": 100}
with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)):
with patch("maigret.db_updater.STATE_PATH", str(tmp_path / "state.json")):
assert force_update() is False
@patch("maigret.db_updater._download_and_verify")
@patch("maigret.db_updater._fetch_meta")
def test_force_update_success(mock_fetch, mock_download, tmp_path):
mock_fetch.return_value = {
"min_maigret_version": "0.1.0",
"sites_count": 3200,
"updated_at": "2099-01-01T00:00:00Z",
"data_url": "https://example.com/data.json",
"data_sha256": "abc123",
}
mock_download.return_value = str(tmp_path / "data.json")
with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)):
with patch("maigret.db_updater.STATE_PATH", str(tmp_path / "state.json")):
with patch("maigret.db_updater.CACHED_DB_PATH", str(tmp_path / "missing.json")):
assert force_update() is True
state = _load_state()
assert state["last_meta"]["sites_count"] == 3200
@patch("maigret.db_updater._fetch_meta")
def test_force_update_already_up_to_date(mock_fetch, tmp_path):
cache = tmp_path / "data.json"
cache.write_text('{"sites": {}, "engines": {}, "tags": []}')
state_file = tmp_path / "state.json"
state_file.write_text(json.dumps({
"last_check_at": _now_iso(),
"last_meta": {"updated_at": "2026-01-01T00:00:00Z", "sites_count": 3000},
}))
mock_fetch.return_value = {
"min_maigret_version": "0.1.0",
"sites_count": 3000,
"updated_at": "2026-01-01T00:00:00Z",
}
with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)):
with patch("maigret.db_updater.STATE_PATH", str(state_file)):
with patch("maigret.db_updater.CACHED_DB_PATH", str(cache)):
assert force_update() is False
@patch("maigret.db_updater._download_and_verify")
@patch("maigret.db_updater._fetch_meta")
def test_force_update_download_fails(mock_fetch, mock_download, tmp_path):
mock_fetch.return_value = {
"min_maigret_version": "0.1.0",
"sites_count": 3200,
"updated_at": "2099-01-01T00:00:00Z",
"data_url": "https://example.com/data.json",
"data_sha256": "abc123",
}
mock_download.return_value = None
with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)):
with patch("maigret.db_updater.STATE_PATH", str(tmp_path / "state.json")):
with patch("maigret.db_updater.CACHED_DB_PATH", str(tmp_path / "missing.json")):
assert force_update() is False
+59
View File
@@ -0,0 +1,59 @@
"""Generate db_meta.json from data.json for the auto-update system."""
import argparse
import hashlib
import json
import os.path as path
import sys
from datetime import datetime, timezone
RESOURCES_DIR = path.join(path.dirname(path.dirname(path.abspath(__file__))), "maigret", "resources")
DATA_JSON_PATH = path.join(RESOURCES_DIR, "data.json")
META_JSON_PATH = path.join(RESOURCES_DIR, "db_meta.json")
DEFAULT_DATA_URL = "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/data.json"
def get_current_version():
version_file = path.join(path.dirname(path.dirname(path.abspath(__file__))), "maigret", "__version__.py")
with open(version_file) as f:
for line in f:
if line.startswith("__version__"):
return line.split("=")[1].strip().strip("'\"")
return "0.0.0"
def main():
parser = argparse.ArgumentParser(description="Generate db_meta.json from data.json")
parser.add_argument("--min-version", default=None, help="Minimum compatible maigret version (default: current version)")
parser.add_argument("--data-url", default=DEFAULT_DATA_URL, help="URL where data.json can be downloaded")
args = parser.parse_args()
min_version = args.min_version or get_current_version()
with open(DATA_JSON_PATH, "rb") as f:
raw = f.read()
sha256 = hashlib.sha256(raw).hexdigest()
data = json.loads(raw)
sites_count = len(data.get("sites", {}))
meta = {
"version": 1,
"updated_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
"sites_count": sites_count,
"min_maigret_version": min_version,
"data_sha256": sha256,
"data_url": args.data_url,
}
with open(META_JSON_PATH, "w", encoding="utf-8") as f:
json.dump(meta, f, indent=4, ensure_ascii=False)
print(f"Generated {META_JSON_PATH}")
print(f" sites: {sites_count}")
print(f" sha256: {sha256[:16]}...")
print(f" min_version: {min_version}")
if __name__ == "__main__":
main()
+7
View File
@@ -217,6 +217,13 @@ Rank data fetched from Majestic Million by domains.
site_file.write(f'\nThe list was updated at ({datetime.now(timezone.utc).date()})\n')
db.save_to_file(args.base_file)
# Regenerate db_meta.json to stay in sync with data.json
try:
from generate_db_meta import main as generate_meta
generate_meta()
except Exception as e:
print(f"Warning: could not regenerate db_meta.json: {e}")
statistics_text = db.get_db_stats(is_markdown=True)
site_file.write('## Statistics\n\n')
site_file.write(statistics_text)