Extra db flag

This commit is contained in:
Soxoj
2026-04-23 17:02:47 +02:00
parent 5e1cc45c17
commit ca54db6fb7
7 changed files with 309 additions and 6 deletions
+44
View File
@@ -84,6 +84,9 @@ ids. Useful for repeated scanning with found known irrelevant usernames.
``--db`` - Load Maigret database from a JSON file or an online, valid,
JSON file. See :ref:`custom-database` below.
``--extra-db`` - Load an **additional** sites database on top of
``--db`` (overlay). Repeatable. See :ref:`extra-database` below.
``--no-autoupdate`` - Disable the automatic database update check that
runs at startup. The currently cached (or bundled) database is used
as-is.
@@ -139,6 +142,47 @@ disabled and all sites scanned, looks like::
--db LLM/maigret_private_db.json \
--no-autoupdate -a
.. _extra-database:
Overlaying additional databases (``--extra-db``)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
``--extra-db FILE`` loads an additional sites database **on top of**
``--db``, rather than replacing it. The flag is repeatable, so multiple
extras can be layered in one invocation::
python3 -m maigret username \
--extra-db private_sites.json \
--extra-db team_sites.json -a
Each extra accepts the same three forms as ``--db`` (HTTP(S) URL,
absolute or cwd-relative local path, or module-relative path).
**Merge semantics.** Sites, engines and tags are merged into the main
database. On duplicate names, **last wins**: a site or engine defined
later (either in a subsequent ``--extra-db`` or in an ``--extra-db``
that re-defines a name from ``--db``) overrides the earlier definition.
Tag lists are deduplicated while preserving first-seen order.
**Auto-update.** Extras are never auto-updated — they are read exactly
as provided, regardless of ``--no-autoupdate`` / ``--force-update``.
**Save behaviour.** While any ``--extra-db`` is active, Maigret **skips
every database save** — including the implicit end-of-run save, the
``--self-check --auto-disable`` save, and the ``--submit`` save. This
prevents silently writing merged (main + extras) content back into the
main ``--db`` file. If you need to persist edits, run Maigret again
without ``--extra-db``. You will see a warning at startup::
[!] Database modifications will NOT be persisted while --extra-db is active.
**Missing or unreadable extra.** Maigret exits with a non-zero status —
extras are opt-in, so a silent skip would hide configuration errors.
**Not supported with** ``--web``. The web UI reloads its own database
from the main ``--db`` path, so extras would be invisible. Passing both
exits with an error.
Reports
-------
+63 -4
View File
@@ -202,6 +202,17 @@ def setup_arguments_parser(settings: Settings):
default=settings.sites_db_path,
help="Load Maigret database from a JSON file or HTTP web resource.",
)
parser.add_argument(
"--extra-db",
metavar="EXTRA_DB_FILE",
dest="extra_db_files",
action="append",
default=[],
help="Load an additional sites database on top of --db. Repeatable. "
"Accepts a local path (absolute or cwd-relative) or HTTP(S) URL. "
"Never auto-updated. Changes from --self-check / --submit are NOT "
"persisted when any --extra-db is loaded.",
)
parser.add_argument(
"--no-autoupdate",
action="store_true",
@@ -614,6 +625,46 @@ async def main():
)
else:
raise
for extra_arg in args.extra_db_files:
try:
extra_path = resolve_db_path(
db_file_arg=extra_arg,
no_autoupdate=True,
meta_url=settings.db_update_meta_url,
check_interval_hours=settings.autoupdate_check_interval_hours,
color=not args.no_color,
)
except FileNotFoundError as e:
logger.error(f"--extra-db: {e}")
sys.exit(2)
before = len(db.sites)
try:
db.load_extra_from_path(extra_path)
except Exception as e:
logger.error(f"Failed to load extra database from {extra_path}: {e}")
sys.exit(2)
query_notify.success(
f'Loaded extra database: {extra_path} '
f'(+{len(db.sites) - before} new, {len(db.sites)} total sites)'
)
if args.extra_db_files:
query_notify.warning(
'Database modifications will NOT be persisted while --extra-db is active.'
)
def save_db_if_safe(reason: str) -> bool:
if args.extra_db_files:
logger.warning(
f"Skipping database save ({reason}): --extra-db is active; "
"modifications are in-memory only."
)
return False
db.save_to_file(db_file)
return True
get_top_sites_for_id = lambda x: db.ranked_sites_dict(
top=args.top_sites,
tags=args.tags,
@@ -629,7 +680,7 @@ async def main():
submitter = Submitter(db=db, logger=logger, settings=settings, args=args)
is_submitted = await submitter.dialog(args.new_site_to_submit, args.cookie_file)
if is_submitted:
db.save_to_file(db_file)
save_db_if_safe("post-submit")
await submitter.close()
# Database self-checking
@@ -663,8 +714,8 @@ async def main():
'y',
'',
):
db.save_to_file(db_file)
print('Database was successfully updated.')
if save_db_if_safe("post-self-check"):
print('Database was successfully updated.')
else:
print('Updates will be applied only for current search session.')
@@ -687,6 +738,14 @@ async def main():
# Web interface
if args.web is not None:
if args.extra_db_files:
logger.error(
'--web is not compatible with --extra-db: the web UI reloads '
'the database from --db only, so extras would be silently '
'ignored. Remove --extra-db or use the CLI mode.'
)
sys.exit(2)
from maigret.web.app import app
app.config["MAIGRET_DB_FILE"] = db_file
@@ -873,7 +932,7 @@ async def main():
print(text_report)
# update database
db.save_to_file(db_file)
save_db_if_safe("end-of-run")
def run():
+1 -1
View File
@@ -1,6 +1,6 @@
{
"version": 1,
"updated_at": "2026-04-22T16:15:02Z",
"updated_at": "2026-04-23T15:02:48Z",
"sites_count": 3142,
"min_maigret_version": "0.6.0",
"data_sha256": "1e1ed6da2aa9db0f34171f61a044c20bbd1ed53a0430dec4a9ce8f8543655d1a",
+9
View File
@@ -516,6 +516,15 @@ class MaigretDatabase:
else:
return self.load_from_file(path)
def load_extra_from_path(self, path: str) -> "MaigretDatabase":
"""Merge an additional DB on top of self. Last-wins on duplicate
site/engine names; tags deduped preserving first-seen order."""
self.load_from_path(path)
self._sites = list({s.name: s for s in self._sites}.values())
self._engines = list({e.name: e for e in self._engines}.values())
self._tags = list(dict.fromkeys(self._tags))
return self
def load_from_http(self, url: str) -> "MaigretDatabase":
is_url_valid = url.startswith("http://") or url.startswith("https://")
+1 -1
View File
@@ -3146,7 +3146,7 @@ Rank data fetched from Majestic Million by domains.
1. ![](https://www.google.com/s2/favicons?domain=https://flarum.es) [flarum.es (https://flarum.es)](https://flarum.es)*: top 100M, es, forum*
1. ![](https://www.google.com/s2/favicons?domain=https://forum.fibra.click) [forum.fibra.click (https://forum.fibra.click)](https://forum.fibra.click)*: top 100M, forum, it*
The list was updated at (2026-04-22)
The list was updated at (2026-04-23)
## Statistics
Enabled/total sites: 2529/3142 = 80.49%
+33
View File
@@ -51,6 +51,7 @@ DEFAULT_ARGS: Dict[str, Any] = {
'md': False,
'no_autoupdate': False,
'force_update': False,
'extra_db_files': [],
}
@@ -126,6 +127,38 @@ def test_args_exclude_tags(argparser):
assert getattr(args, arg) == want_args[arg]
def test_args_single_extra_db(argparser):
args = argparser.parse_args('--extra-db extras.json username'.split())
want_args = dict(DEFAULT_ARGS)
want_args.update(
{
'extra_db_files': ['extras.json'],
'username': ['username'],
}
)
for arg in vars(args):
assert getattr(args, arg) == want_args[arg]
def test_args_multiple_extra_dbs(argparser):
args = argparser.parse_args(
'--extra-db a.json --extra-db https://example.com/b.json username'.split()
)
want_args = dict(DEFAULT_ARGS)
want_args.update(
{
'extra_db_files': ['a.json', 'https://example.com/b.json'],
'username': ['username'],
}
)
for arg in vars(args):
assert getattr(args, arg) == want_args[arg]
def test_args_tags_with_exclude_tags(argparser):
args = argparser.parse_args('--tags coding --exclude-tags porn username'.split())
+158
View File
@@ -1,5 +1,6 @@
"""Maigret Database test functions"""
import json
from typing import Any, Dict
from maigret.sites import MaigretDatabase, MaigretSite
@@ -96,6 +97,163 @@ def test_site_strip_engine_data_with_site_prior_updates():
assert amperka_stripped.json == UPDATED_EXAMPLE_DB['sites']['Amperka']
def _write_db(tmp_path, name, data):
p = tmp_path / name
p.write_text(json.dumps(data), encoding='utf-8')
return str(p)
def test_extra_db_new_site(tmp_path):
db = MaigretDatabase()
db.load_from_json(EXAMPLE_DB)
assert len(db.sites) == 1
extra = {
'engines': {},
'sites': {
'ExampleExtra': {
'tags': ['us'],
'checkType': 'status_code',
'url': 'https://example.com/{username}',
'urlMain': 'https://example.com/',
'usernameClaimed': 'test',
'usernameUnclaimed': 'noonewouldeverusethis7',
}
},
'tags': ['us'],
}
db.load_extra_from_path(_write_db(tmp_path, 'extra.json', extra))
assert len(db.sites) == 2
assert set(db.sites_dict.keys()) == {'Amperka', 'ExampleExtra'}
assert len(db._sites) == len(db.sites_dict)
def test_extra_db_site_override_last_wins(tmp_path):
db = MaigretDatabase()
db.load_from_json(EXAMPLE_DB)
assert db.sites_dict['Amperka'].url_main == 'http://forum.amperka.ru'
extra = {
'engines': {},
'sites': {
'Amperka': {
'engine': 'XenForo',
'rank': 1,
'tags': ['overridden'],
'urlMain': 'https://overridden.example',
'usernameClaimed': 'adam',
'usernameUnclaimed': 'noonewouldeverusethis7',
}
},
'tags': [],
}
db.load_extra_from_path(_write_db(tmp_path, 'extra.json', extra))
assert len(db.sites) == 1
amperka = db.sites_dict['Amperka']
assert amperka.url_main == 'https://overridden.example'
assert 'overridden' in amperka.tags
def test_extra_db_engine_override(tmp_path):
main = {
'engines': {
'Proto': {
'presenseStrs': ['orig'],
'site': {
'absenceStrs': ['original absence'],
'checkType': 'message',
'url': '{urlMain}/orig/{username}',
},
}
},
'sites': {
'MainSite': {
'engine': 'Proto',
'rank': 1,
'tags': [],
'urlMain': 'https://main.example',
'usernameClaimed': 'a',
'usernameUnclaimed': 'noonewouldeverusethis7',
}
},
'tags': [],
}
db = MaigretDatabase()
db.load_from_json(main)
extra = {
'engines': {
'Proto': {
'presenseStrs': ['overridden'],
'site': {
'absenceStrs': ['overridden absence'],
'checkType': 'message',
'url': '{urlMain}/overridden/{username}',
},
}
},
'sites': {
'ExtraSite': {
'engine': 'Proto',
'rank': 10,
'tags': [],
'urlMain': 'https://extra.example',
'usernameClaimed': 'a',
'usernameUnclaimed': 'noonewouldeverusethis7',
}
},
'tags': [],
}
db.load_extra_from_path(_write_db(tmp_path, 'extra.json', extra))
assert len(db._engines) == 1
assert db.engines_dict['Proto'].presenseStrs == ['overridden']
extra_site = db.sites_dict['ExtraSite']
assert extra_site.absence_strs == ['overridden absence']
main_site = db.sites_dict['MainSite']
assert main_site.absence_strs == ['original absence']
def test_extra_db_tag_dedup(tmp_path):
db = MaigretDatabase()
db.load_from_json({'engines': {}, 'sites': {}, 'tags': ['forum', 'ru']})
extra = {'engines': {}, 'sites': {}, 'tags': ['forum', 'us']}
db.load_extra_from_path(_write_db(tmp_path, 'extra.json', extra))
assert db._tags.count('forum') == 1
assert sorted(db._tags) == ['forum', 'ru', 'us']
def test_extra_db_chain_last_wins(tmp_path):
db = MaigretDatabase()
db.load_from_json(EXAMPLE_DB)
def site_with_url(url):
return {
'engines': {},
'sites': {
'Amperka': {
'engine': 'XenForo',
'rank': 1,
'tags': ['ru'],
'urlMain': url,
'usernameClaimed': 'adam',
'usernameUnclaimed': 'noonewouldeverusethis7',
}
},
'tags': [],
}
db.load_extra_from_path(_write_db(tmp_path, 'a.json', site_with_url('https://a')))
db.load_extra_from_path(_write_db(tmp_path, 'b.json', site_with_url('https://b')))
assert len(db.sites) == 1
assert db.sites_dict['Amperka'].url_main == 'https://b'
def test_saving_site_error():
db = MaigretDatabase()