diff --git a/docs/source/command-line-options.rst b/docs/source/command-line-options.rst index b111d91..1389c00 100644 --- a/docs/source/command-line-options.rst +++ b/docs/source/command-line-options.rst @@ -84,6 +84,9 @@ ids. Useful for repeated scanning with found known irrelevant usernames. ``--db`` - Load Maigret database from a JSON file or an online, valid, JSON file. See :ref:`custom-database` below. +``--extra-db`` - Load an **additional** sites database on top of +``--db`` (overlay). Repeatable. See :ref:`extra-database` below. + ``--no-autoupdate`` - Disable the automatic database update check that runs at startup. The currently cached (or bundled) database is used as-is. @@ -139,6 +142,47 @@ disabled and all sites scanned, looks like:: --db LLM/maigret_private_db.json \ --no-autoupdate -a +.. _extra-database: + +Overlaying additional databases (``--extra-db``) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +``--extra-db FILE`` loads an additional sites database **on top of** +``--db``, rather than replacing it. The flag is repeatable, so multiple +extras can be layered in one invocation:: + + python3 -m maigret username \ + --extra-db private_sites.json \ + --extra-db team_sites.json -a + +Each extra accepts the same three forms as ``--db`` (HTTP(S) URL, +absolute or cwd-relative local path, or module-relative path). + +**Merge semantics.** Sites, engines and tags are merged into the main +database. On duplicate names, **last wins**: a site or engine defined +later (either in a subsequent ``--extra-db`` or in an ``--extra-db`` +that re-defines a name from ``--db``) overrides the earlier definition. +Tag lists are deduplicated while preserving first-seen order. + +**Auto-update.** Extras are never auto-updated — they are read exactly +as provided, regardless of ``--no-autoupdate`` / ``--force-update``. + +**Save behaviour.** While any ``--extra-db`` is active, Maigret **skips +every database save** — including the implicit end-of-run save, the +``--self-check --auto-disable`` save, and the ``--submit`` save. This +prevents silently writing merged (main + extras) content back into the +main ``--db`` file. If you need to persist edits, run Maigret again +without ``--extra-db``. You will see a warning at startup:: + + [!] Database modifications will NOT be persisted while --extra-db is active. + +**Missing or unreadable extra.** Maigret exits with a non-zero status — +extras are opt-in, so a silent skip would hide configuration errors. + +**Not supported with** ``--web``. The web UI reloads its own database +from the main ``--db`` path, so extras would be invisible. Passing both +exits with an error. + Reports ------- diff --git a/maigret/maigret.py b/maigret/maigret.py index cead5a0..a2e0dc7 100755 --- a/maigret/maigret.py +++ b/maigret/maigret.py @@ -202,6 +202,17 @@ def setup_arguments_parser(settings: Settings): default=settings.sites_db_path, help="Load Maigret database from a JSON file or HTTP web resource.", ) + parser.add_argument( + "--extra-db", + metavar="EXTRA_DB_FILE", + dest="extra_db_files", + action="append", + default=[], + help="Load an additional sites database on top of --db. Repeatable. " + "Accepts a local path (absolute or cwd-relative) or HTTP(S) URL. " + "Never auto-updated. Changes from --self-check / --submit are NOT " + "persisted when any --extra-db is loaded.", + ) parser.add_argument( "--no-autoupdate", action="store_true", @@ -614,6 +625,46 @@ async def main(): ) else: raise + + for extra_arg in args.extra_db_files: + try: + extra_path = resolve_db_path( + db_file_arg=extra_arg, + no_autoupdate=True, + meta_url=settings.db_update_meta_url, + check_interval_hours=settings.autoupdate_check_interval_hours, + color=not args.no_color, + ) + except FileNotFoundError as e: + logger.error(f"--extra-db: {e}") + sys.exit(2) + + before = len(db.sites) + try: + db.load_extra_from_path(extra_path) + except Exception as e: + logger.error(f"Failed to load extra database from {extra_path}: {e}") + sys.exit(2) + query_notify.success( + f'Loaded extra database: {extra_path} ' + f'(+{len(db.sites) - before} new, {len(db.sites)} total sites)' + ) + + if args.extra_db_files: + query_notify.warning( + 'Database modifications will NOT be persisted while --extra-db is active.' + ) + + def save_db_if_safe(reason: str) -> bool: + if args.extra_db_files: + logger.warning( + f"Skipping database save ({reason}): --extra-db is active; " + "modifications are in-memory only." + ) + return False + db.save_to_file(db_file) + return True + get_top_sites_for_id = lambda x: db.ranked_sites_dict( top=args.top_sites, tags=args.tags, @@ -629,7 +680,7 @@ async def main(): submitter = Submitter(db=db, logger=logger, settings=settings, args=args) is_submitted = await submitter.dialog(args.new_site_to_submit, args.cookie_file) if is_submitted: - db.save_to_file(db_file) + save_db_if_safe("post-submit") await submitter.close() # Database self-checking @@ -663,8 +714,8 @@ async def main(): 'y', '', ): - db.save_to_file(db_file) - print('Database was successfully updated.') + if save_db_if_safe("post-self-check"): + print('Database was successfully updated.') else: print('Updates will be applied only for current search session.') @@ -687,6 +738,14 @@ async def main(): # Web interface if args.web is not None: + if args.extra_db_files: + logger.error( + '--web is not compatible with --extra-db: the web UI reloads ' + 'the database from --db only, so extras would be silently ' + 'ignored. Remove --extra-db or use the CLI mode.' + ) + sys.exit(2) + from maigret.web.app import app app.config["MAIGRET_DB_FILE"] = db_file @@ -873,7 +932,7 @@ async def main(): print(text_report) # update database - db.save_to_file(db_file) + save_db_if_safe("end-of-run") def run(): diff --git a/maigret/resources/db_meta.json b/maigret/resources/db_meta.json index 10537a0..e8f760e 100644 --- a/maigret/resources/db_meta.json +++ b/maigret/resources/db_meta.json @@ -1,6 +1,6 @@ { "version": 1, - "updated_at": "2026-04-22T16:15:02Z", + "updated_at": "2026-04-23T15:02:48Z", "sites_count": 3142, "min_maigret_version": "0.6.0", "data_sha256": "1e1ed6da2aa9db0f34171f61a044c20bbd1ed53a0430dec4a9ce8f8543655d1a", diff --git a/maigret/sites.py b/maigret/sites.py index b30cfae..cb22c03 100644 --- a/maigret/sites.py +++ b/maigret/sites.py @@ -516,6 +516,15 @@ class MaigretDatabase: else: return self.load_from_file(path) + def load_extra_from_path(self, path: str) -> "MaigretDatabase": + """Merge an additional DB on top of self. Last-wins on duplicate + site/engine names; tags deduped preserving first-seen order.""" + self.load_from_path(path) + self._sites = list({s.name: s for s in self._sites}.values()) + self._engines = list({e.name: e for e in self._engines}.values()) + self._tags = list(dict.fromkeys(self._tags)) + return self + def load_from_http(self, url: str) -> "MaigretDatabase": is_url_valid = url.startswith("http://") or url.startswith("https://") diff --git a/sites.md b/sites.md index 0b58e9f..ab85ad4 100644 --- a/sites.md +++ b/sites.md @@ -3146,7 +3146,7 @@ Rank data fetched from Majestic Million by domains. 1. ![](https://www.google.com/s2/favicons?domain=https://flarum.es) [flarum.es (https://flarum.es)](https://flarum.es)*: top 100M, es, forum* 1. ![](https://www.google.com/s2/favicons?domain=https://forum.fibra.click) [forum.fibra.click (https://forum.fibra.click)](https://forum.fibra.click)*: top 100M, forum, it* -The list was updated at (2026-04-22) +The list was updated at (2026-04-23) ## Statistics Enabled/total sites: 2529/3142 = 80.49% diff --git a/tests/test_cli.py b/tests/test_cli.py index e4f7d77..7aa4f33 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -51,6 +51,7 @@ DEFAULT_ARGS: Dict[str, Any] = { 'md': False, 'no_autoupdate': False, 'force_update': False, + 'extra_db_files': [], } @@ -126,6 +127,38 @@ def test_args_exclude_tags(argparser): assert getattr(args, arg) == want_args[arg] +def test_args_single_extra_db(argparser): + args = argparser.parse_args('--extra-db extras.json username'.split()) + + want_args = dict(DEFAULT_ARGS) + want_args.update( + { + 'extra_db_files': ['extras.json'], + 'username': ['username'], + } + ) + + for arg in vars(args): + assert getattr(args, arg) == want_args[arg] + + +def test_args_multiple_extra_dbs(argparser): + args = argparser.parse_args( + '--extra-db a.json --extra-db https://example.com/b.json username'.split() + ) + + want_args = dict(DEFAULT_ARGS) + want_args.update( + { + 'extra_db_files': ['a.json', 'https://example.com/b.json'], + 'username': ['username'], + } + ) + + for arg in vars(args): + assert getattr(args, arg) == want_args[arg] + + def test_args_tags_with_exclude_tags(argparser): args = argparser.parse_args('--tags coding --exclude-tags porn username'.split()) diff --git a/tests/test_sites.py b/tests/test_sites.py index 464b1cd..fef6aa6 100644 --- a/tests/test_sites.py +++ b/tests/test_sites.py @@ -1,5 +1,6 @@ """Maigret Database test functions""" +import json from typing import Any, Dict from maigret.sites import MaigretDatabase, MaigretSite @@ -96,6 +97,163 @@ def test_site_strip_engine_data_with_site_prior_updates(): assert amperka_stripped.json == UPDATED_EXAMPLE_DB['sites']['Amperka'] +def _write_db(tmp_path, name, data): + p = tmp_path / name + p.write_text(json.dumps(data), encoding='utf-8') + return str(p) + + +def test_extra_db_new_site(tmp_path): + db = MaigretDatabase() + db.load_from_json(EXAMPLE_DB) + assert len(db.sites) == 1 + + extra = { + 'engines': {}, + 'sites': { + 'ExampleExtra': { + 'tags': ['us'], + 'checkType': 'status_code', + 'url': 'https://example.com/{username}', + 'urlMain': 'https://example.com/', + 'usernameClaimed': 'test', + 'usernameUnclaimed': 'noonewouldeverusethis7', + } + }, + 'tags': ['us'], + } + db.load_extra_from_path(_write_db(tmp_path, 'extra.json', extra)) + + assert len(db.sites) == 2 + assert set(db.sites_dict.keys()) == {'Amperka', 'ExampleExtra'} + assert len(db._sites) == len(db.sites_dict) + + +def test_extra_db_site_override_last_wins(tmp_path): + db = MaigretDatabase() + db.load_from_json(EXAMPLE_DB) + assert db.sites_dict['Amperka'].url_main == 'http://forum.amperka.ru' + + extra = { + 'engines': {}, + 'sites': { + 'Amperka': { + 'engine': 'XenForo', + 'rank': 1, + 'tags': ['overridden'], + 'urlMain': 'https://overridden.example', + 'usernameClaimed': 'adam', + 'usernameUnclaimed': 'noonewouldeverusethis7', + } + }, + 'tags': [], + } + db.load_extra_from_path(_write_db(tmp_path, 'extra.json', extra)) + + assert len(db.sites) == 1 + amperka = db.sites_dict['Amperka'] + assert amperka.url_main == 'https://overridden.example' + assert 'overridden' in amperka.tags + + +def test_extra_db_engine_override(tmp_path): + main = { + 'engines': { + 'Proto': { + 'presenseStrs': ['orig'], + 'site': { + 'absenceStrs': ['original absence'], + 'checkType': 'message', + 'url': '{urlMain}/orig/{username}', + }, + } + }, + 'sites': { + 'MainSite': { + 'engine': 'Proto', + 'rank': 1, + 'tags': [], + 'urlMain': 'https://main.example', + 'usernameClaimed': 'a', + 'usernameUnclaimed': 'noonewouldeverusethis7', + } + }, + 'tags': [], + } + db = MaigretDatabase() + db.load_from_json(main) + + extra = { + 'engines': { + 'Proto': { + 'presenseStrs': ['overridden'], + 'site': { + 'absenceStrs': ['overridden absence'], + 'checkType': 'message', + 'url': '{urlMain}/overridden/{username}', + }, + } + }, + 'sites': { + 'ExtraSite': { + 'engine': 'Proto', + 'rank': 10, + 'tags': [], + 'urlMain': 'https://extra.example', + 'usernameClaimed': 'a', + 'usernameUnclaimed': 'noonewouldeverusethis7', + } + }, + 'tags': [], + } + db.load_extra_from_path(_write_db(tmp_path, 'extra.json', extra)) + + assert len(db._engines) == 1 + assert db.engines_dict['Proto'].presenseStrs == ['overridden'] + extra_site = db.sites_dict['ExtraSite'] + assert extra_site.absence_strs == ['overridden absence'] + main_site = db.sites_dict['MainSite'] + assert main_site.absence_strs == ['original absence'] + + +def test_extra_db_tag_dedup(tmp_path): + db = MaigretDatabase() + db.load_from_json({'engines': {}, 'sites': {}, 'tags': ['forum', 'ru']}) + + extra = {'engines': {}, 'sites': {}, 'tags': ['forum', 'us']} + db.load_extra_from_path(_write_db(tmp_path, 'extra.json', extra)) + + assert db._tags.count('forum') == 1 + assert sorted(db._tags) == ['forum', 'ru', 'us'] + + +def test_extra_db_chain_last_wins(tmp_path): + db = MaigretDatabase() + db.load_from_json(EXAMPLE_DB) + + def site_with_url(url): + return { + 'engines': {}, + 'sites': { + 'Amperka': { + 'engine': 'XenForo', + 'rank': 1, + 'tags': ['ru'], + 'urlMain': url, + 'usernameClaimed': 'adam', + 'usernameUnclaimed': 'noonewouldeverusethis7', + } + }, + 'tags': [], + } + + db.load_extra_from_path(_write_db(tmp_path, 'a.json', site_with_url('https://a'))) + db.load_extra_from_path(_write_db(tmp_path, 'b.json', site_with_url('https://b'))) + + assert len(db.sites) == 1 + assert db.sites_dict['Amperka'].url_main == 'https://b' + + def test_saving_site_error(): db = MaigretDatabase()