mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-07 06:24:35 +00:00
Extra db flag
This commit is contained in:
@@ -84,6 +84,9 @@ ids. Useful for repeated scanning with found known irrelevant usernames.
|
|||||||
``--db`` - Load Maigret database from a JSON file or an online, valid,
|
``--db`` - Load Maigret database from a JSON file or an online, valid,
|
||||||
JSON file. See :ref:`custom-database` below.
|
JSON file. See :ref:`custom-database` below.
|
||||||
|
|
||||||
|
``--extra-db`` - Load an **additional** sites database on top of
|
||||||
|
``--db`` (overlay). Repeatable. See :ref:`extra-database` below.
|
||||||
|
|
||||||
``--no-autoupdate`` - Disable the automatic database update check that
|
``--no-autoupdate`` - Disable the automatic database update check that
|
||||||
runs at startup. The currently cached (or bundled) database is used
|
runs at startup. The currently cached (or bundled) database is used
|
||||||
as-is.
|
as-is.
|
||||||
@@ -139,6 +142,47 @@ disabled and all sites scanned, looks like::
|
|||||||
--db LLM/maigret_private_db.json \
|
--db LLM/maigret_private_db.json \
|
||||||
--no-autoupdate -a
|
--no-autoupdate -a
|
||||||
|
|
||||||
|
.. _extra-database:
|
||||||
|
|
||||||
|
Overlaying additional databases (``--extra-db``)
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
``--extra-db FILE`` loads an additional sites database **on top of**
|
||||||
|
``--db``, rather than replacing it. The flag is repeatable, so multiple
|
||||||
|
extras can be layered in one invocation::
|
||||||
|
|
||||||
|
python3 -m maigret username \
|
||||||
|
--extra-db private_sites.json \
|
||||||
|
--extra-db team_sites.json -a
|
||||||
|
|
||||||
|
Each extra accepts the same three forms as ``--db`` (HTTP(S) URL,
|
||||||
|
absolute or cwd-relative local path, or module-relative path).
|
||||||
|
|
||||||
|
**Merge semantics.** Sites, engines and tags are merged into the main
|
||||||
|
database. On duplicate names, **last wins**: a site or engine defined
|
||||||
|
later (either in a subsequent ``--extra-db`` or in an ``--extra-db``
|
||||||
|
that re-defines a name from ``--db``) overrides the earlier definition.
|
||||||
|
Tag lists are deduplicated while preserving first-seen order.
|
||||||
|
|
||||||
|
**Auto-update.** Extras are never auto-updated — they are read exactly
|
||||||
|
as provided, regardless of ``--no-autoupdate`` / ``--force-update``.
|
||||||
|
|
||||||
|
**Save behaviour.** While any ``--extra-db`` is active, Maigret **skips
|
||||||
|
every database save** — including the implicit end-of-run save, the
|
||||||
|
``--self-check --auto-disable`` save, and the ``--submit`` save. This
|
||||||
|
prevents silently writing merged (main + extras) content back into the
|
||||||
|
main ``--db`` file. If you need to persist edits, run Maigret again
|
||||||
|
without ``--extra-db``. You will see a warning at startup::
|
||||||
|
|
||||||
|
[!] Database modifications will NOT be persisted while --extra-db is active.
|
||||||
|
|
||||||
|
**Missing or unreadable extra.** Maigret exits with a non-zero status —
|
||||||
|
extras are opt-in, so a silent skip would hide configuration errors.
|
||||||
|
|
||||||
|
**Not supported with** ``--web``. The web UI reloads its own database
|
||||||
|
from the main ``--db`` path, so extras would be invisible. Passing both
|
||||||
|
exits with an error.
|
||||||
|
|
||||||
Reports
|
Reports
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
|||||||
+63
-4
@@ -202,6 +202,17 @@ def setup_arguments_parser(settings: Settings):
|
|||||||
default=settings.sites_db_path,
|
default=settings.sites_db_path,
|
||||||
help="Load Maigret database from a JSON file or HTTP web resource.",
|
help="Load Maigret database from a JSON file or HTTP web resource.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--extra-db",
|
||||||
|
metavar="EXTRA_DB_FILE",
|
||||||
|
dest="extra_db_files",
|
||||||
|
action="append",
|
||||||
|
default=[],
|
||||||
|
help="Load an additional sites database on top of --db. Repeatable. "
|
||||||
|
"Accepts a local path (absolute or cwd-relative) or HTTP(S) URL. "
|
||||||
|
"Never auto-updated. Changes from --self-check / --submit are NOT "
|
||||||
|
"persisted when any --extra-db is loaded.",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--no-autoupdate",
|
"--no-autoupdate",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
@@ -614,6 +625,46 @@ async def main():
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
for extra_arg in args.extra_db_files:
|
||||||
|
try:
|
||||||
|
extra_path = resolve_db_path(
|
||||||
|
db_file_arg=extra_arg,
|
||||||
|
no_autoupdate=True,
|
||||||
|
meta_url=settings.db_update_meta_url,
|
||||||
|
check_interval_hours=settings.autoupdate_check_interval_hours,
|
||||||
|
color=not args.no_color,
|
||||||
|
)
|
||||||
|
except FileNotFoundError as e:
|
||||||
|
logger.error(f"--extra-db: {e}")
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
|
before = len(db.sites)
|
||||||
|
try:
|
||||||
|
db.load_extra_from_path(extra_path)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to load extra database from {extra_path}: {e}")
|
||||||
|
sys.exit(2)
|
||||||
|
query_notify.success(
|
||||||
|
f'Loaded extra database: {extra_path} '
|
||||||
|
f'(+{len(db.sites) - before} new, {len(db.sites)} total sites)'
|
||||||
|
)
|
||||||
|
|
||||||
|
if args.extra_db_files:
|
||||||
|
query_notify.warning(
|
||||||
|
'Database modifications will NOT be persisted while --extra-db is active.'
|
||||||
|
)
|
||||||
|
|
||||||
|
def save_db_if_safe(reason: str) -> bool:
|
||||||
|
if args.extra_db_files:
|
||||||
|
logger.warning(
|
||||||
|
f"Skipping database save ({reason}): --extra-db is active; "
|
||||||
|
"modifications are in-memory only."
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
db.save_to_file(db_file)
|
||||||
|
return True
|
||||||
|
|
||||||
get_top_sites_for_id = lambda x: db.ranked_sites_dict(
|
get_top_sites_for_id = lambda x: db.ranked_sites_dict(
|
||||||
top=args.top_sites,
|
top=args.top_sites,
|
||||||
tags=args.tags,
|
tags=args.tags,
|
||||||
@@ -629,7 +680,7 @@ async def main():
|
|||||||
submitter = Submitter(db=db, logger=logger, settings=settings, args=args)
|
submitter = Submitter(db=db, logger=logger, settings=settings, args=args)
|
||||||
is_submitted = await submitter.dialog(args.new_site_to_submit, args.cookie_file)
|
is_submitted = await submitter.dialog(args.new_site_to_submit, args.cookie_file)
|
||||||
if is_submitted:
|
if is_submitted:
|
||||||
db.save_to_file(db_file)
|
save_db_if_safe("post-submit")
|
||||||
await submitter.close()
|
await submitter.close()
|
||||||
|
|
||||||
# Database self-checking
|
# Database self-checking
|
||||||
@@ -663,8 +714,8 @@ async def main():
|
|||||||
'y',
|
'y',
|
||||||
'',
|
'',
|
||||||
):
|
):
|
||||||
db.save_to_file(db_file)
|
if save_db_if_safe("post-self-check"):
|
||||||
print('Database was successfully updated.')
|
print('Database was successfully updated.')
|
||||||
else:
|
else:
|
||||||
print('Updates will be applied only for current search session.')
|
print('Updates will be applied only for current search session.')
|
||||||
|
|
||||||
@@ -687,6 +738,14 @@ async def main():
|
|||||||
|
|
||||||
# Web interface
|
# Web interface
|
||||||
if args.web is not None:
|
if args.web is not None:
|
||||||
|
if args.extra_db_files:
|
||||||
|
logger.error(
|
||||||
|
'--web is not compatible with --extra-db: the web UI reloads '
|
||||||
|
'the database from --db only, so extras would be silently '
|
||||||
|
'ignored. Remove --extra-db or use the CLI mode.'
|
||||||
|
)
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
from maigret.web.app import app
|
from maigret.web.app import app
|
||||||
|
|
||||||
app.config["MAIGRET_DB_FILE"] = db_file
|
app.config["MAIGRET_DB_FILE"] = db_file
|
||||||
@@ -873,7 +932,7 @@ async def main():
|
|||||||
print(text_report)
|
print(text_report)
|
||||||
|
|
||||||
# update database
|
# update database
|
||||||
db.save_to_file(db_file)
|
save_db_if_safe("end-of-run")
|
||||||
|
|
||||||
|
|
||||||
def run():
|
def run():
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"version": 1,
|
"version": 1,
|
||||||
"updated_at": "2026-04-22T16:15:02Z",
|
"updated_at": "2026-04-23T15:02:48Z",
|
||||||
"sites_count": 3142,
|
"sites_count": 3142,
|
||||||
"min_maigret_version": "0.6.0",
|
"min_maigret_version": "0.6.0",
|
||||||
"data_sha256": "1e1ed6da2aa9db0f34171f61a044c20bbd1ed53a0430dec4a9ce8f8543655d1a",
|
"data_sha256": "1e1ed6da2aa9db0f34171f61a044c20bbd1ed53a0430dec4a9ce8f8543655d1a",
|
||||||
|
|||||||
@@ -516,6 +516,15 @@ class MaigretDatabase:
|
|||||||
else:
|
else:
|
||||||
return self.load_from_file(path)
|
return self.load_from_file(path)
|
||||||
|
|
||||||
|
def load_extra_from_path(self, path: str) -> "MaigretDatabase":
|
||||||
|
"""Merge an additional DB on top of self. Last-wins on duplicate
|
||||||
|
site/engine names; tags deduped preserving first-seen order."""
|
||||||
|
self.load_from_path(path)
|
||||||
|
self._sites = list({s.name: s for s in self._sites}.values())
|
||||||
|
self._engines = list({e.name: e for e in self._engines}.values())
|
||||||
|
self._tags = list(dict.fromkeys(self._tags))
|
||||||
|
return self
|
||||||
|
|
||||||
def load_from_http(self, url: str) -> "MaigretDatabase":
|
def load_from_http(self, url: str) -> "MaigretDatabase":
|
||||||
is_url_valid = url.startswith("http://") or url.startswith("https://")
|
is_url_valid = url.startswith("http://") or url.startswith("https://")
|
||||||
|
|
||||||
|
|||||||
@@ -3146,7 +3146,7 @@ Rank data fetched from Majestic Million by domains.
|
|||||||
1.  [flarum.es (https://flarum.es)](https://flarum.es)*: top 100M, es, forum*
|
1.  [flarum.es (https://flarum.es)](https://flarum.es)*: top 100M, es, forum*
|
||||||
1.  [forum.fibra.click (https://forum.fibra.click)](https://forum.fibra.click)*: top 100M, forum, it*
|
1.  [forum.fibra.click (https://forum.fibra.click)](https://forum.fibra.click)*: top 100M, forum, it*
|
||||||
|
|
||||||
The list was updated at (2026-04-22)
|
The list was updated at (2026-04-23)
|
||||||
## Statistics
|
## Statistics
|
||||||
|
|
||||||
Enabled/total sites: 2529/3142 = 80.49%
|
Enabled/total sites: 2529/3142 = 80.49%
|
||||||
|
|||||||
@@ -51,6 +51,7 @@ DEFAULT_ARGS: Dict[str, Any] = {
|
|||||||
'md': False,
|
'md': False,
|
||||||
'no_autoupdate': False,
|
'no_autoupdate': False,
|
||||||
'force_update': False,
|
'force_update': False,
|
||||||
|
'extra_db_files': [],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -126,6 +127,38 @@ def test_args_exclude_tags(argparser):
|
|||||||
assert getattr(args, arg) == want_args[arg]
|
assert getattr(args, arg) == want_args[arg]
|
||||||
|
|
||||||
|
|
||||||
|
def test_args_single_extra_db(argparser):
|
||||||
|
args = argparser.parse_args('--extra-db extras.json username'.split())
|
||||||
|
|
||||||
|
want_args = dict(DEFAULT_ARGS)
|
||||||
|
want_args.update(
|
||||||
|
{
|
||||||
|
'extra_db_files': ['extras.json'],
|
||||||
|
'username': ['username'],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
for arg in vars(args):
|
||||||
|
assert getattr(args, arg) == want_args[arg]
|
||||||
|
|
||||||
|
|
||||||
|
def test_args_multiple_extra_dbs(argparser):
|
||||||
|
args = argparser.parse_args(
|
||||||
|
'--extra-db a.json --extra-db https://example.com/b.json username'.split()
|
||||||
|
)
|
||||||
|
|
||||||
|
want_args = dict(DEFAULT_ARGS)
|
||||||
|
want_args.update(
|
||||||
|
{
|
||||||
|
'extra_db_files': ['a.json', 'https://example.com/b.json'],
|
||||||
|
'username': ['username'],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
for arg in vars(args):
|
||||||
|
assert getattr(args, arg) == want_args[arg]
|
||||||
|
|
||||||
|
|
||||||
def test_args_tags_with_exclude_tags(argparser):
|
def test_args_tags_with_exclude_tags(argparser):
|
||||||
args = argparser.parse_args('--tags coding --exclude-tags porn username'.split())
|
args = argparser.parse_args('--tags coding --exclude-tags porn username'.split())
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
"""Maigret Database test functions"""
|
"""Maigret Database test functions"""
|
||||||
|
|
||||||
|
import json
|
||||||
from typing import Any, Dict
|
from typing import Any, Dict
|
||||||
|
|
||||||
from maigret.sites import MaigretDatabase, MaigretSite
|
from maigret.sites import MaigretDatabase, MaigretSite
|
||||||
@@ -96,6 +97,163 @@ def test_site_strip_engine_data_with_site_prior_updates():
|
|||||||
assert amperka_stripped.json == UPDATED_EXAMPLE_DB['sites']['Amperka']
|
assert amperka_stripped.json == UPDATED_EXAMPLE_DB['sites']['Amperka']
|
||||||
|
|
||||||
|
|
||||||
|
def _write_db(tmp_path, name, data):
|
||||||
|
p = tmp_path / name
|
||||||
|
p.write_text(json.dumps(data), encoding='utf-8')
|
||||||
|
return str(p)
|
||||||
|
|
||||||
|
|
||||||
|
def test_extra_db_new_site(tmp_path):
|
||||||
|
db = MaigretDatabase()
|
||||||
|
db.load_from_json(EXAMPLE_DB)
|
||||||
|
assert len(db.sites) == 1
|
||||||
|
|
||||||
|
extra = {
|
||||||
|
'engines': {},
|
||||||
|
'sites': {
|
||||||
|
'ExampleExtra': {
|
||||||
|
'tags': ['us'],
|
||||||
|
'checkType': 'status_code',
|
||||||
|
'url': 'https://example.com/{username}',
|
||||||
|
'urlMain': 'https://example.com/',
|
||||||
|
'usernameClaimed': 'test',
|
||||||
|
'usernameUnclaimed': 'noonewouldeverusethis7',
|
||||||
|
}
|
||||||
|
},
|
||||||
|
'tags': ['us'],
|
||||||
|
}
|
||||||
|
db.load_extra_from_path(_write_db(tmp_path, 'extra.json', extra))
|
||||||
|
|
||||||
|
assert len(db.sites) == 2
|
||||||
|
assert set(db.sites_dict.keys()) == {'Amperka', 'ExampleExtra'}
|
||||||
|
assert len(db._sites) == len(db.sites_dict)
|
||||||
|
|
||||||
|
|
||||||
|
def test_extra_db_site_override_last_wins(tmp_path):
|
||||||
|
db = MaigretDatabase()
|
||||||
|
db.load_from_json(EXAMPLE_DB)
|
||||||
|
assert db.sites_dict['Amperka'].url_main == 'http://forum.amperka.ru'
|
||||||
|
|
||||||
|
extra = {
|
||||||
|
'engines': {},
|
||||||
|
'sites': {
|
||||||
|
'Amperka': {
|
||||||
|
'engine': 'XenForo',
|
||||||
|
'rank': 1,
|
||||||
|
'tags': ['overridden'],
|
||||||
|
'urlMain': 'https://overridden.example',
|
||||||
|
'usernameClaimed': 'adam',
|
||||||
|
'usernameUnclaimed': 'noonewouldeverusethis7',
|
||||||
|
}
|
||||||
|
},
|
||||||
|
'tags': [],
|
||||||
|
}
|
||||||
|
db.load_extra_from_path(_write_db(tmp_path, 'extra.json', extra))
|
||||||
|
|
||||||
|
assert len(db.sites) == 1
|
||||||
|
amperka = db.sites_dict['Amperka']
|
||||||
|
assert amperka.url_main == 'https://overridden.example'
|
||||||
|
assert 'overridden' in amperka.tags
|
||||||
|
|
||||||
|
|
||||||
|
def test_extra_db_engine_override(tmp_path):
|
||||||
|
main = {
|
||||||
|
'engines': {
|
||||||
|
'Proto': {
|
||||||
|
'presenseStrs': ['orig'],
|
||||||
|
'site': {
|
||||||
|
'absenceStrs': ['original absence'],
|
||||||
|
'checkType': 'message',
|
||||||
|
'url': '{urlMain}/orig/{username}',
|
||||||
|
},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
'sites': {
|
||||||
|
'MainSite': {
|
||||||
|
'engine': 'Proto',
|
||||||
|
'rank': 1,
|
||||||
|
'tags': [],
|
||||||
|
'urlMain': 'https://main.example',
|
||||||
|
'usernameClaimed': 'a',
|
||||||
|
'usernameUnclaimed': 'noonewouldeverusethis7',
|
||||||
|
}
|
||||||
|
},
|
||||||
|
'tags': [],
|
||||||
|
}
|
||||||
|
db = MaigretDatabase()
|
||||||
|
db.load_from_json(main)
|
||||||
|
|
||||||
|
extra = {
|
||||||
|
'engines': {
|
||||||
|
'Proto': {
|
||||||
|
'presenseStrs': ['overridden'],
|
||||||
|
'site': {
|
||||||
|
'absenceStrs': ['overridden absence'],
|
||||||
|
'checkType': 'message',
|
||||||
|
'url': '{urlMain}/overridden/{username}',
|
||||||
|
},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
'sites': {
|
||||||
|
'ExtraSite': {
|
||||||
|
'engine': 'Proto',
|
||||||
|
'rank': 10,
|
||||||
|
'tags': [],
|
||||||
|
'urlMain': 'https://extra.example',
|
||||||
|
'usernameClaimed': 'a',
|
||||||
|
'usernameUnclaimed': 'noonewouldeverusethis7',
|
||||||
|
}
|
||||||
|
},
|
||||||
|
'tags': [],
|
||||||
|
}
|
||||||
|
db.load_extra_from_path(_write_db(tmp_path, 'extra.json', extra))
|
||||||
|
|
||||||
|
assert len(db._engines) == 1
|
||||||
|
assert db.engines_dict['Proto'].presenseStrs == ['overridden']
|
||||||
|
extra_site = db.sites_dict['ExtraSite']
|
||||||
|
assert extra_site.absence_strs == ['overridden absence']
|
||||||
|
main_site = db.sites_dict['MainSite']
|
||||||
|
assert main_site.absence_strs == ['original absence']
|
||||||
|
|
||||||
|
|
||||||
|
def test_extra_db_tag_dedup(tmp_path):
|
||||||
|
db = MaigretDatabase()
|
||||||
|
db.load_from_json({'engines': {}, 'sites': {}, 'tags': ['forum', 'ru']})
|
||||||
|
|
||||||
|
extra = {'engines': {}, 'sites': {}, 'tags': ['forum', 'us']}
|
||||||
|
db.load_extra_from_path(_write_db(tmp_path, 'extra.json', extra))
|
||||||
|
|
||||||
|
assert db._tags.count('forum') == 1
|
||||||
|
assert sorted(db._tags) == ['forum', 'ru', 'us']
|
||||||
|
|
||||||
|
|
||||||
|
def test_extra_db_chain_last_wins(tmp_path):
|
||||||
|
db = MaigretDatabase()
|
||||||
|
db.load_from_json(EXAMPLE_DB)
|
||||||
|
|
||||||
|
def site_with_url(url):
|
||||||
|
return {
|
||||||
|
'engines': {},
|
||||||
|
'sites': {
|
||||||
|
'Amperka': {
|
||||||
|
'engine': 'XenForo',
|
||||||
|
'rank': 1,
|
||||||
|
'tags': ['ru'],
|
||||||
|
'urlMain': url,
|
||||||
|
'usernameClaimed': 'adam',
|
||||||
|
'usernameUnclaimed': 'noonewouldeverusethis7',
|
||||||
|
}
|
||||||
|
},
|
||||||
|
'tags': [],
|
||||||
|
}
|
||||||
|
|
||||||
|
db.load_extra_from_path(_write_db(tmp_path, 'a.json', site_with_url('https://a')))
|
||||||
|
db.load_extra_from_path(_write_db(tmp_path, 'b.json', site_with_url('https://b')))
|
||||||
|
|
||||||
|
assert len(db.sites) == 1
|
||||||
|
assert db.sites_dict['Amperka'].url_main == 'https://b'
|
||||||
|
|
||||||
|
|
||||||
def test_saving_site_error():
|
def test_saving_site_error():
|
||||||
db = MaigretDatabase()
|
db = MaigretDatabase()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user