From 2e430e5039a8fdd1653f2a7fba57b52470c9f1e6 Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Tue, 24 Mar 2026 22:00:59 +0100 Subject: [PATCH] feat: add tag blacklisting via `--exclude-tags` (#2352) * Initial plan * feat: add tag blacklisting support (--exclude-tags CLI flag, web UI, docs, tests) Co-authored-by: soxoj <31013580+soxoj@users.noreply.github.com> Agent-Logs-Url: https://github.com/soxoj/maigret/sessions/1a656af2-36bf-494f-9f03-1b5340f0357c * fix: correct tag cloud label to match click-cycle interaction Co-authored-by: soxoj <31013580+soxoj@users.noreply.github.com> Agent-Logs-Url: https://github.com/soxoj/maigret/sessions/1a656af2-36bf-494f-9f03-1b5340f0357c * feat: add all country tags to web interface tag cloud Co-authored-by: soxoj <31013580+soxoj@users.noreply.github.com> Agent-Logs-Url: https://github.com/soxoj/maigret/sessions/7e184b24-ff26-48fd-8a93-aea12b0a8d7b --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: soxoj <31013580+soxoj@users.noreply.github.com> --- docs/source/command-line-options.rst | 6 ++ docs/source/tags.rst | 16 +++ maigret/maigret.py | 12 +++ maigret/sites.py | 23 +++- maigret/web/app.py | 10 +- maigret/web/templates/index.html | 153 +++++++++++++++++++++++++-- tests/test_cli.py | 32 ++++++ tests/test_sites.py | 48 +++++++++ 8 files changed, 288 insertions(+), 12 deletions(-) diff --git a/docs/source/command-line-options.rst b/docs/source/command-line-options.rst index 3932910..475e5d5 100644 --- a/docs/source/command-line-options.rst +++ b/docs/source/command-line-options.rst @@ -31,6 +31,12 @@ two-letter country codes (**not a language!**). E.g. photo, dating, sport; jp, u Multiple tags can be associated with one site. **Warning**: tags markup is not stable now. Read more :doc:`in the separate section `. +``--exclude-tags`` - Exclude sites with specific tags from the search +(blacklist). E.g. ``--exclude-tags porn,dating`` will skip all sites +tagged with ``porn`` or ``dating``. Can be combined with ``--tags`` to +include certain categories while excluding others. Read more +:doc:`in the separate section `. + ``-n``, ``--max-connections`` - Allowed number of concurrent connections **(default: 100)**. diff --git a/docs/source/tags.rst b/docs/source/tags.rst index 6fba528..4e0efa0 100644 --- a/docs/source/tags.rst +++ b/docs/source/tags.rst @@ -23,3 +23,19 @@ Usage ``--tags coding`` -- search on sites related to software development. ``--tags ucoz`` -- search on uCoz sites only (mostly CIS countries) + +Blacklisting (excluding) tags +------------------------------ +You can exclude sites with certain tags from the search using ``--exclude-tags``: + +``--exclude-tags porn,dating`` -- skip all sites tagged with ``porn`` or ``dating``. + +``--exclude-tags ru`` -- skip all Russian sites. + +You can combine ``--tags`` and ``--exclude-tags`` to fine-tune your search: + +``--tags forum --exclude-tags ru`` -- search on forum sites, but skip Russian ones. + +In the web interface, the tag cloud supports three states per tag: +click once to **include** (green), click again to **exclude** (dark/strikethrough), +and click once more to return to **neutral** (red). diff --git a/maigret/maigret.py b/maigret/maigret.py index eede2cf..8226dfa 100755 --- a/maigret/maigret.py +++ b/maigret/maigret.py @@ -277,6 +277,12 @@ def setup_arguments_parser(settings: Settings): filter_group.add_argument( "--tags", dest="tags", default='', help="Specify tags of sites (see `--stats`)." ) + filter_group.add_argument( + "--exclude-tags", + dest="exclude_tags", + default='', + help="Specify tags to exclude from search (blacklist).", + ) filter_group.add_argument( "--site", action="append", @@ -532,6 +538,11 @@ async def main(): if args.tags: args.tags = list(set(str(args.tags).split(','))) + if args.exclude_tags: + args.exclude_tags = list(set(str(args.exclude_tags).split(','))) + else: + args.exclude_tags = [] + db_file = args.db_file \ if (args.db_file.startswith("http://") or args.db_file.startswith("https://")) \ else path.join(path.dirname(path.realpath(__file__)), args.db_file) @@ -553,6 +564,7 @@ async def main(): get_top_sites_for_id = lambda x: db.ranked_sites_dict( top=args.top_sites, tags=args.tags, + excluded_tags=args.exclude_tags, names=args.site_list, disabled=args.use_disabled_sites, id_type=x, diff --git a/maigret/sites.py b/maigret/sites.py index db6bf76..1784828 100644 --- a/maigret/sites.py +++ b/maigret/sites.py @@ -318,6 +318,7 @@ class MaigretDatabase: reverse=False, top=sys.maxsize, tags=[], + excluded_tags=[], names=[], disabled=True, id_type="username", @@ -336,7 +337,8 @@ class MaigretDatabase: Args: reverse (bool, optional): Reverse the sorting order. Defaults to False. top (int, optional): Maximum number of sites to return. Defaults to sys.maxsize. - tags (list, optional): List of tags to filter sites by. Defaults to empty list. + tags (list, optional): List of tags to filter sites by (whitelist). Defaults to empty list. + excluded_tags (list, optional): List of tags to exclude sites by (blacklist). Defaults to empty list. names (list, optional): List of site names (or urls, see MaigretSite.__eq__) to filter by. Defaults to empty list. disabled (bool, optional): Whether to include disabled sites. Defaults to True. id_type (str, optional): Type of identifier to filter by. Defaults to "username". @@ -347,6 +349,7 @@ class MaigretDatabase: """ normalized_names = list(map(str.lower, names)) normalized_tags = list(map(str.lower, tags)) + normalized_excluded_tags = list(map(str.lower, excluded_tags)) is_name_ok = lambda x: x.name.lower() in normalized_names is_source_ok = lambda x: x.source and x.source.lower() in normalized_names @@ -360,6 +363,22 @@ class MaigretDatabase: ) is_id_type_ok = lambda x: x.type == id_type + is_excluded_by_tag = lambda x: set( + map(str.lower, x.tags) + ).intersection(set(normalized_excluded_tags)) + is_excluded_by_engine = lambda x: ( + isinstance(x.engine, str) + and x.engine.lower() in normalized_excluded_tags + ) + is_excluded_by_protocol = lambda x: ( + x.protocol and x.protocol in normalized_excluded_tags + ) + is_not_excluded = lambda x: not excluded_tags or not ( + is_excluded_by_tag(x) + or is_excluded_by_engine(x) + or is_excluded_by_protocol(x) + ) + filter_tags_engines_fun = ( lambda x: not tags or is_engine_ok(x) @@ -370,6 +389,7 @@ class MaigretDatabase: filter_fun = ( lambda x: filter_tags_engines_fun(x) + and is_not_excluded(x) and filter_names_fun(x) and is_disabled_needed(x) and is_id_type_ok(x) @@ -387,6 +407,7 @@ class MaigretDatabase: if top < sys.maxsize and sorted_list: filter_fun_ranking_parents = ( lambda x: filter_tags_engines_fun(x) + and is_not_excluded(x) and filter_names_fun(x) and is_id_type_ok(x) ) diff --git a/maigret/web/app.py b/maigret/web/app.py index d5eabc3..a1a8857 100644 --- a/maigret/web/app.py +++ b/maigret/web/app.py @@ -49,12 +49,14 @@ async def maigret_search(username, options): top_sites = 999999999 # effectively all tags = options.get('tags', []) + excluded_tags = options.get('excluded_tags', []) site_list = options.get('site_list', []) - logger.info(f"Filtering sites by tags: {tags}") + logger.info(f"Filtering sites by tags: {tags}, excluded: {excluded_tags}") sites = db.ranked_sites_dict( top=top_sites, tags=tags, + excluded_tags=excluded_tags, names=site_list, disabled=False, id_type='username', @@ -225,7 +227,8 @@ def search(): # Get selected tags - ensure it's a list selected_tags = request.form.getlist('tags') - logging.info(f"Selected tags: {selected_tags}") + excluded_tags = request.form.getlist('excluded_tags') + logging.info(f"Selected tags: {selected_tags}, Excluded tags: {excluded_tags}") options = { 'top_sites': request.form.get('top_sites') or '500', @@ -240,13 +243,14 @@ def search(): 'i2p_proxy': request.form.get('i2p_proxy', None) or None, 'permute': 'permute' in request.form, 'tags': selected_tags, # Pass selected tags as a list + 'excluded_tags': excluded_tags, # Pass excluded tags as a list 'site_list': [ s.strip() for s in request.form.get('site', '').split(',') if s.strip() ], } logging.info( - f"Starting search for usernames: {usernames} with tags: {selected_tags}" + f"Starting search for usernames: {usernames} with tags: {selected_tags}, excluded: {excluded_tags}" ) # Start background job diff --git a/maigret/web/templates/index.html b/maigret/web/templates/index.html index a7c3287..cd2d28e 100644 --- a/maigret/web/templates/index.html +++ b/maigret/web/templates/index.html @@ -28,6 +28,11 @@ background-color: #28a745; } + .tag.excluded { + background-color: #343a40; + text-decoration: line-through; + } + .tag:hover { transform: translateY(-2px); box-shadow: 0 2px 5px rgba(0, 0, 0, 0.2); @@ -168,7 +173,16 @@
- + +
+ + Included (whitelist) +    + Excluded (blacklist) +    + Neutral + +
+
@@ -292,26 +389,66 @@ } document.addEventListener('DOMContentLoaded', function () { - // Tag cloud functionality + // Tag cloud functionality with include/exclude (whitelist/blacklist) support const tagCloud = document.getElementById('tagCloud'); const hiddenSelect = document.getElementById('tags'); + const excludedSelect = document.getElementById('excludedTags'); const allTags = Array.from(hiddenSelect.options).map(opt => ({ value: opt.value, - label: opt.text + label: opt.text, + group: opt.dataset.group || 'category' })); + function updateTagSelects() { + // Clear and repopulate hidden selects based on tag states + Array.from(hiddenSelect.options).forEach(opt => opt.selected = false); + // Clear excluded select + excludedSelect.innerHTML = ''; + + document.querySelectorAll('#tagCloud .tag').forEach(tagEl => { + const val = tagEl.dataset.value; + if (tagEl.classList.contains('selected')) { + const option = Array.from(hiddenSelect.options).find(opt => opt.value === val); + if (option) option.selected = true; + } else if (tagEl.classList.contains('excluded')) { + const opt = document.createElement('option'); + opt.value = val; + opt.selected = true; + excludedSelect.appendChild(opt); + } + }); + } + + let lastGroup = ''; allTags.forEach(tag => { + if (tag.group !== lastGroup && tag.group === 'country') { + const separator = document.createElement('div'); + separator.style.cssText = 'width:100%;margin:8px 0 4px;padding:4px 0;border-top:1px solid rgba(0,0,0,0.15);font-size:13px;color:#666;'; + separator.textContent = 'Countries'; + tagCloud.appendChild(separator); + } + lastGroup = tag.group; + const tagElement = document.createElement('span'); tagElement.className = 'tag'; tagElement.textContent = tag.label; tagElement.dataset.value = tag.value; - tagElement.addEventListener('click', function () { - const isSelected = this.classList.toggle('selected'); - const option = Array.from(hiddenSelect.options).find(opt => opt.value === tag.value); - if (option) { - option.selected = isSelected; + // Single click cycles: neutral -> included -> excluded -> neutral + tagElement.addEventListener('click', function (e) { + e.preventDefault(); + if (this.classList.contains('selected')) { + // included -> excluded + this.classList.remove('selected'); + this.classList.add('excluded'); + } else if (this.classList.contains('excluded')) { + // excluded -> neutral + this.classList.remove('excluded'); + } else { + // neutral -> included + this.classList.add('selected'); } + updateTagSelects(); }); tagCloud.appendChild(tagElement); diff --git a/tests/test_cli.py b/tests/test_cli.py index 07f1d1c..1c69525 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -36,6 +36,7 @@ DEFAULT_ARGS: Dict[str, Any] = { 'site_list': [], 'stats': False, 'tags': '', + 'exclude_tags': '', 'timeout': 30, 'tor_proxy': 'socks5://127.0.0.1:9050', 'i2p_proxy': 'http://127.0.0.1:4444', @@ -105,3 +106,34 @@ def test_args_multiple_sites(argparser): for arg in vars(args): assert getattr(args, arg) == want_args[arg] + + +def test_args_exclude_tags(argparser): + args = argparser.parse_args('--exclude-tags porn,dating username'.split()) + + want_args = dict(DEFAULT_ARGS) + want_args.update( + { + 'exclude_tags': 'porn,dating', + 'username': ['username'], + } + ) + + for arg in vars(args): + assert getattr(args, arg) == want_args[arg] + + +def test_args_tags_with_exclude_tags(argparser): + args = argparser.parse_args('--tags coding --exclude-tags porn username'.split()) + + want_args = dict(DEFAULT_ARGS) + want_args.update( + { + 'tags': 'coding', + 'exclude_tags': 'porn', + 'username': ['username'], + } + ) + + for arg in vars(args): + assert getattr(args, arg) == want_args[arg] diff --git a/tests/test_sites.py b/tests/test_sites.py index dcb8b44..6cbf680 100644 --- a/tests/test_sites.py +++ b/tests/test_sites.py @@ -182,6 +182,54 @@ def test_ranked_sites_dict_id_type(): assert len(db.ranked_sites_dict(id_type='gaia_id')) == 1 +def test_ranked_sites_dict_excluded_tags(): + db = MaigretDatabase() + db.update_site(MaigretSite('3', {'alexaRank': 1000, 'engine': 'ucoz'})) + db.update_site(MaigretSite('1', {'alexaRank': 2, 'tags': ['forum']})) + db.update_site(MaigretSite('2', {'alexaRank': 10, 'tags': ['ru', 'forum']})) + + # excluding by tag + assert list(db.ranked_sites_dict(excluded_tags=['ru']).keys()) == ['1', '3'] + assert list(db.ranked_sites_dict(excluded_tags=['forum']).keys()) == ['3'] + + # excluding by engine + assert list(db.ranked_sites_dict(excluded_tags=['ucoz']).keys()) == ['1', '2'] + + # combining include and exclude tags + assert list(db.ranked_sites_dict(tags=['forum'], excluded_tags=['ru']).keys()) == ['1'] + + # excluding non-existent tag has no effect + assert list(db.ranked_sites_dict(excluded_tags=['nonexistent']).keys()) == ['1', '2', '3'] + + # exclude all + assert list(db.ranked_sites_dict(excluded_tags=['forum', 'ucoz']).keys()) == [] + + +def test_ranked_sites_dict_excluded_tags_with_top(): + """Excluded tags should also prevent mirrors from being included.""" + db = MaigretDatabase() + db.update_site( + MaigretSite('Parent', {'alexaRank': 1, 'tags': ['forum'], 'type': 'username'}) + ) + db.update_site( + MaigretSite('Mirror', {'alexaRank': 999999, 'source': 'Parent', 'tags': ['forum'], 'type': 'username'}) + ) + db.update_site( + MaigretSite('Other', {'alexaRank': 2, 'tags': ['coding'], 'type': 'username'}) + ) + + # Without exclusion, mirror should be included + result = db.ranked_sites_dict(top=1, id_type='username') + assert 'Parent' in result + assert 'Mirror' in result + + # With exclusion of 'forum', both Parent and Mirror should be excluded + result = db.ranked_sites_dict(top=2, excluded_tags=['forum'], id_type='username') + assert 'Parent' not in result + assert 'Mirror' not in result + assert 'Other' in result + + def test_ranked_sites_dict_mirrors_disabled_parent(): """Mirror is included when parent ranks in top N but parent is disabled.""" db = MaigretDatabase()