From 073c20338b27cbb785cc40c1b82a3a5e3accac57 Mon Sep 17 00:00:00 2001 From: Soxoj Date: Sat, 16 May 2026 21:48:43 +0200 Subject: [PATCH] fix(checking): block URL-incompatible usernames before request --- CONTRIBUTING.md | 7 ++++ docs/source/development.rst | 41 ++++++++++++++++++- maigret/checking.py | 45 +++++++++++++++++++++ maigret/resources/db_meta.json | 2 +- tests/test_checking.py | 74 ++++++++++++++++++++++++++++++++++ 5 files changed, 167 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c44d0b9..e9dd1df 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -95,6 +95,13 @@ Each site entry uses one of three `checkType` modes to decide whether a profile **Errors vs absence.** Anything that means "the server can't answer right now" — rate limits, captchas, "Checking your browser", "unusual traffic", maintenance pages — belongs in `errors` (mapping the substring to a human-readable error string), not in `absenceStrs`. The `errors` mechanism produces an UNKNOWN result instead of a false CLAIMED or false AVAILABLE. +**`regexCheck` and non-ASCII usernames.** When `{username}` is interpolated into a URL **path segment** and the username contains characters that need percent-encoding (Cyrillic, Chinese, Korean, spaces, etc.), Maigret skips the site with an `URL-incompatible username` error rather than send a request that would land on a generic listing/homepage and trip overly-broad `presenseStrs`. This default avoids the cascade of false-positives observed in [#459](https://github.com/soxoj/maigret/issues/459) and [#2633](https://github.com/soxoj/maigret/issues/2633). Two corollaries for site entries: + +- If your site legitimately accepts non-ASCII characters in the URL path (a wiki that mounts Unicode usernames, a Russian forum that serves Cyrillic slugs, etc.), declare the actual format with an explicit `regexCheck`. For example, a MediaWiki-style wiki could use `"regexCheck": "^[^\\/\\\\#<>\\[\\]\\|{}]+$"`; a Japanese blog platform might use `"regexCheck": "^[\\w\\-_\\.]+$"` (Python's `\w` matches Unicode letters). Don't paper this over with `regexCheck: "."` — pick a regex that reflects what the site actually accepts. +- If `{username}` is in a query string (`?name={username}`) or only in `requestPayload`, the default has no effect — query/body values are URL-encoded as parameters and most APIs handle that fine. + +The default kicks in *only* when no per-site `regexCheck` is set. Existing per-site regexes always win. + Full reference for `checkType`, `urlProbe`, `engine`, and the rest of the `data.json` schema is in the [development guide](docs/source/development.rst), section *How to fix false-positives*. ### Editing `data.json` safely diff --git a/docs/source/development.rst b/docs/source/development.rst index da4c031..b863cf6 100644 --- a/docs/source/development.rst +++ b/docs/source/development.rst @@ -134,11 +134,50 @@ There are few options for sites data.json helpful in various cases: - ``engine`` - a predefined check for the sites of certain type (e.g. forums), see the ``engines`` section in the JSON file - ``headers`` - a dictionary of additional headers to be sent to the site - ``requestHeadOnly`` - set to ``true`` if it's enough to make a HEAD request to the site -- ``regexCheck`` - a regex to check if the username is valid, in case of frequent false-positives +- ``regexCheck`` - a regex to check if the username is valid, in case of frequent false-positives (see ``regexCheck`` and the non-ASCII default below) - ``requestMethod`` - set the HTTP method to use (e.g., ``POST``). By default, Maigret natively defaults to GET or HEAD. - ``requestPayload`` - a dictionary with the JSON payload to send for POST requests (e.g., ``{"username": "{username}"}``), extremely useful for parsing GraphQL or modern JSON APIs. - ``protection`` - a list of protection types detected on the site (see below). +``regexCheck`` and non-ASCII usernames +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When ``{username}`` is interpolated into a URL **path segment** and the user-supplied username contains characters that would be percent-encoded by :py:func:`urllib.parse.quote` (Cyrillic, Chinese, Korean, Arabic, spaces, etc.), Maigret skips the site with an ``URL-incompatible username`` error rather than send a request that would land on a generic listing/homepage and trip overly-broad ``presenseStrs``. This default closes the cascade of false-positives observed in `issue #459 `_ and `issue #2633 `_. + +Scope of the default: + +- Active **only** when ``{username}`` is in the URL path of ``url`` (or ``urlProbe`` if set), e.g. ``https://example.com/u/{username}``. +- **Not** active when ``{username}`` is in the query string (``?name={username}``) or only in ``requestPayload`` — those values are URL-encoded as parameters and most APIs handle them fine. +- **Always** deferred when the site has its own ``regexCheck`` — an explicit per-site rule wins. + +Opting a site into broader matching: + +If a site genuinely accepts non-ASCII characters in the URL path (a wiki that mounts Unicode usernames, a Russian forum that serves Cyrillic slugs, etc.), declare the actual accepted format with an explicit ``regexCheck`` that matches your reality. A few worked examples: + +- A MediaWiki-style wiki that allows any character except the MediaWiki-forbidden punctuation: + + .. code-block:: json + + { + "url": "https://wiki.example/wiki/User:{username}", + "regexCheck": "^[^\\/\\\\#<>\\[\\]\\|{}]+$" + } + +- A Japanese blog platform that allows Unicode word characters + dash + dot: + + .. code-block:: json + + { + "url": "https://blog.example/{username}", + "regexCheck": "^[\\w\\-_\\.]+$" + } + + In Python's regex engine, ``\\w`` against a ``str`` pattern matches Unicode letters by default, so Hiragana / Hangul / Cyrillic / etc. all pass. + +**Do not** paper this over with ``"regexCheck": "."`` — that's a placeholder, not a description of what the site accepts; it will let any string through, including URLs and emails that other parts of Maigret may pick up and feed back into recursive search (see ``parse_usernames`` in ``checking.py``). + +The complementary direction also matters: if you notice an existing site with a too-permissive ``regexCheck`` (e.g. ``"^[^\\.]+$"``, which means "anything but a dot" — that gladly lets non-ASCII through), tighten it to the actual accepted character class for the site (typically ``"^[A-Za-z0-9_-]+$"`` for ASCII slugs) when fixing related false-positives. + ``protection`` (site protection tracking) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/maigret/checking.py b/maigret/checking.py index 1088349..990ce3e 100644 --- a/maigret/checking.py +++ b/maigret/checking.py @@ -49,6 +49,34 @@ SUPPORTED_IDS = ( BAD_CHARS = "#" +def _username_fits_url_template(site: MaigretSite, username: str) -> bool: + """Decide whether a username can be safely substituted into a site's URL + path without producing a percent-encoded slug that the site cannot match. + + Rationale: most sites that interpolate ``{username}`` into a URL path + segment treat the slug as an ASCII identifier. When a username contains + non-ASCII characters (or other reserved characters), ``urllib.parse.quote`` + percent-encodes the bytes; the site typically cannot resolve such a slug + and falls back to a generic listing/homepage that trips overly-broad + ``presenseStrs`` markers, producing a false CLAIMED. See issues #459 and + #2633. Sites that genuinely accept broader character sets (e.g. wikis + that allow Unicode usernames) opt into permissive matching by setting + their own ``regexCheck``; in that case this helper is bypassed entirely. + + Returns True when the check should proceed, False when the result is + inherently unreliable and the site should be skipped (ILLEGAL). + """ + if site.regex_check: + return True + template = site.url_probe or site.url or "" + if "{username}" not in template: + return True + path_part, _sep, _query = template.partition("?") + if "{username}" not in path_part: + return True + return quote(username, safe='') == username + + def build_cloudflare_bypass_config( settings_obj: Optional[Any], force_enable: bool = False ) -> Optional[Dict[str, Any]]: @@ -880,6 +908,23 @@ def make_site_result( results_site["http_status"] = "" results_site["response_text"] = "" # query_notify.update(results_site["status"]) + # username would be percent-encoded into a path segment — see #459/#2633. + elif not _username_fits_url_template(site, username): + results_site["status"] = MaigretCheckResult( + username, + site.name, + url, + MaigretCheckStatus.ILLEGAL, + error=CheckError( + 'URL-incompatible username', + 'username contains characters that would be percent-encoded ' + 'in this site\'s URL path; result would be unreliable. Add a ' + '`regexCheck` to opt this site in if it accepts these chars.' + ), + ) + results_site["url_user"] = "" + results_site["http_status"] = "" + results_site["response_text"] = "" else: # URL of user on site (if it exists) results_site["url_user"] = url diff --git a/maigret/resources/db_meta.json b/maigret/resources/db_meta.json index a5335c6..c5202f5 100644 --- a/maigret/resources/db_meta.json +++ b/maigret/resources/db_meta.json @@ -1,6 +1,6 @@ { "version": 1, - "updated_at": "2026-05-16T16:00:20Z", + "updated_at": "2026-05-16T19:48:44Z", "sites_count": 3155, "min_maigret_version": "0.6.1", "data_sha256": "0997b68c05eedb6e714432ed79580688d4923c56ef1ebf46db69b90039ef00d7", diff --git a/tests/test_checking.py b/tests/test_checking.py index 826f504..afdbe2f 100644 --- a/tests/test_checking.py +++ b/tests/test_checking.py @@ -13,6 +13,7 @@ from maigret.checking import ( timeout_check, debug_response_logging, process_site_result, + _username_fits_url_template, ) from maigret.errors import CheckError from maigret.result import MaigretCheckResult, MaigretCheckStatus @@ -144,6 +145,79 @@ def test_detect_error_page_instagram_login_wall(): assert "rate-limited" in err.desc +def _site_for_url(url_pattern, regex_check=None, url_probe=None): + """Build a minimal MaigretSite stub for the URL-template helper tests.""" + raw = { + "url": url_pattern, + "urlMain": "https://example.com/", + "checkType": "message", + "usernameClaimed": "alice", + "usernameUnclaimed": "noone", + } + if regex_check is not None: + raw["regexCheck"] = regex_check + if url_probe is not None: + raw["urlProbe"] = url_probe + return MaigretSite("Example", raw) + + +# Regression tests for #459 / #2633 — usernames that would be percent-encoded +# into a URL path segment trip generic presence markers on fallback pages. +def test_username_fits_path_segment_ascii_slug_passes(): + site = _site_for_url("https://example.com/u/{username}") + assert _username_fits_url_template(site, "alice") is True + assert _username_fits_url_template(site, "alice-bob") is True + assert _username_fits_url_template(site, "alice.bob_42") is True + + +def test_username_fits_path_segment_non_ascii_blocked(): + site = _site_for_url("https://example.com/u/{username}") + # Cyrillic + assert _username_fits_url_template(site, "Александр") is False + # Chinese + assert _username_fits_url_template(site, "快嘴摩卡酱") is False + # Korean + assert _username_fits_url_template(site, "홍길동") is False + # Space (also percent-encoded) + assert _username_fits_url_template(site, "alice bob") is False + + +def test_username_fits_query_string_is_unconstrained(): + """If {username} sits in the query string, the value is URL-encoded as a + parameter and most APIs handle that fine — don't block.""" + site = _site_for_url("https://example.com/api/users?name={username}") + assert _username_fits_url_template(site, "快嘴摩卡酱") is True + assert _username_fits_url_template(site, "Александр") is True + + +def test_username_fits_explicit_regex_check_bypasses_helper(): + """When the site declares its own regexCheck, the helper defers entirely.""" + # Permissive site: accepts anything via Unicode-friendly regex. + site = _site_for_url( + "https://wiki.example/User:{username}", regex_check=r"^[\w\- .]+$" + ) + assert _username_fits_url_template(site, "Александр") is True + assert _username_fits_url_template(site, "快嘴摩卡酱") is True + + +def test_username_fits_url_probe_overrides_url(): + """urlProbe is the actual request URL; the helper must use it when set.""" + # Path-segment url, but urlProbe is a clean query API → no validation + site = _site_for_url( + "https://example.com/u/{username}", + url_probe="https://example.com/api/u?name={username}", + ) + assert _username_fits_url_template(site, "快嘴摩卡酱") is True + + +def test_username_fits_post_payload_sites_skipped(): + """Sites with {username} only in requestPayload (no {username} in URL + template at all) should pass unconditionally — payload is JSON-encoded, + not URL-path-encoded.""" + site = _site_for_url("https://api.example.com/check") + assert _username_fits_url_template(site, "快嘴摩卡酱") is True + + def test_detect_error_page_instagram_marker_no_false_positive_on_profile(): """The login-wall marker must NOT match a real profile page. On a claimed user page, `routePath` carries the user-route template