From 227a25bfa1228760491c3d615fd9452fbf8ef803 Mon Sep 17 00:00:00 2001
From: Soxoj <31013580+soxoj@users.noreply.github.com>
Date: Sun, 22 Mar 2026 01:14:17 +0100
Subject: [PATCH] Twitter fixed, mirrors mechanism improvement (#2299)

---
 docs/source/command-line-options.rst | 12 ++++++++
 docs/source/development.rst          |  2 ++
 maigret/checking.py                  | 32 +++++++++++++++++++++
 maigret/errors.py                    |  3 ++
 maigret/resources/data.json          | 34 +++++++++++++++++-----
 maigret/sites.py                     | 37 +++++++++++++++++++++++-
 tests/test_sites.py                  | 43 ++++++++++++++++++++++++++++
 7 files changed, 155 insertions(+), 8 deletions(-)
diff --git a/docs/source/command-line-options.rst b/docs/source/command-line-options.rst
index 9d3b8ec..3932910 100644
--- a/docs/source/command-line-options.rst
+++ b/docs/source/command-line-options.rst
@@ -39,6 +39,18 @@ not stable now. Read more :doc:`in the separate section <tags>`.
 ``--top-sites`` - Count of sites for scan ranked by Alexa Top
 **(default: top 500)**.
 
+**Mirrors:** After the top *N* sites by Alexa rank are chosen (respecting
+``--tags``, ``--use-disabled-sites``, etc.), Maigret may add extra sites
+whose database field ``source`` names a **parent platform** that itself falls
+in the Alexa top *N* when ranking **including disabled** sites. For example,
+if ``Twitter`` ranks in the first 500 by Alexa, a mirror such as ``memory.lol``
+(with ``source: Twitter``) is included even though it has no rank and would
+otherwise be cut off. The same applies to Instagram-related mirrors (e.g.
+Picuki) when ``Instagram`` is in that parent top *N* by rank—even if the
+official ``Instagram`` entry is disabled and not scanned by default, its
+mirrors can still be pulled in. The final list is the ranked top *N* plus
+these mirrors (no fixed upper bound on mirror count).
+
 ``--timeout`` - Time (in seconds) to wait for responses from sites
 **(default: 30)**. A longer timeout will be more likely to get results
 from slow sites. On the other hand, this may cause a long delay to
diff --git a/docs/source/development.rst b/docs/source/development.rst
index 20ce4ff..70fb26d 100644
--- a/docs/source/development.rst
+++ b/docs/source/development.rst
@@ -24,6 +24,8 @@ The supported methods (``checkType`` values in ``data.json``) are:
 
 See the details of check mechanisms in the `checking.py <https://github.com/soxoj/maigret/blob/main/maigret/checking.py#L339>`_ file.
 
+**Mirrors and ``--top-sites``:** When you limit scans with ``--top-sites N``, Maigret also includes *mirror* sites (entries whose ``source`` field points at a parent platform such as Twitter or Instagram) if that parent would appear in the Alexa top *N* when disabled sites are considered for ranking. See the **Mirrors** paragraph under ``--top-sites`` in :doc:`command-line-options`.
+
 Testing
 -------
 
diff --git a/maigret/checking.py b/maigret/checking.py
index b33dc42..97987fd 100644
--- a/maigret/checking.py
+++ b/maigret/checking.py
@@ -547,6 +547,38 @@ async def check_site_for_username(
         return site.name, default_result
 
     response = await checker.check()
+    html_text = response[0] if response and response[0] else ""
+
+    # Retry once after token-style activation (e.g. Twitter guest token refresh).
+    act = site.activation
+    if act and html_text:
+        marks = act.get("marks") or []
+        if marks and any(m in html_text for m in marks):
+            method = act["method"]
+            try:
+                activate_fun = getattr(ParsingActivator(), method)
+                activate_fun(site, logger)
+            except AttributeError as e:
+                logger.warning(
+                    f"Activation method {method} for site {site.name} not found!",
+                    exc_info=True,
+                )
+            except Exception as e:
+                logger.warning(
+                    f"Failed activation {method} for site {site.name}: {str(e)}",
+                    exc_info=True,
+                )
+            else:
+                merged = dict(checker.headers or {})
+                merged.update(site.headers)
+                checker.prepare(
+                    url=checker.url,
+                    headers=merged,
+                    allow_redirects=checker.allow_redirects,
+                    timeout=checker.timeout,
+                    method=checker.method,
+                )
+                response = await checker.check()
 
     response_result = process_site_result(
         response, query_notify, logger, default_result, site
diff --git a/maigret/errors.py b/maigret/errors.py
index 573511e..3b79a6c 100644
--- a/maigret/errors.py
+++ b/maigret/errors.py
@@ -32,6 +32,9 @@ COMMON_ERRORS = {
     '<title>Attention Required! | Cloudflare</title>': CheckError(
         'Captcha', 'Cloudflare'
     ),
+    '<title>Just a moment</title>': CheckError(
+        'Bot protection', 'Cloudflare challenge page'
+    ),
     'Please stand by, while we are checking your browser': CheckError(
         'Bot protection', 'Cloudflare'
     ),
diff --git a/maigret/resources/data.json b/maigret/resources/data.json
index 7fd97bf..1235fed 100644
--- a/maigret/resources/data.json
+++ b/maigret/resources/data.json
@@ -740,8 +740,11 @@
             ],
             "regexCheck": "^[^\\.]+$",
             "checkType": "message",
+            "presenseStrs": [
+                "<meta name=\"description\" content=\""
+            ],
             "absenceStrs": [
-                "<title>404: This page could not be found</title>"
+                "id=\"__next_error__\""
             ],
             "alexaRank": 6524,
             "urlMain": "https://alternativeto.net/",
@@ -12351,6 +12354,10 @@
                 "photo"
             ],
             "checkType": "message",
+            "presenseStrs": [
+                "profile-avatar",
+                "profile-user-info"
+            ],
             "absenceStrs": [
                 "<title>Error 500</title>",
                 "<title>Error 404</title>",
@@ -13662,11 +13669,17 @@
                 "porn",
                 "us"
             ],
-            "checkType": "status_code",
+            "checkType": "message",
+            "presenseStrs": [
+                "Newest Porn Videos | Redtube"
+            ],
+            "absenceStrs": [
+                "Page Not Found"
+            ],
             "alexaRank": 1090,
-            "urlMain": "https://ru.redtube.com/",
-            "url": "https://ru.redtube.com/users/{username}",
-            "usernameClaimed": "adam",
+            "urlMain": "https://www.redtube.com/",
+            "url": "https://www.redtube.com/amateur/{username}",
+            "usernameClaimed": "dollyjeey",
             "usernameUnclaimed": "noonewouldeverusethis7"
         },
         "Reibert": {
@@ -17049,7 +17062,7 @@
                 "sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
                 "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
                 "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
-                "x-guest-token": "1411741418192883712"
+                "x-guest-token": "2035504183667757370"
             },
             "errors": {
                 "Bad guest token": "x-guest-token update required"
@@ -17058,6 +17071,7 @@
             "activation": {
                 "method": "twitter",
                 "marks": [
+                    "Bad guest token",
                     "Bad guest token."
                 ],
                 "url": "https://api.twitter.com/1.1/guest/activate.json",
@@ -17066,6 +17080,9 @@
             },
             "urlProbe": "https://twitter.com/i/api/graphql/ZRnOhhXPwue_JGILb9TNug/UserByScreenName?variables=%7B%22screen_name%22%3A%22{username}%22%2C%22withHighlightedLabel%22%3Atrue%7D",
             "checkType": "message",
+            "presenseStrs": [
+                "\"legacy\""
+            ],
             "absenceStrs": [
                 " not found"
             ],
@@ -18026,8 +18043,11 @@
                 "us"
             ],
             "checkType": "message",
+            "presenseStrs": [
+                "Explore Cannabis Brands"
+            ],
             "absenceStrs": [
-                "<title>Find Marijuana Dispensaries, Brands"
+                "<title data-next-head=\"\">Find Marijuana Dispensaries, Brands, Delivery, Deals"
             ],
             "alexaRank": 7929,
             "urlMain": "https://weedmaps.com",
diff --git a/maigret/sites.py b/maigret/sites.py
index 4220c2f..db6bf76 100644
--- a/maigret/sites.py
+++ b/maigret/sites.py
@@ -325,6 +325,14 @@ class MaigretDatabase:
         """
         Ranking and filtering of the sites list
 
+        When ``top`` is limited (not "all sites"), **mirrors** may be appended after
+        the Alexa-ranked slice. A mirror is any filtered site with a non-empty
+        ``source`` field equal to the name of a site that appears in the first
+        ``top`` positions of a **parent ranking** that includes disabled sites.
+        Thus mirrors such as third-party viewers (e.g. for Twitter or Instagram)
+        are still scanned when their parent platform ranks highly, even if the
+        official site is disabled and omitted from the main list.
+
         Args:
             reverse (bool, optional): Reverse the sorting order. Defaults to False.
             top (int, optional): Maximum number of sites to return. Defaults to sys.maxsize.
@@ -334,7 +342,8 @@ class MaigretDatabase:
             id_type (str, optional): Type of identifier to filter by. Defaults to "username".
 
         Returns:
-            dict: Dictionary of filtered and ranked sites, with site names as keys and MaigretSite objects as values
+            dict: Dictionary of filtered and ranked sites (base top slice plus mirrors),
+            with site names as keys and MaigretSite objects as values
         """
         normalized_names = list(map(str.lower, names))
         normalized_tags = list(map(str.lower, tags))
@@ -371,6 +380,32 @@ class MaigretDatabase:
         sorted_list = sorted(
             filtered_list, key=lambda x: x.alexa_rank, reverse=reverse
         )[:top]
+
+        # Mirrors: sites whose `source` matches a parent platform that ranks in the
+        # top `top` by Alexa when disabled entries are included in the ranking pool
+        # (so e.g. Instagram can be a parent for Picuki even if Instagram is disabled).
+        if top < sys.maxsize and sorted_list:
+            filter_fun_ranking_parents = (
+                lambda x: filter_tags_engines_fun(x)
+                and filter_names_fun(x)
+                and is_id_type_ok(x)
+            )
+            ranking_pool = [s for s in self.sites if filter_fun_ranking_parents(s)]
+            sorted_parents = sorted(
+                ranking_pool, key=lambda x: x.alexa_rank, reverse=reverse
+            )[:top]
+            parent_names_lower = {s.name.lower() for s in sorted_parents}
+            base_names = {s.name for s in sorted_list}
+
+            def is_mirror(s) -> bool:
+                if not s.source or s.name in base_names:
+                    return False
+                return s.source.lower() in parent_names_lower
+
+            mirrors = [s for s in filtered_list if is_mirror(s)]
+            mirrors.sort(key=lambda x: (x.alexa_rank, x.name))
+            sorted_list = list(sorted_list) + mirrors
+
         return {site.name: site for site in sorted_list}
 
     @property
diff --git a/tests/test_sites.py b/tests/test_sites.py
index 7c00f4e..dcb8b44 100644
--- a/tests/test_sites.py
+++ b/tests/test_sites.py
@@ -182,6 +182,49 @@ def test_ranked_sites_dict_id_type():
     assert len(db.ranked_sites_dict(id_type='gaia_id')) == 1
 
 
+def test_ranked_sites_dict_mirrors_disabled_parent():
+    """Mirror is included when parent ranks in top N but parent is disabled."""
+    db = MaigretDatabase()
+    db.update_site(
+        MaigretSite(
+            'ParentPlatform',
+            {'alexaRank': 5, 'disabled': True, 'type': 'username'},
+        )
+    )
+    db.update_site(
+        MaigretSite(
+            'OtherSite',
+            {'alexaRank': 100, 'type': 'username'},
+        )
+    )
+    db.update_site(
+        MaigretSite(
+            'MirrorSite',
+            {
+                'alexaRank': 99999999,
+                'source': 'ParentPlatform',
+                'type': 'username',
+            },
+        )
+    )
+
+    result = db.ranked_sites_dict(top=1, disabled=False, id_type='username')
+    assert list(result.keys()) == ['OtherSite', 'MirrorSite']
+
+
+def test_ranked_sites_dict_mirrors_no_extra_without_parent_in_top():
+    db = MaigretDatabase()
+    db.update_site(MaigretSite('A', {'alexaRank': 1, 'type': 'username'}))
+    db.update_site(
+        MaigretSite(
+            'B',
+            {'alexaRank': 2, 'source': 'NotInDb', 'type': 'username'},
+        )
+    )
+
+    assert list(db.ranked_sites_dict(top=1, id_type='username').keys()) == ['A']
+
+
 def test_get_url_template():
     site = MaigretSite(
         "test",