From 227a25bfa1228760491c3d615fd9452fbf8ef803 Mon Sep 17 00:00:00 2001 From: Soxoj <31013580+soxoj@users.noreply.github.com> Date: Sun, 22 Mar 2026 01:14:17 +0100 Subject: [PATCH] Twitter fixed, mirrors mechanism improvement (#2299) --- docs/source/command-line-options.rst | 12 ++++++++ docs/source/development.rst | 2 ++ maigret/checking.py | 32 +++++++++++++++++++++ maigret/errors.py | 3 ++ maigret/resources/data.json | 34 +++++++++++++++++----- maigret/sites.py | 37 +++++++++++++++++++++++- tests/test_sites.py | 43 ++++++++++++++++++++++++++++ 7 files changed, 155 insertions(+), 8 deletions(-) diff --git a/docs/source/command-line-options.rst b/docs/source/command-line-options.rst index 9d3b8ec..3932910 100644 --- a/docs/source/command-line-options.rst +++ b/docs/source/command-line-options.rst @@ -39,6 +39,18 @@ not stable now. Read more :doc:`in the separate section `. ``--top-sites`` - Count of sites for scan ranked by Alexa Top **(default: top 500)**. +**Mirrors:** After the top *N* sites by Alexa rank are chosen (respecting +``--tags``, ``--use-disabled-sites``, etc.), Maigret may add extra sites +whose database field ``source`` names a **parent platform** that itself falls +in the Alexa top *N* when ranking **including disabled** sites. For example, +if ``Twitter`` ranks in the first 500 by Alexa, a mirror such as ``memory.lol`` +(with ``source: Twitter``) is included even though it has no rank and would +otherwise be cut off. The same applies to Instagram-related mirrors (e.g. +Picuki) when ``Instagram`` is in that parent top *N* by rank—even if the +official ``Instagram`` entry is disabled and not scanned by default, its +mirrors can still be pulled in. The final list is the ranked top *N* plus +these mirrors (no fixed upper bound on mirror count). + ``--timeout`` - Time (in seconds) to wait for responses from sites **(default: 30)**. A longer timeout will be more likely to get results from slow sites. On the other hand, this may cause a long delay to diff --git a/docs/source/development.rst b/docs/source/development.rst index 20ce4ff..70fb26d 100644 --- a/docs/source/development.rst +++ b/docs/source/development.rst @@ -24,6 +24,8 @@ The supported methods (``checkType`` values in ``data.json``) are: See the details of check mechanisms in the `checking.py `_ file. +**Mirrors and ``--top-sites``:** When you limit scans with ``--top-sites N``, Maigret also includes *mirror* sites (entries whose ``source`` field points at a parent platform such as Twitter or Instagram) if that parent would appear in the Alexa top *N* when disabled sites are considered for ranking. See the **Mirrors** paragraph under ``--top-sites`` in :doc:`command-line-options`. + Testing ------- diff --git a/maigret/checking.py b/maigret/checking.py index b33dc42..97987fd 100644 --- a/maigret/checking.py +++ b/maigret/checking.py @@ -547,6 +547,38 @@ async def check_site_for_username( return site.name, default_result response = await checker.check() + html_text = response[0] if response and response[0] else "" + + # Retry once after token-style activation (e.g. Twitter guest token refresh). + act = site.activation + if act and html_text: + marks = act.get("marks") or [] + if marks and any(m in html_text for m in marks): + method = act["method"] + try: + activate_fun = getattr(ParsingActivator(), method) + activate_fun(site, logger) + except AttributeError as e: + logger.warning( + f"Activation method {method} for site {site.name} not found!", + exc_info=True, + ) + except Exception as e: + logger.warning( + f"Failed activation {method} for site {site.name}: {str(e)}", + exc_info=True, + ) + else: + merged = dict(checker.headers or {}) + merged.update(site.headers) + checker.prepare( + url=checker.url, + headers=merged, + allow_redirects=checker.allow_redirects, + timeout=checker.timeout, + method=checker.method, + ) + response = await checker.check() response_result = process_site_result( response, query_notify, logger, default_result, site diff --git a/maigret/errors.py b/maigret/errors.py index 573511e..3b79a6c 100644 --- a/maigret/errors.py +++ b/maigret/errors.py @@ -32,6 +32,9 @@ COMMON_ERRORS = { 'Attention Required! | Cloudflare': CheckError( 'Captcha', 'Cloudflare' ), + 'Just a moment': CheckError( + 'Bot protection', 'Cloudflare challenge page' + ), 'Please stand by, while we are checking your browser': CheckError( 'Bot protection', 'Cloudflare' ), diff --git a/maigret/resources/data.json b/maigret/resources/data.json index 7fd97bf..1235fed 100644 --- a/maigret/resources/data.json +++ b/maigret/resources/data.json @@ -740,8 +740,11 @@ ], "regexCheck": "^[^\\.]+$", "checkType": "message", + "presenseStrs": [ + "404: This page could not be found" + "id=\"__next_error__\"" ], "alexaRank": 6524, "urlMain": "https://alternativeto.net/", @@ -12351,6 +12354,10 @@ "photo" ], "checkType": "message", + "presenseStrs": [ + "profile-avatar", + "profile-user-info" + ], "absenceStrs": [ "Error 500", "Error 404", @@ -13662,11 +13669,17 @@ "porn", "us" ], - "checkType": "status_code", + "checkType": "message", + "presenseStrs": [ + "Newest Porn Videos | Redtube" + ], + "absenceStrs": [ + "Page Not Found" + ], "alexaRank": 1090, - "urlMain": "https://ru.redtube.com/", - "url": "https://ru.redtube.com/users/{username}", - "usernameClaimed": "adam", + "urlMain": "https://www.redtube.com/", + "url": "https://www.redtube.com/amateur/{username}", + "usernameClaimed": "dollyjeey", "usernameUnclaimed": "noonewouldeverusethis7" }, "Reibert": { @@ -17049,7 +17062,7 @@ "sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"", "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA", "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36", - "x-guest-token": "1411741418192883712" + "x-guest-token": "2035504183667757370" }, "errors": { "Bad guest token": "x-guest-token update required" @@ -17058,6 +17071,7 @@ "activation": { "method": "twitter", "marks": [ + "Bad guest token", "Bad guest token." ], "url": "https://api.twitter.com/1.1/guest/activate.json", @@ -17066,6 +17080,9 @@ }, "urlProbe": "https://twitter.com/i/api/graphql/ZRnOhhXPwue_JGILb9TNug/UserByScreenName?variables=%7B%22screen_name%22%3A%22{username}%22%2C%22withHighlightedLabel%22%3Atrue%7D", "checkType": "message", + "presenseStrs": [ + "\"legacy\"" + ], "absenceStrs": [ " not found" ], @@ -18026,8 +18043,11 @@ "us" ], "checkType": "message", + "presenseStrs": [ + "Explore Cannabis Brands" + ], "absenceStrs": [ - "Find Marijuana Dispensaries, Brands" + "<title data-next-head=\"\">Find Marijuana Dispensaries, Brands, Delivery, Deals" ], "alexaRank": 7929, "urlMain": "https://weedmaps.com", diff --git a/maigret/sites.py b/maigret/sites.py index 4220c2f..db6bf76 100644 --- a/maigret/sites.py +++ b/maigret/sites.py @@ -325,6 +325,14 @@ class MaigretDatabase: """ Ranking and filtering of the sites list + When ``top`` is limited (not "all sites"), **mirrors** may be appended after + the Alexa-ranked slice. A mirror is any filtered site with a non-empty + ``source`` field equal to the name of a site that appears in the first + ``top`` positions of a **parent ranking** that includes disabled sites. + Thus mirrors such as third-party viewers (e.g. for Twitter or Instagram) + are still scanned when their parent platform ranks highly, even if the + official site is disabled and omitted from the main list. + Args: reverse (bool, optional): Reverse the sorting order. Defaults to False. top (int, optional): Maximum number of sites to return. Defaults to sys.maxsize. @@ -334,7 +342,8 @@ class MaigretDatabase: id_type (str, optional): Type of identifier to filter by. Defaults to "username". Returns: - dict: Dictionary of filtered and ranked sites, with site names as keys and MaigretSite objects as values + dict: Dictionary of filtered and ranked sites (base top slice plus mirrors), + with site names as keys and MaigretSite objects as values """ normalized_names = list(map(str.lower, names)) normalized_tags = list(map(str.lower, tags)) @@ -371,6 +380,32 @@ class MaigretDatabase: sorted_list = sorted( filtered_list, key=lambda x: x.alexa_rank, reverse=reverse )[:top] + + # Mirrors: sites whose `source` matches a parent platform that ranks in the + # top `top` by Alexa when disabled entries are included in the ranking pool + # (so e.g. Instagram can be a parent for Picuki even if Instagram is disabled). + if top < sys.maxsize and sorted_list: + filter_fun_ranking_parents = ( + lambda x: filter_tags_engines_fun(x) + and filter_names_fun(x) + and is_id_type_ok(x) + ) + ranking_pool = [s for s in self.sites if filter_fun_ranking_parents(s)] + sorted_parents = sorted( + ranking_pool, key=lambda x: x.alexa_rank, reverse=reverse + )[:top] + parent_names_lower = {s.name.lower() for s in sorted_parents} + base_names = {s.name for s in sorted_list} + + def is_mirror(s) -> bool: + if not s.source or s.name in base_names: + return False + return s.source.lower() in parent_names_lower + + mirrors = [s for s in filtered_list if is_mirror(s)] + mirrors.sort(key=lambda x: (x.alexa_rank, x.name)) + sorted_list = list(sorted_list) + mirrors + return {site.name: site for site in sorted_list} @property diff --git a/tests/test_sites.py b/tests/test_sites.py index 7c00f4e..dcb8b44 100644 --- a/tests/test_sites.py +++ b/tests/test_sites.py @@ -182,6 +182,49 @@ def test_ranked_sites_dict_id_type(): assert len(db.ranked_sites_dict(id_type='gaia_id')) == 1 +def test_ranked_sites_dict_mirrors_disabled_parent(): + """Mirror is included when parent ranks in top N but parent is disabled.""" + db = MaigretDatabase() + db.update_site( + MaigretSite( + 'ParentPlatform', + {'alexaRank': 5, 'disabled': True, 'type': 'username'}, + ) + ) + db.update_site( + MaigretSite( + 'OtherSite', + {'alexaRank': 100, 'type': 'username'}, + ) + ) + db.update_site( + MaigretSite( + 'MirrorSite', + { + 'alexaRank': 99999999, + 'source': 'ParentPlatform', + 'type': 'username', + }, + ) + ) + + result = db.ranked_sites_dict(top=1, disabled=False, id_type='username') + assert list(result.keys()) == ['OtherSite', 'MirrorSite'] + + +def test_ranked_sites_dict_mirrors_no_extra_without_parent_in_top(): + db = MaigretDatabase() + db.update_site(MaigretSite('A', {'alexaRank': 1, 'type': 'username'})) + db.update_site( + MaigretSite( + 'B', + {'alexaRank': 2, 'source': 'NotInDb', 'type': 'username'}, + ) + ) + + assert list(db.ranked_sites_dict(top=1, id_type='username').keys()) == ['A'] + + def test_get_url_template(): site = MaigretSite( "test",