mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-06 22:19:01 +00:00
Twitter fixed, mirrors mechanism improvement (#2299)
This commit is contained in:
@@ -39,6 +39,18 @@ not stable now. Read more :doc:`in the separate section <tags>`.
|
|||||||
``--top-sites`` - Count of sites for scan ranked by Alexa Top
|
``--top-sites`` - Count of sites for scan ranked by Alexa Top
|
||||||
**(default: top 500)**.
|
**(default: top 500)**.
|
||||||
|
|
||||||
|
**Mirrors:** After the top *N* sites by Alexa rank are chosen (respecting
|
||||||
|
``--tags``, ``--use-disabled-sites``, etc.), Maigret may add extra sites
|
||||||
|
whose database field ``source`` names a **parent platform** that itself falls
|
||||||
|
in the Alexa top *N* when ranking **including disabled** sites. For example,
|
||||||
|
if ``Twitter`` ranks in the first 500 by Alexa, a mirror such as ``memory.lol``
|
||||||
|
(with ``source: Twitter``) is included even though it has no rank and would
|
||||||
|
otherwise be cut off. The same applies to Instagram-related mirrors (e.g.
|
||||||
|
Picuki) when ``Instagram`` is in that parent top *N* by rank—even if the
|
||||||
|
official ``Instagram`` entry is disabled and not scanned by default, its
|
||||||
|
mirrors can still be pulled in. The final list is the ranked top *N* plus
|
||||||
|
these mirrors (no fixed upper bound on mirror count).
|
||||||
|
|
||||||
``--timeout`` - Time (in seconds) to wait for responses from sites
|
``--timeout`` - Time (in seconds) to wait for responses from sites
|
||||||
**(default: 30)**. A longer timeout will be more likely to get results
|
**(default: 30)**. A longer timeout will be more likely to get results
|
||||||
from slow sites. On the other hand, this may cause a long delay to
|
from slow sites. On the other hand, this may cause a long delay to
|
||||||
|
|||||||
@@ -24,6 +24,8 @@ The supported methods (``checkType`` values in ``data.json``) are:
|
|||||||
|
|
||||||
See the details of check mechanisms in the `checking.py <https://github.com/soxoj/maigret/blob/main/maigret/checking.py#L339>`_ file.
|
See the details of check mechanisms in the `checking.py <https://github.com/soxoj/maigret/blob/main/maigret/checking.py#L339>`_ file.
|
||||||
|
|
||||||
|
**Mirrors and ``--top-sites``:** When you limit scans with ``--top-sites N``, Maigret also includes *mirror* sites (entries whose ``source`` field points at a parent platform such as Twitter or Instagram) if that parent would appear in the Alexa top *N* when disabled sites are considered for ranking. See the **Mirrors** paragraph under ``--top-sites`` in :doc:`command-line-options`.
|
||||||
|
|
||||||
Testing
|
Testing
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
|||||||
@@ -547,6 +547,38 @@ async def check_site_for_username(
|
|||||||
return site.name, default_result
|
return site.name, default_result
|
||||||
|
|
||||||
response = await checker.check()
|
response = await checker.check()
|
||||||
|
html_text = response[0] if response and response[0] else ""
|
||||||
|
|
||||||
|
# Retry once after token-style activation (e.g. Twitter guest token refresh).
|
||||||
|
act = site.activation
|
||||||
|
if act and html_text:
|
||||||
|
marks = act.get("marks") or []
|
||||||
|
if marks and any(m in html_text for m in marks):
|
||||||
|
method = act["method"]
|
||||||
|
try:
|
||||||
|
activate_fun = getattr(ParsingActivator(), method)
|
||||||
|
activate_fun(site, logger)
|
||||||
|
except AttributeError as e:
|
||||||
|
logger.warning(
|
||||||
|
f"Activation method {method} for site {site.name} not found!",
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
f"Failed activation {method} for site {site.name}: {str(e)}",
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
merged = dict(checker.headers or {})
|
||||||
|
merged.update(site.headers)
|
||||||
|
checker.prepare(
|
||||||
|
url=checker.url,
|
||||||
|
headers=merged,
|
||||||
|
allow_redirects=checker.allow_redirects,
|
||||||
|
timeout=checker.timeout,
|
||||||
|
method=checker.method,
|
||||||
|
)
|
||||||
|
response = await checker.check()
|
||||||
|
|
||||||
response_result = process_site_result(
|
response_result = process_site_result(
|
||||||
response, query_notify, logger, default_result, site
|
response, query_notify, logger, default_result, site
|
||||||
|
|||||||
@@ -32,6 +32,9 @@ COMMON_ERRORS = {
|
|||||||
'<title>Attention Required! | Cloudflare</title>': CheckError(
|
'<title>Attention Required! | Cloudflare</title>': CheckError(
|
||||||
'Captcha', 'Cloudflare'
|
'Captcha', 'Cloudflare'
|
||||||
),
|
),
|
||||||
|
'<title>Just a moment</title>': CheckError(
|
||||||
|
'Bot protection', 'Cloudflare challenge page'
|
||||||
|
),
|
||||||
'Please stand by, while we are checking your browser': CheckError(
|
'Please stand by, while we are checking your browser': CheckError(
|
||||||
'Bot protection', 'Cloudflare'
|
'Bot protection', 'Cloudflare'
|
||||||
),
|
),
|
||||||
|
|||||||
@@ -740,8 +740,11 @@
|
|||||||
],
|
],
|
||||||
"regexCheck": "^[^\\.]+$",
|
"regexCheck": "^[^\\.]+$",
|
||||||
"checkType": "message",
|
"checkType": "message",
|
||||||
|
"presenseStrs": [
|
||||||
|
"<meta name=\"description\" content=\""
|
||||||
|
],
|
||||||
"absenceStrs": [
|
"absenceStrs": [
|
||||||
"<title>404: This page could not be found</title>"
|
"id=\"__next_error__\""
|
||||||
],
|
],
|
||||||
"alexaRank": 6524,
|
"alexaRank": 6524,
|
||||||
"urlMain": "https://alternativeto.net/",
|
"urlMain": "https://alternativeto.net/",
|
||||||
@@ -12351,6 +12354,10 @@
|
|||||||
"photo"
|
"photo"
|
||||||
],
|
],
|
||||||
"checkType": "message",
|
"checkType": "message",
|
||||||
|
"presenseStrs": [
|
||||||
|
"profile-avatar",
|
||||||
|
"profile-user-info"
|
||||||
|
],
|
||||||
"absenceStrs": [
|
"absenceStrs": [
|
||||||
"<title>Error 500</title>",
|
"<title>Error 500</title>",
|
||||||
"<title>Error 404</title>",
|
"<title>Error 404</title>",
|
||||||
@@ -13662,11 +13669,17 @@
|
|||||||
"porn",
|
"porn",
|
||||||
"us"
|
"us"
|
||||||
],
|
],
|
||||||
"checkType": "status_code",
|
"checkType": "message",
|
||||||
|
"presenseStrs": [
|
||||||
|
"Newest Porn Videos | Redtube"
|
||||||
|
],
|
||||||
|
"absenceStrs": [
|
||||||
|
"Page Not Found"
|
||||||
|
],
|
||||||
"alexaRank": 1090,
|
"alexaRank": 1090,
|
||||||
"urlMain": "https://ru.redtube.com/",
|
"urlMain": "https://www.redtube.com/",
|
||||||
"url": "https://ru.redtube.com/users/{username}",
|
"url": "https://www.redtube.com/amateur/{username}",
|
||||||
"usernameClaimed": "adam",
|
"usernameClaimed": "dollyjeey",
|
||||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||||
},
|
},
|
||||||
"Reibert": {
|
"Reibert": {
|
||||||
@@ -17049,7 +17062,7 @@
|
|||||||
"sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
|
"sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
|
||||||
"authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
|
"authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
|
||||||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
|
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
|
||||||
"x-guest-token": "1411741418192883712"
|
"x-guest-token": "2035504183667757370"
|
||||||
},
|
},
|
||||||
"errors": {
|
"errors": {
|
||||||
"Bad guest token": "x-guest-token update required"
|
"Bad guest token": "x-guest-token update required"
|
||||||
@@ -17058,6 +17071,7 @@
|
|||||||
"activation": {
|
"activation": {
|
||||||
"method": "twitter",
|
"method": "twitter",
|
||||||
"marks": [
|
"marks": [
|
||||||
|
"Bad guest token",
|
||||||
"Bad guest token."
|
"Bad guest token."
|
||||||
],
|
],
|
||||||
"url": "https://api.twitter.com/1.1/guest/activate.json",
|
"url": "https://api.twitter.com/1.1/guest/activate.json",
|
||||||
@@ -17066,6 +17080,9 @@
|
|||||||
},
|
},
|
||||||
"urlProbe": "https://twitter.com/i/api/graphql/ZRnOhhXPwue_JGILb9TNug/UserByScreenName?variables=%7B%22screen_name%22%3A%22{username}%22%2C%22withHighlightedLabel%22%3Atrue%7D",
|
"urlProbe": "https://twitter.com/i/api/graphql/ZRnOhhXPwue_JGILb9TNug/UserByScreenName?variables=%7B%22screen_name%22%3A%22{username}%22%2C%22withHighlightedLabel%22%3Atrue%7D",
|
||||||
"checkType": "message",
|
"checkType": "message",
|
||||||
|
"presenseStrs": [
|
||||||
|
"\"legacy\""
|
||||||
|
],
|
||||||
"absenceStrs": [
|
"absenceStrs": [
|
||||||
" not found"
|
" not found"
|
||||||
],
|
],
|
||||||
@@ -18026,8 +18043,11 @@
|
|||||||
"us"
|
"us"
|
||||||
],
|
],
|
||||||
"checkType": "message",
|
"checkType": "message",
|
||||||
|
"presenseStrs": [
|
||||||
|
"Explore Cannabis Brands"
|
||||||
|
],
|
||||||
"absenceStrs": [
|
"absenceStrs": [
|
||||||
"<title>Find Marijuana Dispensaries, Brands"
|
"<title data-next-head=\"\">Find Marijuana Dispensaries, Brands, Delivery, Deals"
|
||||||
],
|
],
|
||||||
"alexaRank": 7929,
|
"alexaRank": 7929,
|
||||||
"urlMain": "https://weedmaps.com",
|
"urlMain": "https://weedmaps.com",
|
||||||
|
|||||||
+36
-1
@@ -325,6 +325,14 @@ class MaigretDatabase:
|
|||||||
"""
|
"""
|
||||||
Ranking and filtering of the sites list
|
Ranking and filtering of the sites list
|
||||||
|
|
||||||
|
When ``top`` is limited (not "all sites"), **mirrors** may be appended after
|
||||||
|
the Alexa-ranked slice. A mirror is any filtered site with a non-empty
|
||||||
|
``source`` field equal to the name of a site that appears in the first
|
||||||
|
``top`` positions of a **parent ranking** that includes disabled sites.
|
||||||
|
Thus mirrors such as third-party viewers (e.g. for Twitter or Instagram)
|
||||||
|
are still scanned when their parent platform ranks highly, even if the
|
||||||
|
official site is disabled and omitted from the main list.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
reverse (bool, optional): Reverse the sorting order. Defaults to False.
|
reverse (bool, optional): Reverse the sorting order. Defaults to False.
|
||||||
top (int, optional): Maximum number of sites to return. Defaults to sys.maxsize.
|
top (int, optional): Maximum number of sites to return. Defaults to sys.maxsize.
|
||||||
@@ -334,7 +342,8 @@ class MaigretDatabase:
|
|||||||
id_type (str, optional): Type of identifier to filter by. Defaults to "username".
|
id_type (str, optional): Type of identifier to filter by. Defaults to "username".
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
dict: Dictionary of filtered and ranked sites, with site names as keys and MaigretSite objects as values
|
dict: Dictionary of filtered and ranked sites (base top slice plus mirrors),
|
||||||
|
with site names as keys and MaigretSite objects as values
|
||||||
"""
|
"""
|
||||||
normalized_names = list(map(str.lower, names))
|
normalized_names = list(map(str.lower, names))
|
||||||
normalized_tags = list(map(str.lower, tags))
|
normalized_tags = list(map(str.lower, tags))
|
||||||
@@ -371,6 +380,32 @@ class MaigretDatabase:
|
|||||||
sorted_list = sorted(
|
sorted_list = sorted(
|
||||||
filtered_list, key=lambda x: x.alexa_rank, reverse=reverse
|
filtered_list, key=lambda x: x.alexa_rank, reverse=reverse
|
||||||
)[:top]
|
)[:top]
|
||||||
|
|
||||||
|
# Mirrors: sites whose `source` matches a parent platform that ranks in the
|
||||||
|
# top `top` by Alexa when disabled entries are included in the ranking pool
|
||||||
|
# (so e.g. Instagram can be a parent for Picuki even if Instagram is disabled).
|
||||||
|
if top < sys.maxsize and sorted_list:
|
||||||
|
filter_fun_ranking_parents = (
|
||||||
|
lambda x: filter_tags_engines_fun(x)
|
||||||
|
and filter_names_fun(x)
|
||||||
|
and is_id_type_ok(x)
|
||||||
|
)
|
||||||
|
ranking_pool = [s for s in self.sites if filter_fun_ranking_parents(s)]
|
||||||
|
sorted_parents = sorted(
|
||||||
|
ranking_pool, key=lambda x: x.alexa_rank, reverse=reverse
|
||||||
|
)[:top]
|
||||||
|
parent_names_lower = {s.name.lower() for s in sorted_parents}
|
||||||
|
base_names = {s.name for s in sorted_list}
|
||||||
|
|
||||||
|
def is_mirror(s) -> bool:
|
||||||
|
if not s.source or s.name in base_names:
|
||||||
|
return False
|
||||||
|
return s.source.lower() in parent_names_lower
|
||||||
|
|
||||||
|
mirrors = [s for s in filtered_list if is_mirror(s)]
|
||||||
|
mirrors.sort(key=lambda x: (x.alexa_rank, x.name))
|
||||||
|
sorted_list = list(sorted_list) + mirrors
|
||||||
|
|
||||||
return {site.name: site for site in sorted_list}
|
return {site.name: site for site in sorted_list}
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -182,6 +182,49 @@ def test_ranked_sites_dict_id_type():
|
|||||||
assert len(db.ranked_sites_dict(id_type='gaia_id')) == 1
|
assert len(db.ranked_sites_dict(id_type='gaia_id')) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_ranked_sites_dict_mirrors_disabled_parent():
|
||||||
|
"""Mirror is included when parent ranks in top N but parent is disabled."""
|
||||||
|
db = MaigretDatabase()
|
||||||
|
db.update_site(
|
||||||
|
MaigretSite(
|
||||||
|
'ParentPlatform',
|
||||||
|
{'alexaRank': 5, 'disabled': True, 'type': 'username'},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
db.update_site(
|
||||||
|
MaigretSite(
|
||||||
|
'OtherSite',
|
||||||
|
{'alexaRank': 100, 'type': 'username'},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
db.update_site(
|
||||||
|
MaigretSite(
|
||||||
|
'MirrorSite',
|
||||||
|
{
|
||||||
|
'alexaRank': 99999999,
|
||||||
|
'source': 'ParentPlatform',
|
||||||
|
'type': 'username',
|
||||||
|
},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
result = db.ranked_sites_dict(top=1, disabled=False, id_type='username')
|
||||||
|
assert list(result.keys()) == ['OtherSite', 'MirrorSite']
|
||||||
|
|
||||||
|
|
||||||
|
def test_ranked_sites_dict_mirrors_no_extra_without_parent_in_top():
|
||||||
|
db = MaigretDatabase()
|
||||||
|
db.update_site(MaigretSite('A', {'alexaRank': 1, 'type': 'username'}))
|
||||||
|
db.update_site(
|
||||||
|
MaigretSite(
|
||||||
|
'B',
|
||||||
|
{'alexaRank': 2, 'source': 'NotInDb', 'type': 'username'},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
assert list(db.ranked_sites_dict(top=1, id_type='username').keys()) == ['A']
|
||||||
|
|
||||||
|
|
||||||
def test_get_url_template():
|
def test_get_url_template():
|
||||||
site = MaigretSite(
|
site = MaigretSite(
|
||||||
"test",
|
"test",
|
||||||
|
|||||||
Reference in New Issue
Block a user