mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-07 06:24:35 +00:00
c2f912b164
- Added social tag to social networks (33 sites) - Fixed wrong tags (8 sites) - Filled empty tags for 213 sites in top-1000 - Country tag cleanup (~374 sites) - Site naming normalization (75 sites) - New tests (3) - Documentation updates
105 lines
2.9 KiB
Python
105 lines
2.9 KiB
Python
"""Maigret data test functions"""
|
|
|
|
import pytest
|
|
from maigret.utils import is_country_tag
|
|
|
|
|
|
TOP_SITES_ALEXA_RANK_LIMIT = 50
|
|
|
|
KNOWN_SOCIAL_DOMAINS = [
|
|
"facebook.com",
|
|
"instagram.com",
|
|
"twitter.com",
|
|
"tiktok.com",
|
|
"vk.com",
|
|
"reddit.com",
|
|
"pinterest.com",
|
|
"snapchat.com",
|
|
"linkedin.com",
|
|
"tumblr.com",
|
|
"threads.net",
|
|
"bsky.app",
|
|
"myspace.com",
|
|
"weibo.com",
|
|
"mastodon.social",
|
|
"gab.com",
|
|
"minds.com",
|
|
"clubhouse.com",
|
|
]
|
|
|
|
|
|
@pytest.mark.slow
|
|
def test_tags_validity(default_db):
|
|
unknown_tags = set()
|
|
|
|
tags = default_db._tags
|
|
|
|
for site in default_db.sites:
|
|
for tag in filter(lambda x: not is_country_tag(x), site.tags):
|
|
if tag not in tags:
|
|
unknown_tags.add(tag)
|
|
|
|
# make sure all tags are known
|
|
# if you see "unchecked" tag error, please, do
|
|
# maigret --db `pwd`/maigret/resources/data.json --self-check --tag unchecked --use-disabled-sites
|
|
assert unknown_tags == set()
|
|
|
|
|
|
@pytest.mark.slow
|
|
def test_top_sites_have_category_tag(default_db):
|
|
"""Top sites by alexaRank must have at least one category tag (not just country codes)."""
|
|
sites_ranked = sorted(
|
|
[s for s in default_db.sites if s.alexa_rank],
|
|
key=lambda s: s.alexa_rank,
|
|
)[:TOP_SITES_ALEXA_RANK_LIMIT]
|
|
|
|
missing_category = []
|
|
for site in sites_ranked:
|
|
category_tags = [t for t in site.tags if not is_country_tag(t)]
|
|
if not category_tags:
|
|
missing_category.append(f"{site.name} (rank {site.alexa_rank})")
|
|
|
|
assert missing_category == [], (
|
|
f"{len(missing_category)} top-{TOP_SITES_ALEXA_RANK_LIMIT} sites have no category tag: "
|
|
+ ", ".join(missing_category[:20])
|
|
)
|
|
|
|
|
|
@pytest.mark.slow
|
|
def test_no_unused_tags_in_registry(default_db):
|
|
"""Every tag in the registry should be used by at least one site."""
|
|
all_used_tags = set()
|
|
for site in default_db.sites:
|
|
for tag in site.tags:
|
|
if not is_country_tag(tag):
|
|
all_used_tags.add(tag)
|
|
|
|
registered_tags = set(default_db._tags)
|
|
unused = registered_tags - all_used_tags
|
|
|
|
assert unused == set(), f"Tags registered but not used by any site: {unused}"
|
|
|
|
|
|
@pytest.mark.slow
|
|
def test_social_networks_have_social_tag(default_db):
|
|
"""Known social network domains must have the 'social' tag."""
|
|
from urllib.parse import urlparse
|
|
|
|
missing_social = []
|
|
for site in default_db.sites:
|
|
url = site.url_main or ""
|
|
try:
|
|
hostname = urlparse(url).hostname or ""
|
|
except Exception:
|
|
continue
|
|
for domain in KNOWN_SOCIAL_DOMAINS:
|
|
if hostname == domain or hostname.endswith("." + domain):
|
|
if "social" not in site.tags:
|
|
missing_social.append(f"{site.name} ({domain})")
|
|
break
|
|
|
|
assert missing_social == [], (
|
|
f"{len(missing_social)} known social networks missing 'social' tag: "
|
|
+ ", ".join(missing_social)
|
|
)
|