mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-07 06:24:35 +00:00
b145e7b26f
* feat(core): add POST request support, new sites, migrate to Majestic Million ranking - Added native POST request support to the Maigret engine (requestMethod, requestPayload) to enable querying modern JSON registration endpoints. - Replaced the discontinued Alexa rank API with the Majestic Million dataset for global popularity sorting and automated CI updates. - Fixed multiple false positives among top 500 sites and bypassed standard anti-bot protections using custom User-Agents. - Updated public documentation and internal playbooks to reflect the new features. * feat(data): apply all data.json site check updates from main branch - Added CTFtime and PentesterLab (new sites added in main) - Removed forums.imore.com (deleted in main as dead site) - Disabled 5 sites per main branch fixes: Librusec, MirTesen, amateurvoyeurforum.com, forums.stevehoffman.tv, vegalab - Fixed 5 site checks per main branch: SoundCloud, Taplink, Setlist, RoyalCams, club.cnews.ru (switched from status_code to message checkType with proper markers) Co-authored-by: soxoj <31013580+soxoj@users.noreply.github.com> Agent-Logs-Url: https://github.com/soxoj/maigret/sessions/a1d194d9-c0ff-4e2b-974c-c5e4b59548bf --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
165 lines
5.2 KiB
Python
Executable File
165 lines
5.2 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Maigret: Supported Site Listing with Alexa ranking and country tags
|
|
This module generates the listing of supported sites in file `SITES.md`
|
|
and pretty prints file with sites data.
|
|
"""
|
|
import sys
|
|
import requests
|
|
import logging
|
|
import threading
|
|
import xml.etree.ElementTree as ET
|
|
from datetime import datetime, timezone
|
|
from argparse import ArgumentParser, RawDescriptionHelpFormatter
|
|
|
|
from maigret.maigret import MaigretDatabase
|
|
|
|
RANKS = {str(i):str(i) for i in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 50, 100, 500]}
|
|
RANKS.update({
|
|
'1000': '1K',
|
|
'5000': '5K',
|
|
'10000': '10K',
|
|
'100000': '100K',
|
|
'10000000': '10M',
|
|
'50000000': '50M',
|
|
'100000000': '100M',
|
|
})
|
|
|
|
|
|
|
|
import csv
|
|
import io
|
|
from urllib.parse import urlparse
|
|
|
|
def fetch_majestic_million():
|
|
print("Fetching Majestic Million CSV (this may take a few seconds)...")
|
|
ranks = {}
|
|
url = "https://downloads.majestic.com/majestic_million.csv"
|
|
try:
|
|
response = requests.get(url, stream=True)
|
|
response.raise_for_status()
|
|
|
|
csv_file = io.StringIO(response.text)
|
|
reader = csv.reader(csv_file)
|
|
next(reader) # skip headers
|
|
|
|
for row in reader:
|
|
if not row or len(row) < 3:
|
|
continue
|
|
rank = int(row[0])
|
|
domain = row[2].lower()
|
|
ranks[domain] = rank
|
|
except Exception as e:
|
|
logging.error(f"Error fetching Majestic Million: {e}")
|
|
|
|
print(f"Loaded {len(ranks)} domains from Majestic Million.")
|
|
return ranks
|
|
|
|
def get_base_domain(url):
|
|
try:
|
|
netloc = urlparse(url).netloc
|
|
if netloc.startswith('www.'):
|
|
netloc = netloc[4:]
|
|
return netloc.lower()
|
|
except Exception:
|
|
return ""
|
|
|
|
|
|
def get_step_rank(rank):
|
|
def get_readable_rank(r):
|
|
return RANKS[str(r)]
|
|
|
|
valid_step_ranks = sorted(map(int, RANKS.keys()))
|
|
if rank == 0 or rank == sys.maxsize:
|
|
return get_readable_rank(valid_step_ranks[-1])
|
|
else:
|
|
return get_readable_rank(list(filter(lambda x: x >= rank, valid_step_ranks))[0])
|
|
|
|
|
|
def main():
|
|
parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter
|
|
)
|
|
parser.add_argument("--base","-b", metavar="BASE_FILE",
|
|
dest="base_file", default="maigret/resources/data.json",
|
|
help="JSON file with sites data to update.")
|
|
|
|
parser.add_argument('--with-rank', help='update with use of local data only', action='store_true')
|
|
parser.add_argument('--empty-only', help='update only sites without rating', action='store_true')
|
|
parser.add_argument('--exclude-engine', help='do not update score with certain engine',
|
|
action="append", dest="exclude_engine_list", default=[])
|
|
|
|
pool = list()
|
|
|
|
args = parser.parse_args()
|
|
|
|
db = MaigretDatabase()
|
|
sites_subset = db.load_from_file(args.base_file).sites
|
|
|
|
print(f"\nUpdating supported sites list (don't worry, it's needed)...")
|
|
|
|
with open("sites.md", "w") as site_file:
|
|
site_file.write(f"""
|
|
## List of supported sites (search methods): total {len(sites_subset)}\n
|
|
Rank data fetched from Majestic Million by domains.
|
|
|
|
""")
|
|
|
|
majestic_ranks = {}
|
|
if args.with_rank:
|
|
majestic_ranks = fetch_majestic_million()
|
|
|
|
for site in sites_subset:
|
|
if not args.with_rank:
|
|
break
|
|
|
|
if site.alexa_rank < sys.maxsize and args.empty_only:
|
|
continue
|
|
if args.exclude_engine_list and site.engine in args.exclude_engine_list:
|
|
continue
|
|
|
|
domain = get_base_domain(site.url_main)
|
|
|
|
if domain in majestic_ranks:
|
|
site.alexa_rank = majestic_ranks[domain]
|
|
else:
|
|
site.alexa_rank = sys.maxsize
|
|
|
|
# In memory matching complete, no threads to join
|
|
if args.with_rank:
|
|
print("Successfully updated ranks matching Majestic Million dataset.")
|
|
|
|
sites_full_list = [(s, int(s.alexa_rank)) for s in sites_subset]
|
|
|
|
sites_full_list.sort(reverse=False, key=lambda x: x[1])
|
|
|
|
while sites_full_list[0][1] == 0:
|
|
site = sites_full_list.pop(0)
|
|
sites_full_list.append(site)
|
|
|
|
for num, site_tuple in enumerate(sites_full_list):
|
|
site, rank = site_tuple
|
|
url_main = site.url_main
|
|
valid_rank = get_step_rank(rank)
|
|
all_tags = site.tags
|
|
all_tags.sort()
|
|
tags = ', ' + ', '.join(all_tags) if all_tags else ''
|
|
note = ''
|
|
if site.disabled:
|
|
note = ', search is disabled'
|
|
|
|
favicon = f""
|
|
site_file.write(f'1. {favicon} [{site}]({url_main})*: top {valid_rank}{tags}*{note}\n')
|
|
db.update_site(site)
|
|
|
|
site_file.write(f'\nThe list was updated at ({datetime.now(timezone.utc).date()})\n')
|
|
db.save_to_file(args.base_file)
|
|
|
|
statistics_text = db.get_db_stats(is_markdown=True)
|
|
site_file.write('## Statistics\n\n')
|
|
site_file.write(statistics_text)
|
|
|
|
print("Finished updating supported site listing!")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|