maigret/utils/fp_probe_top_sites.py

#!/usr/bin/env python3
"""
Probe likely false-positive sites among the top-N Alexa-ranked entries.

For each of K random *distinct* usernames taken from ``usernameClaimed`` fields in
the Maigret database, runs a clean ``maigret`` scan (``--top-sites N --json simple|ndjson``).
Sites that return CLAIMED in *every* run are reported: unrelated random claimed
handles are unlikely to all exist on the same third-party site, so such sites are
candidates for broken checks.
"""

from __future__ import annotations

import argparse
import json
import random
import shutil
import subprocess
import sys
import tempfile
from pathlib import Path


def repo_root() -> Path:
    return Path(__file__).resolve().parent.parent


def load_username_claimed_pool(db_path: Path) -> list[str]:
    with db_path.open(encoding="utf-8") as f:
        data = json.load(f)
    sites = data.get("sites") or {}
    seen: set[str] = set()
    pool: list[str] = []
    for _name, site in sites.items():
        u = (site or {}).get("usernameClaimed")
        if not u or not isinstance(u, str):
            continue
        u = u.strip()
        if not u or u in seen:
            continue
        seen.add(u)
        pool.append(u)
    return pool


def run_maigret(
    *,
    username: str,
    db_path: Path,
    out_dir: Path,
    top_sites: int,
    json_format: str,
    quiet: bool,
) -> Path:
    """Run maigret subprocess; return path to the written JSON report."""
    safe = username.replace("/", "_")
    report_name = f"report_{safe}_{json_format}.json"
    report_path = out_dir / report_name

    cmd = [
        sys.executable,
        "-m",
        "maigret",
        username,
        "--db",
        str(db_path),
        "--top-sites",
        str(top_sites),
        "--json",
        json_format,
        "--folderoutput",
        str(out_dir),
        "--no-progressbar",
        "--no-color",
        "--no-recursion",
        "--no-extracting",
    ]
    sink = subprocess.DEVNULL if quiet else None
    proc = subprocess.run(
        cmd,
        cwd=str(repo_root()),
        text=True,
        stdout=sink,
        stderr=sink,
    )
    if proc.returncode != 0:
        raise RuntimeError(
            f"maigret exited with {proc.returncode} for username {username!r}"
        )
    if not report_path.is_file():
        raise FileNotFoundError(f"Expected report missing: {report_path}")
    return report_path


def claimed_sites_from_report(path: Path, json_format: str) -> set[str]:
    if json_format == "simple":
        with path.open(encoding="utf-8") as f:
            data = json.load(f)
        if not isinstance(data, dict):
            return set()
        return set(data.keys())
    # ndjson: one object per line, each has "sitename"
    sites: set[str] = set()
    with path.open(encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            obj = json.loads(line)
            name = obj.get("sitename")
            if isinstance(name, str) and name:
                sites.add(name)
    return sites


def main() -> int:
    parser = argparse.ArgumentParser(
        description=(
            "Pick random distinct usernameClaimed values, run maigret --top-sites N "
            "with JSON reports, and list sites that claimed all of them (suspicious FP)."
        )
    )
    parser.add_argument(
        "--db",
        "-b",
        type=Path,
        default=repo_root() / "maigret" / "resources" / "data.json",
        help="Path to Maigret data.json (a temp copy is used for runs).",
    )
    parser.add_argument(
        "--top-sites",
        "-n",
        type=int,
        default=500,
        metavar="N",
        help="Value for maigret --top-sites (default: 500).",
    )
    parser.add_argument(
        "--samples",
        "-k",
        type=int,
        default=5,
        metavar="K",
        help="How many distinct random usernames to draw (default: 5).",
    )
    parser.add_argument(
        "--seed",
        type=int,
        default=None,
        help="RNG seed for reproducible username selection.",
    )
    parser.add_argument(
        "--json",
        dest="json_format",
        default="simple",
        choices=["simple", "ndjson"],
        help="JSON report type passed to maigret -J (default: simple).",
    )
    parser.add_argument(
        "--verbose",
        "-v",
        action="store_true",
        default=False,
        help="Print maigret stdout/stderr (default: suppress child output).",
    )
    args = parser.parse_args()
    quiet = not args.verbose

    db_src = args.db.resolve()
    if not db_src.is_file():
        print(f"Database not found: {db_src}", file=sys.stderr)
        return 2

    pool = load_username_claimed_pool(db_src)
    if len(pool) < args.samples:
        print(
            f"Need at least {args.samples} distinct usernameClaimed entries, "
            f"found {len(pool)}.",
            file=sys.stderr,
        )
        return 2

    rng = random.Random(args.seed)
    picked = rng.sample(pool, args.samples)

    print(f"Database: {db_src}")
    print(f"--top-sites {args.top_sites}, {args.samples} random usernameClaimed:")
    for i, u in enumerate(picked, 1):
        print(f"  {i}. {u}")

    site_sets: list[set[str]] = []
    with tempfile.TemporaryDirectory(prefix="maigret_fp_probe_") as tmp:
        tmp_path = Path(tmp)
        db_work = tmp_path / "data.json"
        shutil.copyfile(db_src, db_work)

        for u in picked:
            print(f"\nRunning maigret for {u!r} ...", flush=True)
            report = run_maigret(
                username=u,
                db_path=db_work,
                out_dir=tmp_path,
                top_sites=args.top_sites,
                json_format=args.json_format,
                quiet=quiet,
            )
            sites = claimed_sites_from_report(report, args.json_format)
            site_sets.append(sites)
            print(f"  -> {len(sites)} positive site(s) in JSON", flush=True)

    always = set.intersection(*site_sets) if site_sets else set()
    print("\n--- Sites with CLAIMED in all runs (candidates for false positives) ---")
    if not always:
        print("(none)")
    else:
        for name in sorted(always):
            print(name)

    return 0


if __name__ == "__main__":
    raise SystemExit(main())