mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-06 22:19:01 +00:00
Compare commits
93 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 7fd9bb3692 | |||
| 385f9f5bb3 | |||
| dc8751ac55 | |||
| 9303b1686d | |||
| aa80bd4232 | |||
| f5c4b1c35d | |||
| 5e24117e93 | |||
| 777e503e30 | |||
| c222c96aeb | |||
| b213f6e079 | |||
| 9354331874 | |||
| 8a82eb6ee6 | |||
| a61f3b32c4 | |||
| fbb8255518 | |||
| 9bad5d8269 | |||
| a8e7ab4540 | |||
| 6db1df2ddb | |||
| 23adc178ea | |||
| 6834483360 | |||
| 6ed8fdefcc | |||
| 3fd34afb77 | |||
| ad95302745 | |||
| 44a6c729e3 | |||
| 6d0a22b738 | |||
| abce3c9be4 | |||
| 269d50eedc | |||
| e8f4318e5d | |||
| 75289c78bf | |||
| eeb38ccdc0 | |||
| d136014576 | |||
| 5d502eaef6 | |||
| 9e8a701c54 | |||
| 7b67c61240 | |||
| 0e113c4592 | |||
| fb4e17be92 | |||
| adb19e5930 | |||
| 116fae3e0f | |||
| bf495cd57e | |||
| e49aa533df | |||
| 5aa7f6429b | |||
| a5d337b765 | |||
| 5aa0c908b0 | |||
| 51b452ad71 | |||
| fa1a4d1b4a | |||
| 184519b202 | |||
| a203eecbb2 | |||
| dde1cd5d78 | |||
| 547512519b | |||
| b333a2e2b2 | |||
| 2835ec71c7 | |||
| af67a6a3f3 | |||
| 4f737b5260 | |||
| 185e09e4ea | |||
| 5865e0f375 | |||
| 815c8cb2f3 | |||
| 656fe1df24 | |||
| 1c5dc5f152 | |||
| bc3d9faad9 | |||
| 5aae2ee005 | |||
| b145e7b26f | |||
| abd9aa57fe | |||
| 2e430e5039 | |||
| f5786f11ce | |||
| 3e56c95e16 | |||
| 28f35f9a4f | |||
| 79cea49526 | |||
| 2d94269656 | |||
| 829bda885a | |||
| eb541dcf51 | |||
| 4c97025a32 | |||
| 2775181a6a | |||
| b00ef1f5dd | |||
| d3f13ac295 | |||
| 479a614d1d | |||
| e0559e4320 | |||
| 00a9249229 | |||
| 005863c2e0 | |||
| e3aada6aef | |||
| 9b35fc1ab0 | |||
| 146bc0481b | |||
| 5930a3022e | |||
| b4482e0ba4 | |||
| 2c55501bc2 | |||
| 3ba07591a1 | |||
| a2d4373b68 | |||
| b960acec10 | |||
| b1a211c3cd | |||
| 56d0c9f2f1 | |||
| 01049b730d | |||
| 2c2d3409e2 | |||
| e81b50ef61 | |||
| 9ac0a65914 | |||
| 4f397fed1c |
@@ -1,3 +1,10 @@
|
||||
#!/bin/sh
|
||||
echo 'Activating update_sitesmd hook script...'
|
||||
poetry run update_sitesmd
|
||||
poetry run update_sitesmd
|
||||
|
||||
echo 'Regenerating db_meta.json...'
|
||||
python3 utils/generate_db_meta.py
|
||||
|
||||
git add maigret/resources/db_meta.json
|
||||
git add maigret/resources/data.json
|
||||
git add sites.md
|
||||
|
||||
@@ -1,21 +1,30 @@
|
||||
name: Upload Python Package to PyPI when a Release is Created
|
||||
name: Upload Python Package to PyPI when a Release is Published
|
||||
|
||||
on:
|
||||
release:
|
||||
types: [created]
|
||||
push:
|
||||
tags:
|
||||
- "v*"
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
types: [published]
|
||||
|
||||
jobs:
|
||||
build-and-publish:
|
||||
pypi-publish:
|
||||
name: Publish release to PyPI
|
||||
runs-on: ubuntu-latest
|
||||
environment:
|
||||
name: pypi
|
||||
url: https://pypi.org/p/maigret
|
||||
permissions:
|
||||
id-token: write
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: astral-sh/setup-uv@v3
|
||||
- run: uv build
|
||||
- name: Publish to PyPI (Trusted Publishing)
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
packages-dir: dist
|
||||
python-version: "3.x"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install build
|
||||
- name: Build package
|
||||
run: |
|
||||
python -m build
|
||||
- name: Publish package distributions to PyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
|
||||
@@ -4,13 +4,18 @@ on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
|
||||
concurrency:
|
||||
group: update-sites-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v2.3.2
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: main
|
||||
fetch-depth: 0 # otherwise, there would be errors pushing refs to the destination repository.
|
||||
|
||||
- name: Install system dependencies
|
||||
@@ -22,6 +27,12 @@ jobs:
|
||||
pip3 install .
|
||||
python3 ./utils/update_site_data.py --empty-only
|
||||
|
||||
- name: Regenerate db_meta.json
|
||||
run: python3 utils/generate_db_meta.py
|
||||
|
||||
- name: Remove ambiguous main tag
|
||||
run: git tag -d main || true
|
||||
|
||||
- name: Check for meaningful changes
|
||||
id: check
|
||||
run: |
|
||||
@@ -32,13 +43,18 @@ jobs:
|
||||
echo "has_changes=false" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Delete existing PR branch
|
||||
if: steps.check.outputs.has_changes == 'true'
|
||||
run: git push origin --delete auto/update-sites-list || true
|
||||
|
||||
- name: Create Pull Request
|
||||
if: steps.check.outputs.has_changes == 'true'
|
||||
uses: peter-evans/create-pull-request@v5
|
||||
uses: peter-evans/create-pull-request@v7
|
||||
with:
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
commit-message: "Updated site list and statistics"
|
||||
title: "Automated Sites List Update"
|
||||
body: "Automated changes to sites.md based on new Alexa rankings/statistics."
|
||||
branch: "auto/update-sites-list"
|
||||
base: main
|
||||
delete-branch: true
|
||||
+2
-1
@@ -42,4 +42,5 @@ settings.json
|
||||
|
||||
# other
|
||||
*.egg-info
|
||||
build
|
||||
build
|
||||
LLM
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2019 Sherlock Project
|
||||
Copyright (c) 2020-2021 Soxoj
|
||||
Copyright (c) 2020-2026 Soxoj
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
||||
@@ -1,451 +0,0 @@
|
||||
# Site checks — guide (Maigret)
|
||||
|
||||
Working document for future changes: workflow, findings from reviews, and practical steps. See also [`site-checks-playbook.md`](site-checks-playbook.md) (short checklist), [`socid_extractor_improvements.log`](socid_extractor_improvements.log) (proposals for upstream identity extraction), and the code in [`maigret/checking.py`](../maigret/checking.py).
|
||||
|
||||
**Documentation maintenance:** whenever you improve Maigret, add search tooling, or change check logic, update **this file** and [`site-checks-playbook.md`](site-checks-playbook.md) in sync (see the section at the end). If you change rules about the JSON API check or the `socid_extractor` log format, update **[`socid_extractor_improvements.log`](socid_extractor_improvements.log)** (template / header) together with this guide.
|
||||
|
||||
---
|
||||
|
||||
## 1. How checks work
|
||||
|
||||
Logic lives in `process_site_result` ([`maigret/checking.py`](../maigret/checking.py)):
|
||||
|
||||
| `checkType` | Meaning |
|
||||
|-------------|---------|
|
||||
| `message` | Profile is “found” if the HTML contains **none** of the `absenceStrs` substrings **and** at least one `presenseStrs` marker matches. If `presenseStrs` is **empty**, presence is treated as true for **any** page (risky configuration). |
|
||||
| `status_code` | HTTP **2xx** is enough — only safe if the server does **not** return 200 for “user not found”. |
|
||||
| `response_url` | Custom flow with **redirects disabled** so the status/URL of the *first* response can be used. |
|
||||
|
||||
For other `checkType` values, [`make_site_result`](../maigret/checking.py) sets **`allow_redirects=True`**: the client follows redirects and `process_site_result` sees the **final** response body and status (not the pre-redirect hop). You do **not** need to “turn on” follow-redirect separately for most sites.
|
||||
|
||||
Sites with an `engine` field (e.g. XenForo) are merged with a template from the `engines` section in [`maigret/resources/data.json`](../maigret/resources/data.json) ([`MaigretSite.update_from_engine`](../maigret/sites.py)).
|
||||
|
||||
### `urlProbe`: probe URL vs reported profile URL
|
||||
|
||||
- **`url`** — pattern for the **public profile page** users should open (what appears in reports as `url_user`). Supports `{username}`, `{urlMain}`, `{urlSubpath}`; the username segment is URL-encoded when the string is built ([`make_site_result`](../maigret/checking.py)).
|
||||
- **`urlProbe`** (optional) — if set, Maigret sends the HTTP **GET** (or HEAD where applicable) to **this** URL for the check, instead of to `url`. Same placeholders. Use it when the reliable signal is a **JSON/API** endpoint but the human-facing link must stay on the main site (e.g. `https://picsart.com/u/{username}` + probe `https://api.picsart.com/users/show/{username}.json`, or GitHub’s `https://github.com/{username}` + `https://api.github.com/users/{username}`).
|
||||
|
||||
If `urlProbe` is omitted, the probe URL defaults to `url`.
|
||||
|
||||
### Redirects and final URL as a signal
|
||||
|
||||
If the **HTML shell** looks the same for “user exists” and “user does not exist” (typical SPA), it is still worth checking whether the **server** behaves differently:
|
||||
|
||||
- **Final URL** after redirects (e.g. profile canonical URL vs `/404` path).
|
||||
- **Redirect chain** length or target host (e.g. lander vs profile).
|
||||
|
||||
If that differs reliably, you may be able to use **`checkType`: `response_url`** in [`data.json`](../maigret/resources/data.json) (no auto-follow) or extend logic — but only when the difference is stable.
|
||||
|
||||
**Server-side HTTP vs client-side navigation.** Maigret follows **HTTP** redirects only; it does **not** run JavaScript. If the browser shows a navigation to `/u/name/posts` or `/not-found` **after** the SPA bundle loads, that may never appear as an extra hop in `curl`/aiohttp — only a **trailing-slash** `301` might show up. Always confirm with `curl -sIL` / a small script whether the **Location** chain differs for real vs fake users before relying on URL-based rules.
|
||||
|
||||
**Empirical check (claimed vs non-existent usernames, `GET` with follow redirects, no JS):**
|
||||
|
||||
| Site | Result |
|
||||
|------|--------|
|
||||
| **Kaskus** | No HTTP redirects beyond the request path; same generic `<title>` and near-identical body length — **no** discriminating signal from redirects alone. |
|
||||
| **Bibsonomy** | Both requests redirect to **`/pow-challenge/?return=/user/...`** (proof-of-work). Only the `return` path changes with the username; **both** existing and fake hit the same challenge flow — not a profile-vs-missing distinction. |
|
||||
| **Picsart (web UI `https://picsart.com/u/{username}`)** | Only a **trailing-slash** `301`; the first HTML is the same empty app shell (~3 KiB) for real and fake users. Browser-only routes such as `…/posts` vs `…/not-found` are **not** visible as additional HTTP redirects in this pipeline. |
|
||||
|
||||
**Picsart — workable check via public API.** The site exposes **`https://api.picsart.com/users/show/{username}.json`**: JSON with `"status":"success"` and a user object when the account exists, and `"reason":"user_not_found"` when it does not. Put that URL in **`urlProbe`**, set **`url`** to the web profile pattern **`https://picsart.com/u/{username}`**, and use **`checkType`: `message`** with narrow `presenseStrs` / `absenceStrs` so reports show the human link while the request hits the API (see **`urlProbe`** above).
|
||||
|
||||
For **Kaskus** and **Bibsonomy**, HTTP-level comparison still does **not** unlock a safe check without PoW / richer signals; keep **`disabled: true`** until something stable appears (API, SSR markers, etc.).
|
||||
|
||||
---
|
||||
|
||||
## 2. Standard checks: public JSON API and `socid_extractor` log
|
||||
|
||||
### 2.1 Public JSON API (always)
|
||||
|
||||
When diagnosing a site—especially **SPAs**, **soft 404s**, or **near-identical HTML** for real vs fake users—**routinely look for a public JSON (or JSON-like) API** used for profile or user lookup. Typical leads: paths containing `/api/`, `/v1/`, `graphql`, `users/show`, `.json` suffixes, or the same endpoints mobile apps use. Verify with `curl` (or the Maigret request path) that **claimed** and **unclaimed** usernames produce **reliably different** bodies or status codes. If such an endpoint is more stable than HTML, put it in **`urlProbe`** and keep **`url`** as the canonical profile page on the main site (see **`urlProbe`** in section 1). If there is no separate public URL for humans, you may still point **`url`** at the API only (reports will show that URL).
|
||||
|
||||
This is a **standard** part of site-check work, not an optional extra.
|
||||
|
||||
### 2.2 Mandatory: [`LLM/socid_extractor_improvements.log`](socid_extractor_improvements.log)
|
||||
|
||||
If you discover **either**:
|
||||
|
||||
1. **JSON embedded in HTML** with user/profile fields (inline scripts, `__NEXT_DATA__`, `application/ld+json`, hydration blobs, etc.), or
|
||||
2. A **standalone JSON HTTP response** (public API) with user/profile data for that service,
|
||||
|
||||
you **must append** a proposal block to **[`LLM/socid_extractor_improvements.log`](socid_extractor_improvements.log)**.
|
||||
|
||||
**Why:** Maigret calls [`socid_extractor.extract`](https://pypi.org/project/socid-extractor/) on the response body ([`extract_ids_data` in `checking.py`](../maigret/checking.py)) to fill `ids_data`. New payloads usually need a **new scheme** upstream (`flags`, `regex`, optional `extract_json`, `fields`, optional `url_mutations` / `transforms`), matching patterns such as **`GitHub API`** or **`Gitlab API`** in `socid_extractor`’s `schemes.py`.
|
||||
|
||||
**Each log entry must include:**
|
||||
|
||||
- **Date** — ISO `YYYY-MM-DD` (day you add the entry).
|
||||
- **Example username** — Prefer the site’s `usernameClaimed` from `data.json`, or any account that reproduces the payload.
|
||||
- **Proposal** — Use the **block template** in the log file: detection idea, optional URL mutation, and field mappings in the same style as existing schemes.
|
||||
|
||||
If the service is **already covered** by an existing `socid_extractor` scheme, add a **short** entry anyway (date, example username, scheme name, “already implemented”) so there is an audit trail.
|
||||
|
||||
Do **not** paste secrets, cookies, or full private JSON; short key names and structure hints are enough.
|
||||
|
||||
---
|
||||
|
||||
## 3. Improvement workflow
|
||||
|
||||
### Phase A — Reproduce
|
||||
|
||||
1. Targeted run:
|
||||
```bash
|
||||
maigret --db /path/to/maigret/resources/data.json \
|
||||
TEST_USERNAME \
|
||||
--site "SiteName" \
|
||||
--print-not-found --print-errors \
|
||||
--no-progressbar -vv
|
||||
```
|
||||
2. Run separately with a **real** existing username and a **definitely non-existent** one (as `usernameClaimed` / `usernameUnclaimed` in JSON).
|
||||
3. If needed: `-vvv` and `debug.log` (raw response).
|
||||
4. Automated pair check:
|
||||
```bash
|
||||
maigret --db ... --self-check --site "SiteName" --no-progressbar
|
||||
```
|
||||
|
||||
### Phase B — Classify the cause
|
||||
|
||||
| Symptom | Likely cause |
|
||||
|---------|----------------|
|
||||
| False “found” with `status_code` | Soft 404 (200 on a “not found” page). |
|
||||
| False “found” with `message` | Overly broad `presenseStrs` (`name`, `email`, JSON keys) or stale `absenceStrs`. |
|
||||
| Same HTML for different users | SPA / skeleton shell before hydration — also compare **final URL / redirect chain** (see above); if still identical, often `disabled`. |
|
||||
| Login page instead of profile | XenForo etc.: guest, `ignore403`, “must be logged in” strings. |
|
||||
| reCAPTCHA / “Checking your browser” / “not a bot” | Bot protection; Maigret’s default User-Agent may worsen the response. |
|
||||
| Redirect to another domain / lander | Stale URL template. |
|
||||
|
||||
### Phase C — Edits in [`data.json`](../maigret/resources/data.json)
|
||||
|
||||
1. Update `url` / `urlMain` if needed (HTTPS, new profile path).
|
||||
2. Replace inappropriate `status_code` with `message` (or `response_url`), choosing:
|
||||
- **`absenceStrs`** — only what reliably appears on the “user does not exist” page;
|
||||
- **`presenseStrs`** — narrow markers of a real profile (avoid generic words).
|
||||
3. For XenForo: override only fields that differ in the site entry; do not break the global `engines` template.
|
||||
4. Refresh `usernameClaimed` / `usernameUnclaimed` if reference accounts disappeared.
|
||||
5. Set **`headers`** (e.g. another `User-Agent`) if the site serves a captcha only to “suspicious” clients.
|
||||
6. Use **`errors`**: HTML substring → meaningful check error (UNKNOWN), so it is not confused with “available”.
|
||||
|
||||
### Phase D — Decision criteria
|
||||
|
||||
| Outcome | When to use |
|
||||
|---------|-------------|
|
||||
| **Check fixed** | The `claimed` / `unclaimed` pair behaves predictably, `--self-check` passes, no regression on a similar site with the same engine. |
|
||||
| **Check disabled** (`disabled: true`) | Cloudflare / anti-bot / login required / indistinguishable SPA without stable markers. |
|
||||
| **Entry removed** | **Only** if the domain/service is gone (NXDOMAIN, clearly dead project), not “because it is hard to fix”. |
|
||||
|
||||
### Phase E — Before commit
|
||||
|
||||
- `maigret --self-check` for affected sites.
|
||||
- `make test`.
|
||||
|
||||
---
|
||||
|
||||
## 4. Findings from reviews (concrete site batch)
|
||||
|
||||
Summary from an earlier false-positive review for: OpenSea, Mercado Livre, Redtube, Tom’s Guide, Kaggle, Kaskus, Livemaster, TechPowerUp, authorSTREAM, Bibsonomy, Bulbagarden, iXBT, Serebii, Picsart, Hashnode, hi5.
|
||||
|
||||
### What most often broke checks
|
||||
|
||||
1. **`status_code` where content checks are needed** — soft 404 with status 200.
|
||||
2. **Broad `presenseStrs`** — matches on error pages or generic SPA shells.
|
||||
3. **XenForo + guest** — HTML includes strings like “You must be logged in” that overlap the engine template.
|
||||
4. **User-Agent** — on some sites (e.g. Kaggle) the default UA triggered a reCAPTCHA page instead of profile HTML; a deliberate `User-Agent` in site `headers` helped.
|
||||
5. **SPAs and redirects** — identical first HTML, redirect to lander / another product (hi5 → Tagged), URL format changes by region (Mercado Livre).
|
||||
|
||||
### What worked as a fix
|
||||
|
||||
- Switching to **`message`** with narrow strings from **`<title>`** or unique markup where stable (**Kaggle**, **Mercado Livre**, **Hashnode**).
|
||||
- For **Kaggle**, additionally: **`headers`**, **`errors`** for browser-check text.
|
||||
- **Redtube** stayed valid on **`status_code`** with a stable **404** for non-existent users.
|
||||
- **Picsart**: the web profile URL is a thin SPA shell; use the **JSON API** (`api.picsart.com/users/show/{username}.json`) in **`url`** with **`message`**-style markers (`"status":"success"` vs `user_not_found`), not the browser-only `/posts` vs `/not-found` navigation.
|
||||
|
||||
### What required disabling checks
|
||||
|
||||
Where you **cannot** reliably tell “profile exists” from “no profile” without bypassing protection, login, or full JS:
|
||||
|
||||
- Anti-bot / captcha / “not a bot” page;
|
||||
- Guest-only access to the needed page;
|
||||
- SPA with indistinguishable first response;
|
||||
- Forums returning **403** and a login page instead of a member profile for the member-search URL;
|
||||
- Stale URLs that redirect to a stub.
|
||||
|
||||
In those cases **`disabled: true`** is better than false “found”; remove the DB entry only on **actual** domain death.
|
||||
|
||||
### Code notes
|
||||
|
||||
- For the `status_code` branch in `process_site_result`, use **strict** comparison `check_type == "status_code"`, not a substring match inside `"status_code"`.
|
||||
- Treat empty `presenseStrs` with `message` as risky: when debugging, watch DEBUG-level logs if that diagnostics exists in code.
|
||||
|
||||
---
|
||||
|
||||
## 5. Future ideas (Maigret improvements)
|
||||
|
||||
- A mode or script: one site, two usernames, print statuses and first N bytes of the response (wrapper around `maigret()`).
|
||||
- Document in CLI help that **`--use-disabled-sites`** is needed to analyze disabled entries.
|
||||
|
||||
---
|
||||
|
||||
## 6. Development utilities
|
||||
|
||||
### 6.1 `utils/site_check.py` — Single site diagnostics
|
||||
|
||||
A comprehensive utility for testing individual sites with multiple modes:
|
||||
|
||||
```bash
|
||||
# Basic comparison of claimed vs unclaimed (aiohttp)
|
||||
python utils/site_check.py --site "VK" --check-claimed
|
||||
|
||||
# Test via Maigret's checker directly
|
||||
python utils/site_check.py --site "VK" --maigret
|
||||
|
||||
# Compare aiohttp vs Maigret results (find discrepancies)
|
||||
python utils/site_check.py --site "VK" --compare-methods
|
||||
|
||||
# Full diagnosis with recommendations
|
||||
python utils/site_check.py --site "VK" --diagnose
|
||||
|
||||
# Test with custom URL
|
||||
python utils/site_check.py --url "https://example.com/{username}" --compare user1 user2
|
||||
|
||||
# Find a valid username for a site
|
||||
python utils/site_check.py --site "VK" --find-user
|
||||
```
|
||||
|
||||
**Key features:**
|
||||
- `--maigret` — Uses Maigret's actual checking code, not raw aiohttp
|
||||
- `--compare-methods` — Shows if aiohttp and Maigret see different results (useful for debugging)
|
||||
- `--diagnose` — Validates checkType against actual responses, suggests fixes
|
||||
- Color output with markers detection (captcha, cloudflare, login, etc.)
|
||||
- `--json` flag for machine-readable output
|
||||
|
||||
**When to use each mode:**
|
||||
|
||||
| Mode | Use case |
|
||||
|------|----------|
|
||||
| `--check-claimed` | Quick sanity check: do claimed/unclaimed still differ? |
|
||||
| `--maigret` | Verify Maigret's actual behavior matches expectations |
|
||||
| `--compare-methods` | Debug "works in curl but fails in Maigret" issues |
|
||||
| `--diagnose` | Full analysis when a site is broken, get fix recommendations |
|
||||
|
||||
### 6.2 `utils/check_top_n.py` — Mass site checking
|
||||
|
||||
Batch-check top N sites by Alexa rank with categorized reporting:
|
||||
|
||||
```bash
|
||||
# Check top 100 sites
|
||||
python utils/check_top_n.py --top 100
|
||||
|
||||
# Faster with more parallelism
|
||||
python utils/check_top_n.py --top 100 --parallel 10
|
||||
|
||||
# Output JSON report
|
||||
python utils/check_top_n.py --top 100 --output report.json
|
||||
|
||||
# Only show broken sites
|
||||
python utils/check_top_n.py --top 100 --only-broken
|
||||
```
|
||||
|
||||
**Output categories:**
|
||||
- `working` — Site check passes
|
||||
- `broken` — Check fails (wrong status, missing markers)
|
||||
- `timeout` — Request timed out
|
||||
- `anti_bot` — 403/429 or captcha detected
|
||||
- `error` — Connection or other errors
|
||||
- `disabled` — Already disabled in data.json
|
||||
|
||||
**Report includes:**
|
||||
- Summary counts by category
|
||||
- List of broken sites with issues
|
||||
- Recommendations for fixes (e.g., "Switch to checkType: status_code")
|
||||
|
||||
### 6.3 Self-check behavior (`--self-check`)
|
||||
|
||||
The self-check command has been improved to be less aggressive:
|
||||
|
||||
```bash
|
||||
# Check sites WITHOUT auto-disabling (default)
|
||||
maigret --self-check --site "VK"
|
||||
|
||||
# Auto-disable failing sites (old behavior)
|
||||
maigret --self-check --site "VK" --auto-disable
|
||||
|
||||
# Show detailed diagnosis for each failure
|
||||
maigret --self-check --site "VK" --diagnose
|
||||
```
|
||||
|
||||
**Behavior changes:**
|
||||
|
||||
| Flag | Effect |
|
||||
|------|--------|
|
||||
| `--self-check` alone | Reports issues but does NOT disable sites |
|
||||
| `--auto-disable` | Automatically disables sites that fail (opt-in) |
|
||||
| `--diagnose` | Prints detailed diagnosis with recommendations |
|
||||
|
||||
**Why this matters:**
|
||||
- Old behavior was too aggressive — sites got disabled without explanation
|
||||
- New behavior reports issues and suggests fixes
|
||||
- Explicit `--auto-disable` required to modify database
|
||||
|
||||
---
|
||||
|
||||
## 7. Lessons learned (practical observations)
|
||||
|
||||
Collected from hands-on work fixing top-ranked sites (Reddit, Wikipedia, Microsoft Learn, Baidu, etc.).
|
||||
|
||||
### 7.1 JSON API is the first thing to look for
|
||||
|
||||
Both Reddit and Microsoft Learn had working public APIs that solved the problem entirely. The web pages were SPAs or blocked by anti-bot measures, but the APIs worked reliably:
|
||||
|
||||
- **Reddit**: `https://api.reddit.com/user/{username}/about` — returns JSON with user data or `{"message": "Not Found", "error": 404}`.
|
||||
- **Microsoft Learn**: `https://learn.microsoft.com/api/profiles/{username}` — returns JSON with `userName` field or HTTP 404.
|
||||
|
||||
This confirms the playbook recommendation: always check for `/api/`, `.json`, GraphQL endpoints before giving up on a site.
|
||||
|
||||
### 7.2 `urlProbe` is a powerful tool
|
||||
|
||||
It separates "what we check" (API) from "what we show the user" (human-readable profile URL). Reddit is a perfect example:
|
||||
|
||||
```json
|
||||
{
|
||||
"url": "https://www.reddit.com/user/{username}",
|
||||
"urlProbe": "https://api.reddit.com/user/{username}/about",
|
||||
"checkType": "message",
|
||||
"presenseStrs": ["\"name\":"],
|
||||
"absenceStrs": ["Not Found"]
|
||||
}
|
||||
```
|
||||
|
||||
The check hits the API, but reports display `www.reddit.com/user/blue`.
|
||||
|
||||
### 7.3 aiohttp ≠ curl ≠ requests
|
||||
|
||||
Wikipedia returned HTTP 200 for `curl` and Python `requests`, but HTTP 403 for `aiohttp`. This is **TLS fingerprinting** — the server identifies the HTTP library by cryptographic characteristics of the TLS handshake, not by headers.
|
||||
|
||||
**Key insight:** Changing `User-Agent` does **not** help against TLS fingerprinting. Always test with aiohttp directly (or via Maigret with `-vvv` and `debug.log`), not just `curl`.
|
||||
|
||||
```python
|
||||
# This returns 403 for Wikipedia even with browser UA:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url, headers={"User-Agent": "Mozilla/5.0 ..."}) as resp:
|
||||
print(resp.status) # 403
|
||||
```
|
||||
|
||||
### 7.4 HTTP 403 in Maigret can mean different things
|
||||
|
||||
Initially it seemed Wikipedia was returning 403, but `curl` showed 200. Only `debug.log` revealed the real picture — aiohttp was getting blocked at TLS level.
|
||||
|
||||
**Lesson:** Use `-vvv` flag and inspect `debug.log` for raw response status and body. The warning message alone may be misleading.
|
||||
|
||||
### 7.5 Dead services migrate, not disappear
|
||||
|
||||
MSDN Social and TechNet profiles redirected to Microsoft Learn. Instead of deleting old entries:
|
||||
|
||||
1. Keep old entries with `disabled: true` as historical record.
|
||||
2. Create a new entry for the current service with working API.
|
||||
|
||||
This preserves audit trail and avoids breaking existing workflows.
|
||||
|
||||
### 7.6 `status_code` is more reliable than `message` for APIs
|
||||
|
||||
Microsoft Learn API returns HTTP 404 for non-existent users — a clean signal without HTML parsing. For JSON APIs that return proper HTTP status codes, `status_code` is often the best choice:
|
||||
|
||||
```json
|
||||
{
|
||||
"checkType": "status_code",
|
||||
"urlProbe": "https://learn.microsoft.com/api/profiles/{username}"
|
||||
}
|
||||
```
|
||||
|
||||
No need for fragile string matching when the API speaks HTTP correctly.
|
||||
|
||||
### 7.8 Engine templates can silently break across many sites
|
||||
|
||||
The **vBulletin** engine template has `absenceStrs` in five languages ("This user has not registered…", "Пользователь не зарегистрирован…", etc.). In a batch review of ~12 vBulletin forums (oneclickchicks, mirf, Pesiq, VKMOnline, forum.zone-game.info, etc.), **none** of the absence strings matched — the forums returned identical pages for both claimed and unclaimed usernames. Root cause: many of these forums require login to view member profiles, so they serve a generic page (no "user not registered" message at all) instead of an informative error.
|
||||
|
||||
**Lesson:** When a whole engine class shows false positives, do not patch sites one by one — check whether the **engine template** itself still matches the actual error pages. A template written for one version/language pack may silently stop working after a forum upgrade or config change.
|
||||
|
||||
### 7.9 Search-by-author URLs are architecturally unreliable
|
||||
|
||||
Several sites (OnanistovNet, Shoppingzone, Pogovorim, Astrogalaxy, Sexwin) used a phpBB-style `search.php?keywords=&terms=all&author={username}` URL as the check endpoint. This searches for **posts** by that author, not for the user account itself. Even if the markers worked, a user who exists but has zero posts would be indistinguishable from a non-existent user. And in practice, the sites changed their response format — some now return HTTP 404, others dropped the expected Russian absence text altogether.
|
||||
|
||||
**Lesson:** Avoid author-search URLs as the check endpoint; they test "has posts" rather than "account exists" and are doubly fragile (both logic mismatch and format drift).
|
||||
|
||||
### 7.10 Some sites generate a page for any path — permanent false positives
|
||||
|
||||
Two distinct patterns:
|
||||
|
||||
- **Pbase** creates a stub page titled "pbase Artist {username}" for **every** URL, real or fake. Both return HTTP 200 with nearly identical content (~3.3 KB). No markers can distinguish them.
|
||||
- **ffm.bio** is even trickier: for the non-existent username `a.slomkoowski` it generated a page titled "mr.a" with description "a is a", apparently fuzzy-matching the path to the closest real entry. Both return HTTP 200 with large, content-rich pages.
|
||||
|
||||
**Lesson:** Before writing markers for a site, verify that the "unclaimed" URL actually produces an **error-like** response (different status, different title, unique error text). If the site always returns a plausible-looking page, no combination of `presenseStrs` / `absenceStrs` will help — `disabled: true` is the only safe option.
|
||||
|
||||
### 7.11 TLS fingerprinting can degrade over time (Kaggle)
|
||||
|
||||
Kaggle was previously fixed with a custom `User-Agent` header and `errors` for the "Checking your browser" captcha page. In the latest batch review, aiohttp receives HTTP 404 with identical content for **both** claimed and unclaimed usernames — the site now blocks the entire request before it reaches the profile page. This matches the TLS fingerprinting pattern seen earlier with Wikipedia (section 7.3), but here the degradation happened **after** a working fix was already in place.
|
||||
|
||||
**Lesson:** Sites that rely on bot-detection can tighten their rules at any time. A working `User-Agent` override today may fail tomorrow. When a previously fixed site starts returning identical responses for both usernames, suspect TLS fingerprinting first, and accept `disabled: true` if no public API is available.
|
||||
|
||||
### 7.12 API endpoints may bypass Cloudflare even when the main site is blocked
|
||||
|
||||
All four Fandom wikis returned HTTP 403 with a Cloudflare "Just a moment..." challenge when aiohttp accessed the user profile page (`/wiki/User:{username}`). However, the **MediaWiki API** on the same domain (`/api.php?action=query&list=users&ususers={username}&format=json`) returned clean JSON without any challenge. Similarly, **Substack** served a captcha-laden SPA for `/@{username}`, but its `public_profile` API (`/api/v1/user/{username}/public_profile`) responded with proper JSON and correct HTTP 404 for missing users.
|
||||
|
||||
This is likely because API routes are excluded from the Cloudflare WAF rules or use a different pipeline than the HTML-serving paths.
|
||||
|
||||
**Lesson:** When a site's main pages are blocked by Cloudflare or similar WAF, still check API endpoints on the **same domain** — they may not go through the same protection layer. This is especially true for:
|
||||
- MediaWiki's `api.php` on wiki farms (Fandom, Wikia, self-hosted MediaWiki)
|
||||
- REST API paths (`/api/v1/`, `/api/v2/`) on SPA-heavy sites
|
||||
- Internal data endpoints that the SPA itself calls
|
||||
|
||||
### 7.13 GraphQL APIs often support GET, not just POST
|
||||
|
||||
**hashnode** exposes a GraphQL endpoint at `https://gql.hashnode.com`. While GraphQL is typically associated with POST requests, many implementations also support **GET** with the query passed as a URL parameter. This is critical for Maigret, which only supports GET/HEAD for `urlProbe`.
|
||||
|
||||
```
|
||||
GET https://gql.hashnode.com?query=%7Buser(username%3A%20%22melwinalm%22)%20%7B%20name%20username%20%7D%7D
|
||||
→ {"data":{"user":{"name":"Melwin D'Almeida","username":"melwinalm"}}}
|
||||
|
||||
GET https://gql.hashnode.com?query=%7Buser(username%3A%20%22a.slomkoowski%22)%20%7B%20name%20username%20%7D%7D
|
||||
→ {"data":{"user":null}}
|
||||
```
|
||||
|
||||
**Lesson:** Before giving up on a GraphQL-only site, try the same query via GET with `?query=...` (URL-encoded). Many GraphQL servers accept both methods.
|
||||
|
||||
### 7.14 URL-encoding resolves template placeholder conflicts
|
||||
|
||||
The hashnode GraphQL query `{user(username: "{username}") { name }}` contains curly braces that conflict with Maigret's `{username}` placeholder — Python's `str.format()` would raise a `KeyError` on `{user(username...}`.
|
||||
|
||||
The fix: URL-encode the GraphQL braces (`{` → `%7B`, `}` → `%7D`) but leave `{username}` as-is. Python's `.format()` only interprets literal `{…}` as placeholders, not `%7B…%7D`, and the GraphQL server decodes the percent-encoding on its end:
|
||||
|
||||
```
|
||||
urlProbe: https://gql.hashnode.com?query=%7Buser(username%3A%20%22{username}%22)%20%7B%20name%20username%20%7D%7D
|
||||
```
|
||||
|
||||
After `.format(username="melwinalm")`:
|
||||
```
|
||||
https://gql.hashnode.com?query=%7Buser(username%3A%20%22melwinalm%22)%20%7B%20name%20username%20%7D%7D
|
||||
```
|
||||
|
||||
**Lesson:** When a `urlProbe` needs literal curly braces (GraphQL, JSON in URL, etc.), percent-encode them. This is a general technique for any `data.json` URL field processed by `.format()`.
|
||||
|
||||
### 7.7 The playbook classification works
|
||||
|
||||
The decision tree from the documentation accurately describes real-world cases:
|
||||
|
||||
| Situation | Playbook says | Actual result |
|
||||
|-----------|---------------|---------------|
|
||||
| Captcha (Baidu) | `disabled: true` | Correct |
|
||||
| TLS fingerprinting (Wikipedia) | `disabled: true` (anti-bot) | Correct |
|
||||
| Working API available (Reddit, MS Learn) | Use `urlProbe` | Correct |
|
||||
| Service migrated (MSDN → MS Learn) | Update URL or create new entry | Correct |
|
||||
|
||||
---
|
||||
|
||||
## Documentation maintenance
|
||||
|
||||
For any of the changes below, **always** keep these artifacts in sync — this file ([`site-checks-guide.md`](site-checks-guide.md)), [`site-checks-playbook.md`](site-checks-playbook.md), and (when rules or templates change) the header/template in [`socid_extractor_improvements.log`](socid_extractor_improvements.log):
|
||||
|
||||
- Maigret code changes (including [`maigret/checking.py`](../maigret/checking.py), request executors, CLI);
|
||||
- New or changed search tools / helper utilities for site checks;
|
||||
- Changes to rules or semantics of `checkType`, `data.json` fields, self-check, etc.;
|
||||
- Changes to the **public JSON API** diagnostic step or **mandatory** `socid_extractor` logging rules.
|
||||
|
||||
Prefer updating the guide, playbook, and log template in one commit or in the same task so instructions do not diverge. **Append-only:** new proposals go at the bottom of `socid_extractor_improvements.log`; do not delete historical entries when editing the template.
|
||||
@@ -1,84 +0,0 @@
|
||||
# Site checks — playbook (Maigret)
|
||||
|
||||
Short checklist for edits to [`maigret/resources/data.json`](../maigret/resources/data.json) and, when needed, [`maigret/checking.py`](../maigret/checking.py). Full guide: [`site-checks-guide.md`](site-checks-guide.md). Upstream extraction proposals: [`socid_extractor_improvements.log`](socid_extractor_improvements.log).
|
||||
|
||||
**Documentation maintenance:** whenever you improve Maigret, add search tooling, or change check logic, update **both** this file and [`site-checks-guide.md`](site-checks-guide.md) (see the “Documentation maintenance” section at the end of that file). When JSON API / `socid_extractor` logging rules change, update the **template header** in [`socid_extractor_improvements.log`](socid_extractor_improvements.log) in the same change.
|
||||
|
||||
## 0. Standard checks (do alongside reproduce / classify)
|
||||
|
||||
- **Public JSON API:** always look for a stable JSON (or GraphQL JSON) profile endpoint (`/api/`, `.json`, mobile-style URLs). When the API is more reliable than HTML, set **`urlProbe`** to that endpoint and keep **`url`** as the human-readable profile link (e.g. `https://picsart.com/u/{username}`). If there is no separate profile URL, use the API as `url` only. Details: **`urlProbe`** and section **2.1** in [`site-checks-guide.md`](site-checks-guide.md).
|
||||
- **`socid_extractor` log (mandatory):** if you find **embedded user JSON in HTML** or a **standalone JSON profile API**, append a dated entry (with **example username**) to [`socid_extractor_improvements.log`](socid_extractor_improvements.log). Details: section **2.2** in [`site-checks-guide.md`](site-checks-guide.md).
|
||||
|
||||
## 1. Reproduce
|
||||
|
||||
- Run a targeted check:
|
||||
`maigret USER --db /path/to/maigret/resources/data.json --site "SiteName" --print-not-found --print-errors --no-progressbar -vv`
|
||||
- Compare an **existing** and a **non-existent** username (as `usernameClaimed` / `usernameUnclaimed` in JSON).
|
||||
- With `-vvv`, inspect `debug.log` (raw response in the log).
|
||||
|
||||
## 2. Classify the cause
|
||||
|
||||
| Symptom | Typical cause | Action |
|
||||
|--------|-----------------|--------|
|
||||
| HTTP 200 for “user does not exist” | Soft 404 | Move from `status_code` to `message` or `response_url`; add `absenceStrs` / narrow `presenseStrs` |
|
||||
| Generic words match (`name`, `email`) | `presenseStrs` too broad | Remove generic markers; add profile-specific ones |
|
||||
| Same HTML without JS | SPA / skeleton shell | Compare **final URL and HTTP redirects** (Maigret already follows redirects by default). If the browser shows extra routes (`/posts`, `/not-found`) only **after JS**, they will **not** appear to Maigret — try a **public JSON/API** endpoint for the same site if one exists. See **Redirects and final URL** and **Picsart** in [`site-checks-guide.md`](site-checks-guide.md). |
|
||||
| 403 / “Log in” / guest-only | Auth or anti-bot required | `disabled: true` |
|
||||
| reCAPTCHA / “Checking your browser” | Bot protection | Try a reasonable `User-Agent` in `headers`; else `errors` + UNKNOWN or `disabled` |
|
||||
| Domain does not resolve / persistent timeout | Dead service | Remove entry **only** after confirming the domain is dead |
|
||||
|
||||
## 3. Data edits
|
||||
|
||||
1. Update `url` / `urlMain` if needed (HTTPS redirects). Use optional **`urlProbe`** when the HTTP check should hit a different URL than the profile link shown in reports (API vs web UI).
|
||||
2. For `message`: **always** tune string pairs so `absenceStrs` fire on “no user” pages and `presenseStrs` fire on real profiles without false absence hits.
|
||||
3. Engine (`engine`, e.g. XenForo): override only differing fields in the site entry so other sites are not broken.
|
||||
4. Keep `status_code` only if the response **reliably** differs by status code without soft 404.
|
||||
|
||||
## 4. Verify
|
||||
|
||||
- `maigret --self-check --site "SiteName" --db ...` for touched entries.
|
||||
- `make test` before commit.
|
||||
|
||||
## 5. Code notes
|
||||
|
||||
- `process_site_result` uses strict comparison to `"status_code"` for `checkType` (not a substring trick).
|
||||
- Empty `presenseStrs` with `message` means “presence always true”; a debug line is logged only at DEBUG level.
|
||||
|
||||
## 6. Development utilities
|
||||
|
||||
Quick reference for site check utilities. Full details: section **6** in [`site-checks-guide.md`](site-checks-guide.md).
|
||||
|
||||
| Command | Purpose |
|
||||
|---------|---------|
|
||||
| `python utils/site_check.py --site "X" --check-claimed` | Quick aiohttp comparison |
|
||||
| `python utils/site_check.py --site "X" --maigret` | Test via Maigret checker |
|
||||
| `python utils/site_check.py --site "X" --compare-methods` | Find aiohttp vs Maigret discrepancies |
|
||||
| `python utils/site_check.py --site "X" --diagnose` | Full diagnosis with fix recommendations |
|
||||
| `python utils/check_top_n.py --top 100` | Mass-check top 100 sites |
|
||||
| `maigret --self-check --site "X"` | Self-check (reports only, no auto-disable) |
|
||||
| `maigret --self-check --site "X" --auto-disable` | Self-check with auto-disable |
|
||||
| `maigret --self-check --site "X" --diagnose` | Self-check with detailed diagnosis |
|
||||
|
||||
## 7. Quick tips (lessons learned)
|
||||
|
||||
Practical observations from fixing top-ranked sites. Full details: section **7** in [`site-checks-guide.md`](site-checks-guide.md).
|
||||
|
||||
| Tip | Why it matters |
|
||||
|-----|----------------|
|
||||
| **API first** | Reddit, Microsoft Learn — APIs worked when web pages were blocked. Always check `/api/`, `.json` endpoints. |
|
||||
| **`urlProbe` separates check from display** | Check via API, show human URL in reports. Example: Reddit API → `www.reddit.com/user/` link. |
|
||||
| **aiohttp ≠ curl** | Wikipedia returned 200 for curl, 403 for aiohttp (TLS fingerprinting). Always test with Maigret directly. |
|
||||
| **Use `debug.log`** | Run with `-vvv` to see raw response. Warning messages alone can be misleading. |
|
||||
| **`status_code` for clean APIs** | If API returns proper 404 for missing users, prefer `status_code` over `message`. |
|
||||
| **Migrate, don't delete** | MSDN → Microsoft Learn: keep old entry disabled, create new one for current service. |
|
||||
| **Engine templates break silently** | vBulletin `absenceStrs` failed on ~12 forums at once — many require login, showing a generic page with no error text. Check the engine template first. |
|
||||
| **Search-by-author is unreliable** | phpBB `search.php?author=` checks for posts, not accounts. A user with zero posts looks identical to a non-existent user. Avoid these URLs. |
|
||||
| **Some sites always generate a page** | Pbase stubs "pbase Artist {name}" for any path; ffm.bio fuzzy-matches to the nearest real entry. No markers can help — `disabled: true`. |
|
||||
| **TLS fingerprinting degrades over time** | Kaggle's custom `User-Agent` fix stopped working — aiohttp now gets 404 for both usernames. Accept `disabled: true` when no API exists. |
|
||||
| **API endpoints bypass Cloudflare** | Fandom `api.php` and Substack `/api/v1/` returned clean JSON while main pages were blocked by Cloudflare. Always try API paths on the same domain. |
|
||||
| **GraphQL supports GET too** | hashnode GraphQL works via `GET ?query=...` (URL-encoded). Don't assume POST-only — Maigret can use GET `urlProbe` for GraphQL. |
|
||||
| **URL-encode braces for template safety** | GraphQL `{...}` conflicts with Maigret's `{username}`. Use `%7B`/`%7D` for literal braces in `urlProbe` — `.format()` ignores percent-encoded chars. |
|
||||
|
||||
## 8. Documentation maintenance
|
||||
|
||||
When you change Maigret, add search tools, or change check logic, keep **this playbook**, [`site-checks-guide.md`](site-checks-guide.md), and (when applicable) the template in [`socid_extractor_improvements.log`](socid_extractor_improvements.log) aligned. New log **entries** are append-only at the bottom of that file.
|
||||
@@ -1,4 +0,0 @@
|
||||
include LICENSE
|
||||
include README.md
|
||||
include requirements.txt
|
||||
include maigret/resources/*
|
||||
@@ -25,7 +25,7 @@
|
||||
|
||||
<i>The Commissioner Jules Maigret is a fictional French police detective, created by Georges Simenon. His investigation method is based on understanding the personality of different people and their interactions.</i>
|
||||
|
||||
<b>👉👉👉 [Online Telegram bot](https://t.me/maigret_search_bot)</b>
|
||||
<b>👉👉👉 [Online Telegram bot](https://t.me/maigret_search_bot) | 🏢 [Commercial use & API](#commercial-use)</b>
|
||||
|
||||
## About
|
||||
|
||||
@@ -112,6 +112,10 @@ docker run -v /mydir:/app/reports soxoj/maigret:latest username --html
|
||||
docker build -t maigret .
|
||||
```
|
||||
|
||||
### Troubleshooting
|
||||
|
||||
If you encounter build errors during installation, check the [troubleshooting guide](https://maigret.readthedocs.io/en/latest/installation.html#troubleshooting).
|
||||
|
||||
## Usage examples
|
||||
|
||||
```bash
|
||||
@@ -192,6 +196,16 @@ The authors and developers of this tool bear no responsibility for any misuse or
|
||||
|
||||
If you have any questions, suggestions, or feedback, please feel free to [open an issue](https://github.com/soxoj/maigret/issues), create a [GitHub discussion](https://github.com/soxoj/maigret/discussions), or contact the author directly via [Telegram](https://t.me/soxoj).
|
||||
|
||||
## Commercial Use
|
||||
|
||||
If you need a **daily updated database** of supported sites or an **API for username checks**, feel free to reach out:
|
||||
|
||||
📧 [maigret@soxoj.com](mailto:maigret@soxoj.com)
|
||||
|
||||
Available options:
|
||||
- Up-to-date site database - regularly maintained and updated list of 5K+ sites, delivered daily
|
||||
- Username check API - programmatic access to Maigret's search capabilities for integration into your products
|
||||
|
||||
## SOWEL classification
|
||||
|
||||
This tool uses the following OSINT techniques:
|
||||
|
||||
@@ -31,19 +31,25 @@ two-letter country codes (**not a language!**). E.g. photo, dating, sport; jp, u
|
||||
Multiple tags can be associated with one site. **Warning**: tags markup is
|
||||
not stable now. Read more :doc:`in the separate section <tags>`.
|
||||
|
||||
``--exclude-tags`` - Exclude sites with specific tags from the search
|
||||
(blacklist). E.g. ``--exclude-tags porn,dating`` will skip all sites
|
||||
tagged with ``porn`` or ``dating``. Can be combined with ``--tags`` to
|
||||
include certain categories while excluding others. Read more
|
||||
:doc:`in the separate section <tags>`.
|
||||
|
||||
``-n``, ``--max-connections`` - Allowed number of concurrent connections
|
||||
**(default: 100)**.
|
||||
|
||||
``-a``, ``--all-sites`` - Use all sites for scan **(default: top 500)**.
|
||||
|
||||
``--top-sites`` - Count of sites for scan ranked by Alexa Top
|
||||
``--top-sites`` - Count of sites for scan ranked by Majestic Million
|
||||
**(default: top 500)**.
|
||||
|
||||
**Mirrors:** After the top *N* sites by Alexa rank are chosen (respecting
|
||||
**Mirrors:** After the top *N* sites by Majestic Million rank are chosen (respecting
|
||||
``--tags``, ``--use-disabled-sites``, etc.), Maigret may add extra sites
|
||||
whose database field ``source`` names a **parent platform** that itself falls
|
||||
in the Alexa top *N* when ranking **including disabled** sites. For example,
|
||||
if ``Twitter`` ranks in the first 500 by Alexa, a mirror such as ``memory.lol``
|
||||
in the Majestic Million top *N* when ranking **including disabled** sites. For example,
|
||||
if ``Twitter`` ranks in the first 500 by Majestic Million, a mirror such as ``memory.lol``
|
||||
(with ``source: Twitter``) is included even though it has no rank and would
|
||||
otherwise be cut off. The same applies to Instagram-related mirrors (e.g.
|
||||
Picuki) when ``Instagram`` is in that parent top *N* by rank—even if the
|
||||
@@ -76,11 +82,63 @@ id types, sites will be filtered automatically.
|
||||
ids. Useful for repeated scanning with found known irrelevant usernames.
|
||||
|
||||
``--db`` - Load Maigret database from a JSON file or an online, valid,
|
||||
JSON file.
|
||||
JSON file. See :ref:`custom-database` below.
|
||||
|
||||
``--no-autoupdate`` - Disable the automatic database update check that
|
||||
runs at startup. The currently cached (or bundled) database is used
|
||||
as-is.
|
||||
|
||||
``--force-update`` - Force a database update check at startup, ignoring
|
||||
the usual check interval. Implies ``--no-autoupdate`` for the rest of
|
||||
the run after the explicit update finishes.
|
||||
|
||||
``--retries RETRIES`` - Count of attempts to restart temporarily failed
|
||||
requests.
|
||||
|
||||
.. _custom-database:
|
||||
|
||||
Using a custom sites database
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The ``--db`` flag accepts three forms:
|
||||
|
||||
1. **HTTP(S) URL** — fetched as-is, e.g.
|
||||
``--db https://example.com/my_db.json``.
|
||||
2. **Local file path** — absolute (``--db /tmp/private.json``) or
|
||||
relative to the current working directory
|
||||
(``--db LLM/maigret_private_db.json``).
|
||||
3. **Module-relative path** — kept for backwards compatibility, resolved
|
||||
against the installed ``maigret/`` package directory (e.g. the
|
||||
default ``resources/data.json``).
|
||||
|
||||
Resolution order for local paths: the path is first tried as given
|
||||
(absolute or cwd-relative); if that file does not exist, Maigret falls
|
||||
back to the legacy module-relative resolution. If neither location
|
||||
contains the file, Maigret exits with an error rather than silently
|
||||
loading the bundled database.
|
||||
|
||||
When ``--db`` points to a custom file, automatic database updates are
|
||||
skipped — the file is used exactly as provided.
|
||||
|
||||
On every run Maigret prints the database it actually loaded, for
|
||||
example::
|
||||
|
||||
[+] Using sites database: /path/to/maigret_private_db.json (6 sites)
|
||||
|
||||
If loading the requested database fails for any other reason (corrupt
|
||||
JSON, missing required keys, …), Maigret prints a warning, falls back
|
||||
to the bundled database, and reports the fallback explicitly::
|
||||
|
||||
[-] Falling back to bundled database: /…/maigret/resources/data.json
|
||||
[+] Using sites database: /…/maigret/resources/data.json (3154 sites)
|
||||
|
||||
A typical invocation against a private database, with auto-update
|
||||
disabled and all sites scanned, looks like::
|
||||
|
||||
python3 -m maigret username \
|
||||
--db LLM/maigret_private_db.json \
|
||||
--no-autoupdate -a
|
||||
|
||||
Reports
|
||||
-------
|
||||
|
||||
@@ -100,6 +158,9 @@ username).
|
||||
``-J``, ``--json`` - Generate a JSON report of specific type: simple,
|
||||
ndjson (one report per username). E.g. ``--json ndjson``
|
||||
|
||||
``-M``, ``--md`` - Generate a Markdown report (general report on all
|
||||
usernames). See :ref:`markdown-report` below.
|
||||
|
||||
``-fo``, ``--folderoutput`` - Results will be saved to this folder,
|
||||
``results`` by default. Will be created if doesn’t exist.
|
||||
|
||||
@@ -124,16 +185,60 @@ Other operations modes
|
||||
|
||||
``--version`` - Display version information and dependencies.
|
||||
|
||||
``--self-check`` - Do self-checking for sites and database and disable
|
||||
non-working ones **for current search session** by default. It’s useful
|
||||
for testing new internet connection (it depends on provider/hosting on
|
||||
which sites there will be censorship stub or captcha display). After
|
||||
checking Maigret asks if you want to save updates, answering y/Y will
|
||||
rewrite the local database.
|
||||
``--self-check`` - Do self-checking for sites and database. Each site is
|
||||
tested by looking up its known-claimed and known-unclaimed usernames and
|
||||
verifying that the results match expectations. Individual site failures
|
||||
(network errors, unexpected exceptions, etc.) are caught and logged
|
||||
without stopping the overall process, so the check always runs to
|
||||
completion. After checking, Maigret reports a summary of issues found.
|
||||
If any sites were disabled (see ``--auto-disable``), Maigret asks if you
|
||||
want to save updates; answering y/Y will rewrite the local database.
|
||||
|
||||
``--auto-disable`` - Used with ``--self-check``: automatically disable
|
||||
sites that fail checks (incorrect detection of claimed/unclaimed
|
||||
usernames, connection errors, or unexpected exceptions). Without this
|
||||
flag, ``--self-check`` only **reports** issues without modifying the
|
||||
database.
|
||||
|
||||
``--diagnose`` - Used with ``--self-check``: print detailed diagnosis
|
||||
information for each failing site, including the check type, the list
|
||||
of issues found, and recommendations (e.g. suggesting a different
|
||||
``checkType``).
|
||||
|
||||
``--submit URL`` - Do an automatic analysis of the given account URL or
|
||||
site main page URL to determine the site engine and methods to check
|
||||
account presence. After checking Maigret asks if you want to add the
|
||||
site, answering y/Y will rewrite the local database.
|
||||
|
||||
.. _markdown-report:
|
||||
|
||||
Markdown report (LLM-friendly)
|
||||
------------------------------
|
||||
|
||||
The ``--md`` / ``-M`` flag generates a Markdown report designed for both human reading and analysis by AI assistants (ChatGPT, Claude, etc.).
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
maigret username --md
|
||||
|
||||
The report includes:
|
||||
|
||||
- **Summary** with aggregated personal data (all fullnames, locations, bios found across accounts), country tags, website tags, first/last seen timestamps.
|
||||
- **Per-account sections** with profile URL, site tags, and all extracted fields (username, bio, follower count, linked accounts, etc.).
|
||||
- **Possible false positives** disclaimer explaining that accounts may belong to different people.
|
||||
- **Ethical use** notice about applicable data protection laws.
|
||||
|
||||
**Using with AI tools:**
|
||||
|
||||
The Markdown format is optimized for LLM context windows. You can feed the report directly to an AI assistant for follow-up analysis:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
# Generate the report
|
||||
maigret johndoe --md
|
||||
|
||||
# Feed it to an AI tool
|
||||
cat reports/report_johndoe.md | llm "Analyze this OSINT report and summarize key findings"
|
||||
|
||||
The structured Markdown with per-site sections makes it easy for AI tools to extract relationships, cross-reference identities, and identify patterns across accounts.
|
||||
|
||||
|
||||
@@ -22,9 +22,15 @@ The supported methods (``checkType`` values in ``data.json``) are:
|
||||
- ``status_code`` - checks that status code of the response is 2XX
|
||||
- ``response_url`` - check if there is not redirect and the response is 2XX
|
||||
|
||||
.. note::
|
||||
Maigret natively treats specific anti-bot HTTP status codes (like LinkedIn's ``HTTP 999``) as a standard "Not Found/Available" signal instead of throwing an infrastructure Server Error, gracefully preventing false positives.
|
||||
|
||||
See the details of check mechanisms in the `checking.py <https://github.com/soxoj/maigret/blob/main/maigret/checking.py#L339>`_ file.
|
||||
|
||||
**Mirrors and ``--top-sites``:** When you limit scans with ``--top-sites N``, Maigret also includes *mirror* sites (entries whose ``source`` field points at a parent platform such as Twitter or Instagram) if that parent would appear in the Alexa top *N* when disabled sites are considered for ranking. See the **Mirrors** paragraph under ``--top-sites`` in :doc:`command-line-options`.
|
||||
.. note::
|
||||
Maigret now uses the **Majestic Million** dataset for site popularity sorting instead of the discontinued Alexa Rank API. For backward compatibility with existing configurations and parsers, the ranking field in `data.json` and internal site models remains named ``alexaRank`` and ``alexa_rank``.
|
||||
|
||||
**Mirrors and ``--top-sites``:** When you limit scans with ``--top-sites N``, Maigret also includes *mirror* sites (entries whose ``source`` field points at a parent platform such as Twitter or Instagram) if that parent would appear in the Majestic Million top *N* when disabled sites are considered for ranking. See the **Mirrors** paragraph under ``--top-sites`` in :doc:`command-line-options`.
|
||||
|
||||
Testing
|
||||
-------
|
||||
@@ -63,6 +69,21 @@ Use the following commands to check Maigret:
|
||||
make speed
|
||||
|
||||
|
||||
Site naming conventions
|
||||
-----------------------------------------------
|
||||
|
||||
Site names are the keys in ``data.json`` and appear in user-facing reports. Follow these rules:
|
||||
|
||||
- **Title Case** by default: ``Product Hunt``, ``Hacker News``.
|
||||
- **Lowercase** only if the brand itself is written that way: ``kofi``, ``note``, ``hi5``.
|
||||
- **No domain suffix** (``calendly.com`` → ``Calendly``), unless the domain is part of the recognized brand name: ``last.fm``, ``VC.ru``, ``Archive.org``.
|
||||
- **No full UPPERCASE** unless the brand is an acronym: ``VK``, ``CNET``, ``ICQ``, ``IFTTT``.
|
||||
- **No** ``www.`` **or** ``https://`` **prefix** in the name.
|
||||
- **Spaces** are allowed when the brand uses them: ``Star Citizen``, ``Google Maps``.
|
||||
- **{username} templates** in names are acceptable: ``{username}.tilda.ws``.
|
||||
|
||||
When in doubt, check how the service refers to itself on its homepage.
|
||||
|
||||
How to fix false-positives
|
||||
-----------------------------------------------
|
||||
|
||||
@@ -114,6 +135,32 @@ There are few options for sites data.json helpful in various cases:
|
||||
- ``headers`` - a dictionary of additional headers to be sent to the site
|
||||
- ``requestHeadOnly`` - set to ``true`` if it's enough to make a HEAD request to the site
|
||||
- ``regexCheck`` - a regex to check if the username is valid, in case of frequent false-positives
|
||||
- ``requestMethod`` - set the HTTP method to use (e.g., ``POST``). By default, Maigret natively defaults to GET or HEAD.
|
||||
- ``requestPayload`` - a dictionary with the JSON payload to send for POST requests (e.g., ``{"username": "{username}"}``), extremely useful for parsing GraphQL or modern JSON APIs.
|
||||
- ``protection`` - a list of protection types detected on the site (see below).
|
||||
|
||||
``protection`` (site protection tracking)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The ``protection`` field records what kind of anti-bot protection a site uses. Maigret reads this field and automatically applies the appropriate bypass mechanism.
|
||||
|
||||
Supported values:
|
||||
|
||||
- ``tls_fingerprint`` — the site fingerprints the TLS handshake (JA3/JA4) and blocks non-browser clients. Maigret automatically uses ``curl_cffi`` with Chrome browser emulation to bypass this. Requires the ``curl_cffi`` package (included as a dependency). Examples: Instagram, NPM, Codepen, Kickstarter, Letterboxd.
|
||||
- ``ip_reputation`` — the site blocks requests from datacenter/cloud IPs regardless of headers or TLS. Cannot be bypassed automatically; run Maigret from a regular internet connection (not a datacenter) or use a proxy (``--proxy``). Examples: Reddit, Patreon, Figma.
|
||||
- ``js_challenge`` — the site serves a JavaScript challenge page (e.g. "Just a moment...") that cannot be solved without a browser. Maigret detects challenge signatures and returns UNKNOWN instead of a false positive.
|
||||
|
||||
Example:
|
||||
|
||||
.. code-block:: json
|
||||
|
||||
"Instagram": {
|
||||
"url": "https://www.instagram.com/{username}/",
|
||||
"checkType": "message",
|
||||
"presenseStrs": ["\"routePath\":\"\\/"],
|
||||
"absenceStrs": ["\"routePath\":null"],
|
||||
"protection": ["tls_fingerprint"]
|
||||
}
|
||||
|
||||
``urlProbe`` (optional profile probe URL)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
@@ -170,6 +170,35 @@ Maigret will do retries of the requests with temporary errors got (connection fa
|
||||
|
||||
One attempt by default, can be changed with option ``--retries N``.
|
||||
|
||||
Database self-check
|
||||
-------------------
|
||||
|
||||
Maigret includes a self-check mode (``--self-check``) that validates every site
|
||||
in the database by looking up its known-claimed and known-unclaimed usernames
|
||||
and verifying that the detection results match expectations.
|
||||
|
||||
The self-check is **error-resilient**: if an individual site check raises an
|
||||
unexpected exception (e.g. a network error or a parsing failure), the error is
|
||||
caught, logged, and recorded as an issue — the remaining sites continue to be
|
||||
checked without interruption. This means the process always runs to completion,
|
||||
even when checking hundreds of sites with ``-a --self-check``.
|
||||
|
||||
Use ``--auto-disable`` together with ``--self-check`` to automatically disable
|
||||
sites that fail checks. Without it, issues are only reported. Use ``--diagnose``
|
||||
to print detailed per-site diagnosis including the check type, specific issues,
|
||||
and recommendations.
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
# Report-only mode (no changes to the database)
|
||||
maigret --self-check
|
||||
|
||||
# Automatically disable failing sites and save updates
|
||||
maigret -a --self-check --auto-disable
|
||||
|
||||
# Show detailed diagnosis for each failing site
|
||||
maigret -a --self-check --diagnose
|
||||
|
||||
Archives and mirrors checking
|
||||
-----------------------------
|
||||
|
||||
|
||||
@@ -90,3 +90,39 @@ Docker
|
||||
|
||||
# manual build
|
||||
docker build -t maigret .
|
||||
|
||||
Troubleshooting
|
||||
---------------
|
||||
|
||||
If you encounter build errors during installation such as ``cannot find ft2build.h``
|
||||
or errors related to ``reportlab`` / ``_renderPM``, you need to install system-level
|
||||
dependencies required to compile native extensions.
|
||||
|
||||
**Debian/Ubuntu/Kali:**
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
sudo apt install -y libfreetype6-dev libjpeg-dev libffi-dev
|
||||
|
||||
**Fedora/RHEL/CentOS:**
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
sudo dnf install -y freetype-devel libjpeg-devel libffi-devel
|
||||
|
||||
**Arch Linux:**
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
sudo pacman -S freetype2 libjpeg-turbo libffi
|
||||
|
||||
**macOS (Homebrew):**
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
brew install freetype
|
||||
|
||||
After installing the system dependencies, retry the maigret installation.
|
||||
|
||||
If you continue to have issues, consider using Docker instead, which includes all
|
||||
necessary dependencies.
|
||||
|
||||
@@ -27,3 +27,77 @@ Missing any of these files is not an error.
|
||||
If the next settings file contains already known option,
|
||||
this option will be rewrited. So it is possible to make
|
||||
custom configuration for different users and directories.
|
||||
|
||||
.. _database-auto-update:
|
||||
|
||||
Database auto-update
|
||||
--------------------
|
||||
|
||||
Maigret ships with a bundled site database, but it gets outdated between releases. To keep the database current, Maigret automatically checks for updates on startup.
|
||||
|
||||
**How it works:**
|
||||
|
||||
1. On startup, Maigret checks if more than 24 hours have passed since the last update check.
|
||||
2. If so, it fetches a lightweight metadata file (~200 bytes) from GitHub to see if a newer database is available.
|
||||
3. If a newer, compatible database exists, Maigret downloads it to ``~/.maigret/data.json`` and uses it instead of the bundled copy.
|
||||
4. If the download fails or the new database is incompatible with your Maigret version, the bundled database is used as a fallback.
|
||||
|
||||
The downloaded database has **higher priority** than the bundled one — it replaces, not overlays.
|
||||
|
||||
**Status messages** are printed only when an action occurs:
|
||||
|
||||
.. code-block:: text
|
||||
|
||||
[*] DB auto-update: checking for updates...
|
||||
[+] DB auto-update: database updated successfully (3180 sites)
|
||||
[*] DB auto-update: database is up to date (3157 sites)
|
||||
[!] DB auto-update: latest database requires maigret >= 0.6.0, you have 0.5.0
|
||||
|
||||
**Forcing an update:**
|
||||
|
||||
Use the ``--force-update`` flag to check for updates immediately, ignoring the check interval:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
maigret username --force-update
|
||||
|
||||
The update happens at startup, then the search continues normally with the freshly downloaded database.
|
||||
|
||||
**Disabling auto-update:**
|
||||
|
||||
Use the ``--no-autoupdate`` flag to skip the update check entirely:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
maigret username --no-autoupdate
|
||||
|
||||
Or set it permanently in ``~/.maigret/settings.json``:
|
||||
|
||||
.. code-block:: json
|
||||
|
||||
{
|
||||
"no_autoupdate": true
|
||||
}
|
||||
|
||||
This is recommended for **Docker containers**, **CI pipelines**, and **air-gapped environments**.
|
||||
|
||||
**Configuration options** (in ``settings.json``):
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:widths: 35 15 50
|
||||
|
||||
* - Setting
|
||||
- Default
|
||||
- Description
|
||||
* - ``no_autoupdate``
|
||||
- ``false``
|
||||
- Disable auto-update entirely
|
||||
* - ``autoupdate_check_interval_hours``
|
||||
- ``24``
|
||||
- How often to check for updates (in hours)
|
||||
* - ``db_update_meta_url``
|
||||
- GitHub raw URL
|
||||
- URL of the metadata file (for custom mirrors)
|
||||
|
||||
**Using a custom database** with ``--db`` always skips auto-update — you are explicitly choosing your data source.
|
||||
|
||||
+22
-1
@@ -10,7 +10,12 @@ The use of tags allows you to select a subset of the sites from big Maigret DB f
|
||||
|
||||
There are several types of tags:
|
||||
|
||||
1. **Country codes**: ``us``, ``jp``, ``br``... (`ISO 3166-1 alpha-2 <https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2>`_). These tags reflect the site language and regional origin of its users and are then used to locate the owner of a username. If the regional origin is difficult to establish or a site is positioned as worldwide, `no country code is given`. There could be multiple country code tags for one site.
|
||||
1. **Country codes**: ``us``, ``jp``, ``br``... (`ISO 3166-1 alpha-2 <https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2>`_). A country tag means that having an account on the site implies a connection to that country — either origin or residence. The goal is attribution, not perfect accuracy.
|
||||
|
||||
- **Global sites** (GitHub, YouTube, Reddit, Medium, etc.) get **no country tag** — an account there says nothing about where a person is from.
|
||||
- **Regional/local sites** where an account implies a specific country **must** have a country tag: ``VK`` → ``ru``, ``Naver`` → ``kr``, ``Zhihu`` → ``cn``.
|
||||
- Multiple country tags are allowed when a service is used predominantly in a few countries (e.g. ``Xing`` → ``de``, ``eu``).
|
||||
- Do **not** assign country tags based on traffic statistics alone — a site popular in India by traffic is not "Indian" if it is used globally.
|
||||
|
||||
2. **Site engines**. Most of them are forum engines now: ``uCoz``, ``vBulletin``, ``XenForo`` et al. Full list of engines stored in the Maigret database.
|
||||
|
||||
@@ -23,3 +28,19 @@ Usage
|
||||
``--tags coding`` -- search on sites related to software development.
|
||||
|
||||
``--tags ucoz`` -- search on uCoz sites only (mostly CIS countries)
|
||||
|
||||
Blacklisting (excluding) tags
|
||||
------------------------------
|
||||
You can exclude sites with certain tags from the search using ``--exclude-tags``:
|
||||
|
||||
``--exclude-tags porn,dating`` -- skip all sites tagged with ``porn`` or ``dating``.
|
||||
|
||||
``--exclude-tags ru`` -- skip all Russian sites.
|
||||
|
||||
You can combine ``--tags`` and ``--exclude-tags`` to fine-tune your search:
|
||||
|
||||
``--tags forum --exclude-tags ru`` -- search on forum sites, but skip Russian ones.
|
||||
|
||||
In the web interface, the tag cloud supports three states per tag:
|
||||
click once to **include** (green), click again to **exclude** (dark/strikethrough),
|
||||
and click once more to return to **neutral** (red).
|
||||
|
||||
@@ -13,7 +13,7 @@ Use Cases
|
||||
---------
|
||||
|
||||
|
||||
1. Search for accounts with username ``machine42`` on top 500 sites (by default, according to Alexa rank) from the Maigret DB.
|
||||
1. Search for accounts with username ``machine42`` on top 500 sites (by default, according to Majestic Million rank) from the Maigret DB.
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
@@ -33,7 +33,7 @@ Use Cases
|
||||
If you experience many false positives, you can do the following:
|
||||
|
||||
- Install the last development version of Maigret from GitHub
|
||||
- Run Maigret with ``--self-check`` flag and agree on disabling of problematic sites
|
||||
- Run Maigret with ``--self-check --auto-disable`` flag and agree on disabling of problematic sites
|
||||
|
||||
3. Search for accounts with username ``machine42`` and generate HTML and PDF reports.
|
||||
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
"""Maigret version file"""
|
||||
|
||||
__version__ = '0.5.0'
|
||||
__version__ = '0.6.0'
|
||||
|
||||
+3
-14
@@ -30,17 +30,6 @@ class ParsingActivator:
|
||||
jwt_token = r.json()["jwt"]
|
||||
site.headers["Authorization"] = "jwt " + jwt_token
|
||||
|
||||
@staticmethod
|
||||
def spotify(site, logger, cookies={}):
|
||||
headers = dict(site.headers)
|
||||
if "Authorization" in headers:
|
||||
del headers["Authorization"]
|
||||
import requests
|
||||
|
||||
r = requests.get(site.activation["url"])
|
||||
bearer_token = r.json()["accessToken"]
|
||||
site.headers["authorization"] = f"Bearer {bearer_token}"
|
||||
|
||||
@staticmethod
|
||||
def weibo(site, logger):
|
||||
headers = dict(site.headers)
|
||||
@@ -54,7 +43,7 @@ class ParsingActivator:
|
||||
logger.debug(
|
||||
f"1 stage: {'success' if r.status_code == 302 else 'no 302 redirect, fail!'}"
|
||||
)
|
||||
location = r.headers.get("Location")
|
||||
location = r.headers.get("Location", "")
|
||||
|
||||
# 2 stage: go to passport visitor page
|
||||
headers["Referer"] = location
|
||||
@@ -84,9 +73,9 @@ def import_aiohttp_cookies(cookiestxt_filename):
|
||||
cookies = CookieJar()
|
||||
|
||||
cookies_list = []
|
||||
for domain in cookies_obj._cookies.values():
|
||||
for domain in cookies_obj._cookies.values(): # type: ignore[attr-defined]
|
||||
for key, cookie in list(domain.values())[0].items():
|
||||
c = Morsel()
|
||||
c: Morsel = Morsel()
|
||||
c.set(key, cookie.value, cookie.value)
|
||||
c["domain"] = cookie.domain
|
||||
c["path"] = cookie.path
|
||||
|
||||
+287
-141
@@ -6,7 +6,7 @@ import random
|
||||
import re
|
||||
import ssl
|
||||
import sys
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from urllib.parse import quote
|
||||
|
||||
# Third party imports
|
||||
@@ -15,7 +15,7 @@ from alive_progress import alive_bar
|
||||
from aiohttp import ClientSession, TCPConnector, http_exceptions
|
||||
from aiohttp.client_exceptions import ClientConnectorError, ServerDisconnectedError
|
||||
from python_socks import _errors as proxy_errors
|
||||
from socid_extractor import extract
|
||||
from socid_extractor import extract # type: ignore[import-not-found]
|
||||
|
||||
try:
|
||||
from mock import Mock
|
||||
@@ -61,30 +61,49 @@ class SimpleAiohttpChecker(CheckerBase):
|
||||
self.headers = None
|
||||
self.allow_redirects = True
|
||||
self.timeout = 0
|
||||
self.allow_redirects = True
|
||||
self.timeout = 0
|
||||
self.method = 'get'
|
||||
self.payload = None
|
||||
|
||||
def prepare(self, url, headers=None, allow_redirects=True, timeout=0, method='get'):
|
||||
def prepare(self, url, headers=None, allow_redirects=True, timeout=0, method='get', payload=None):
|
||||
self.url = url
|
||||
self.headers = headers
|
||||
self.allow_redirects = allow_redirects
|
||||
self.timeout = timeout
|
||||
self.method = method
|
||||
self.payload = payload
|
||||
return None
|
||||
|
||||
async def close(self):
|
||||
pass
|
||||
|
||||
async def _make_request(
|
||||
self, session, url, headers, allow_redirects, timeout, method, logger
|
||||
) -> Tuple[str, int, Optional[CheckError]]:
|
||||
self, session, url, headers, allow_redirects, timeout, method, logger, payload=None
|
||||
) -> Tuple[Optional[str], int, Optional[CheckError]]:
|
||||
try:
|
||||
request_method = session.get if method == 'get' else session.head
|
||||
async with request_method(
|
||||
url=url,
|
||||
headers=headers,
|
||||
allow_redirects=allow_redirects,
|
||||
timeout=timeout,
|
||||
) as response:
|
||||
if method.lower() == 'get':
|
||||
request_method = session.get
|
||||
elif method.lower() == 'post':
|
||||
request_method = session.post
|
||||
elif method.lower() == 'head':
|
||||
request_method = session.head
|
||||
else:
|
||||
request_method = session.get
|
||||
|
||||
kwargs = {
|
||||
'url': url,
|
||||
'headers': headers,
|
||||
'allow_redirects': allow_redirects,
|
||||
'timeout': timeout,
|
||||
}
|
||||
if payload and method.lower() == 'post':
|
||||
if headers and headers.get('Content-Type') == 'application/x-www-form-urlencoded':
|
||||
kwargs['data'] = payload
|
||||
else:
|
||||
kwargs['json'] = payload
|
||||
|
||||
async with request_method(**kwargs) as response:
|
||||
status_code = response.status
|
||||
response_content = await response.content.read()
|
||||
charset = response.charset or "utf-8"
|
||||
@@ -117,15 +136,21 @@ class SimpleAiohttpChecker(CheckerBase):
|
||||
logger.debug(e, exc_info=True)
|
||||
return None, 0, CheckError("Unexpected", str(e))
|
||||
|
||||
async def check(self) -> Tuple[str, int, Optional[CheckError]]:
|
||||
async def check(self) -> Tuple[Optional[str], int, Optional[CheckError]]:
|
||||
from aiohttp_socks import ProxyConnector
|
||||
|
||||
# Use a real SSL context instead of ssl=False to avoid TLS fingerprinting
|
||||
# blocks by Cloudflare and similar WAFs. Certificate verification is
|
||||
# disabled to handle sites with invalid/expired certs.
|
||||
ssl_context = ssl.create_default_context()
|
||||
ssl_context.check_hostname = False
|
||||
ssl_context.verify_mode = ssl.CERT_NONE
|
||||
|
||||
connector = (
|
||||
ProxyConnector.from_url(self.proxy)
|
||||
if self.proxy
|
||||
else TCPConnector(ssl=False)
|
||||
else TCPConnector(ssl=ssl_context)
|
||||
)
|
||||
connector.verify_ssl = False
|
||||
|
||||
async with ClientSession(
|
||||
connector=connector,
|
||||
@@ -141,6 +166,7 @@ class SimpleAiohttpChecker(CheckerBase):
|
||||
self.timeout,
|
||||
self.method,
|
||||
self.logger,
|
||||
self.payload,
|
||||
)
|
||||
|
||||
if error and str(error) == "Invalid proxy response":
|
||||
@@ -165,11 +191,11 @@ class AiodnsDomainResolver(CheckerBase):
|
||||
self.logger = kwargs.get('logger', Mock())
|
||||
self.resolver = aiodns.DNSResolver(loop=loop)
|
||||
|
||||
def prepare(self, url, headers=None, allow_redirects=True, timeout=0, method='get'):
|
||||
def prepare(self, url, headers=None, allow_redirects=True, timeout=0, method='get', payload=None):
|
||||
self.url = url
|
||||
return None
|
||||
|
||||
async def check(self) -> Tuple[str, int, Optional[CheckError]]:
|
||||
async def check(self) -> Tuple[Optional[str], int, Optional[CheckError]]:
|
||||
status = 404
|
||||
error = None
|
||||
text = ''
|
||||
@@ -187,14 +213,84 @@ class AiodnsDomainResolver(CheckerBase):
|
||||
return text, status, error
|
||||
|
||||
|
||||
try:
|
||||
from curl_cffi.requests import AsyncSession as CurlCffiAsyncSession
|
||||
|
||||
CURL_CFFI_AVAILABLE = True
|
||||
except ImportError:
|
||||
CURL_CFFI_AVAILABLE = False
|
||||
|
||||
|
||||
class CurlCffiChecker(CheckerBase):
|
||||
"""Checker using curl_cffi to emulate browser TLS fingerprint and bypass WAF."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.logger = kwargs.get('logger', Mock())
|
||||
self.browser_emulate = kwargs.get('browser_emulate', 'chrome')
|
||||
self.url = None
|
||||
self.headers = None
|
||||
self.allow_redirects = True
|
||||
self.timeout = 0
|
||||
self.method = 'get'
|
||||
self.payload = None
|
||||
|
||||
def prepare(self, url, headers=None, allow_redirects=True, timeout=0, method='get', payload=None):
|
||||
self.url = url
|
||||
self.headers = headers
|
||||
self.allow_redirects = allow_redirects
|
||||
self.timeout = timeout
|
||||
self.method = method
|
||||
self.payload = payload
|
||||
return None
|
||||
|
||||
async def close(self):
|
||||
pass
|
||||
|
||||
async def check(self) -> Tuple[Optional[str], int, Optional[CheckError]]:
|
||||
try:
|
||||
async with CurlCffiAsyncSession() as session:
|
||||
kwargs = {
|
||||
'url': self.url,
|
||||
'headers': self.headers,
|
||||
'allow_redirects': self.allow_redirects,
|
||||
'timeout': self.timeout if self.timeout else 10,
|
||||
'impersonate': self.browser_emulate,
|
||||
}
|
||||
if self.payload and self.method.lower() == 'post':
|
||||
kwargs['json'] = self.payload
|
||||
|
||||
if self.method.lower() == 'post':
|
||||
response = await session.post(**kwargs)
|
||||
elif self.method.lower() == 'head':
|
||||
response = await session.head(**kwargs)
|
||||
else:
|
||||
response = await session.get(**kwargs)
|
||||
|
||||
status_code = response.status_code
|
||||
decoded_content = response.text
|
||||
|
||||
self.logger.debug(decoded_content)
|
||||
|
||||
error = CheckError("Connection lost") if status_code == 0 else None
|
||||
return decoded_content, status_code, error
|
||||
|
||||
except asyncio.TimeoutError as e:
|
||||
return None, 0, CheckError("Request timeout", str(e))
|
||||
except KeyboardInterrupt:
|
||||
return None, 0, CheckError("Interrupted")
|
||||
except Exception as e:
|
||||
self.logger.debug(e, exc_info=True)
|
||||
return None, 0, CheckError("Unexpected", str(e))
|
||||
|
||||
|
||||
class CheckerMock:
|
||||
def __init__(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
def prepare(self, url, headers=None, allow_redirects=True, timeout=0, method='get'):
|
||||
def prepare(self, url, headers=None, allow_redirects=True, timeout=0, method='get', payload=None):
|
||||
return None
|
||||
|
||||
async def check(self) -> Tuple[str, int, Optional[CheckError]]:
|
||||
async def check(self) -> Tuple[Optional[str], int, Optional[CheckError]]:
|
||||
await asyncio.sleep(0)
|
||||
return '', 0, None
|
||||
|
||||
@@ -220,6 +316,11 @@ def detect_error_page(
|
||||
if status_code == 403 and not ignore_403:
|
||||
return CheckError("Access denied", "403 status code, use proxy/vpn")
|
||||
|
||||
elif status_code == 999:
|
||||
# LinkedIn anti-bot / HTTP 999 workaround. It shouldn't trigger an infrastructure
|
||||
# Server Error because it represents a valid "Not Found / Blocked" state for the username.
|
||||
pass
|
||||
|
||||
elif status_code >= 500:
|
||||
return CheckError("Server", f"{status_code} status code")
|
||||
|
||||
@@ -438,8 +539,18 @@ def make_site_result(
|
||||
# workaround to prevent slash errors
|
||||
url = re.sub("(?<!:)/+", "/", url)
|
||||
|
||||
# always clearweb_checker for now
|
||||
checker = options["checkers"][site.protocol]
|
||||
# Select checker: use curl_cffi for sites requiring TLS impersonation
|
||||
needs_impersonation = 'tls_fingerprint' in site.protection
|
||||
if needs_impersonation and CURL_CFFI_AVAILABLE:
|
||||
checker = CurlCffiChecker(logger=logger, browser_emulate='chrome')
|
||||
elif needs_impersonation and not CURL_CFFI_AVAILABLE:
|
||||
logger.warning(
|
||||
f"Site {site.name} requires TLS impersonation (curl_cffi) but it's not installed. "
|
||||
"Install with: pip install curl_cffi"
|
||||
)
|
||||
checker = options["checkers"][site.protocol]
|
||||
else:
|
||||
checker = options["checkers"][site.protocol]
|
||||
|
||||
# site check is disabled
|
||||
if site.disabled and not options['forced']:
|
||||
@@ -494,7 +605,9 @@ def make_site_result(
|
||||
for k, v in site.get_params.items():
|
||||
url_probe += f"&{k}={v}"
|
||||
|
||||
if site.check_type == "status_code" and site.request_head_only:
|
||||
if site.request_method:
|
||||
request_method = site.request_method.lower()
|
||||
elif site.check_type == "status_code" and site.request_head_only:
|
||||
# In most cases when we are detecting by status code,
|
||||
# it is not necessary to get the entire body: we can
|
||||
# detect fine with just the HEAD response.
|
||||
@@ -505,6 +618,15 @@ def make_site_result(
|
||||
# not respond properly unless we request the whole page.
|
||||
request_method = 'get'
|
||||
|
||||
payload = None
|
||||
if site.request_payload:
|
||||
payload = {}
|
||||
for k, v in site.request_payload.items():
|
||||
if isinstance(v, str):
|
||||
payload[k] = v.format(username=username)
|
||||
else:
|
||||
payload[k] = v
|
||||
|
||||
if site.check_type == "response_url":
|
||||
# Site forwards request to a different URL if username not
|
||||
# found. Disallow the redirect so we can capture the
|
||||
@@ -521,6 +643,7 @@ def make_site_result(
|
||||
headers=headers,
|
||||
allow_redirects=allow_redirects,
|
||||
timeout=options['timeout'],
|
||||
payload=payload,
|
||||
)
|
||||
|
||||
# Store future request object in the results object
|
||||
@@ -577,6 +700,7 @@ async def check_site_for_username(
|
||||
allow_redirects=checker.allow_redirects,
|
||||
timeout=checker.timeout,
|
||||
method=checker.method,
|
||||
payload=getattr(checker, 'payload', None),
|
||||
)
|
||||
response = await checker.check()
|
||||
|
||||
@@ -761,7 +885,7 @@ async def maigret(
|
||||
with alive_bar(
|
||||
len(tasks_dict), title="Searching", force_tty=True, disable=no_progressbar
|
||||
) as progress:
|
||||
async for result in executor.run(tasks_dict.values()):
|
||||
async for result in executor.run(list(tasks_dict.values())): # type: ignore[arg-type]
|
||||
cur_results.append(result)
|
||||
progress()
|
||||
|
||||
@@ -837,135 +961,149 @@ async def site_self_check(
|
||||
If False (default), only report issues without disabling.
|
||||
diagnose: If True, print detailed diagnosis information.
|
||||
"""
|
||||
changes = {
|
||||
changes: Dict[str, Any] = {
|
||||
"disabled": False,
|
||||
"issues": [],
|
||||
"recommendations": [],
|
||||
}
|
||||
|
||||
check_data = [
|
||||
(site.username_claimed, MaigretCheckStatus.CLAIMED),
|
||||
(site.username_unclaimed, MaigretCheckStatus.AVAILABLE),
|
||||
]
|
||||
try:
|
||||
check_data = [
|
||||
(site.username_claimed, MaigretCheckStatus.CLAIMED),
|
||||
(site.username_unclaimed, MaigretCheckStatus.AVAILABLE),
|
||||
]
|
||||
|
||||
logger.info(f"Checking {site.name}...")
|
||||
logger.info(f"Checking {site.name}...")
|
||||
|
||||
results_cache = {}
|
||||
results_cache = {}
|
||||
|
||||
for username, status in check_data:
|
||||
async with semaphore:
|
||||
results_dict = await maigret(
|
||||
username=username,
|
||||
site_dict={site.name: site},
|
||||
logger=logger,
|
||||
timeout=30,
|
||||
id_type=site.type,
|
||||
forced=True,
|
||||
no_progressbar=True,
|
||||
retries=1,
|
||||
proxy=proxy,
|
||||
tor_proxy=tor_proxy,
|
||||
i2p_proxy=i2p_proxy,
|
||||
cookies=cookies,
|
||||
)
|
||||
|
||||
# don't disable entries with other ids types
|
||||
# TODO: make normal checking
|
||||
if site.name not in results_dict:
|
||||
logger.info(results_dict)
|
||||
changes["issues"].append(f"Site {site.name} not in results (wrong id_type?)")
|
||||
if auto_disable:
|
||||
changes["disabled"] = True
|
||||
continue
|
||||
|
||||
logger.debug(results_dict)
|
||||
|
||||
result = results_dict[site.name]["status"]
|
||||
results_cache[username] = results_dict[site.name]
|
||||
|
||||
if result.error and 'Cannot connect to host' in result.error.desc:
|
||||
changes["issues"].append(f"Cannot connect to host")
|
||||
if auto_disable:
|
||||
changes["disabled"] = True
|
||||
|
||||
site_status = result.status
|
||||
|
||||
if site_status != status:
|
||||
if site_status == MaigretCheckStatus.UNKNOWN:
|
||||
msgs = site.absence_strs
|
||||
etype = site.check_type
|
||||
error_msg = f"Error checking {username}: {result.context}"
|
||||
changes["issues"].append(error_msg)
|
||||
logger.warning(
|
||||
f"Error while searching {username} in {site.name}: {result.context}, {msgs}, type {etype}"
|
||||
for username, status in check_data:
|
||||
async with semaphore:
|
||||
results_dict = await maigret(
|
||||
username=username,
|
||||
site_dict={site.name: site},
|
||||
logger=logger,
|
||||
timeout=30,
|
||||
id_type=site.type,
|
||||
forced=True,
|
||||
no_progressbar=True,
|
||||
retries=1,
|
||||
proxy=proxy,
|
||||
tor_proxy=tor_proxy,
|
||||
i2p_proxy=i2p_proxy,
|
||||
cookies=cookies,
|
||||
)
|
||||
# don't disable sites after the error
|
||||
# meaning that the site could be available, but returned error for the check
|
||||
# e.g. many sites protected by cloudflare and available in general
|
||||
if skip_errors:
|
||||
pass
|
||||
# don't disable in case of available username
|
||||
elif status == MaigretCheckStatus.CLAIMED and auto_disable:
|
||||
changes["disabled"] = True
|
||||
elif status == MaigretCheckStatus.CLAIMED:
|
||||
changes["issues"].append(f"Claimed user '{username}' not detected as claimed")
|
||||
logger.warning(
|
||||
f"Not found `{username}` in {site.name}, must be claimed"
|
||||
)
|
||||
logger.info(results_dict[site.name])
|
||||
if auto_disable:
|
||||
changes["disabled"] = True
|
||||
else:
|
||||
changes["issues"].append(f"Unclaimed user '{username}' detected as claimed")
|
||||
logger.warning(f"Found `{username}` in {site.name}, must be available")
|
||||
logger.info(results_dict[site.name])
|
||||
|
||||
# don't disable entries with other ids types
|
||||
# TODO: make normal checking
|
||||
if site.name not in results_dict:
|
||||
logger.info(results_dict)
|
||||
changes["issues"].append(f"Site {site.name} not in results (wrong id_type?)")
|
||||
if auto_disable:
|
||||
changes["disabled"] = True
|
||||
continue
|
||||
|
||||
logger.debug(results_dict)
|
||||
|
||||
result = results_dict[site.name]["status"]
|
||||
results_cache[username] = results_dict[site.name]
|
||||
|
||||
if result.error and 'Cannot connect to host' in result.error.desc:
|
||||
changes["issues"].append("Cannot connect to host")
|
||||
if auto_disable:
|
||||
changes["disabled"] = True
|
||||
|
||||
logger.info(f"Site {site.name} checking is finished")
|
||||
site_status = result.status
|
||||
|
||||
# Generate recommendations based on issues
|
||||
if changes["issues"] and len(results_cache) == 2:
|
||||
claimed_result = results_cache.get(site.username_claimed, {})
|
||||
unclaimed_result = results_cache.get(site.username_unclaimed, {})
|
||||
if site_status != status:
|
||||
if site_status == MaigretCheckStatus.UNKNOWN:
|
||||
msgs = site.absence_strs
|
||||
etype = site.check_type
|
||||
error_msg = f"Error checking {username}: {result.context}"
|
||||
changes["issues"].append(error_msg)
|
||||
logger.warning(
|
||||
f"Error while searching {username} in {site.name}: {result.context}, {msgs}, type {etype}"
|
||||
)
|
||||
# don't disable sites after the error
|
||||
# meaning that the site could be available, but returned error for the check
|
||||
# e.g. many sites protected by cloudflare and available in general
|
||||
if skip_errors:
|
||||
pass
|
||||
# don't disable in case of available username
|
||||
elif status == MaigretCheckStatus.CLAIMED and auto_disable:
|
||||
changes["disabled"] = True
|
||||
elif status == MaigretCheckStatus.CLAIMED:
|
||||
changes["issues"].append(f"Claimed user '{username}' not detected as claimed")
|
||||
logger.warning(
|
||||
f"Not found `{username}` in {site.name}, must be claimed"
|
||||
)
|
||||
logger.info(results_dict[site.name])
|
||||
if auto_disable:
|
||||
changes["disabled"] = True
|
||||
else:
|
||||
changes["issues"].append(f"Unclaimed user '{username}' detected as claimed")
|
||||
logger.warning(f"Found `{username}` in {site.name}, must be available")
|
||||
logger.info(results_dict[site.name])
|
||||
if auto_disable:
|
||||
changes["disabled"] = True
|
||||
|
||||
claimed_http = claimed_result.get("http_status")
|
||||
unclaimed_http = unclaimed_result.get("http_status")
|
||||
logger.info(f"Site {site.name} checking is finished")
|
||||
|
||||
if claimed_http and unclaimed_http:
|
||||
if claimed_http != unclaimed_http and site.check_type != "status_code":
|
||||
changes["recommendations"].append(
|
||||
f"Consider checkType: status_code (HTTP {claimed_http} vs {unclaimed_http})"
|
||||
)
|
||||
# Generate recommendations based on issues
|
||||
if changes["issues"] and len(results_cache) == 2:
|
||||
claimed_result = results_cache.get(site.username_claimed, {})
|
||||
unclaimed_result = results_cache.get(site.username_unclaimed, {})
|
||||
|
||||
# Print diagnosis if requested
|
||||
if diagnose and changes["issues"]:
|
||||
print(f"\n--- {site.name} DIAGNOSIS ---")
|
||||
print(f" Check type: {site.check_type}")
|
||||
print(f" Issues:")
|
||||
for issue in changes["issues"]:
|
||||
print(f" - {issue}")
|
||||
if changes["recommendations"]:
|
||||
print(f" Recommendations:")
|
||||
for rec in changes["recommendations"]:
|
||||
print(f" -> {rec}")
|
||||
claimed_http = claimed_result.get("http_status")
|
||||
unclaimed_http = unclaimed_result.get("http_status")
|
||||
|
||||
# Only modify site if auto_disable is enabled
|
||||
if auto_disable and changes["disabled"] != site.disabled:
|
||||
site.disabled = changes["disabled"]
|
||||
logger.info(f"Switching property 'disabled' for {site.name} to {site.disabled}")
|
||||
db.update_site(site)
|
||||
if not silent:
|
||||
action = "Disabled" if site.disabled else "Enabled"
|
||||
print(f"{action} site {site.name}...")
|
||||
elif changes["issues"] and not silent and not diagnose:
|
||||
# Report issues without disabling
|
||||
print(f"Issues found in {site.name}: {len(changes['issues'])} (not auto-disabled)")
|
||||
if claimed_http and unclaimed_http:
|
||||
if claimed_http != unclaimed_http and site.check_type != "status_code":
|
||||
changes["recommendations"].append(
|
||||
f"Consider checkType: status_code (HTTP {claimed_http} vs {unclaimed_http})"
|
||||
)
|
||||
|
||||
# remove service tag "unchecked"
|
||||
if "unchecked" in site.tags:
|
||||
site.tags.remove("unchecked")
|
||||
db.update_site(site)
|
||||
# Print diagnosis if requested
|
||||
if diagnose and changes["issues"]:
|
||||
print(f"\n--- {site.name} DIAGNOSIS ---")
|
||||
print(f" Check type: {site.check_type}")
|
||||
print(" Issues:")
|
||||
for issue in changes["issues"]:
|
||||
print(f" - {issue}")
|
||||
if changes["recommendations"]:
|
||||
print(" Recommendations:")
|
||||
for rec in changes["recommendations"]:
|
||||
print(f" -> {rec}")
|
||||
|
||||
# Only modify site if auto_disable is enabled
|
||||
if auto_disable and changes["disabled"] != site.disabled:
|
||||
site.disabled = changes["disabled"]
|
||||
logger.info(f"Switching property 'disabled' for {site.name} to {site.disabled}")
|
||||
db.update_site(site)
|
||||
if not silent:
|
||||
action = "Disabled" if site.disabled else "Enabled"
|
||||
print(f"{action} site {site.name}...")
|
||||
elif changes["issues"] and not silent and not diagnose:
|
||||
# Report issues without disabling
|
||||
print(f"Issues found in {site.name}: {len(changes['issues'])} (not auto-disabled)")
|
||||
|
||||
# remove service tag "unchecked"
|
||||
if "unchecked" in site.tags:
|
||||
site.tags.remove("unchecked")
|
||||
db.update_site(site)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Self-check of {site.name} failed with unexpected error: {e}",
|
||||
exc_info=True,
|
||||
)
|
||||
changes["issues"].append(f"Unexpected error: {e}")
|
||||
if auto_disable and not site.disabled:
|
||||
changes["disabled"] = True
|
||||
site.disabled = True
|
||||
db.update_site(site)
|
||||
if not silent:
|
||||
print(f"Disabled site {site.name} (unexpected error)...")
|
||||
|
||||
return changes
|
||||
|
||||
@@ -981,6 +1119,7 @@ async def self_check(
|
||||
i2p_proxy=None,
|
||||
auto_disable=False,
|
||||
diagnose=False,
|
||||
no_progressbar=False,
|
||||
) -> dict:
|
||||
"""
|
||||
Run self-check on sites.
|
||||
@@ -1015,9 +1154,20 @@ async def self_check(
|
||||
tasks.append((site.name, future))
|
||||
|
||||
if tasks:
|
||||
with alive_bar(len(tasks), title='Self-checking', force_tty=True) as progress:
|
||||
with alive_bar(len(tasks), title='Self-checking', force_tty=True, disable=no_progressbar) as progress:
|
||||
for site_name, f in tasks:
|
||||
result = await f
|
||||
try:
|
||||
result = await f
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Self-check task for {site_name} raised unexpected error: {e}",
|
||||
exc_info=True,
|
||||
)
|
||||
result = {
|
||||
"disabled": False,
|
||||
"issues": [f"Unexpected error: {e}"],
|
||||
"recommendations": [],
|
||||
}
|
||||
result['site_name'] = site_name
|
||||
all_results.append(result)
|
||||
progress() # Update the progress bar
|
||||
@@ -1053,10 +1203,6 @@ async def self_check(
|
||||
|
||||
needs_update = total_disabled != 0 or unchecked_new_count != unchecked_old_count
|
||||
|
||||
# For backwards compatibility, return bool if auto_disable is True
|
||||
if auto_disable:
|
||||
return needs_update
|
||||
|
||||
return {
|
||||
'needs_update': needs_update,
|
||||
'results': all_results,
|
||||
@@ -1080,7 +1226,7 @@ def parse_usernames(extracted_ids_data, logger) -> Dict:
|
||||
elif "usernames" in k:
|
||||
try:
|
||||
tree = ast.literal_eval(v)
|
||||
if type(tree) == list:
|
||||
if isinstance(tree, list):
|
||||
for n in tree:
|
||||
new_usernames[n] = "username"
|
||||
except Exception as e:
|
||||
|
||||
@@ -0,0 +1,342 @@
|
||||
"""
|
||||
Database auto-update logic for maigret.
|
||||
|
||||
Checks a lightweight meta file to determine if a newer site database is available,
|
||||
downloads it if compatible, and caches it locally in ~/.maigret/.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import os.path as path
|
||||
import tempfile
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
from colorama import Fore, Style
|
||||
|
||||
from .__version__ import __version__
|
||||
|
||||
logger = logging.getLogger("maigret")
|
||||
|
||||
_use_color = True
|
||||
|
||||
|
||||
def _print_info(msg: str) -> None:
|
||||
text = f"[*] {msg}"
|
||||
if _use_color:
|
||||
print(Style.BRIGHT + Fore.GREEN + text + Style.RESET_ALL)
|
||||
else:
|
||||
print(text)
|
||||
|
||||
|
||||
def _print_success(msg: str) -> None:
|
||||
text = f"[+] {msg}"
|
||||
if _use_color:
|
||||
print(Style.BRIGHT + Fore.GREEN + text + Style.RESET_ALL)
|
||||
else:
|
||||
print(text)
|
||||
|
||||
|
||||
def _print_warning(msg: str) -> None:
|
||||
text = f"[!] {msg}"
|
||||
if _use_color:
|
||||
print(Style.BRIGHT + Fore.YELLOW + text + Style.RESET_ALL)
|
||||
else:
|
||||
print(text)
|
||||
|
||||
|
||||
DEFAULT_META_URL = (
|
||||
"https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/db_meta.json"
|
||||
)
|
||||
DEFAULT_CHECK_INTERVAL_HOURS = 24
|
||||
MAIGRET_HOME = path.expanduser("~/.maigret")
|
||||
CACHED_DB_PATH = path.join(MAIGRET_HOME, "data.json")
|
||||
STATE_PATH = path.join(MAIGRET_HOME, "autoupdate_state.json")
|
||||
BUNDLED_DB_PATH = path.join(path.dirname(path.realpath(__file__)), "resources", "data.json")
|
||||
|
||||
|
||||
def _parse_version(version_str: str) -> tuple:
|
||||
"""Parse a version string like '0.5.0' into a comparable tuple (0, 5, 0)."""
|
||||
try:
|
||||
return tuple(int(x) for x in version_str.strip().split("."))
|
||||
except (ValueError, AttributeError):
|
||||
return (0, 0, 0)
|
||||
|
||||
|
||||
def _ensure_maigret_home() -> None:
|
||||
os.makedirs(MAIGRET_HOME, exist_ok=True)
|
||||
|
||||
|
||||
def _load_state() -> dict:
|
||||
try:
|
||||
with open(STATE_PATH, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
except (FileNotFoundError, json.JSONDecodeError, OSError):
|
||||
return {}
|
||||
|
||||
|
||||
def _save_state(state: dict) -> None:
|
||||
_ensure_maigret_home()
|
||||
tmp_path = STATE_PATH + ".tmp"
|
||||
try:
|
||||
with open(tmp_path, "w", encoding="utf-8") as f:
|
||||
json.dump(state, f, indent=2, ensure_ascii=False)
|
||||
os.replace(tmp_path, STATE_PATH)
|
||||
except OSError:
|
||||
try:
|
||||
os.unlink(tmp_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def _needs_check(state: dict, interval_hours: int) -> bool:
|
||||
last_check = state.get("last_check_at")
|
||||
if not last_check:
|
||||
return True
|
||||
try:
|
||||
last_dt = datetime.fromisoformat(last_check.replace("Z", "+00:00"))
|
||||
elapsed = (datetime.now(timezone.utc) - last_dt).total_seconds() / 3600
|
||||
return elapsed >= interval_hours
|
||||
except (ValueError, TypeError):
|
||||
return True
|
||||
|
||||
|
||||
def _fetch_meta(meta_url: str, timeout: int = 10) -> Optional[dict]:
|
||||
try:
|
||||
response = requests.get(meta_url, timeout=timeout)
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def _is_version_compatible(meta: dict) -> bool:
|
||||
min_ver = meta.get("min_maigret_version", "0.0.0")
|
||||
return _parse_version(__version__) >= _parse_version(min_ver)
|
||||
|
||||
|
||||
def _is_update_available(meta: dict, state: dict) -> bool:
|
||||
if not path.isfile(CACHED_DB_PATH):
|
||||
return True
|
||||
remote_date = meta.get("updated_at", "")
|
||||
cached_date = state.get("last_meta", {}).get("updated_at", "")
|
||||
return remote_date > cached_date
|
||||
|
||||
|
||||
def _download_and_verify(data_url: str, expected_sha256: str, timeout: int = 60) -> Optional[str]:
|
||||
_ensure_maigret_home()
|
||||
tmp_fd, tmp_path = tempfile.mkstemp(dir=MAIGRET_HOME, suffix=".json")
|
||||
try:
|
||||
response = requests.get(data_url, timeout=timeout)
|
||||
if response.status_code != 200:
|
||||
return None
|
||||
|
||||
content = response.content
|
||||
actual_sha256 = hashlib.sha256(content).hexdigest()
|
||||
if actual_sha256 != expected_sha256:
|
||||
_print_warning("DB auto-update: SHA-256 mismatch, download rejected")
|
||||
return None
|
||||
|
||||
# Validate JSON structure
|
||||
data = json.loads(content)
|
||||
if not all(k in data for k in ("sites", "engines", "tags")):
|
||||
_print_warning("DB auto-update: invalid database structure")
|
||||
return None
|
||||
|
||||
os.write(tmp_fd, content)
|
||||
os.close(tmp_fd)
|
||||
tmp_fd = None
|
||||
os.replace(tmp_path, CACHED_DB_PATH)
|
||||
return CACHED_DB_PATH
|
||||
except Exception:
|
||||
return None
|
||||
finally:
|
||||
if tmp_fd is not None:
|
||||
os.close(tmp_fd)
|
||||
try:
|
||||
os.unlink(tmp_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def _best_local() -> str:
|
||||
"""Return cached DB if it exists and is valid, otherwise bundled."""
|
||||
if path.isfile(CACHED_DB_PATH):
|
||||
try:
|
||||
with open(CACHED_DB_PATH, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
if "sites" in data:
|
||||
return CACHED_DB_PATH
|
||||
except (json.JSONDecodeError, OSError):
|
||||
pass
|
||||
return BUNDLED_DB_PATH
|
||||
|
||||
|
||||
def _now_iso() -> str:
|
||||
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
|
||||
|
||||
def resolve_db_path(
|
||||
db_file_arg: str,
|
||||
no_autoupdate: bool = False,
|
||||
meta_url: str = DEFAULT_META_URL,
|
||||
check_interval_hours: int = DEFAULT_CHECK_INTERVAL_HOURS,
|
||||
color: bool = True,
|
||||
) -> str:
|
||||
"""
|
||||
Determine which database file to use, potentially downloading an update.
|
||||
|
||||
Returns the path to the database file that should be loaded.
|
||||
"""
|
||||
global _use_color
|
||||
_use_color = color
|
||||
|
||||
default_db_name = "resources/data.json"
|
||||
|
||||
# User specified a custom DB — skip auto-update
|
||||
is_url = db_file_arg.startswith("http://") or db_file_arg.startswith("https://")
|
||||
is_default = db_file_arg == default_db_name
|
||||
if is_url:
|
||||
return db_file_arg
|
||||
if not is_default:
|
||||
# Try the path as-is (absolute or relative to cwd) first.
|
||||
if path.isfile(db_file_arg):
|
||||
return path.abspath(db_file_arg)
|
||||
# Fall back to legacy behavior: resolve relative to the maigret module dir.
|
||||
module_relative = path.join(path.dirname(path.realpath(__file__)), db_file_arg)
|
||||
if module_relative != db_file_arg and path.isfile(module_relative):
|
||||
return module_relative
|
||||
if module_relative != db_file_arg:
|
||||
raise FileNotFoundError(
|
||||
f"Custom database file not found: {db_file_arg!r} "
|
||||
f"(also tried {module_relative!r})"
|
||||
)
|
||||
raise FileNotFoundError(f"Custom database file not found: {db_file_arg!r}")
|
||||
|
||||
# Auto-update disabled
|
||||
if no_autoupdate:
|
||||
return _best_local()
|
||||
|
||||
# Check interval
|
||||
_ensure_maigret_home()
|
||||
state = _load_state()
|
||||
if not _needs_check(state, check_interval_hours):
|
||||
return _best_local()
|
||||
|
||||
# Time to check
|
||||
_print_info("DB auto-update: checking for updates...")
|
||||
meta = _fetch_meta(meta_url)
|
||||
if meta is None:
|
||||
_print_warning("DB auto-update: could not reach update server, using local database")
|
||||
state["last_check_at"] = _now_iso()
|
||||
_save_state(state)
|
||||
return _best_local()
|
||||
|
||||
# Version compatibility
|
||||
if not _is_version_compatible(meta):
|
||||
min_ver = meta.get("min_maigret_version", "?")
|
||||
_print_warning(
|
||||
f"DB auto-update: latest database requires maigret >= {min_ver}, "
|
||||
f"you have {__version__}. Please upgrade with: pip install -U maigret"
|
||||
)
|
||||
state["last_check_at"] = _now_iso()
|
||||
_save_state(state)
|
||||
return _best_local()
|
||||
|
||||
# Check if update available
|
||||
if not _is_update_available(meta, state):
|
||||
sites_count = meta.get("sites_count", "?")
|
||||
_print_info(f"DB auto-update: database is up to date ({sites_count} sites)")
|
||||
state["last_check_at"] = _now_iso()
|
||||
state["last_meta"] = meta
|
||||
_save_state(state)
|
||||
return _best_local()
|
||||
|
||||
# Download update
|
||||
new_count = meta.get("sites_count", "?")
|
||||
old_count = state.get("last_meta", {}).get("sites_count")
|
||||
if old_count:
|
||||
_print_info(f"DB auto-update: downloading updated database ({new_count} sites, was {old_count})...")
|
||||
else:
|
||||
_print_info(f"DB auto-update: downloading database ({new_count} sites)...")
|
||||
|
||||
data_url = meta.get("data_url", "")
|
||||
expected_sha = meta.get("data_sha256", "")
|
||||
result = _download_and_verify(data_url, expected_sha)
|
||||
|
||||
if result is None:
|
||||
_print_warning("DB auto-update: download failed, using local database")
|
||||
state["last_check_at"] = _now_iso()
|
||||
_save_state(state)
|
||||
return _best_local()
|
||||
|
||||
_print_success(f"DB auto-update: database updated successfully ({new_count} sites)")
|
||||
state["last_check_at"] = _now_iso()
|
||||
state["last_meta"] = meta
|
||||
state["cached_db_sha256"] = expected_sha
|
||||
_save_state(state)
|
||||
return CACHED_DB_PATH
|
||||
|
||||
|
||||
def force_update(
|
||||
meta_url: str = DEFAULT_META_URL,
|
||||
color: bool = True,
|
||||
) -> bool:
|
||||
"""
|
||||
Force check for database updates and download if available.
|
||||
|
||||
Returns True if database was updated, False otherwise.
|
||||
"""
|
||||
global _use_color
|
||||
_use_color = color
|
||||
|
||||
_ensure_maigret_home()
|
||||
|
||||
_print_info("DB update: checking for updates...")
|
||||
meta = _fetch_meta(meta_url)
|
||||
if meta is None:
|
||||
_print_warning("DB update: could not reach update server")
|
||||
return False
|
||||
|
||||
if not _is_version_compatible(meta):
|
||||
min_ver = meta.get("min_maigret_version", "?")
|
||||
_print_warning(
|
||||
f"DB update: latest database requires maigret >= {min_ver}, "
|
||||
f"you have {__version__}. Please upgrade with: pip install -U maigret"
|
||||
)
|
||||
return False
|
||||
|
||||
state = _load_state()
|
||||
new_count = meta.get("sites_count", "?")
|
||||
old_count = state.get("last_meta", {}).get("sites_count")
|
||||
|
||||
if not _is_update_available(meta, state):
|
||||
_print_info(f"DB update: database is already up to date ({new_count} sites)")
|
||||
state["last_check_at"] = _now_iso()
|
||||
state["last_meta"] = meta
|
||||
_save_state(state)
|
||||
return False
|
||||
|
||||
if old_count:
|
||||
_print_info(f"DB update: downloading updated database ({new_count} sites, was {old_count})...")
|
||||
else:
|
||||
_print_info(f"DB update: downloading database ({new_count} sites)...")
|
||||
|
||||
data_url = meta.get("data_url", "")
|
||||
expected_sha = meta.get("data_sha256", "")
|
||||
result = _download_and_verify(data_url, expected_sha)
|
||||
|
||||
if result is None:
|
||||
_print_warning("DB update: download failed")
|
||||
return False
|
||||
|
||||
_print_success(f"DB update: database updated successfully ({new_count} sites)")
|
||||
state["last_check_at"] = _now_iso()
|
||||
state["last_meta"] = meta
|
||||
state["cached_db_sha256"] = expected_sha
|
||||
_save_state(state)
|
||||
return True
|
||||
@@ -58,6 +58,8 @@ COMMON_ERRORS = {
|
||||
'Censorship', 'MGTS'
|
||||
),
|
||||
'Incapsula incident ID': CheckError('Bot protection', 'Incapsula'),
|
||||
'<title>Client Challenge</title>': CheckError('Bot protection', 'Anti-bot challenge'),
|
||||
'<title>DDoS-Guard</title>': CheckError('Bot protection', 'DDoS-Guard'),
|
||||
'Сайт заблокирован хостинг-провайдером': CheckError(
|
||||
'Site-specific', 'Site is disabled (Beget)'
|
||||
),
|
||||
|
||||
@@ -103,7 +103,7 @@ class AsyncioProgressbarQueueExecutor(AsyncExecutor):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.workers_count = kwargs.get('in_parallel', 10)
|
||||
self.queue = asyncio.Queue(self.workers_count)
|
||||
self.queue: asyncio.Queue = asyncio.Queue(self.workers_count)
|
||||
self.timeout = kwargs.get('timeout')
|
||||
# Pass a progress function; alive_bar by default
|
||||
self.progress_func = kwargs.get('progress_func', alive_bar)
|
||||
@@ -184,10 +184,10 @@ class AsyncioQueueGeneratorExecutor:
|
||||
# Deprecated: will be removed soon, don't use it
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.workers_count = kwargs.get('in_parallel', 10)
|
||||
self.queue = asyncio.Queue()
|
||||
self.queue: asyncio.Queue = asyncio.Queue()
|
||||
self.timeout = kwargs.get('timeout')
|
||||
self.logger = kwargs['logger']
|
||||
self._results = asyncio.Queue()
|
||||
self._results: asyncio.Queue = asyncio.Queue()
|
||||
self._stop_signal = object()
|
||||
|
||||
async def worker(self):
|
||||
@@ -209,7 +209,7 @@ class AsyncioQueueGeneratorExecutor:
|
||||
result = kwargs.get('default')
|
||||
await self._results.put(result)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error in worker: {e}")
|
||||
self.logger.error(f"Error in worker: {e}", exc_info=True)
|
||||
finally:
|
||||
self.queue.task_done()
|
||||
|
||||
|
||||
+91
-12
@@ -13,7 +13,7 @@ from argparse import ArgumentParser, RawDescriptionHelpFormatter
|
||||
from typing import List, Tuple
|
||||
import os.path as path
|
||||
|
||||
from socid_extractor import extract, parse
|
||||
from socid_extractor import extract, parse # type: ignore[import-not-found]
|
||||
|
||||
from .__version__ import __version__
|
||||
from .checking import (
|
||||
@@ -37,6 +37,7 @@ from .report import (
|
||||
get_plaintext_report,
|
||||
sort_report_by_data_points,
|
||||
save_graph_report,
|
||||
save_markdown_report,
|
||||
)
|
||||
from .sites import MaigretDatabase
|
||||
from .submit import Submitter
|
||||
@@ -75,7 +76,7 @@ def extract_ids_from_page(url, logger, timeout=5) -> dict:
|
||||
elif 'usernames' in k:
|
||||
try:
|
||||
tree = ast.literal_eval(v)
|
||||
if type(tree) == list:
|
||||
if isinstance(tree, list):
|
||||
for n in tree:
|
||||
results[n] = 'username'
|
||||
except Exception as e:
|
||||
@@ -201,6 +202,20 @@ def setup_arguments_parser(settings: Settings):
|
||||
default=settings.sites_db_path,
|
||||
help="Load Maigret database from a JSON file or HTTP web resource.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-autoupdate",
|
||||
action="store_true",
|
||||
dest="no_autoupdate",
|
||||
default=settings.no_autoupdate,
|
||||
help="Disable automatic database updates on startup.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--force-update",
|
||||
action="store_true",
|
||||
dest="force_update",
|
||||
default=False,
|
||||
help="Force check for database updates and download if available.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cookies-jar-file",
|
||||
metavar="COOKIE_FILE",
|
||||
@@ -277,6 +292,12 @@ def setup_arguments_parser(settings: Settings):
|
||||
filter_group.add_argument(
|
||||
"--tags", dest="tags", default='', help="Specify tags of sites (see `--stats`)."
|
||||
)
|
||||
filter_group.add_argument(
|
||||
"--exclude-tags",
|
||||
dest="exclude_tags",
|
||||
default='',
|
||||
help="Specify tags to exclude from search (blacklist).",
|
||||
)
|
||||
filter_group.add_argument(
|
||||
"--site",
|
||||
action="append",
|
||||
@@ -445,6 +466,14 @@ def setup_arguments_parser(settings: Settings):
|
||||
default=settings.pdf_report,
|
||||
help="Generate a PDF report (general report on all usernames).",
|
||||
)
|
||||
report_group.add_argument(
|
||||
"-M",
|
||||
"--md",
|
||||
action="store_true",
|
||||
dest="md",
|
||||
default=settings.md_report,
|
||||
help="Generate a Markdown report (general report on all usernames).",
|
||||
)
|
||||
report_group.add_argument(
|
||||
"-G",
|
||||
"--graph",
|
||||
@@ -532,9 +561,30 @@ async def main():
|
||||
if args.tags:
|
||||
args.tags = list(set(str(args.tags).split(',')))
|
||||
|
||||
db_file = args.db_file \
|
||||
if (args.db_file.startswith("http://") or args.db_file.startswith("https://")) \
|
||||
else path.join(path.dirname(path.realpath(__file__)), args.db_file)
|
||||
if args.exclude_tags:
|
||||
args.exclude_tags = list(set(str(args.exclude_tags).split(',')))
|
||||
else:
|
||||
args.exclude_tags = []
|
||||
|
||||
from .db_updater import resolve_db_path, force_update, BUNDLED_DB_PATH
|
||||
|
||||
if args.force_update:
|
||||
force_update(
|
||||
meta_url=settings.db_update_meta_url,
|
||||
color=not args.no_color,
|
||||
)
|
||||
|
||||
try:
|
||||
db_file = resolve_db_path(
|
||||
db_file_arg=args.db_file,
|
||||
no_autoupdate=args.no_autoupdate or args.force_update,
|
||||
meta_url=settings.db_update_meta_url,
|
||||
check_interval_hours=settings.autoupdate_check_interval_hours,
|
||||
color=not args.no_color,
|
||||
)
|
||||
except FileNotFoundError as e:
|
||||
logger.error(str(e))
|
||||
sys.exit(2)
|
||||
|
||||
if args.top_sites == 0 or args.all_sites:
|
||||
args.top_sites = sys.maxsize
|
||||
@@ -549,10 +599,25 @@ async def main():
|
||||
)
|
||||
|
||||
# Create object with all information about sites we are aware of.
|
||||
db = MaigretDatabase().load_from_path(db_file)
|
||||
try:
|
||||
db = MaigretDatabase().load_from_path(db_file)
|
||||
query_notify.success(f'Using sites database: {db_file} ({len(db.sites)} sites)')
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load database from {db_file}: {e}")
|
||||
if db_file != BUNDLED_DB_PATH:
|
||||
query_notify.warning(
|
||||
f'Falling back to bundled database: {BUNDLED_DB_PATH}'
|
||||
)
|
||||
db = MaigretDatabase().load_from_path(BUNDLED_DB_PATH)
|
||||
query_notify.success(
|
||||
f'Using sites database: {BUNDLED_DB_PATH} ({len(db.sites)} sites)'
|
||||
)
|
||||
else:
|
||||
raise
|
||||
get_top_sites_for_id = lambda x: db.ranked_sites_dict(
|
||||
top=args.top_sites,
|
||||
tags=args.tags,
|
||||
excluded_tags=args.exclude_tags,
|
||||
names=args.site_list,
|
||||
disabled=args.use_disabled_sites,
|
||||
id_type=x,
|
||||
@@ -588,13 +653,10 @@ async def main():
|
||||
i2p_proxy=args.i2p_proxy,
|
||||
auto_disable=args.auto_disable,
|
||||
diagnose=args.diagnose,
|
||||
no_progressbar=args.no_progressbar,
|
||||
)
|
||||
|
||||
# Handle both old (bool) and new (dict) return types
|
||||
if isinstance(check_result, dict):
|
||||
is_need_update = check_result.get('needs_update', False)
|
||||
else:
|
||||
is_need_update = check_result
|
||||
is_need_update = check_result.get('needs_update', False)
|
||||
|
||||
if is_need_update:
|
||||
if input('Do you want to save changes permanently? [Yn]\n').lower() in (
|
||||
@@ -760,7 +822,7 @@ async def main():
|
||||
|
||||
# reporting for all the result
|
||||
if general_results:
|
||||
if args.html or args.pdf:
|
||||
if args.html or args.pdf or args.md:
|
||||
query_notify.warning('Generating report info...')
|
||||
report_context = generate_report_context(general_results)
|
||||
# determine main username
|
||||
@@ -780,6 +842,23 @@ async def main():
|
||||
save_pdf_report(filename, report_context)
|
||||
query_notify.warning(f'PDF report on all usernames saved in {filename}')
|
||||
|
||||
if args.md:
|
||||
username = username.replace('/', '_')
|
||||
filename = report_filepath_tpl.format(username=username, postfix='.md')
|
||||
run_flags = []
|
||||
if args.tags:
|
||||
run_flags.append(f"--tags {args.tags}")
|
||||
if args.site_list:
|
||||
run_flags.append(f"--site {','.join(args.site_list)}")
|
||||
if args.all_sites:
|
||||
run_flags.append("--all-sites")
|
||||
run_info = {
|
||||
"sites_count": sum(len(d) for _, _, d in general_results),
|
||||
"flags": " ".join(run_flags) if run_flags else None,
|
||||
}
|
||||
save_markdown_report(filename, report_context, run_info=run_info)
|
||||
query_notify.warning(f'Markdown report on all usernames saved in {filename}')
|
||||
|
||||
if args.graph:
|
||||
username = username.replace('/', '_')
|
||||
filename = report_filepath_tpl.format(
|
||||
|
||||
+1
-1
@@ -174,7 +174,7 @@ class QueryNotifyPrint(QueryNotify):
|
||||
else:
|
||||
return self.make_simple_terminal_notify(*args)
|
||||
|
||||
def start(self, message, id_type):
|
||||
def start(self, message=None, id_type="username"):
|
||||
"""Notify Start.
|
||||
|
||||
Will print the title to the standard output.
|
||||
|
||||
+151
-12
@@ -7,7 +7,7 @@ import os
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any
|
||||
|
||||
import xmind
|
||||
import xmind # type: ignore[import-untyped]
|
||||
from dateutil.tz import gettz
|
||||
from dateutil.parser import parse as parse_datetime_str
|
||||
from jinja2 import Template
|
||||
@@ -79,7 +79,7 @@ def save_pdf_report(filename: str, context: dict):
|
||||
filled_template = template.render(**context)
|
||||
|
||||
# moved here to speed up the launch of Maigret
|
||||
from xhtml2pdf import pisa
|
||||
from xhtml2pdf import pisa # type: ignore[import-untyped]
|
||||
|
||||
with open(filename, "w+b") as f:
|
||||
pisa.pisaDocument(io.StringIO(filled_template), dest=f, default_css=css)
|
||||
@@ -91,9 +91,9 @@ def save_json_report(filename: str, username: str, results: dict, report_type: s
|
||||
|
||||
|
||||
class MaigretGraph:
|
||||
other_params = {'size': 10, 'group': 3}
|
||||
site_params = {'size': 15, 'group': 2}
|
||||
username_params = {'size': 20, 'group': 1}
|
||||
other_params: dict = {'size': 10, 'group': 3}
|
||||
site_params: dict = {'size': 15, 'group': 2}
|
||||
username_params: dict = {'size': 20, 'group': 1}
|
||||
|
||||
def __init__(self, graph):
|
||||
self.G = graph
|
||||
@@ -121,12 +121,12 @@ class MaigretGraph:
|
||||
def save_graph_report(filename: str, username_results: list, db: MaigretDatabase):
|
||||
import networkx as nx
|
||||
|
||||
G = nx.Graph()
|
||||
G: Any = nx.Graph()
|
||||
graph = MaigretGraph(G)
|
||||
|
||||
base_site_nodes = {}
|
||||
site_account_nodes = {}
|
||||
processed_values = {} # Track processed values to avoid duplicates
|
||||
processed_values: Dict[str, Any] = {} # Track processed values to avoid duplicates
|
||||
|
||||
for username, id_type, results in username_results:
|
||||
# Add username node, using normalized version directly if different
|
||||
@@ -239,7 +239,7 @@ def save_graph_report(filename: str, username_results: list, db: MaigretDatabase
|
||||
G.remove_nodes_from(single_degree_sites)
|
||||
|
||||
# Generate interactive visualization
|
||||
from pyvis.network import Network
|
||||
from pyvis.network import Network # type: ignore[import-untyped]
|
||||
|
||||
nt = Network(notebook=True, height="750px", width="100%")
|
||||
nt.from_nx(G)
|
||||
@@ -257,6 +257,144 @@ def get_plaintext_report(context: dict) -> str:
|
||||
return output.strip()
|
||||
|
||||
|
||||
def _md_format_value(value) -> str:
|
||||
"""Format a value for Markdown output, detecting links."""
|
||||
if isinstance(value, list):
|
||||
return ", ".join(str(v) for v in value)
|
||||
s = str(value)
|
||||
if s.startswith("http://") or s.startswith("https://"):
|
||||
return f"[{s}]({s})"
|
||||
return s
|
||||
|
||||
|
||||
def save_markdown_report(filename: str, context: dict, run_info: dict = None):
|
||||
username = context.get("username", "unknown")
|
||||
generated_at = context.get("generated_at", "")
|
||||
brief = context.get("brief", "")
|
||||
countries = context.get("countries_tuple_list", [])
|
||||
interests = context.get("interests_tuple_list", [])
|
||||
first_seen = context.get("first_seen")
|
||||
results = context.get("results", [])
|
||||
|
||||
# Collect ALL values for key fields across all accounts
|
||||
all_fields: Dict[str, list] = {}
|
||||
last_seen = None
|
||||
for _, _, data in results:
|
||||
for _, v in data.items():
|
||||
if not v.get("found") or v.get("is_similar"):
|
||||
continue
|
||||
ids_data = v.get("ids_data", {})
|
||||
# Map multiple source fields to unified output fields
|
||||
field_sources = {
|
||||
"fullname": ("fullname", "name"),
|
||||
"location": ("location", "country", "city", "country_code", "locale", "region"),
|
||||
"gender": ("gender",),
|
||||
"bio": ("bio", "about", "description"),
|
||||
}
|
||||
for out_field, source_keys in field_sources.items():
|
||||
for src in source_keys:
|
||||
val = ids_data.get(src)
|
||||
if val:
|
||||
all_fields.setdefault(out_field, [])
|
||||
val_str = str(val)
|
||||
if val_str not in all_fields[out_field]:
|
||||
all_fields[out_field].append(val_str)
|
||||
# Track last_seen
|
||||
for ts_field in ("last_online", "latest_activity_at", "updated_at"):
|
||||
ts = ids_data.get(ts_field)
|
||||
if ts and (last_seen is None or str(ts) > str(last_seen)):
|
||||
last_seen = ts
|
||||
|
||||
lines = []
|
||||
lines.append(f"# Report by searching on username \"{username}\"\n")
|
||||
|
||||
# Generated line with run info
|
||||
gen_line = f"Generated at {generated_at} by [Maigret](https://github.com/soxoj/maigret)"
|
||||
if run_info:
|
||||
parts = []
|
||||
if run_info.get("sites_count"):
|
||||
parts.append(f"{run_info['sites_count']} sites checked")
|
||||
if run_info.get("flags"):
|
||||
parts.append(f"flags: `{run_info['flags']}`")
|
||||
if parts:
|
||||
gen_line += f" ({', '.join(parts)})"
|
||||
lines.append(f"{gen_line}\n")
|
||||
|
||||
# Summary
|
||||
lines.append("## Summary\n")
|
||||
lines.append(f"{brief}\n")
|
||||
|
||||
if all_fields:
|
||||
lines.append("**Information extracted from accounts:**\n")
|
||||
for field, values in all_fields.items():
|
||||
title = CaseConverter.snake_to_title(field)
|
||||
lines.append(f"- {title}: {'; '.join(values)}")
|
||||
lines.append("")
|
||||
|
||||
if countries:
|
||||
geo = ", ".join(f"{code} (x{count})" for code, count in countries)
|
||||
lines.append(f"**Country tags:** {geo}\n")
|
||||
|
||||
if interests:
|
||||
tags = ", ".join(f"{tag} (x{count})" for tag, count in interests)
|
||||
lines.append(f"**Website tags:** {tags}\n")
|
||||
|
||||
if first_seen:
|
||||
lines.append(f"**First seen:** {first_seen}")
|
||||
if last_seen:
|
||||
lines.append(f"**Last seen:** {last_seen}")
|
||||
if first_seen or last_seen:
|
||||
lines.append("")
|
||||
|
||||
# Accounts found
|
||||
lines.append("## Accounts found\n")
|
||||
|
||||
for u, id_type, data in results:
|
||||
for site_name, v in data.items():
|
||||
if not v.get("found") or v.get("is_similar"):
|
||||
continue
|
||||
|
||||
lines.append(f"### {site_name}\n")
|
||||
lines.append(f"- **URL:** [{v.get('url_user', '')}]({v.get('url_user', '')})")
|
||||
|
||||
tags = v.get("status") and v["status"].tags or []
|
||||
if tags:
|
||||
lines.append(f"- **Tags:** {', '.join(tags)}")
|
||||
lines.append("")
|
||||
|
||||
ids_data = v.get("ids_data", {})
|
||||
if ids_data:
|
||||
for field, value in ids_data.items():
|
||||
if field == "image":
|
||||
continue
|
||||
title = CaseConverter.snake_to_title(field)
|
||||
lines.append(f"- {title}: {_md_format_value(value)}")
|
||||
|
||||
lines.append("")
|
||||
|
||||
# Possible false positives
|
||||
lines.append("## Possible false positives\n")
|
||||
lines.append(
|
||||
f"This report was generated by searching for accounts matching the username `{username}`. "
|
||||
f"Accounts listed above may belong to different people who happen to use the same "
|
||||
f"or similar username. Results without extracted personal information could contain "
|
||||
f"some false positive findings. Always verify findings before drawing conclusions.\n"
|
||||
)
|
||||
|
||||
# Ethical use
|
||||
lines.append("## Ethical use\n")
|
||||
lines.append(
|
||||
"This report is a result of a technical collection of publicly available information "
|
||||
"from online accounts and does not constitute personal data processing. If you intend "
|
||||
"to use this data for personal data processing or collection purposes, ensure your use "
|
||||
"complies with applicable laws and regulations in your jurisdiction (such as GDPR, "
|
||||
"CCPA, and similar).\n"
|
||||
)
|
||||
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
f.write("\n".join(lines))
|
||||
|
||||
|
||||
"""
|
||||
REPORTS GENERATING
|
||||
"""
|
||||
@@ -353,11 +491,12 @@ def generate_report_context(username_results: list):
|
||||
if k in ["country", "locale"]:
|
||||
try:
|
||||
if is_country_tag(k):
|
||||
tag = pycountry.countries.get(alpha_2=v).alpha_2.lower()
|
||||
country = pycountry.countries.get(alpha_2=v)
|
||||
tag = country.alpha_2.lower() # type: ignore[union-attr]
|
||||
else:
|
||||
tag = pycountry.countries.search_fuzzy(v)[
|
||||
0
|
||||
].alpha_2.lower()
|
||||
].alpha_2.lower() # type: ignore[attr-defined]
|
||||
# TODO: move countries to another struct
|
||||
tags[tag] = tags.get(tag, 0) + 1
|
||||
except Exception as e:
|
||||
@@ -513,8 +652,8 @@ def add_xmind_subtopic(userlink, k, v, supposed_data):
|
||||
|
||||
|
||||
def design_xmind_sheet(sheet, username, results):
|
||||
alltags = {}
|
||||
supposed_data = {}
|
||||
alltags: Dict[str, Any] = {}
|
||||
supposed_data: Dict[str, Any] = {}
|
||||
|
||||
sheet.setTitle("%s Analysis" % (username))
|
||||
root_topic1 = sheet.getRootTopic()
|
||||
|
||||
+24251
-24931
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,8 @@
|
||||
{
|
||||
"version": 1,
|
||||
"updated_at": "2026-04-10T10:28:14Z",
|
||||
"sites_count": 3150,
|
||||
"min_maigret_version": "0.6.0",
|
||||
"data_sha256": "72a493fef4eb8958fe8ed0c9b895841ec10c335f1b8e5e9b24b50784be6ad017",
|
||||
"data_url": "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/data.json"
|
||||
}
|
||||
@@ -54,5 +54,9 @@
|
||||
"graph_report": false,
|
||||
"pdf_report": false,
|
||||
"html_report": false,
|
||||
"web_interface_port": 5000
|
||||
"md_report": false,
|
||||
"web_interface_port": 5000,
|
||||
"no_autoupdate": false,
|
||||
"db_update_meta_url": "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/db_meta.json",
|
||||
"autoupdate_check_interval_hours": 24
|
||||
}
|
||||
@@ -42,7 +42,11 @@ class Settings:
|
||||
pdf_report: bool
|
||||
html_report: bool
|
||||
graph_report: bool
|
||||
md_report: bool
|
||||
web_interface_port: int
|
||||
no_autoupdate: bool
|
||||
db_update_meta_url: str
|
||||
autoupdate_check_interval_hours: int
|
||||
|
||||
# submit mode settings
|
||||
presence_strings: list
|
||||
|
||||
+62
-8
@@ -65,6 +65,10 @@ class MaigretSite:
|
||||
url_probe = None
|
||||
# Type of check to perform
|
||||
check_type = ""
|
||||
# HTTP request method (GET, POST, HEAD, etc.)
|
||||
request_method = ""
|
||||
# HTTP request payload (for POST, PUT, etc.)
|
||||
request_payload: Dict[str, Any] = {}
|
||||
# Whether to only send HEAD requests (GET by default)
|
||||
request_head_only = ""
|
||||
# GET parameters to include in requests
|
||||
@@ -88,10 +92,12 @@ class MaigretSite:
|
||||
# Alexa traffic rank
|
||||
alexa_rank = None
|
||||
# Source (in case a site is a mirror of another site)
|
||||
source = None
|
||||
source: Optional[str] = None
|
||||
|
||||
# URL protocol (http/https)
|
||||
protocol = ''
|
||||
# Protection types detected on this site (e.g. ["tls_fingerprint", "ddos_guard"])
|
||||
protection: List[str] = []
|
||||
|
||||
def __init__(self, name, information):
|
||||
self.name = name
|
||||
@@ -137,6 +143,8 @@ class MaigretSite:
|
||||
'regex_check',
|
||||
'url_probe',
|
||||
'check_type',
|
||||
'request_method',
|
||||
'request_payload',
|
||||
'request_head_only',
|
||||
'get_params',
|
||||
'presense_strs',
|
||||
@@ -167,7 +175,7 @@ class MaigretSite:
|
||||
self.__dict__[CaseConverter.camel_to_snake(group)],
|
||||
)
|
||||
|
||||
self.url_regexp = URLMatcher.make_profile_url_regexp(url, self.regex_check)
|
||||
self.url_regexp = URLMatcher.make_profile_url_regexp(url, self.regex_check or "")
|
||||
|
||||
def detect_username(self, url: str) -> Optional[str]:
|
||||
if self.url_regexp:
|
||||
@@ -318,6 +326,7 @@ class MaigretDatabase:
|
||||
reverse=False,
|
||||
top=sys.maxsize,
|
||||
tags=[],
|
||||
excluded_tags=[],
|
||||
names=[],
|
||||
disabled=True,
|
||||
id_type="username",
|
||||
@@ -336,7 +345,8 @@ class MaigretDatabase:
|
||||
Args:
|
||||
reverse (bool, optional): Reverse the sorting order. Defaults to False.
|
||||
top (int, optional): Maximum number of sites to return. Defaults to sys.maxsize.
|
||||
tags (list, optional): List of tags to filter sites by. Defaults to empty list.
|
||||
tags (list, optional): List of tags to filter sites by (whitelist). Defaults to empty list.
|
||||
excluded_tags (list, optional): List of tags to exclude sites by (blacklist). Defaults to empty list.
|
||||
names (list, optional): List of site names (or urls, see MaigretSite.__eq__) to filter by. Defaults to empty list.
|
||||
disabled (bool, optional): Whether to include disabled sites. Defaults to True.
|
||||
id_type (str, optional): Type of identifier to filter by. Defaults to "username".
|
||||
@@ -347,6 +357,7 @@ class MaigretDatabase:
|
||||
"""
|
||||
normalized_names = list(map(str.lower, names))
|
||||
normalized_tags = list(map(str.lower, tags))
|
||||
normalized_excluded_tags = list(map(str.lower, excluded_tags))
|
||||
|
||||
is_name_ok = lambda x: x.name.lower() in normalized_names
|
||||
is_source_ok = lambda x: x.source and x.source.lower() in normalized_names
|
||||
@@ -360,6 +371,22 @@ class MaigretDatabase:
|
||||
)
|
||||
is_id_type_ok = lambda x: x.type == id_type
|
||||
|
||||
is_excluded_by_tag = lambda x: set(
|
||||
map(str.lower, x.tags)
|
||||
).intersection(set(normalized_excluded_tags))
|
||||
is_excluded_by_engine = lambda x: (
|
||||
isinstance(x.engine, str)
|
||||
and x.engine.lower() in normalized_excluded_tags
|
||||
)
|
||||
is_excluded_by_protocol = lambda x: (
|
||||
x.protocol and x.protocol in normalized_excluded_tags
|
||||
)
|
||||
is_not_excluded = lambda x: not excluded_tags or not (
|
||||
is_excluded_by_tag(x)
|
||||
or is_excluded_by_engine(x)
|
||||
or is_excluded_by_protocol(x)
|
||||
)
|
||||
|
||||
filter_tags_engines_fun = (
|
||||
lambda x: not tags
|
||||
or is_engine_ok(x)
|
||||
@@ -370,6 +397,7 @@ class MaigretDatabase:
|
||||
|
||||
filter_fun = (
|
||||
lambda x: filter_tags_engines_fun(x)
|
||||
and is_not_excluded(x)
|
||||
and filter_names_fun(x)
|
||||
and is_disabled_needed(x)
|
||||
and is_id_type_ok(x)
|
||||
@@ -387,6 +415,7 @@ class MaigretDatabase:
|
||||
if top < sys.maxsize and sorted_list:
|
||||
filter_fun_ranking_parents = (
|
||||
lambda x: filter_tags_engines_fun(x)
|
||||
and is_not_excluded(x)
|
||||
and filter_names_fun(x)
|
||||
and is_id_type_ok(x)
|
||||
)
|
||||
@@ -435,9 +464,9 @@ class MaigretDatabase:
|
||||
"tags": self._tags,
|
||||
}
|
||||
|
||||
json_data = json.dumps(db_data, indent=4)
|
||||
json_data = json.dumps(db_data, indent=4, ensure_ascii=False)
|
||||
|
||||
with open(filename, "w") as f:
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
f.write(json_data)
|
||||
|
||||
return self
|
||||
@@ -537,7 +566,7 @@ class MaigretDatabase:
|
||||
|
||||
def get_scan_stats(self, sites_dict):
|
||||
sites = sites_dict or self.sites_dict
|
||||
found_flags = {}
|
||||
found_flags: Dict[str, int] = {}
|
||||
for _, s in sites.items():
|
||||
if "presense_flag" in s.stats:
|
||||
flag = s.stats["presense_flag"]
|
||||
@@ -558,8 +587,10 @@ class MaigretDatabase:
|
||||
def get_db_stats(self, is_markdown=False):
|
||||
# Initialize counters
|
||||
sites_dict = self.sites_dict
|
||||
urls = {}
|
||||
tags = {}
|
||||
urls: Dict[str, int] = {}
|
||||
tags: Dict[str, int] = {}
|
||||
engine_total: Dict[str, int] = {}
|
||||
engine_enabled: Dict[str, int] = {}
|
||||
disabled_count = 0
|
||||
message_checks_one_factor = 0
|
||||
status_checks = 0
|
||||
@@ -582,6 +613,14 @@ class MaigretDatabase:
|
||||
elif site.check_type == 'status_code':
|
||||
status_checks += 1
|
||||
|
||||
# Count engines
|
||||
if site.engine:
|
||||
engine_total[site.engine] = engine_total.get(site.engine, 0) + 1
|
||||
if not site.disabled:
|
||||
engine_enabled[site.engine] = (
|
||||
engine_enabled.get(site.engine, 0) + 1
|
||||
)
|
||||
|
||||
# Count tags
|
||||
if not site.tags:
|
||||
tags["NO_TAGS"] = tags.get("NO_TAGS", 0) + 1
|
||||
@@ -618,11 +657,26 @@ class MaigretDatabase:
|
||||
f"Sites with probing: {', '.join(sorted(site_with_probing))}",
|
||||
f"Sites with activation: {', '.join(sorted(site_with_activation))}",
|
||||
self._format_top_items("profile URLs", urls, 20, is_markdown),
|
||||
self._format_engine_stats(engine_total, engine_enabled, is_markdown),
|
||||
self._format_top_items("tags", tags, 20, is_markdown, self._tags),
|
||||
]
|
||||
|
||||
return separator.join(output)
|
||||
|
||||
def _format_engine_stats(self, engine_total, engine_enabled, is_markdown):
|
||||
"""Format per-engine enabled/total counts, sorted by total descending."""
|
||||
output = "Sites by engine:\n"
|
||||
for engine, total in sorted(
|
||||
engine_total.items(), key=lambda x: x[1], reverse=True
|
||||
):
|
||||
enabled = engine_enabled.get(engine, 0)
|
||||
perc = round(100 * enabled / total, 1) if total else 0.0
|
||||
if is_markdown:
|
||||
output += f"- `{engine}`: {enabled}/{total} ({perc}%)\n"
|
||||
else:
|
||||
output += f"{enabled}/{total} ({perc}%)\t{engine}\n"
|
||||
return output
|
||||
|
||||
def _format_top_items(
|
||||
self, title, items_dict, limit, is_markdown, valid_items=None
|
||||
):
|
||||
|
||||
+41
-28
@@ -6,8 +6,7 @@ import logging
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from aiohttp import ClientSession, TCPConnector
|
||||
from aiohttp_socks import ProxyConnector
|
||||
import cloudscraper
|
||||
import cloudscraper # type: ignore[import-untyped]
|
||||
from colorama import Fore, Style
|
||||
|
||||
from .activation import import_aiohttp_cookies
|
||||
@@ -68,8 +67,10 @@ class Submitter:
|
||||
else:
|
||||
cookie_jar = import_aiohttp_cookies(args.cookie_file)
|
||||
|
||||
connector = ProxyConnector.from_url(proxy) if proxy else TCPConnector(ssl=False)
|
||||
connector.verify_ssl = False
|
||||
ssl_context = __import__('ssl').create_default_context()
|
||||
ssl_context.check_hostname = False
|
||||
ssl_context.verify_mode = __import__('ssl').CERT_NONE
|
||||
connector = ProxyConnector.from_url(proxy) if proxy else TCPConnector(ssl=ssl_context)
|
||||
self.session = ClientSession(
|
||||
connector=connector, trust_env=True, cookie_jar=cookie_jar
|
||||
)
|
||||
@@ -88,7 +89,9 @@ class Submitter:
|
||||
alexa_rank = 0
|
||||
|
||||
try:
|
||||
alexa_rank = int(root.find('.//REACH').attrib['RANK'])
|
||||
reach_elem = root.find('.//REACH')
|
||||
if reach_elem is not None:
|
||||
alexa_rank = int(reach_elem.attrib['RANK'])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@@ -127,7 +130,7 @@ class Submitter:
|
||||
|
||||
async def detect_known_engine(
|
||||
self, url_exists, url_mainpage, session, follow_redirects, headers
|
||||
) -> [List[MaigretSite], str]:
|
||||
) -> Tuple[List[MaigretSite], str]:
|
||||
|
||||
session = session or self.session
|
||||
resp_text, _ = await self.get_html_response_to_compare(
|
||||
@@ -191,8 +194,9 @@ class Submitter:
|
||||
# TODO: replace with checking.py/SimpleAiohttpChecker call
|
||||
@staticmethod
|
||||
async def get_html_response_to_compare(
|
||||
url: str, session: ClientSession = None, redirects=False, headers: Dict = None
|
||||
url: str, session: Optional[ClientSession] = None, redirects=False, headers: Optional[Dict] = None
|
||||
):
|
||||
assert session is not None, "session must not be None"
|
||||
async with session.get(
|
||||
url, allow_redirects=redirects, headers=headers
|
||||
) as response:
|
||||
@@ -211,10 +215,10 @@ class Submitter:
|
||||
username: str,
|
||||
url_exists: str,
|
||||
cookie_filename="", # TODO: use cookies
|
||||
session: ClientSession = None,
|
||||
session: Optional[ClientSession] = None,
|
||||
follow_redirects=False,
|
||||
headers: dict = None,
|
||||
) -> Tuple[List[str], List[str], str, str]:
|
||||
headers: Optional[dict] = None,
|
||||
) -> Tuple[Optional[List[str]], Optional[List[str]], str, str]:
|
||||
|
||||
random_username = generate_random_username()
|
||||
url_of_non_existing_account = url_exists.lower().replace(
|
||||
@@ -269,11 +273,8 @@ class Submitter:
|
||||
tokens_a = set(re.split(f'[{self.SEPARATORS}]', first_html_response))
|
||||
tokens_b = set(re.split(f'[{self.SEPARATORS}]', second_html_response))
|
||||
|
||||
a_minus_b = tokens_a.difference(tokens_b)
|
||||
b_minus_a = tokens_b.difference(tokens_a)
|
||||
|
||||
a_minus_b = list(map(lambda x: x.strip('\\'), a_minus_b))
|
||||
b_minus_a = list(map(lambda x: x.strip('\\'), b_minus_a))
|
||||
a_minus_b: List[str] = [x.strip('\\') for x in tokens_a.difference(tokens_b)]
|
||||
b_minus_a: List[str] = [x.strip('\\') for x in tokens_b.difference(tokens_a)]
|
||||
|
||||
# Filter out strings containing usernames
|
||||
a_minus_b = [s for s in a_minus_b if username.lower() not in s.lower()]
|
||||
@@ -378,7 +379,7 @@ class Submitter:
|
||||
).strip()
|
||||
|
||||
if field in ['tags', 'presense_strs', 'absence_strs']:
|
||||
new_value = list(map(str.strip, new_value.split(',')))
|
||||
new_value = list(map(str.strip, new_value.split(','))) # type: ignore[assignment]
|
||||
|
||||
if new_value:
|
||||
setattr(site, field, new_value)
|
||||
@@ -409,8 +410,13 @@ class Submitter:
|
||||
self.logger.info('Domain is %s', domain_raw)
|
||||
|
||||
# check for existence
|
||||
domain_re = re.compile(
|
||||
r'://(www\.)?' + re.escape(domain_raw) + r'(/|$)'
|
||||
)
|
||||
matched_sites = list(
|
||||
filter(lambda x: domain_raw in x.url_main + x.url, self.db.sites)
|
||||
filter(
|
||||
lambda x: domain_re.search(x.url_main + x.url), self.db.sites
|
||||
)
|
||||
)
|
||||
|
||||
if matched_sites:
|
||||
@@ -419,12 +425,12 @@ class Submitter:
|
||||
f"{Fore.YELLOW}[!] Sites with domain \"{domain_raw}\" already exists in the Maigret database!{Style.RESET_ALL}"
|
||||
)
|
||||
|
||||
status = lambda s: "(disabled)" if s.disabled else ""
|
||||
site_status = lambda s: "(disabled)" if s.disabled else ""
|
||||
url_block = lambda s: f"\n\t{s.url_main}\n\t{s.url}"
|
||||
print(
|
||||
"\n".join(
|
||||
[
|
||||
f"{site.name} {status(site)}{url_block(site)}"
|
||||
f"{site.name} {site_status(site)}{url_block(site)}"
|
||||
for site in matched_sites
|
||||
]
|
||||
)
|
||||
@@ -448,9 +454,14 @@ class Submitter:
|
||||
old_site = next(
|
||||
(site for site in matched_sites if site.name == site_name), None
|
||||
)
|
||||
print(
|
||||
f'{Fore.GREEN}[+] We will update site "{old_site.name}" in case of success.{Style.RESET_ALL}'
|
||||
)
|
||||
if old_site is None:
|
||||
print(
|
||||
f'{Fore.RED}[!] Site "{site_name}" not found in the matched list. Proceeding without updating an existing site.{Style.RESET_ALL}'
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f'{Fore.GREEN}[+] We will update site "{old_site.name}" in case of success.{Style.RESET_ALL}'
|
||||
)
|
||||
|
||||
# Check if the site check is ordinary or not
|
||||
if old_site and (old_site.url_probe or old_site.activation):
|
||||
@@ -487,7 +498,7 @@ class Submitter:
|
||||
)
|
||||
|
||||
print('Detecting site engine, please wait...')
|
||||
sites = []
|
||||
sites: List[MaigretSite] = []
|
||||
text = None
|
||||
try:
|
||||
sites, text = await self.detect_known_engine(
|
||||
@@ -500,7 +511,7 @@ class Submitter:
|
||||
except KeyboardInterrupt:
|
||||
print('Engine detect process is interrupted.')
|
||||
|
||||
if 'cloudflare' in text.lower():
|
||||
if text and 'cloudflare' in text.lower():
|
||||
print(
|
||||
'Cloudflare protection detected. I will use cloudscraper for further work'
|
||||
)
|
||||
@@ -563,6 +574,8 @@ class Submitter:
|
||||
found = True
|
||||
break
|
||||
|
||||
assert chosen_site is not None, "No sites to check"
|
||||
|
||||
if not found:
|
||||
print(
|
||||
f"{Fore.RED}[!] The check for site '{chosen_site.name}' failed!{Style.RESET_ALL}"
|
||||
@@ -621,8 +634,8 @@ class Submitter:
|
||||
# chosen_site.alexa_rank = rank
|
||||
|
||||
self.logger.info(chosen_site.json)
|
||||
site_data = chosen_site.strip_engine_data()
|
||||
self.logger.info(site_data.json)
|
||||
stripped_site = chosen_site.strip_engine_data()
|
||||
self.logger.info(stripped_site.json)
|
||||
|
||||
if old_site:
|
||||
# Update old site with new values and log changes
|
||||
@@ -641,7 +654,7 @@ class Submitter:
|
||||
|
||||
for field, display_name in fields_to_check.items():
|
||||
old_value = getattr(old_site, field)
|
||||
new_value = getattr(site_data, field)
|
||||
new_value = getattr(stripped_site, field)
|
||||
if field == 'tags' and not new_tags:
|
||||
continue
|
||||
if str(old_value) != str(new_value):
|
||||
@@ -651,7 +664,7 @@ class Submitter:
|
||||
old_site.__dict__[field] = new_value
|
||||
|
||||
# update the site
|
||||
final_site = old_site if old_site else site_data
|
||||
final_site = old_site if old_site else stripped_site
|
||||
self.db.update_site(final_site)
|
||||
|
||||
# save the db in file
|
||||
|
||||
+6
-3
@@ -8,7 +8,7 @@ from typing import Any
|
||||
|
||||
|
||||
DEFAULT_USER_AGENTS = [
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
|
||||
]
|
||||
|
||||
|
||||
@@ -71,7 +71,10 @@ class URLMatcher:
|
||||
|
||||
|
||||
def ascii_data_display(data: str) -> Any:
|
||||
return ast.literal_eval(data)
|
||||
try:
|
||||
return ast.literal_eval(data)
|
||||
except (ValueError, SyntaxError):
|
||||
return data
|
||||
|
||||
|
||||
def get_dict_ascii_tree(items, prepend="", new_line=True):
|
||||
@@ -86,7 +89,7 @@ def get_dict_ascii_tree(items, prepend="", new_line=True):
|
||||
new_result + new_line if num != len(items) - 1 else last_result + new_line
|
||||
)
|
||||
|
||||
if type(item) == tuple:
|
||||
if isinstance(item, tuple):
|
||||
field_name, field_value = item
|
||||
if field_value.startswith("['"):
|
||||
is_last_item = num == len(items) - 1
|
||||
|
||||
+10
-5
@@ -13,6 +13,7 @@ import os
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
from threading import Thread
|
||||
from typing import Any, Dict
|
||||
import maigret
|
||||
import maigret.settings
|
||||
from maigret.sites import MaigretDatabase
|
||||
@@ -23,7 +24,7 @@ app = Flask(__name__)
|
||||
app.secret_key = os.getenv('FLASK_SECRET_KEY', os.urandom(24).hex())
|
||||
|
||||
# add background job tracking
|
||||
background_jobs = {}
|
||||
background_jobs: Dict[str, Any] = {}
|
||||
job_results = {}
|
||||
|
||||
# Configuration
|
||||
@@ -49,12 +50,14 @@ async def maigret_search(username, options):
|
||||
top_sites = 999999999 # effectively all
|
||||
|
||||
tags = options.get('tags', [])
|
||||
excluded_tags = options.get('excluded_tags', [])
|
||||
site_list = options.get('site_list', [])
|
||||
logger.info(f"Filtering sites by tags: {tags}")
|
||||
logger.info(f"Filtering sites by tags: {tags}, excluded: {excluded_tags}")
|
||||
|
||||
sites = db.ranked_sites_dict(
|
||||
top=top_sites,
|
||||
tags=tags,
|
||||
excluded_tags=excluded_tags,
|
||||
names=site_list,
|
||||
disabled=False,
|
||||
id_type='username',
|
||||
@@ -225,7 +228,8 @@ def search():
|
||||
|
||||
# Get selected tags - ensure it's a list
|
||||
selected_tags = request.form.getlist('tags')
|
||||
logging.info(f"Selected tags: {selected_tags}")
|
||||
excluded_tags = request.form.getlist('excluded_tags')
|
||||
logging.info(f"Selected tags: {selected_tags}, Excluded tags: {excluded_tags}")
|
||||
|
||||
options = {
|
||||
'top_sites': request.form.get('top_sites') or '500',
|
||||
@@ -240,13 +244,14 @@ def search():
|
||||
'i2p_proxy': request.form.get('i2p_proxy', None) or None,
|
||||
'permute': 'permute' in request.form,
|
||||
'tags': selected_tags, # Pass selected tags as a list
|
||||
'excluded_tags': excluded_tags, # Pass excluded tags as a list
|
||||
'site_list': [
|
||||
s.strip() for s in request.form.get('site', '').split(',') if s.strip()
|
||||
],
|
||||
}
|
||||
|
||||
logging.info(
|
||||
f"Starting search for usernames: {usernames} with tags: {selected_tags}"
|
||||
f"Starting search for usernames: {usernames} with tags: {selected_tags}, excluded: {excluded_tags}"
|
||||
)
|
||||
|
||||
# Start background job
|
||||
@@ -256,7 +261,7 @@ def search():
|
||||
target=process_search_task, args=(usernames, options, timestamp)
|
||||
),
|
||||
}
|
||||
background_jobs[timestamp]['thread'].start()
|
||||
background_jobs[timestamp]['thread'].start() # type: ignore[union-attr]
|
||||
|
||||
return redirect(url_for('status', timestamp=timestamp))
|
||||
|
||||
|
||||
@@ -28,6 +28,11 @@
|
||||
background-color: #28a745;
|
||||
}
|
||||
|
||||
.tag.excluded {
|
||||
background-color: #343a40;
|
||||
text-decoration: line-through;
|
||||
}
|
||||
|
||||
.tag:hover {
|
||||
transform: translateY(-2px);
|
||||
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.2);
|
||||
@@ -168,7 +173,16 @@
|
||||
</div>
|
||||
|
||||
<div class="mb-3">
|
||||
<label class="form-label">Tags (click to select)</label>
|
||||
<label class="form-label">Tags (click to cycle: include → exclude → neutral)</label>
|
||||
<div class="mb-2">
|
||||
<small class="text-muted">
|
||||
<span style="display:inline-block;width:12px;height:12px;background:#28a745;border-radius:50%;"></span> Included (whitelist)
|
||||
|
||||
<span style="display:inline-block;width:12px;height:12px;background:#343a40;border-radius:50%;"></span> Excluded (blacklist)
|
||||
|
||||
<span style="display:inline-block;width:12px;height:12px;background:#dc3545;border-radius:50%;"></span> Neutral
|
||||
</small>
|
||||
</div>
|
||||
<div class="tag-cloud" id="tagCloud"></div>
|
||||
<select multiple class="hidden-select" id="tags" name="tags">
|
||||
<option value="gaming">Gaming</option>
|
||||
@@ -230,6 +244,89 @@
|
||||
<option value="q&a">Q&A</option>
|
||||
<option value="crypto">Crypto</option>
|
||||
<option value="ai">AI</option>
|
||||
<!-- Country tags -->
|
||||
<option value="ae" data-group="country">AE - United Arab Emirates</option>
|
||||
<option value="ao" data-group="country">AO - Angola</option>
|
||||
<option value="ar" data-group="country">AR - Argentina</option>
|
||||
<option value="at" data-group="country">AT - Austria</option>
|
||||
<option value="au" data-group="country">AU - Australia</option>
|
||||
<option value="az" data-group="country">AZ - Azerbaijan</option>
|
||||
<option value="bd" data-group="country">BD - Bangladesh</option>
|
||||
<option value="be" data-group="country">BE - Belgium</option>
|
||||
<option value="bg" data-group="country">BG - Bulgaria</option>
|
||||
<option value="br" data-group="country">BR - Brazil</option>
|
||||
<option value="by" data-group="country">BY - Belarus</option>
|
||||
<option value="ca" data-group="country">CA - Canada</option>
|
||||
<option value="ch" data-group="country">CH - Switzerland</option>
|
||||
<option value="cl" data-group="country">CL - Chile</option>
|
||||
<option value="cn" data-group="country">CN - China</option>
|
||||
<option value="co" data-group="country">CO - Colombia</option>
|
||||
<option value="cr" data-group="country">CR - Costa Rica</option>
|
||||
<option value="cz" data-group="country">CZ - Czechia</option>
|
||||
<option value="de" data-group="country">DE - Germany</option>
|
||||
<option value="dk" data-group="country">DK - Denmark</option>
|
||||
<option value="dz" data-group="country">DZ - Algeria</option>
|
||||
<option value="ee" data-group="country">EE - Estonia</option>
|
||||
<option value="eg" data-group="country">EG - Egypt</option>
|
||||
<option value="es" data-group="country">ES - Spain</option>
|
||||
<option value="eu" data-group="country">EU - European Union</option>
|
||||
<option value="fi" data-group="country">FI - Finland</option>
|
||||
<option value="fr" data-group="country">FR - France</option>
|
||||
<option value="gb" data-group="country">GB - United Kingdom</option>
|
||||
<option value="global" data-group="country">🌍 Global</option>
|
||||
<option value="gr" data-group="country">GR - Greece</option>
|
||||
<option value="hk" data-group="country">HK - Hong Kong</option>
|
||||
<option value="hr" data-group="country">HR - Croatia</option>
|
||||
<option value="hu" data-group="country">HU - Hungary</option>
|
||||
<option value="id" data-group="country">ID - Indonesia</option>
|
||||
<option value="ie" data-group="country">IE - Ireland</option>
|
||||
<option value="il" data-group="country">IL - Israel</option>
|
||||
<option value="in" data-group="country">IN - India</option>
|
||||
<option value="ir" data-group="country">IR - Iran</option>
|
||||
<option value="it" data-group="country">IT - Italy</option>
|
||||
<option value="jp" data-group="country">JP - Japan</option>
|
||||
<option value="kg" data-group="country">KG - Kyrgyzstan</option>
|
||||
<option value="kr" data-group="country">KR - Korea</option>
|
||||
<option value="kz" data-group="country">KZ - Kazakhstan</option>
|
||||
<option value="la" data-group="country">LA - Laos</option>
|
||||
<option value="lk" data-group="country">LK - Sri Lanka</option>
|
||||
<option value="lt" data-group="country">LT - Lithuania</option>
|
||||
<option value="ma" data-group="country">MA - Morocco</option>
|
||||
<option value="md" data-group="country">MD - Moldova</option>
|
||||
<option value="mg" data-group="country">MG - Madagascar</option>
|
||||
<option value="mk" data-group="country">MK - North Macedonia</option>
|
||||
<option value="mx" data-group="country">MX - Mexico</option>
|
||||
<option value="ng" data-group="country">NG - Nigeria</option>
|
||||
<option value="nl" data-group="country">NL - Netherlands</option>
|
||||
<option value="no" data-group="country">NO - Norway</option>
|
||||
<option value="ph" data-group="country">PH - Philippines</option>
|
||||
<option value="pk" data-group="country">PK - Pakistan</option>
|
||||
<option value="pl" data-group="country">PL - Poland</option>
|
||||
<option value="pt" data-group="country">PT - Portugal</option>
|
||||
<option value="re" data-group="country">RE - Réunion</option>
|
||||
<option value="ro" data-group="country">RO - Romania</option>
|
||||
<option value="rs" data-group="country">RS - Serbia</option>
|
||||
<option value="ru" data-group="country">RU - Russia</option>
|
||||
<option value="sa" data-group="country">SA - Saudi Arabia</option>
|
||||
<option value="sd" data-group="country">SD - Sudan</option>
|
||||
<option value="se" data-group="country">SE - Sweden</option>
|
||||
<option value="sg" data-group="country">SG - Singapore</option>
|
||||
<option value="sk" data-group="country">SK - Slovakia</option>
|
||||
<option value="sv" data-group="country">SV - El Salvador</option>
|
||||
<option value="th" data-group="country">TH - Thailand</option>
|
||||
<option value="tn" data-group="country">TN - Tunisia</option>
|
||||
<option value="tr" data-group="country">TR - Türkiye</option>
|
||||
<option value="tw" data-group="country">TW - Taiwan</option>
|
||||
<option value="ua" data-group="country">UA - Ukraine</option>
|
||||
<option value="uk" data-group="country">UK - United Kingdom</option>
|
||||
<option value="us" data-group="country">US - United States</option>
|
||||
<option value="uz" data-group="country">UZ - Uzbekistan</option>
|
||||
<option value="ve" data-group="country">VE - Venezuela</option>
|
||||
<option value="vi" data-group="country">VI - Virgin Islands</option>
|
||||
<option value="vn" data-group="country">VN - Viet Nam</option>
|
||||
<option value="za" data-group="country">ZA - South Africa</option>
|
||||
</select>
|
||||
<select multiple class="hidden-select" id="excludedTags" name="excluded_tags">
|
||||
</select>
|
||||
</div>
|
||||
</div>
|
||||
@@ -292,26 +389,66 @@
|
||||
}
|
||||
|
||||
document.addEventListener('DOMContentLoaded', function () {
|
||||
// Tag cloud functionality
|
||||
// Tag cloud functionality with include/exclude (whitelist/blacklist) support
|
||||
const tagCloud = document.getElementById('tagCloud');
|
||||
const hiddenSelect = document.getElementById('tags');
|
||||
const excludedSelect = document.getElementById('excludedTags');
|
||||
const allTags = Array.from(hiddenSelect.options).map(opt => ({
|
||||
value: opt.value,
|
||||
label: opt.text
|
||||
label: opt.text,
|
||||
group: opt.dataset.group || 'category'
|
||||
}));
|
||||
|
||||
function updateTagSelects() {
|
||||
// Clear and repopulate hidden selects based on tag states
|
||||
Array.from(hiddenSelect.options).forEach(opt => opt.selected = false);
|
||||
// Clear excluded select
|
||||
excludedSelect.innerHTML = '';
|
||||
|
||||
document.querySelectorAll('#tagCloud .tag').forEach(tagEl => {
|
||||
const val = tagEl.dataset.value;
|
||||
if (tagEl.classList.contains('selected')) {
|
||||
const option = Array.from(hiddenSelect.options).find(opt => opt.value === val);
|
||||
if (option) option.selected = true;
|
||||
} else if (tagEl.classList.contains('excluded')) {
|
||||
const opt = document.createElement('option');
|
||||
opt.value = val;
|
||||
opt.selected = true;
|
||||
excludedSelect.appendChild(opt);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
let lastGroup = '';
|
||||
allTags.forEach(tag => {
|
||||
if (tag.group !== lastGroup && tag.group === 'country') {
|
||||
const separator = document.createElement('div');
|
||||
separator.style.cssText = 'width:100%;margin:8px 0 4px;padding:4px 0;border-top:1px solid rgba(0,0,0,0.15);font-size:13px;color:#666;';
|
||||
separator.textContent = 'Countries';
|
||||
tagCloud.appendChild(separator);
|
||||
}
|
||||
lastGroup = tag.group;
|
||||
|
||||
const tagElement = document.createElement('span');
|
||||
tagElement.className = 'tag';
|
||||
tagElement.textContent = tag.label;
|
||||
tagElement.dataset.value = tag.value;
|
||||
|
||||
tagElement.addEventListener('click', function () {
|
||||
const isSelected = this.classList.toggle('selected');
|
||||
const option = Array.from(hiddenSelect.options).find(opt => opt.value === tag.value);
|
||||
if (option) {
|
||||
option.selected = isSelected;
|
||||
// Single click cycles: neutral -> included -> excluded -> neutral
|
||||
tagElement.addEventListener('click', function (e) {
|
||||
e.preventDefault();
|
||||
if (this.classList.contains('selected')) {
|
||||
// included -> excluded
|
||||
this.classList.remove('selected');
|
||||
this.classList.add('excluded');
|
||||
} else if (this.classList.contains('excluded')) {
|
||||
// excluded -> neutral
|
||||
this.classList.remove('excluded');
|
||||
} else {
|
||||
// neutral -> included
|
||||
this.classList.add('selected');
|
||||
}
|
||||
updateTagSelects();
|
||||
});
|
||||
|
||||
tagCloud.appendChild(tagElement);
|
||||
|
||||
Generated
+1120
-957
File diff suppressed because it is too large
Load Diff
@@ -1,5 +1,5 @@
|
||||
maigret @ https://github.com/soxoj/maigret/archive/refs/heads/main.zip
|
||||
pefile==2023.2.7 # do not bump while pyinstaller is 6.11.1, there is a conflict
|
||||
psutil==7.1.3
|
||||
pyinstaller==6.16.0
|
||||
psutil==7.2.2
|
||||
pyinstaller==6.19.0
|
||||
pywin32-ctypes==0.2.3
|
||||
|
||||
+12
-10
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.poetry]
|
||||
name = "maigret"
|
||||
version = "0.5.0"
|
||||
version = "0.6.0"
|
||||
description = "🕵️♂️ Collect a dossier on a person by username from thousands of sites."
|
||||
authors = ["Soxoj <soxoj@protonmail.com>"]
|
||||
readme = "README.md"
|
||||
@@ -31,32 +31,33 @@ classifiers = [
|
||||
# Install with dev dependencies:
|
||||
# poetry install --with dev
|
||||
python = "^3.10"
|
||||
aiodns = "^3.0.0"
|
||||
aiodns = ">=3,<5"
|
||||
aiohttp = "^3.12.14"
|
||||
aiohttp-socks = "^0.10.1"
|
||||
aiohttp-socks = ">=0.10.1,<0.12.0"
|
||||
arabic-reshaper = "^3.0.0"
|
||||
async-timeout = "^5.0.1"
|
||||
attrs = "^25.3.0"
|
||||
certifi = "^2025.6.15"
|
||||
chardet = "^5.0.0"
|
||||
attrs = ">=25.3,<27.0"
|
||||
certifi = ">=2025.6.15,<2027.0.0"
|
||||
chardet = ">=5,<8"
|
||||
colorama = "^0.4.6"
|
||||
future = "^1.0.0"
|
||||
future-annotations= "^1.0.0"
|
||||
html5lib = "^1.1"
|
||||
idna = "^3.4"
|
||||
Jinja2 = "^3.1.6"
|
||||
lxml = ">=5.3,<7.0"
|
||||
lxml = ">=6.0.2,<7.0"
|
||||
MarkupSafe = "^3.0.2"
|
||||
mock = "^5.1.0"
|
||||
multidict = "^6.6.3"
|
||||
pycountry = "^24.6.1"
|
||||
pycountry = ">=24.6.1,<27.0.0"
|
||||
PyPDF2 = "^3.0.1"
|
||||
PySocks = "^1.7.1"
|
||||
python-bidi = "^0.6.3"
|
||||
requests = "^2.32.4"
|
||||
requests-futures = "^1.0.2"
|
||||
requests-toolbelt = "^1.0.0"
|
||||
six = "^1.17.0"
|
||||
socid-extractor = "^0.0.27"
|
||||
socid-extractor = ">=0.0.27,<0.0.29"
|
||||
soupsieve = "^2.6"
|
||||
stem = "^1.8.1"
|
||||
torrequest = "^0.1.0"
|
||||
@@ -73,6 +74,7 @@ cloudscraper = "^1.2.71"
|
||||
flask = {extras = ["async"], version = "^3.1.1"}
|
||||
asgiref = "^3.9.1"
|
||||
platformdirs = "^4.3.8"
|
||||
curl-cffi = ">=0.14,<1.0"
|
||||
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
@@ -93,4 +95,4 @@ black = ">=25.1,<27.0"
|
||||
[tool.poetry.scripts]
|
||||
# Run with: poetry run maigret <username>
|
||||
maigret = "maigret.maigret:run"
|
||||
update_sitesmd = "utils.update_site_data:main"
|
||||
update_sitesmd = "utils.update_site_data:main"
|
||||
|
||||
@@ -36,6 +36,7 @@ DEFAULT_ARGS: Dict[str, Any] = {
|
||||
'site_list': [],
|
||||
'stats': False,
|
||||
'tags': '',
|
||||
'exclude_tags': '',
|
||||
'timeout': 30,
|
||||
'tor_proxy': 'socks5://127.0.0.1:9050',
|
||||
'i2p_proxy': 'http://127.0.0.1:4444',
|
||||
@@ -47,6 +48,9 @@ DEFAULT_ARGS: Dict[str, Any] = {
|
||||
'web': None,
|
||||
'with_domains': False,
|
||||
'xmind': False,
|
||||
'md': False,
|
||||
'no_autoupdate': False,
|
||||
'force_update': False,
|
||||
}
|
||||
|
||||
|
||||
@@ -105,3 +109,34 @@ def test_args_multiple_sites(argparser):
|
||||
|
||||
for arg in vars(args):
|
||||
assert getattr(args, arg) == want_args[arg]
|
||||
|
||||
|
||||
def test_args_exclude_tags(argparser):
|
||||
args = argparser.parse_args('--exclude-tags porn,dating username'.split())
|
||||
|
||||
want_args = dict(DEFAULT_ARGS)
|
||||
want_args.update(
|
||||
{
|
||||
'exclude_tags': 'porn,dating',
|
||||
'username': ['username'],
|
||||
}
|
||||
)
|
||||
|
||||
for arg in vars(args):
|
||||
assert getattr(args, arg) == want_args[arg]
|
||||
|
||||
|
||||
def test_args_tags_with_exclude_tags(argparser):
|
||||
args = argparser.parse_args('--tags coding --exclude-tags porn username'.split())
|
||||
|
||||
want_args = dict(DEFAULT_ARGS)
|
||||
want_args.update(
|
||||
{
|
||||
'tags': 'coding',
|
||||
'exclude_tags': 'porn',
|
||||
'username': ['username'],
|
||||
}
|
||||
)
|
||||
|
||||
for arg in vars(args):
|
||||
assert getattr(args, arg) == want_args[arg]
|
||||
|
||||
@@ -4,6 +4,30 @@ import pytest
|
||||
from maigret.utils import is_country_tag
|
||||
|
||||
|
||||
TOP_SITES_ALEXA_RANK_LIMIT = 50
|
||||
|
||||
KNOWN_SOCIAL_DOMAINS = [
|
||||
"facebook.com",
|
||||
"instagram.com",
|
||||
"twitter.com",
|
||||
"tiktok.com",
|
||||
"vk.com",
|
||||
"reddit.com",
|
||||
"pinterest.com",
|
||||
"snapchat.com",
|
||||
"linkedin.com",
|
||||
"tumblr.com",
|
||||
"threads.net",
|
||||
"bsky.app",
|
||||
"myspace.com",
|
||||
"weibo.com",
|
||||
"mastodon.social",
|
||||
"gab.com",
|
||||
"minds.com",
|
||||
"clubhouse.com",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_tags_validity(default_db):
|
||||
unknown_tags = set()
|
||||
@@ -19,3 +43,62 @@ def test_tags_validity(default_db):
|
||||
# if you see "unchecked" tag error, please, do
|
||||
# maigret --db `pwd`/maigret/resources/data.json --self-check --tag unchecked --use-disabled-sites
|
||||
assert unknown_tags == set()
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_top_sites_have_category_tag(default_db):
|
||||
"""Top sites by alexaRank must have at least one category tag (not just country codes)."""
|
||||
sites_ranked = sorted(
|
||||
[s for s in default_db.sites if s.alexa_rank],
|
||||
key=lambda s: s.alexa_rank,
|
||||
)[:TOP_SITES_ALEXA_RANK_LIMIT]
|
||||
|
||||
missing_category = []
|
||||
for site in sites_ranked:
|
||||
category_tags = [t for t in site.tags if not is_country_tag(t)]
|
||||
if not category_tags:
|
||||
missing_category.append(f"{site.name} (rank {site.alexa_rank})")
|
||||
|
||||
assert missing_category == [], (
|
||||
f"{len(missing_category)} top-{TOP_SITES_ALEXA_RANK_LIMIT} sites have no category tag: "
|
||||
+ ", ".join(missing_category[:20])
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_no_unused_tags_in_registry(default_db):
|
||||
"""Every tag in the registry should be used by at least one site."""
|
||||
all_used_tags = set()
|
||||
for site in default_db.sites:
|
||||
for tag in site.tags:
|
||||
if not is_country_tag(tag):
|
||||
all_used_tags.add(tag)
|
||||
|
||||
registered_tags = set(default_db._tags)
|
||||
unused = registered_tags - all_used_tags
|
||||
|
||||
assert unused == set(), f"Tags registered but not used by any site: {unused}"
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_social_networks_have_social_tag(default_db):
|
||||
"""Known social network domains must have the 'social' tag."""
|
||||
from urllib.parse import urlparse
|
||||
|
||||
missing_social = []
|
||||
for site in default_db.sites:
|
||||
url = site.url_main or ""
|
||||
try:
|
||||
hostname = urlparse(url).hostname or ""
|
||||
except Exception:
|
||||
continue
|
||||
for domain in KNOWN_SOCIAL_DOMAINS:
|
||||
if hostname == domain or hostname.endswith("." + domain):
|
||||
if "social" not in site.tags:
|
||||
missing_social.append(f"{site.name} ({domain})")
|
||||
break
|
||||
|
||||
assert missing_social == [], (
|
||||
f"{len(missing_social)} known social networks missing 'social' tag: "
|
||||
+ ", ".join(missing_social)
|
||||
)
|
||||
|
||||
@@ -0,0 +1,236 @@
|
||||
"""Tests for the database auto-update system."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import hashlib
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from maigret.db_updater import (
|
||||
_parse_version,
|
||||
_needs_check,
|
||||
_is_version_compatible,
|
||||
_is_update_available,
|
||||
_load_state,
|
||||
_save_state,
|
||||
_best_local,
|
||||
_now_iso,
|
||||
resolve_db_path,
|
||||
force_update,
|
||||
CACHED_DB_PATH,
|
||||
BUNDLED_DB_PATH,
|
||||
STATE_PATH,
|
||||
MAIGRET_HOME,
|
||||
)
|
||||
|
||||
|
||||
def test_parse_version():
|
||||
assert _parse_version("0.5.0") == (0, 5, 0)
|
||||
assert _parse_version("1.2.3") == (1, 2, 3)
|
||||
assert _parse_version("bad") == (0, 0, 0)
|
||||
assert _parse_version("") == (0, 0, 0)
|
||||
|
||||
|
||||
def test_needs_check_no_state():
|
||||
assert _needs_check({}, 24) is True
|
||||
|
||||
|
||||
def test_needs_check_recent():
|
||||
state = {"last_check_at": _now_iso()}
|
||||
assert _needs_check(state, 24) is False
|
||||
|
||||
|
||||
def test_needs_check_expired():
|
||||
old_time = (datetime.now(timezone.utc) - timedelta(hours=25)).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
state = {"last_check_at": old_time}
|
||||
assert _needs_check(state, 24) is True
|
||||
|
||||
|
||||
def test_needs_check_corrupt():
|
||||
state = {"last_check_at": "not-a-date"}
|
||||
assert _needs_check(state, 24) is True
|
||||
|
||||
|
||||
def test_version_compatible():
|
||||
with patch("maigret.db_updater.__version__", "0.5.0"):
|
||||
assert _is_version_compatible({"min_maigret_version": "0.5.0"}) is True
|
||||
assert _is_version_compatible({"min_maigret_version": "0.4.0"}) is True
|
||||
assert _is_version_compatible({"min_maigret_version": "0.6.0"}) is False
|
||||
assert _is_version_compatible({}) is True # missing field = compatible
|
||||
|
||||
|
||||
def test_update_available_no_cache(tmp_path):
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(tmp_path / "nonexistent.json")):
|
||||
assert _is_update_available({"updated_at": "2026-01-01T00:00:00Z"}, {}) is True
|
||||
|
||||
|
||||
def test_update_available_newer(tmp_path):
|
||||
cache = tmp_path / "data.json"
|
||||
cache.write_text("{}")
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(cache)):
|
||||
state = {"last_meta": {"updated_at": "2026-01-01T00:00:00Z"}}
|
||||
meta = {"updated_at": "2026-02-01T00:00:00Z"}
|
||||
assert _is_update_available(meta, state) is True
|
||||
|
||||
|
||||
def test_update_available_same(tmp_path):
|
||||
cache = tmp_path / "data.json"
|
||||
cache.write_text("{}")
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(cache)):
|
||||
state = {"last_meta": {"updated_at": "2026-01-01T00:00:00Z"}}
|
||||
meta = {"updated_at": "2026-01-01T00:00:00Z"}
|
||||
assert _is_update_available(meta, state) is False
|
||||
|
||||
|
||||
def test_load_state_missing(tmp_path):
|
||||
with patch("maigret.db_updater.STATE_PATH", str(tmp_path / "missing.json")):
|
||||
assert _load_state() == {}
|
||||
|
||||
|
||||
def test_load_state_corrupt(tmp_path):
|
||||
corrupt = tmp_path / "state.json"
|
||||
corrupt.write_text("not json{{{")
|
||||
with patch("maigret.db_updater.STATE_PATH", str(corrupt)):
|
||||
assert _load_state() == {}
|
||||
|
||||
|
||||
def test_save_and_load_state(tmp_path):
|
||||
state_file = tmp_path / "state.json"
|
||||
with patch("maigret.db_updater.STATE_PATH", str(state_file)):
|
||||
with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)):
|
||||
_save_state({"last_check_at": "2026-01-01T00:00:00Z"})
|
||||
loaded = _load_state()
|
||||
assert loaded["last_check_at"] == "2026-01-01T00:00:00Z"
|
||||
|
||||
|
||||
def test_best_local_with_valid_cache(tmp_path):
|
||||
cache = tmp_path / "data.json"
|
||||
cache.write_text('{"sites": {}, "engines": {}, "tags": []}')
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(cache)):
|
||||
assert _best_local() == str(cache)
|
||||
|
||||
|
||||
def test_best_local_with_corrupt_cache(tmp_path):
|
||||
cache = tmp_path / "data.json"
|
||||
cache.write_text("not json")
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(cache)):
|
||||
assert _best_local() == BUNDLED_DB_PATH
|
||||
|
||||
|
||||
def test_best_local_no_cache(tmp_path):
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(tmp_path / "missing.json")):
|
||||
assert _best_local() == BUNDLED_DB_PATH
|
||||
|
||||
|
||||
def test_resolve_db_path_custom_url():
|
||||
result = resolve_db_path("https://example.com/db.json")
|
||||
assert result == "https://example.com/db.json"
|
||||
|
||||
|
||||
def test_resolve_db_path_custom_file(tmp_path):
|
||||
custom_db = tmp_path / "custom" / "path.json"
|
||||
custom_db.parent.mkdir(parents=True)
|
||||
custom_db.write_text("{}")
|
||||
result = resolve_db_path(str(custom_db))
|
||||
assert result.endswith("custom/path.json")
|
||||
|
||||
|
||||
def test_resolve_db_path_no_autoupdate(tmp_path):
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(tmp_path / "missing.json")):
|
||||
result = resolve_db_path("resources/data.json", no_autoupdate=True)
|
||||
assert result == BUNDLED_DB_PATH
|
||||
|
||||
|
||||
def test_resolve_db_path_no_autoupdate_with_cache(tmp_path):
|
||||
cache = tmp_path / "data.json"
|
||||
cache.write_text('{"sites": {}, "engines": {}, "tags": []}')
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(cache)):
|
||||
result = resolve_db_path("resources/data.json", no_autoupdate=True)
|
||||
assert result == str(cache)
|
||||
|
||||
|
||||
@patch("maigret.db_updater._fetch_meta")
|
||||
def test_resolve_db_path_network_failure(mock_fetch, tmp_path):
|
||||
mock_fetch.return_value = None
|
||||
with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)):
|
||||
with patch("maigret.db_updater.STATE_PATH", str(tmp_path / "state.json")):
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(tmp_path / "missing.json")):
|
||||
result = resolve_db_path("resources/data.json")
|
||||
assert result == BUNDLED_DB_PATH
|
||||
|
||||
|
||||
# --- force_update tests ---
|
||||
|
||||
|
||||
@patch("maigret.db_updater._fetch_meta")
|
||||
def test_force_update_network_failure(mock_fetch, tmp_path):
|
||||
mock_fetch.return_value = None
|
||||
with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)):
|
||||
with patch("maigret.db_updater.STATE_PATH", str(tmp_path / "state.json")):
|
||||
assert force_update() is False
|
||||
|
||||
|
||||
@patch("maigret.db_updater._fetch_meta")
|
||||
def test_force_update_incompatible_version(mock_fetch, tmp_path):
|
||||
mock_fetch.return_value = {"min_maigret_version": "99.0.0", "sites_count": 100}
|
||||
with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)):
|
||||
with patch("maigret.db_updater.STATE_PATH", str(tmp_path / "state.json")):
|
||||
assert force_update() is False
|
||||
|
||||
|
||||
@patch("maigret.db_updater._download_and_verify")
|
||||
@patch("maigret.db_updater._fetch_meta")
|
||||
def test_force_update_success(mock_fetch, mock_download, tmp_path):
|
||||
mock_fetch.return_value = {
|
||||
"min_maigret_version": "0.1.0",
|
||||
"sites_count": 3200,
|
||||
"updated_at": "2099-01-01T00:00:00Z",
|
||||
"data_url": "https://example.com/data.json",
|
||||
"data_sha256": "abc123",
|
||||
}
|
||||
mock_download.return_value = str(tmp_path / "data.json")
|
||||
with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)):
|
||||
with patch("maigret.db_updater.STATE_PATH", str(tmp_path / "state.json")):
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(tmp_path / "missing.json")):
|
||||
assert force_update() is True
|
||||
state = _load_state()
|
||||
assert state["last_meta"]["sites_count"] == 3200
|
||||
|
||||
|
||||
@patch("maigret.db_updater._fetch_meta")
|
||||
def test_force_update_already_up_to_date(mock_fetch, tmp_path):
|
||||
cache = tmp_path / "data.json"
|
||||
cache.write_text('{"sites": {}, "engines": {}, "tags": []}')
|
||||
state_file = tmp_path / "state.json"
|
||||
state_file.write_text(json.dumps({
|
||||
"last_check_at": _now_iso(),
|
||||
"last_meta": {"updated_at": "2026-01-01T00:00:00Z", "sites_count": 3000},
|
||||
}))
|
||||
mock_fetch.return_value = {
|
||||
"min_maigret_version": "0.1.0",
|
||||
"sites_count": 3000,
|
||||
"updated_at": "2026-01-01T00:00:00Z",
|
||||
}
|
||||
with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)):
|
||||
with patch("maigret.db_updater.STATE_PATH", str(state_file)):
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(cache)):
|
||||
assert force_update() is False
|
||||
|
||||
|
||||
@patch("maigret.db_updater._download_and_verify")
|
||||
@patch("maigret.db_updater._fetch_meta")
|
||||
def test_force_update_download_fails(mock_fetch, mock_download, tmp_path):
|
||||
mock_fetch.return_value = {
|
||||
"min_maigret_version": "0.1.0",
|
||||
"sites_count": 3200,
|
||||
"updated_at": "2099-01-01T00:00:00Z",
|
||||
"data_url": "https://example.com/data.json",
|
||||
"data_sha256": "abc123",
|
||||
}
|
||||
mock_download.return_value = None
|
||||
with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)):
|
||||
with patch("maigret.db_updater.STATE_PATH", str(tmp_path / "state.json")):
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(tmp_path / "missing.json")):
|
||||
assert force_update() is False
|
||||
@@ -36,7 +36,7 @@ def test_notify_about_errors():
|
||||
},
|
||||
}
|
||||
|
||||
results = notify_about_errors(results, query_notify=None, show_statistics=True)
|
||||
notifications = notify_about_errors(results, query_notify=None, show_statistics=True)
|
||||
|
||||
# Check the output
|
||||
expected_output = [
|
||||
@@ -55,4 +55,4 @@ def test_notify_about_errors():
|
||||
('Access denied: 25.0%', '!'),
|
||||
('You can see detailed site check errors with a flag `--print-errors`', '-'),
|
||||
]
|
||||
assert results == expected_output
|
||||
assert notifications == expected_output
|
||||
|
||||
+10
-9
@@ -3,6 +3,7 @@
|
||||
import pytest
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Any, List, Tuple, Callable, Dict
|
||||
from maigret.executors import (
|
||||
AsyncioSimpleExecutor,
|
||||
AsyncioProgressbarExecutor,
|
||||
@@ -21,7 +22,7 @@ async def func(n):
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_simple_asyncio_executor():
|
||||
tasks = [(func, [n], {}) for n in range(10)]
|
||||
tasks: List[Tuple[Callable, list, dict]] = [(func, [n], {}) for n in range(10)]
|
||||
executor = AsyncioSimpleExecutor(logger=logger)
|
||||
assert await executor.run(tasks) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
|
||||
assert executor.execution_time > 0.2
|
||||
@@ -30,7 +31,7 @@ async def test_simple_asyncio_executor():
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_asyncio_progressbar_executor():
|
||||
tasks = [(func, [n], {}) for n in range(10)]
|
||||
tasks: List[Tuple[Callable, list, dict]] = [(func, [n], {}) for n in range(10)]
|
||||
|
||||
executor = AsyncioProgressbarExecutor(logger=logger)
|
||||
# no guarantees for the results order
|
||||
@@ -41,7 +42,7 @@ async def test_asyncio_progressbar_executor():
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_asyncio_progressbar_semaphore_executor():
|
||||
tasks = [(func, [n], {}) for n in range(10)]
|
||||
tasks: List[Tuple[Callable, list, dict]] = [(func, [n], {}) for n in range(10)]
|
||||
|
||||
executor = AsyncioProgressbarSemaphoreExecutor(logger=logger, in_parallel=5)
|
||||
# no guarantees for the results order
|
||||
@@ -53,7 +54,7 @@ async def test_asyncio_progressbar_semaphore_executor():
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.asyncio
|
||||
async def test_asyncio_progressbar_queue_executor():
|
||||
tasks = [(func, [n], {}) for n in range(10)]
|
||||
tasks: List[Tuple[Callable, list, dict]] = [(func, [n], {}) for n in range(10)]
|
||||
|
||||
executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=2)
|
||||
assert await executor.run(tasks) == [0, 1, 3, 2, 4, 6, 7, 5, 9, 8]
|
||||
@@ -81,22 +82,22 @@ async def test_asyncio_progressbar_queue_executor():
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_asyncio_queue_generator_executor():
|
||||
tasks = [(func, [n], {}) for n in range(10)]
|
||||
tasks: List[Tuple[Callable, list, dict]] = [(func, [n], {}) for n in range(10)]
|
||||
|
||||
executor = AsyncioQueueGeneratorExecutor(logger=logger, in_parallel=2)
|
||||
results = [result async for result in executor.run(tasks)]
|
||||
results = [result async for result in executor.run(tasks)] # type: ignore[arg-type]
|
||||
assert results == [0, 1, 3, 2, 4, 6, 7, 5, 9, 8]
|
||||
assert executor.execution_time > 0.5
|
||||
assert executor.execution_time < 0.6
|
||||
|
||||
executor = AsyncioQueueGeneratorExecutor(logger=logger, in_parallel=3)
|
||||
results = [result async for result in executor.run(tasks)]
|
||||
results = [result async for result in executor.run(tasks)] # type: ignore[arg-type]
|
||||
assert results == [0, 3, 1, 4, 6, 2, 7, 9, 5, 8]
|
||||
assert executor.execution_time > 0.4
|
||||
assert executor.execution_time < 0.5
|
||||
|
||||
executor = AsyncioQueueGeneratorExecutor(logger=logger, in_parallel=5)
|
||||
results = [result async for result in executor.run(tasks)]
|
||||
results = [result async for result in executor.run(tasks)] # type: ignore[arg-type]
|
||||
assert results in (
|
||||
[0, 3, 6, 1, 4, 7, 9, 2, 5, 8],
|
||||
[0, 3, 6, 1, 4, 9, 7, 2, 5, 8],
|
||||
@@ -105,7 +106,7 @@ async def test_asyncio_queue_generator_executor():
|
||||
assert executor.execution_time < 0.4
|
||||
|
||||
executor = AsyncioQueueGeneratorExecutor(logger=logger, in_parallel=10)
|
||||
results = [result async for result in executor.run(tasks)]
|
||||
results = [result async for result in executor.run(tasks)] # type: ignore[arg-type]
|
||||
assert results == [0, 3, 6, 9, 1, 4, 7, 2, 5, 8]
|
||||
assert executor.execution_time > 0.2
|
||||
assert executor.execution_time < 0.3
|
||||
|
||||
+84
-2
@@ -2,6 +2,7 @@
|
||||
|
||||
import asyncio
|
||||
import copy
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from mock import Mock
|
||||
@@ -11,7 +12,8 @@ from maigret.maigret import (
|
||||
extract_ids_from_page,
|
||||
extract_ids_from_results,
|
||||
)
|
||||
from maigret.sites import MaigretSite
|
||||
from maigret.checking import site_self_check
|
||||
from maigret.sites import MaigretSite, MaigretDatabase
|
||||
from maigret.result import MaigretCheckResult, MaigretCheckStatus
|
||||
from tests.conftest import RESULTS_EXAMPLE
|
||||
|
||||
@@ -37,6 +39,86 @@ async def test_self_check_db(test_db):
|
||||
assert test_db.sites_dict['InvalidInactive'].disabled is True
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.asyncio
|
||||
async def test_self_check_no_progressbar(test_db):
|
||||
"""Verify that no_progressbar=True disables the alive_bar in self_check."""
|
||||
logger = Mock()
|
||||
|
||||
with patch('maigret.checking.alive_bar') as mock_alive_bar:
|
||||
mock_bar = Mock()
|
||||
mock_alive_bar.return_value.__enter__ = Mock(return_value=mock_bar)
|
||||
mock_alive_bar.return_value.__exit__ = Mock(return_value=False)
|
||||
|
||||
await self_check(
|
||||
test_db, test_db.sites_dict, logger, silent=True,
|
||||
no_progressbar=True,
|
||||
)
|
||||
|
||||
# First call is the self-check progress bar; subsequent calls are
|
||||
# from inner search() invocations.
|
||||
self_check_call = mock_alive_bar.call_args_list[0]
|
||||
_, kwargs = self_check_call
|
||||
assert kwargs.get('title') == 'Self-checking'
|
||||
assert kwargs.get('disable') is True
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.asyncio
|
||||
async def test_self_check_progressbar_enabled_by_default(test_db):
|
||||
"""Verify that alive_bar is enabled by default (no_progressbar=False)."""
|
||||
logger = Mock()
|
||||
|
||||
with patch('maigret.checking.alive_bar') as mock_alive_bar:
|
||||
mock_bar = Mock()
|
||||
mock_alive_bar.return_value.__enter__ = Mock(return_value=mock_bar)
|
||||
mock_alive_bar.return_value.__exit__ = Mock(return_value=False)
|
||||
|
||||
await self_check(
|
||||
test_db, test_db.sites_dict, logger, silent=True,
|
||||
)
|
||||
|
||||
self_check_call = mock_alive_bar.call_args_list[0]
|
||||
_, kwargs = self_check_call
|
||||
assert kwargs.get('title') == 'Self-checking'
|
||||
assert kwargs.get('disable') is False
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_site_self_check_handles_exception(test_db):
|
||||
"""Verify that site_self_check catches unexpected exceptions and returns a valid result."""
|
||||
logger = Mock()
|
||||
sem = asyncio.Semaphore(1)
|
||||
site = test_db.sites_dict['ValidActive']
|
||||
|
||||
with patch('maigret.checking.maigret', side_effect=RuntimeError("test crash")):
|
||||
result = await site_self_check(site, logger, sem, test_db)
|
||||
|
||||
assert isinstance(result, dict)
|
||||
assert "issues" in result
|
||||
assert len(result["issues"]) > 0
|
||||
assert any("Unexpected error" in issue for issue in result["issues"])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_self_check_handles_task_exception(test_db):
|
||||
"""Verify that self_check continues when individual site checks raise exceptions."""
|
||||
logger = Mock()
|
||||
|
||||
with patch('maigret.checking.maigret', side_effect=RuntimeError("test crash")):
|
||||
result = await self_check(
|
||||
test_db, test_db.sites_dict, logger, silent=True,
|
||||
no_progressbar=True,
|
||||
)
|
||||
|
||||
assert isinstance(result, dict)
|
||||
assert 'results' in result
|
||||
assert len(result['results']) == len(test_db.sites_dict)
|
||||
for r in result['results']:
|
||||
assert 'site_name' in r
|
||||
assert 'issues' in r
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.skip(reason="broken, fixme")
|
||||
def test_maigret_results(test_db):
|
||||
@@ -112,7 +194,7 @@ def test_extract_ids_from_page(test_db):
|
||||
|
||||
|
||||
def test_extract_ids_from_results(test_db):
|
||||
TEST_EXAMPLE = copy.deepcopy(RESULTS_EXAMPLE)
|
||||
TEST_EXAMPLE: dict = copy.deepcopy(RESULTS_EXAMPLE)
|
||||
TEST_EXAMPLE['Reddit']['ids_usernames'] = {'test1': 'yandex_public_id'}
|
||||
TEST_EXAMPLE['Reddit']['ids_links'] = ['https://www.reddit.com/user/test2']
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ import os
|
||||
import pytest
|
||||
from io import StringIO
|
||||
|
||||
import xmind
|
||||
import xmind # type: ignore[import-untyped]
|
||||
from jinja2 import Template
|
||||
|
||||
from maigret.report import (
|
||||
|
||||
+51
-1
@@ -1,8 +1,10 @@
|
||||
"""Maigret Database test functions"""
|
||||
|
||||
from typing import Any, Dict
|
||||
|
||||
from maigret.sites import MaigretDatabase, MaigretSite
|
||||
|
||||
EXAMPLE_DB = {
|
||||
EXAMPLE_DB: Dict[str, Any] = {
|
||||
'engines': {
|
||||
"XenForo": {
|
||||
"presenseStrs": ["XenForo"],
|
||||
@@ -182,6 +184,54 @@ def test_ranked_sites_dict_id_type():
|
||||
assert len(db.ranked_sites_dict(id_type='gaia_id')) == 1
|
||||
|
||||
|
||||
def test_ranked_sites_dict_excluded_tags():
|
||||
db = MaigretDatabase()
|
||||
db.update_site(MaigretSite('3', {'alexaRank': 1000, 'engine': 'ucoz'}))
|
||||
db.update_site(MaigretSite('1', {'alexaRank': 2, 'tags': ['forum']}))
|
||||
db.update_site(MaigretSite('2', {'alexaRank': 10, 'tags': ['ru', 'forum']}))
|
||||
|
||||
# excluding by tag
|
||||
assert list(db.ranked_sites_dict(excluded_tags=['ru']).keys()) == ['1', '3']
|
||||
assert list(db.ranked_sites_dict(excluded_tags=['forum']).keys()) == ['3']
|
||||
|
||||
# excluding by engine
|
||||
assert list(db.ranked_sites_dict(excluded_tags=['ucoz']).keys()) == ['1', '2']
|
||||
|
||||
# combining include and exclude tags
|
||||
assert list(db.ranked_sites_dict(tags=['forum'], excluded_tags=['ru']).keys()) == ['1']
|
||||
|
||||
# excluding non-existent tag has no effect
|
||||
assert list(db.ranked_sites_dict(excluded_tags=['nonexistent']).keys()) == ['1', '2', '3']
|
||||
|
||||
# exclude all
|
||||
assert list(db.ranked_sites_dict(excluded_tags=['forum', 'ucoz']).keys()) == []
|
||||
|
||||
|
||||
def test_ranked_sites_dict_excluded_tags_with_top():
|
||||
"""Excluded tags should also prevent mirrors from being included."""
|
||||
db = MaigretDatabase()
|
||||
db.update_site(
|
||||
MaigretSite('Parent', {'alexaRank': 1, 'tags': ['forum'], 'type': 'username'})
|
||||
)
|
||||
db.update_site(
|
||||
MaigretSite('Mirror', {'alexaRank': 999999, 'source': 'Parent', 'tags': ['forum'], 'type': 'username'})
|
||||
)
|
||||
db.update_site(
|
||||
MaigretSite('Other', {'alexaRank': 2, 'tags': ['coding'], 'type': 'username'})
|
||||
)
|
||||
|
||||
# Without exclusion, mirror should be included
|
||||
result = db.ranked_sites_dict(top=1, id_type='username')
|
||||
assert 'Parent' in result
|
||||
assert 'Mirror' in result
|
||||
|
||||
# With exclusion of 'forum', both Parent and Mirror should be excluded
|
||||
result = db.ranked_sites_dict(top=2, excluded_tags=['forum'], id_type='username')
|
||||
assert 'Parent' not in result
|
||||
assert 'Mirror' not in result
|
||||
assert 'Other' in result
|
||||
|
||||
|
||||
def test_ranked_sites_dict_mirrors_disabled_parent():
|
||||
"""Mirror is included when parent ranks in top N but parent is disabled."""
|
||||
db = MaigretDatabase()
|
||||
|
||||
+86
-3
@@ -1,8 +1,10 @@
|
||||
import re
|
||||
|
||||
import pytest
|
||||
from unittest.mock import MagicMock, patch
|
||||
from maigret.submit import Submitter
|
||||
from aiohttp import ClientSession
|
||||
from maigret.sites import MaigretDatabase
|
||||
from maigret.sites import MaigretDatabase, MaigretSite
|
||||
import logging
|
||||
|
||||
|
||||
@@ -26,7 +28,7 @@ async def test_detect_known_engine(test_db, local_test_db):
|
||||
url_exists = "https://devforum.zoom.us/u/adam"
|
||||
url_mainpage = "https://devforum.zoom.us/"
|
||||
# Mock extract_username_dialog to return "adam"
|
||||
submitter.extract_username_dialog = MagicMock(return_value="adam")
|
||||
submitter.extract_username_dialog = MagicMock(return_value="adam") # type: ignore[method-assign]
|
||||
|
||||
sites, resp_text = await submitter.detect_known_engine(
|
||||
url_exists, url_mainpage, session=None, follow_redirects=False, headers=None
|
||||
@@ -109,7 +111,7 @@ async def test_check_features_manually_success(settings):
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_features_manually_success(settings):
|
||||
async def test_check_features_manually_cloudflare(settings):
|
||||
# Setup
|
||||
db = MaigretDatabase()
|
||||
logger = logging.getLogger("test_logger")
|
||||
@@ -275,3 +277,84 @@ async def test_dialog_adds_site_negative(settings):
|
||||
await submitter.close()
|
||||
|
||||
assert result is False
|
||||
|
||||
|
||||
def test_domain_matching_exact():
|
||||
"""Test that domain matching uses proper boundary checks, not substring matching.
|
||||
|
||||
x.com should NOT match sites like 500px.com, mix.com, etc.
|
||||
"""
|
||||
domain_raw = "x.com"
|
||||
domain_re = re.compile(
|
||||
r'://(www\.)?' + re.escape(domain_raw) + r'(/|$)'
|
||||
)
|
||||
|
||||
# These should NOT match x.com
|
||||
non_matching = [
|
||||
MaigretSite("500px", {"url": "https://500px.com/p/{username}", "urlMain": "https://500px.com/"}),
|
||||
MaigretSite("Mix", {"url": "https://mix.com/{username}", "urlMain": "https://mix.com"}),
|
||||
MaigretSite("Screwfix", {"url": "{urlMain}{urlSubpath}/members/?username={username}", "urlMain": "https://community.screwfix.com"}),
|
||||
MaigretSite("Wix", {"url": "https://{username}.wix.com", "urlMain": "https://wix.com/"}),
|
||||
MaigretSite("1x", {"url": "https://1x.com/{username}", "urlMain": "https://1x.com"}),
|
||||
MaigretSite("Roblox", {"url": "https://www.roblox.com/user.aspx?username={username}", "urlMain": "https://www.roblox.com/"}),
|
||||
]
|
||||
|
||||
for site in non_matching:
|
||||
assert not domain_re.search(site.url_main + site.url), \
|
||||
f"x.com should NOT match site {site.name} ({site.url_main})"
|
||||
|
||||
|
||||
def test_domain_matching_positive():
|
||||
"""Test that domain matching correctly matches the exact domain."""
|
||||
domain_raw = "x.com"
|
||||
domain_re = re.compile(
|
||||
r'://(www\.)?' + re.escape(domain_raw) + r'(/|$)'
|
||||
)
|
||||
|
||||
# These SHOULD match x.com
|
||||
matching = [
|
||||
MaigretSite("X", {"url": "https://x.com/{username}", "urlMain": "https://x.com"}),
|
||||
MaigretSite("X-www", {"url": "https://www.x.com/{username}", "urlMain": "https://www.x.com"}),
|
||||
]
|
||||
|
||||
for site in matching:
|
||||
assert domain_re.search(site.url_main + site.url), \
|
||||
f"x.com SHOULD match site {site.name} ({site.url_main})"
|
||||
|
||||
|
||||
def test_dialog_nonexistent_site_name_no_crash():
|
||||
"""Test that entering a site name not in the matched list doesn't crash.
|
||||
|
||||
This tests the fix for: AttributeError: 'NoneType' object has no attribute 'name'
|
||||
The old_site should be None when user enters a name not in matched_sites,
|
||||
and the code should handle it gracefully.
|
||||
"""
|
||||
# Simulate the logic that was crashing
|
||||
matched_sites = [
|
||||
MaigretSite("ValidActive", {"url": "https://example.com/{username}", "urlMain": "https://example.com"}),
|
||||
MaigretSite("InvalidActive", {"url": "https://example.com/alt/{username}", "urlMain": "https://example.com"}),
|
||||
]
|
||||
site_name = "NonExistentSite"
|
||||
|
||||
old_site = next(
|
||||
(site for site in matched_sites if site.name == site_name), None
|
||||
)
|
||||
|
||||
# This is what the old code did - it would crash here
|
||||
assert old_site is None
|
||||
|
||||
# The fix: check before accessing .name
|
||||
if old_site is None:
|
||||
result = "not found"
|
||||
else:
|
||||
result = old_site.name
|
||||
|
||||
assert result == "not found"
|
||||
|
||||
# And when site_name IS in matched_sites, it should work
|
||||
site_name = "ValidActive"
|
||||
old_site = next(
|
||||
(site for site in matched_sites if site.name == site_name), None
|
||||
)
|
||||
assert old_site is not None
|
||||
assert old_site.name == "ValidActive"
|
||||
|
||||
@@ -0,0 +1,59 @@
|
||||
"""Generate db_meta.json from data.json for the auto-update system."""
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import os.path as path
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
|
||||
RESOURCES_DIR = path.join(path.dirname(path.dirname(path.abspath(__file__))), "maigret", "resources")
|
||||
DATA_JSON_PATH = path.join(RESOURCES_DIR, "data.json")
|
||||
META_JSON_PATH = path.join(RESOURCES_DIR, "db_meta.json")
|
||||
DEFAULT_DATA_URL = "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/data.json"
|
||||
|
||||
|
||||
def get_current_version():
|
||||
version_file = path.join(path.dirname(path.dirname(path.abspath(__file__))), "maigret", "__version__.py")
|
||||
with open(version_file) as f:
|
||||
for line in f:
|
||||
if line.startswith("__version__"):
|
||||
return line.split("=")[1].strip().strip("'\"")
|
||||
return "0.0.0"
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Generate db_meta.json from data.json")
|
||||
parser.add_argument("--min-version", default=None, help="Minimum compatible maigret version (default: current version)")
|
||||
parser.add_argument("--data-url", default=DEFAULT_DATA_URL, help="URL where data.json can be downloaded")
|
||||
args = parser.parse_args()
|
||||
|
||||
min_version = args.min_version or get_current_version()
|
||||
|
||||
with open(DATA_JSON_PATH, "rb") as f:
|
||||
raw = f.read()
|
||||
sha256 = hashlib.sha256(raw).hexdigest()
|
||||
|
||||
data = json.loads(raw)
|
||||
sites_count = len(data.get("sites", {}))
|
||||
|
||||
meta = {
|
||||
"version": 1,
|
||||
"updated_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"sites_count": sites_count,
|
||||
"min_maigret_version": min_version,
|
||||
"data_sha256": sha256,
|
||||
"data_url": args.data_url,
|
||||
}
|
||||
|
||||
with open(META_JSON_PATH, "w", encoding="utf-8") as f:
|
||||
json.dump(meta, f, indent=4, ensure_ascii=False)
|
||||
|
||||
print(f"Generated {META_JSON_PATH}")
|
||||
print(f" sites: {sites_count}")
|
||||
print(f" sha256: {sha256[:16]}...")
|
||||
print(f" min_version: {min_version}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
+67
-9
@@ -26,6 +26,7 @@ sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
try:
|
||||
import aiohttp
|
||||
from yarl import URL as YarlURL
|
||||
except ImportError:
|
||||
print("aiohttp not installed. Run: pip install aiohttp")
|
||||
sys.exit(1)
|
||||
@@ -74,8 +75,14 @@ def color(text: str, c: str) -> str:
|
||||
|
||||
|
||||
async def check_url_aiohttp(url: str, headers: dict = None, follow_redirects: bool = True,
|
||||
timeout: int = 15, ssl_verify: bool = False) -> dict:
|
||||
"""Check a URL using aiohttp and return detailed response info."""
|
||||
timeout: int = 15, ssl_verify: bool = False,
|
||||
method: str = "GET", payload: dict = None) -> dict:
|
||||
"""Check a URL using aiohttp and return detailed response info.
|
||||
|
||||
Args:
|
||||
method: HTTP method ("GET" or "POST").
|
||||
payload: JSON payload for POST requests (dict, will be serialized).
|
||||
"""
|
||||
headers = headers or DEFAULT_HEADERS.copy()
|
||||
result = {
|
||||
"method": "aiohttp",
|
||||
@@ -96,7 +103,14 @@ async def check_url_aiohttp(url: str, headers: dict = None, follow_redirects: bo
|
||||
timeout_obj = aiohttp.ClientTimeout(total=timeout)
|
||||
|
||||
async with aiohttp.ClientSession(connector=connector, timeout=timeout_obj) as session:
|
||||
async with session.get(url, headers=headers, allow_redirects=follow_redirects) as resp:
|
||||
# Use encoded=True if URL contains percent-encoded chars to prevent double-encoding
|
||||
request_url = YarlURL(url, encoded=True) if '%' in url else url
|
||||
request_kwargs = dict(headers=headers, allow_redirects=follow_redirects)
|
||||
if method.upper() == "POST" and payload is not None:
|
||||
request_kwargs["json"] = payload
|
||||
|
||||
request_fn = session.post if method.upper() == "POST" else session.get
|
||||
async with request_fn(request_url, **request_kwargs) as resp:
|
||||
result["status"] = resp.status
|
||||
result["final_url"] = str(resp.url)
|
||||
|
||||
@@ -438,21 +452,54 @@ async def diagnose_site(site_config: dict, site_name: str) -> dict:
|
||||
print(f" {color('[!]', Colors.RED)} No usernameClaimed defined")
|
||||
return diagnosis
|
||||
|
||||
# Build full URL
|
||||
# Build full URL (display URL)
|
||||
url_template = url.replace("{urlMain}", url_main).replace("{urlSubpath}", site_config.get("urlSubpath", ""))
|
||||
|
||||
# Build probe URL (what Maigret actually requests)
|
||||
url_probe = site_config.get("urlProbe", "")
|
||||
if url_probe:
|
||||
probe_template = url_probe.replace("{urlMain}", url_main).replace("{urlSubpath}", site_config.get("urlSubpath", ""))
|
||||
else:
|
||||
probe_template = url_template
|
||||
|
||||
# Detect request method and payload
|
||||
request_method = site_config.get("requestMethod", "GET").upper()
|
||||
request_payload_template = site_config.get("requestPayload")
|
||||
|
||||
headers = DEFAULT_HEADERS.copy()
|
||||
# For API probes (urlProbe, POST), use neutral Accept header instead of text/html
|
||||
# which can cause servers to return HTML instead of JSON
|
||||
if url_probe or request_method == "POST":
|
||||
headers["Accept"] = "*/*"
|
||||
if site_config.get("headers"):
|
||||
headers.update(site_config["headers"])
|
||||
|
||||
if url_probe:
|
||||
print(f" urlProbe: {url_probe}")
|
||||
if request_method != "GET":
|
||||
print(f" requestMethod: {request_method}")
|
||||
if request_payload_template:
|
||||
print(f" requestPayload: {request_payload_template}")
|
||||
|
||||
# 2. Connectivity test
|
||||
print(f"\n--- {color('2. CONNECTIVITY TEST', Colors.BOLD)} ---")
|
||||
url_claimed = url_template.replace("{username}", claimed)
|
||||
url_unclaimed = url_template.replace("{username}", unclaimed)
|
||||
probe_claimed = probe_template.replace("{username}", claimed)
|
||||
probe_unclaimed = probe_template.replace("{username}", unclaimed)
|
||||
|
||||
# Build payloads with username substituted
|
||||
payload_claimed = None
|
||||
payload_unclaimed = None
|
||||
if request_payload_template and request_method == "POST":
|
||||
payload_claimed = json.loads(
|
||||
json.dumps(request_payload_template).replace("{username}", claimed)
|
||||
)
|
||||
payload_unclaimed = json.loads(
|
||||
json.dumps(request_payload_template).replace("{username}", unclaimed)
|
||||
)
|
||||
|
||||
result_claimed, result_unclaimed = await asyncio.gather(
|
||||
check_url_aiohttp(url_claimed, headers),
|
||||
check_url_aiohttp(url_unclaimed, headers)
|
||||
check_url_aiohttp(probe_claimed, headers, method=request_method, payload=payload_claimed),
|
||||
check_url_aiohttp(probe_unclaimed, headers, method=request_method, payload=payload_unclaimed)
|
||||
)
|
||||
|
||||
print(f" Claimed ({claimed}): status={result_claimed['status']}, error={result_claimed['error']}")
|
||||
@@ -523,7 +570,18 @@ async def diagnose_site(site_config: dict, site_name: str) -> dict:
|
||||
diagnosis["warnings"].append(f"absenceStrs not found in unclaimed page")
|
||||
print(f" {color('[WARN]', Colors.YELLOW)} absenceStrs not found in unclaimed page")
|
||||
|
||||
if presense_found_claimed and not absence_found_claimed and absence_found_unclaimed:
|
||||
# Check works if: claimed is detected as present AND unclaimed is detected as absent.
|
||||
# Presence detection: presenseStrs found (or empty = always true).
|
||||
# Absence detection: absenceStrs found in unclaimed (or empty = never, rely on presenseStrs only).
|
||||
# With only presenseStrs: works if found in claimed but NOT in unclaimed.
|
||||
# With only absenceStrs: works if found in unclaimed but NOT in claimed.
|
||||
# With both: standard combination.
|
||||
claimed_is_present = presense_found_claimed and not absence_found_claimed
|
||||
unclaimed_is_absent = (
|
||||
(absence_strs and absence_found_unclaimed) or
|
||||
(presense_strs and not presense_found_unclaimed)
|
||||
)
|
||||
if claimed_is_present and unclaimed_is_absent:
|
||||
print(f" {color('[OK]', Colors.GREEN)} Message check should work correctly")
|
||||
diagnosis["working"] = True
|
||||
|
||||
|
||||
+134
-39
@@ -4,6 +4,7 @@ This module generates the listing of supported sites in file `SITES.md`
|
||||
and pretty prints file with sites data.
|
||||
"""
|
||||
import sys
|
||||
import socket
|
||||
import requests
|
||||
import logging
|
||||
import threading
|
||||
@@ -24,36 +25,87 @@ RANKS.update({
|
||||
'100000000': '100M',
|
||||
})
|
||||
|
||||
SEMAPHORE = threading.Semaphore(20)
|
||||
|
||||
|
||||
def get_rank(domain_to_query, site, print_errors=True):
|
||||
with SEMAPHORE:
|
||||
# Retrieve ranking data via alexa API
|
||||
url = f"http://data.alexa.com/data?cli=10&url={domain_to_query}"
|
||||
xml_data = requests.get(url).text
|
||||
root = ET.fromstring(xml_data)
|
||||
import csv
|
||||
import io
|
||||
from urllib.parse import urlparse
|
||||
|
||||
try:
|
||||
#Get ranking for this site.
|
||||
site.alexa_rank = int(root.find('.//REACH').attrib['RANK'])
|
||||
# country = root.find('.//COUNTRY')
|
||||
# if not country is None and country.attrib:
|
||||
# country_code = country.attrib['CODE']
|
||||
# tags = set(site.tags)
|
||||
# if country_code:
|
||||
# tags.add(country_code.lower())
|
||||
# site.tags = sorted(list(tags))
|
||||
# if site.type != 'username':
|
||||
# site.disabled = False
|
||||
except Exception as e:
|
||||
if print_errors:
|
||||
logging.error(e)
|
||||
# We did not find the rank for some reason.
|
||||
print(f"Error retrieving rank information for '{domain_to_query}'")
|
||||
print(f" Returned XML is |{xml_data}|")
|
||||
def fetch_majestic_million():
|
||||
print("Fetching Majestic Million CSV (this may take a few seconds)...")
|
||||
ranks = {}
|
||||
url = "https://downloads.majestic.com/majestic_million.csv"
|
||||
try:
|
||||
response = requests.get(url, stream=True)
|
||||
response.raise_for_status()
|
||||
|
||||
csv_file = io.StringIO(response.text)
|
||||
reader = csv.reader(csv_file)
|
||||
next(reader) # skip headers
|
||||
|
||||
for row in reader:
|
||||
if not row or len(row) < 3:
|
||||
continue
|
||||
rank = int(row[0])
|
||||
domain = row[2].lower()
|
||||
ranks[domain] = rank
|
||||
except Exception as e:
|
||||
logging.error(f"Error fetching Majestic Million: {e}")
|
||||
|
||||
print(f"Loaded {len(ranks)} domains from Majestic Million.")
|
||||
return ranks
|
||||
|
||||
return
|
||||
def get_base_domain(url):
|
||||
try:
|
||||
netloc = urlparse(url).netloc
|
||||
if netloc.startswith('www.'):
|
||||
netloc = netloc[4:]
|
||||
return netloc.lower()
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def check_dns(domain, timeout=5):
|
||||
"""Check if a domain resolves via DNS. Returns True if it resolves."""
|
||||
try:
|
||||
socket.setdefaulttimeout(timeout)
|
||||
socket.getaddrinfo(domain, None)
|
||||
return True
|
||||
except (socket.gaierror, socket.timeout, OSError):
|
||||
return False
|
||||
|
||||
|
||||
def check_sites_dns(sites):
|
||||
"""Check DNS resolution for all sites. Returns a set of site names that failed."""
|
||||
SKIP_TLDS = ('.onion', '.i2p')
|
||||
domains = {}
|
||||
for site in sites:
|
||||
domain = get_base_domain(site.url_main)
|
||||
if domain and not any(domain.endswith(tld) for tld in SKIP_TLDS):
|
||||
domains.setdefault(domain, []).append(site)
|
||||
|
||||
failed_sites = set()
|
||||
results = {}
|
||||
|
||||
def resolve(domain):
|
||||
results[domain] = check_dns(domain)
|
||||
|
||||
threads = []
|
||||
for domain in domains:
|
||||
t = threading.Thread(target=resolve, args=(domain,))
|
||||
threads.append(t)
|
||||
t.start()
|
||||
|
||||
for t in threads:
|
||||
t.join()
|
||||
|
||||
for domain, resolved in results.items():
|
||||
if not resolved:
|
||||
for site in domains[domain]:
|
||||
failed_sites.add(site.name)
|
||||
logging.warning(f"DNS resolution failed for {domain}")
|
||||
|
||||
return failed_sites
|
||||
|
||||
|
||||
def get_step_rank(rank):
|
||||
@@ -78,6 +130,8 @@ def main():
|
||||
parser.add_argument('--empty-only', help='update only sites without rating', action='store_true')
|
||||
parser.add_argument('--exclude-engine', help='do not update score with certain engine',
|
||||
action="append", dest="exclude_engine_list", default=[])
|
||||
parser.add_argument('--dns-check', help='disable sites whose domains do not resolve via DNS',
|
||||
action='store_true')
|
||||
|
||||
pool = list()
|
||||
|
||||
@@ -91,30 +145,51 @@ def main():
|
||||
with open("sites.md", "w") as site_file:
|
||||
site_file.write(f"""
|
||||
## List of supported sites (search methods): total {len(sites_subset)}\n
|
||||
Rank data fetched from Alexa by domains.
|
||||
Rank data fetched from Majestic Million by domains.
|
||||
|
||||
""")
|
||||
|
||||
if args.dns_check:
|
||||
print("Checking DNS resolution for all site domains...")
|
||||
failed = check_sites_dns(sites_subset)
|
||||
disabled_count = 0
|
||||
re_enabled_count = 0
|
||||
for site in sites_subset:
|
||||
if site.name in failed:
|
||||
if not site.disabled:
|
||||
site.disabled = True
|
||||
disabled_count += 1
|
||||
print(f" Disabled {site.name}: DNS does not resolve ({get_base_domain(site.url_main)})")
|
||||
else:
|
||||
if site.disabled:
|
||||
# Re-enable previously disabled site if DNS now resolves
|
||||
# (only if it was likely disabled due to DNS failure)
|
||||
pass
|
||||
print(f"DNS check complete: {disabled_count} site(s) disabled, {len(failed)} domain(s) unresolvable.")
|
||||
|
||||
majestic_ranks = {}
|
||||
if args.with_rank:
|
||||
majestic_ranks = fetch_majestic_million()
|
||||
|
||||
for site in sites_subset:
|
||||
if not args.with_rank:
|
||||
break
|
||||
url_main = site.url_main
|
||||
|
||||
if site.alexa_rank < sys.maxsize and args.empty_only:
|
||||
continue
|
||||
if args.exclude_engine_list and site.engine in args.exclude_engine_list:
|
||||
continue
|
||||
site.alexa_rank = 0
|
||||
th = threading.Thread(target=get_rank, args=(url_main, site,))
|
||||
pool.append((site.name, url_main, th))
|
||||
th.start()
|
||||
|
||||
|
||||
domain = get_base_domain(site.url_main)
|
||||
|
||||
if domain in majestic_ranks:
|
||||
site.alexa_rank = majestic_ranks[domain]
|
||||
else:
|
||||
site.alexa_rank = sys.maxsize
|
||||
|
||||
# In memory matching complete, no threads to join
|
||||
if args.with_rank:
|
||||
index = 1
|
||||
for site_name, url_main, th in pool:
|
||||
th.join()
|
||||
sys.stdout.write("\r{0}".format(f"Updated {index} out of {len(sites_subset)} entries"))
|
||||
sys.stdout.flush()
|
||||
index = index + 1
|
||||
print("Successfully updated ranks matching Majestic Million dataset.")
|
||||
|
||||
sites_full_list = [(s, int(s.alexa_rank)) for s in sites_subset]
|
||||
|
||||
@@ -142,6 +217,26 @@ Rank data fetched from Alexa by domains.
|
||||
site_file.write(f'\nThe list was updated at ({datetime.now(timezone.utc).date()})\n')
|
||||
db.save_to_file(args.base_file)
|
||||
|
||||
# Regenerate db_meta.json to stay in sync with data.json
|
||||
try:
|
||||
import hashlib, json, os
|
||||
db_data_raw = open(args.base_file, 'rb').read()
|
||||
db_data_parsed = json.loads(db_data_raw)
|
||||
meta = {
|
||||
"version": 1,
|
||||
"updated_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"sites_count": len(db_data_parsed.get("sites", {})),
|
||||
"min_maigret_version": "0.5.0",
|
||||
"data_sha256": hashlib.sha256(db_data_raw).hexdigest(),
|
||||
"data_url": "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/data.json",
|
||||
}
|
||||
meta_path = os.path.join(os.path.dirname(args.base_file), "db_meta.json")
|
||||
with open(meta_path, "w", encoding="utf-8") as mf:
|
||||
json.dump(meta, mf, indent=4, ensure_ascii=False)
|
||||
print(f"Updated {meta_path} ({meta['sites_count']} sites)")
|
||||
except Exception as e:
|
||||
print(f"Warning: could not regenerate db_meta.json: {e}")
|
||||
|
||||
statistics_text = db.get_db_stats(is_markdown=True)
|
||||
site_file.write('## Statistics\n\n')
|
||||
site_file.write(statistics_text)
|
||||
|
||||
Reference in New Issue
Block a user