mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-15 10:55:43 +00:00
Compare commits
270 Commits
v0.5.0
...
freelabzfix
| Author | SHA1 | Date | |
|---|---|---|---|
| 7a8b82f1da | |||
| f83b73bf89 | |||
| 6083ea2664 | |||
| 7307e5328b | |||
| 86fb3cb414 | |||
| be01cb5d1b | |||
| 2f5df91a82 | |||
| 5578b6881a | |||
| f9f9ec8ada | |||
| f3093fd5af | |||
| 66b741793e | |||
| 59b1570f1f | |||
| 4f070f5e6c | |||
| 2329bd62fd | |||
| 99847ad3e7 | |||
| c8a183683a | |||
| 77288cd4ca | |||
| 870bb1761e | |||
| 02de3b649a | |||
| cb0288ceeb | |||
| 07d1839148 | |||
| 687ef4d249 | |||
| 661cfb8a69 | |||
| f41f9439fc | |||
| c2f912b164 | |||
| b653d1dda8 | |||
| d27acbed86 | |||
| 42de0a526d | |||
| f150966056 | |||
| 8e8d410ee8 | |||
| aefc236af3 | |||
| adcbefb59a | |||
| ac5b4df258 | |||
| 299e88a4a4 | |||
| 20b74383ee | |||
| 48559a2fe8 | |||
| 19abcc8586 | |||
| bc3a069624 | |||
| 337b9b5270 | |||
| 0c6388aff5 | |||
| f860ce788a | |||
| 55fc8694ed | |||
| 31b759c08b | |||
| 06c3360e3d | |||
| e95d061460 | |||
| 3097b37eb3 | |||
| 7d1911288f | |||
| c9272983fc | |||
| 797daf0aa1 | |||
| abfb579755 | |||
| ff13d5fafb | |||
| 2c30d19c9f | |||
| 089cb48773 | |||
| 2bb473d919 | |||
| 1a8a76ca21 | |||
| ddb5f26813 | |||
| 6b5955d676 | |||
| ebbff47829 | |||
| 9d6319aebd | |||
| 05015a9cce | |||
| 444eba65cd | |||
| 86777eeccf | |||
| 829063d1a1 | |||
| b6d4473f6f | |||
| 93e48e583b | |||
| d2b71dce57 | |||
| d8f2a48870 | |||
| b5aa76a52c | |||
| f8bd42baa8 | |||
| ce4c768891 | |||
| d566d48d1f | |||
| 278a5082ce | |||
| 87c8356ff1 | |||
| d1ecd8a965 | |||
| 44991e0b7c | |||
| 1a3ce4f114 | |||
| 6c5f67f30b | |||
| a2982da0f3 | |||
| 6b8ad90172 | |||
| e95057abf8 | |||
| 5fa86187f5 | |||
| c9ab9d676b | |||
| 4784ecdacc | |||
| f24a73846e | |||
| 9e64f7260d | |||
| ee68542028 | |||
| c2240b6607 | |||
| 54f1a46d79 | |||
| f5151ca0fd | |||
| 43db010dfe | |||
| 59535c59e5 | |||
| eccc09275a | |||
| 2e94bafb7b | |||
| 5a6ee2ef90 | |||
| d180101a06 | |||
| c0af820de3 | |||
| b7125dc97e | |||
| 108fef50ee | |||
| 4470c3a440 | |||
| 84529cd5b4 | |||
| 23adc178ea | |||
| 6834483360 | |||
| 6ed8fdefcc | |||
| 3fd34afb77 | |||
| ad95302745 | |||
| 44a6c729e3 | |||
| 6d0a22b738 | |||
| abce3c9be4 | |||
| 269d50eedc | |||
| e8f4318e5d | |||
| 75289c78bf | |||
| eeb38ccdc0 | |||
| d136014576 | |||
| 5d502eaef6 | |||
| 9e8a701c54 | |||
| 7b67c61240 | |||
| 0e113c4592 | |||
| fb4e17be92 | |||
| adb19e5930 | |||
| 116fae3e0f | |||
| bf495cd57e | |||
| e49aa533df | |||
| 5aa7f6429b | |||
| a5d337b765 | |||
| 5aa0c908b0 | |||
| 51b452ad71 | |||
| fa1a4d1b4a | |||
| 184519b202 | |||
| a203eecbb2 | |||
| dde1cd5d78 | |||
| 547512519b | |||
| b333a2e2b2 | |||
| 2835ec71c7 | |||
| af67a6a3f3 | |||
| 4f737b5260 | |||
| 185e09e4ea | |||
| 5865e0f375 | |||
| 815c8cb2f3 | |||
| 656fe1df24 | |||
| 1c5dc5f152 | |||
| bc3d9faad9 | |||
| 5aae2ee005 | |||
| b145e7b26f | |||
| abd9aa57fe | |||
| 2e430e5039 | |||
| f5786f11ce | |||
| 3e56c95e16 | |||
| 28f35f9a4f | |||
| 79cea49526 | |||
| 2d94269656 | |||
| 829bda885a | |||
| eb541dcf51 | |||
| 4c97025a32 | |||
| 2775181a6a | |||
| b00ef1f5dd | |||
| d3f13ac295 | |||
| 479a614d1d | |||
| e0559e4320 | |||
| 00a9249229 | |||
| 005863c2e0 | |||
| e3aada6aef | |||
| 9b35fc1ab0 | |||
| 146bc0481b | |||
| 5930a3022e | |||
| b4482e0ba4 | |||
| 2c55501bc2 | |||
| 3ba07591a1 | |||
| a2d4373b68 | |||
| b960acec10 | |||
| b1a211c3cd | |||
| 56d0c9f2f1 | |||
| 01049b730d | |||
| 2c2d3409e2 | |||
| e81b50ef61 | |||
| 9ac0a65914 | |||
| 4f397fed1c | |||
| a17e0c7a13 | |||
| e84e394e6f | |||
| b8ada1c818 | |||
| 959b2be136 | |||
| 97cc4b46d9 | |||
| f3b741d283 | |||
| 33620853a1 | |||
| 19ed03a94d | |||
| 35372446e0 | |||
| 519bb46db6 | |||
| 227a25bfa1 | |||
| 5da4e78092 | |||
| e4d6b064df | |||
| f99091f5f7 | |||
| f26976f1dd | |||
| 83ae9c0133 | |||
| 93c4fdeba9 | |||
| 6ec3c47769 | |||
| 3dc3fe9371 | |||
| ebf8227bf1 | |||
| 5b7b28e683 | |||
| 0e95e2e3cc | |||
| 4cd1fccaa3 | |||
| 7ee1872dbc | |||
| 5130206b98 | |||
| 83a9dafe55 | |||
| b4147d2cd3 | |||
| aa591da913 | |||
| 2d4d3ba0cc | |||
| ec21bbe974 | |||
| 1a4190ee03 | |||
| fe60783a68 | |||
| 8aa0fab314 | |||
| 941a5171ae | |||
| 9a1bd8ffdb | |||
| 68f586fcca | |||
| e39476c4c7 | |||
| 6a7f778c80 | |||
| 7679f98e58 | |||
| c6dbc09ba5 | |||
| b8352c3406 | |||
| 8a02ad5ed7 | |||
| 8fda5776c6 | |||
| 2347bd2f7d | |||
| 229472f323 | |||
| 6acc22dd69 | |||
| 8af07b3889 | |||
| e9df40bdce | |||
| d5bef9e3ac | |||
| 25121754bd | |||
| 198c11b8d4 | |||
| bf9bc5a518 | |||
| 41e246f6a6 | |||
| 9f58fb27ad | |||
| b344a5d98a | |||
| d8b26181f1 | |||
| a60d96c7f2 | |||
| a3159b213b | |||
| 123ead4c03 | |||
| cd7571ef57 | |||
| d922f9be25 | |||
| 3b20b36609 | |||
| ba86981cf4 | |||
| 561ced647f | |||
| 7be3ee8240 | |||
| 48ca13dc4d | |||
| 7f94e86259 | |||
| c2ed1af4b4 | |||
| 648ba6e64c | |||
| 56815d8368 | |||
| b178e97d90 | |||
| a764198c2c | |||
| 2c4684e4a9 | |||
| 8713e1a63e | |||
| 55adc70d10 | |||
| 53fc83dbce | |||
| e8bd00f013 | |||
| a0ba853e64 | |||
| 54b4c7d2ab | |||
| 8791bca866 | |||
| fb26ccd1f6 | |||
| c22abdb834 | |||
| 0689470506 | |||
| 410d7568b7 | |||
| 7280033198 | |||
| 3c6af42916 | |||
| cdb896ba32 | |||
| 6bd047fda3 | |||
| e30cf353a6 | |||
| bd9e48de7c | |||
| aec4fef8db | |||
| 1da49bd208 | |||
| 6da39cf3d5 | |||
| f869eb49ca |
@@ -1,3 +1,10 @@
|
||||
#!/bin/sh
|
||||
echo 'Activating update_sitesmd hook script...'
|
||||
poetry run update_sitesmd
|
||||
poetry run update_sitesmd
|
||||
|
||||
echo 'Regenerating db_meta.json...'
|
||||
python3 utils/generate_db_meta.py
|
||||
|
||||
git add maigret/resources/db_meta.json
|
||||
git add maigret/resources/data.json
|
||||
git add sites.md
|
||||
|
||||
@@ -2,54 +2,69 @@ name: Package exe with PyInstaller - Windows
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main, dev ]
|
||||
branches: [main, dev]
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: PyInstaller Windows Build
|
||||
uses: JackMcKew/pyinstaller-action-windows@main
|
||||
with:
|
||||
path: pyinstaller
|
||||
# Wine Python (not Linux) runs PyInstaller; altgraph needs pkg_resources — reinstall setuptools after all deps.
|
||||
- name: Prepare requirements for Wine (setuptools last)
|
||||
run: |
|
||||
set -euo pipefail
|
||||
cp pyinstaller/requirements.txt pyinstaller/requirements-wine.txt
|
||||
{
|
||||
echo ""
|
||||
echo "# CI: setuptools last so pkg_resources exists for PyInstaller/altgraph in Wine"
|
||||
echo "setuptools==70.0.0"
|
||||
} >> pyinstaller/requirements-wine.txt
|
||||
|
||||
- name: Upload PyInstaller Binary to Workflow as Artifact
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: maigret_standalone_win32
|
||||
path: pyinstaller/dist/windows
|
||||
- name: PyInstaller Windows Build
|
||||
uses: JackMcKew/pyinstaller-action-windows@main
|
||||
with:
|
||||
path: pyinstaller
|
||||
requirements: requirements-wine.txt
|
||||
|
||||
- name: Download PyInstaller Binary
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: maigret_standalone_win32
|
||||
- name: Upload PyInstaller Binary to Workflow as Artifact
|
||||
if: success()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: maigret_standalone_win32
|
||||
path: pyinstaller/dist/windows
|
||||
|
||||
- name: Create New Release and Upload PyInstaller Binary to Release
|
||||
uses: ncipollo/release-action@v1.14.0
|
||||
id: create_release
|
||||
with:
|
||||
allowUpdates: true
|
||||
draft: false
|
||||
prerelease: false
|
||||
artifactErrorsFailBuild: true
|
||||
makeLatest: true
|
||||
replacesArtifacts: true
|
||||
artifacts: maigret_standalone.exe
|
||||
name: Development Windows Release [${{ github.ref_name }}]
|
||||
tag: ${{ github.ref_name }}
|
||||
body: |
|
||||
This is a development release built from the **${{ github.ref_name }}** branch.
|
||||
- name: Download PyInstaller Binary
|
||||
if: success()
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: maigret_standalone_win32
|
||||
|
||||
Take into account that `dev` releases may be unstable.
|
||||
Please, use [the development release](https://github.com/soxoj/maigret/releases/tag/main) build from the **main** branch.
|
||||
- name: Create New Release and Upload PyInstaller Binary to Release
|
||||
if: success()
|
||||
uses: ncipollo/release-action@v1.14.0
|
||||
id: create_release
|
||||
with:
|
||||
allowUpdates: true
|
||||
draft: false
|
||||
prerelease: false
|
||||
artifactErrorsFailBuild: true
|
||||
makeLatest: true
|
||||
replacesArtifacts: true
|
||||
artifacts: maigret_standalone.exe
|
||||
name: Development Windows Release [${{ github.ref_name }}]
|
||||
tag: ${{ github.ref_name }}
|
||||
body: |
|
||||
This is a development release built from the **${{ github.ref_name }}** branch.
|
||||
|
||||
Instructions:
|
||||
- Download the attached file `maigret_standalone.exe` to get the Windows executable.
|
||||
- Video guide on how to run it: https://youtu.be/qIgwTZOmMmM
|
||||
- For detailed documentation, visit: https://maigret.readthedocs.io/en/latest/
|
||||
Take into account that `dev` releases may be unstable.
|
||||
Please, use [the development release](https://github.com/soxoj/maigret/releases/tag/main) build from the **main** branch.
|
||||
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ github.token }}
|
||||
Instructions:
|
||||
- Download the attached file `maigret_standalone.exe` to get the Windows executable.
|
||||
- Video guide on how to run it: https://youtu.be/qIgwTZOmMmM
|
||||
- For detailed documentation, visit: https://maigret.readthedocs.io/en/latest/
|
||||
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ github.token }}
|
||||
|
||||
@@ -13,7 +13,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.10", "3.11", "3.12"]
|
||||
python-version: ["3.10", "3.11", "3.12", "3.13"]
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -22,6 +22,9 @@ jobs:
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install system dependencies
|
||||
run: |
|
||||
sudo apt-get update && sudo apt-get install -y libcairo2-dev
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
|
||||
@@ -1,30 +1,21 @@
|
||||
name: Upload Python Package to PyPI when a Release is Created
|
||||
|
||||
on:
|
||||
release:
|
||||
types: [created]
|
||||
|
||||
push:
|
||||
tags:
|
||||
- "v*"
|
||||
permissions:
|
||||
id-token: write
|
||||
contents: read
|
||||
jobs:
|
||||
pypi-publish:
|
||||
name: Publish release to PyPI
|
||||
build-and-publish:
|
||||
runs-on: ubuntu-latest
|
||||
environment:
|
||||
name: pypi
|
||||
url: https://pypi.org/p/maigret
|
||||
permissions:
|
||||
id-token: write
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
- uses: astral-sh/setup-uv@v3
|
||||
- run: uv build
|
||||
- name: Publish to PyPI (Trusted Publishing)
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
with:
|
||||
python-version: "3.x"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install setuptools wheel
|
||||
- name: Build package
|
||||
run: |
|
||||
python setup.py sdist bdist_wheel # Could also be python -m build
|
||||
- name: Publish package distributions to PyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
packages-dir: dist
|
||||
@@ -1,34 +1,60 @@
|
||||
name: Update sites rating and statistics
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches: [ dev ]
|
||||
types: [opened, synchronize]
|
||||
push:
|
||||
branches: [ main ]
|
||||
|
||||
concurrency:
|
||||
group: update-sites-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v2.3.2
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event.pull_request.head.sha }}
|
||||
ref: main
|
||||
fetch-depth: 0 # otherwise, there would be errors pushing refs to the destination repository.
|
||||
|
||||
- name: build application
|
||||
- name: Install system dependencies
|
||||
run: |
|
||||
sudo apt-get update && sudo apt-get install -y libcairo2-dev
|
||||
|
||||
- name: Build application
|
||||
run: |
|
||||
pip3 install .
|
||||
python3 ./utils/update_site_data.py --empty-only
|
||||
|
||||
- name: Commit and push changes
|
||||
- name: Regenerate db_meta.json
|
||||
run: python3 utils/generate_db_meta.py
|
||||
|
||||
- name: Remove ambiguous main tag
|
||||
run: git tag -d main || true
|
||||
|
||||
- name: Check for meaningful changes
|
||||
id: check
|
||||
run: |
|
||||
git config --global user.name "Maigret autoupdate"
|
||||
git config --global user.email "soxoj@protonmail.com"
|
||||
echo `git name-rev ${{ github.event.pull_request.head.sha }} --name-only`
|
||||
export BRANCH=`git name-rev ${{ github.event.pull_request.head.sha }} --name-only | sed 's/remotes\/origin\///'`
|
||||
echo $BRANCH
|
||||
git remote -v
|
||||
git checkout $BRANCH
|
||||
git add sites.md
|
||||
git commit -m "Updated site list and statistics"
|
||||
git push origin $BRANCH
|
||||
REAL_CHANGES=$(git diff --unified=0 sites.md | grep '^[+-][^+-]' | grep -v 'The list was updated at' | wc -l)
|
||||
if [ "$REAL_CHANGES" -gt 0 ]; then
|
||||
echo "has_changes=true" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "has_changes=false" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Delete existing PR branch
|
||||
if: steps.check.outputs.has_changes == 'true'
|
||||
run: git push origin --delete auto/update-sites-list || true
|
||||
|
||||
- name: Create Pull Request
|
||||
if: steps.check.outputs.has_changes == 'true'
|
||||
uses: peter-evans/create-pull-request@v7
|
||||
with:
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
commit-message: "Updated site list and statistics"
|
||||
title: "Automated Sites List Update"
|
||||
body: "Automated changes to sites.md based on new Alexa rankings/statistics."
|
||||
branch: "auto/update-sites-list"
|
||||
base: main
|
||||
delete-branch: true
|
||||
+2
-1
@@ -42,4 +42,5 @@ settings.json
|
||||
|
||||
# other
|
||||
*.egg-info
|
||||
build
|
||||
build
|
||||
LLM
|
||||
|
||||
+9
-7
@@ -1,16 +1,18 @@
|
||||
FROM python:3.10-slim
|
||||
FROM python:3.11-slim
|
||||
LABEL maintainer="Soxoj <soxoj@protonmail.com>"
|
||||
WORKDIR /app
|
||||
RUN pip install --no-cache-dir --upgrade pip
|
||||
RUN apt-get update && \
|
||||
apt-get install --no-install-recommends -y \
|
||||
gcc \
|
||||
musl-dev \
|
||||
libxml2 \
|
||||
build-essential \
|
||||
python3-dev \
|
||||
pkg-config \
|
||||
libcairo2-dev \
|
||||
libxml2-dev \
|
||||
libxslt-dev \
|
||||
&& \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/*
|
||||
libxslt1-dev \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/*
|
||||
COPY . .
|
||||
RUN YARL_NO_EXTENSIONS=1 python3 -m pip install --no-cache-dir .
|
||||
# For production use, set FLASK_HOST to a specific IP address for security
|
||||
ENV FLASK_HOST=0.0.0.0
|
||||
ENTRYPOINT ["maigret"]
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2019 Sherlock Project
|
||||
Copyright (c) 2020-2021 Soxoj
|
||||
Copyright (c) 2020-2026 Soxoj
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
||||
@@ -1,4 +0,0 @@
|
||||
include LICENSE
|
||||
include README.md
|
||||
include requirements.txt
|
||||
include maigret/resources/*
|
||||
@@ -25,7 +25,7 @@
|
||||
|
||||
<i>The Commissioner Jules Maigret is a fictional French police detective, created by Georges Simenon. His investigation method is based on understanding the personality of different people and their interactions.</i>
|
||||
|
||||
<b>👉👉👉 [Online Telegram bot](https://t.me/osint_maigret_bot)</b>
|
||||
<b>👉👉👉 [Online Telegram bot](https://t.me/maigret_search_bot) | 🏢 [Commercial use & API](#commercial-use)</b>
|
||||
|
||||
## About
|
||||
|
||||
@@ -53,7 +53,7 @@ See the full description of Maigret features [in the documentation](https://maig
|
||||
|
||||
## Installation
|
||||
|
||||
‼️ Maigret is available online via [official Telegram bot](https://t.me/osint_maigret_bot). Consider using it if you don't want to install anything.
|
||||
‼️ Maigret is available online via [official Telegram bot](https://t.me/maigret_search_bot). Consider using it if you don't want to install anything.
|
||||
|
||||
### Windows
|
||||
|
||||
@@ -112,6 +112,10 @@ docker run -v /mydir:/app/reports soxoj/maigret:latest username --html
|
||||
docker build -t maigret .
|
||||
```
|
||||
|
||||
### Troubleshooting
|
||||
|
||||
If you encounter build errors during installation, check the [troubleshooting guide](https://maigret.readthedocs.io/en/latest/installation.html#troubleshooting).
|
||||
|
||||
## Usage examples
|
||||
|
||||
```bash
|
||||
@@ -192,6 +196,16 @@ The authors and developers of this tool bear no responsibility for any misuse or
|
||||
|
||||
If you have any questions, suggestions, or feedback, please feel free to [open an issue](https://github.com/soxoj/maigret/issues), create a [GitHub discussion](https://github.com/soxoj/maigret/discussions), or contact the author directly via [Telegram](https://t.me/soxoj).
|
||||
|
||||
## Commercial Use
|
||||
|
||||
If you need a **daily updated database** of supported sites or an **API for username checks**, feel free to reach out:
|
||||
|
||||
📧 [maigret@soxoj.com](mailto:maigret@soxoj.com)
|
||||
|
||||
Available options:
|
||||
- Up-to-date site database - regularly maintained and updated list of 5K+ sites, delivered daily
|
||||
- Username check API - programmatic access to Maigret's search capabilities for integration into your products
|
||||
|
||||
## SOWEL classification
|
||||
|
||||
This tool uses the following OSINT techniques:
|
||||
|
||||
@@ -31,14 +31,32 @@ two-letter country codes (**not a language!**). E.g. photo, dating, sport; jp, u
|
||||
Multiple tags can be associated with one site. **Warning**: tags markup is
|
||||
not stable now. Read more :doc:`in the separate section <tags>`.
|
||||
|
||||
``--exclude-tags`` - Exclude sites with specific tags from the search
|
||||
(blacklist). E.g. ``--exclude-tags porn,dating`` will skip all sites
|
||||
tagged with ``porn`` or ``dating``. Can be combined with ``--tags`` to
|
||||
include certain categories while excluding others. Read more
|
||||
:doc:`in the separate section <tags>`.
|
||||
|
||||
``-n``, ``--max-connections`` - Allowed number of concurrent connections
|
||||
**(default: 100)**.
|
||||
|
||||
``-a``, ``--all-sites`` - Use all sites for scan **(default: top 500)**.
|
||||
|
||||
``--top-sites`` - Count of sites for scan ranked by Alexa Top
|
||||
``--top-sites`` - Count of sites for scan ranked by Majestic Million
|
||||
**(default: top 500)**.
|
||||
|
||||
**Mirrors:** After the top *N* sites by Majestic Million rank are chosen (respecting
|
||||
``--tags``, ``--use-disabled-sites``, etc.), Maigret may add extra sites
|
||||
whose database field ``source`` names a **parent platform** that itself falls
|
||||
in the Majestic Million top *N* when ranking **including disabled** sites. For example,
|
||||
if ``Twitter`` ranks in the first 500 by Majestic Million, a mirror such as ``memory.lol``
|
||||
(with ``source: Twitter``) is included even though it has no rank and would
|
||||
otherwise be cut off. The same applies to Instagram-related mirrors (e.g.
|
||||
Picuki) when ``Instagram`` is in that parent top *N* by rank—even if the
|
||||
official ``Instagram`` entry is disabled and not scanned by default, its
|
||||
mirrors can still be pulled in. The final list is the ranked top *N* plus
|
||||
these mirrors (no fixed upper bound on mirror count).
|
||||
|
||||
``--timeout`` - Time (in seconds) to wait for responses from sites
|
||||
**(default: 30)**. A longer timeout will be more likely to get results
|
||||
from slow sites. On the other hand, this may cause a long delay to
|
||||
@@ -88,6 +106,9 @@ username).
|
||||
``-J``, ``--json`` - Generate a JSON report of specific type: simple,
|
||||
ndjson (one report per username). E.g. ``--json ndjson``
|
||||
|
||||
``-M``, ``--md`` - Generate a Markdown report (general report on all
|
||||
usernames). See :ref:`markdown-report` below.
|
||||
|
||||
``-fo``, ``--folderoutput`` - Results will be saved to this folder,
|
||||
``results`` by default. Will be created if doesn’t exist.
|
||||
|
||||
@@ -112,16 +133,60 @@ Other operations modes
|
||||
|
||||
``--version`` - Display version information and dependencies.
|
||||
|
||||
``--self-check`` - Do self-checking for sites and database and disable
|
||||
non-working ones **for current search session** by default. It’s useful
|
||||
for testing new internet connection (it depends on provider/hosting on
|
||||
which sites there will be censorship stub or captcha display). After
|
||||
checking Maigret asks if you want to save updates, answering y/Y will
|
||||
rewrite the local database.
|
||||
``--self-check`` - Do self-checking for sites and database. Each site is
|
||||
tested by looking up its known-claimed and known-unclaimed usernames and
|
||||
verifying that the results match expectations. Individual site failures
|
||||
(network errors, unexpected exceptions, etc.) are caught and logged
|
||||
without stopping the overall process, so the check always runs to
|
||||
completion. After checking, Maigret reports a summary of issues found.
|
||||
If any sites were disabled (see ``--auto-disable``), Maigret asks if you
|
||||
want to save updates; answering y/Y will rewrite the local database.
|
||||
|
||||
``--auto-disable`` - Used with ``--self-check``: automatically disable
|
||||
sites that fail checks (incorrect detection of claimed/unclaimed
|
||||
usernames, connection errors, or unexpected exceptions). Without this
|
||||
flag, ``--self-check`` only **reports** issues without modifying the
|
||||
database.
|
||||
|
||||
``--diagnose`` - Used with ``--self-check``: print detailed diagnosis
|
||||
information for each failing site, including the check type, the list
|
||||
of issues found, and recommendations (e.g. suggesting a different
|
||||
``checkType``).
|
||||
|
||||
``--submit URL`` - Do an automatic analysis of the given account URL or
|
||||
site main page URL to determine the site engine and methods to check
|
||||
account presence. After checking Maigret asks if you want to add the
|
||||
site, answering y/Y will rewrite the local database.
|
||||
|
||||
.. _markdown-report:
|
||||
|
||||
Markdown report (LLM-friendly)
|
||||
------------------------------
|
||||
|
||||
The ``--md`` / ``-M`` flag generates a Markdown report designed for both human reading and analysis by AI assistants (ChatGPT, Claude, etc.).
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
maigret username --md
|
||||
|
||||
The report includes:
|
||||
|
||||
- **Summary** with aggregated personal data (all fullnames, locations, bios found across accounts), country tags, website tags, first/last seen timestamps.
|
||||
- **Per-account sections** with profile URL, site tags, and all extracted fields (username, bio, follower count, linked accounts, etc.).
|
||||
- **Possible false positives** disclaimer explaining that accounts may belong to different people.
|
||||
- **Ethical use** notice about applicable data protection laws.
|
||||
|
||||
**Using with AI tools:**
|
||||
|
||||
The Markdown format is optimized for LLM context windows. You can feed the report directly to an AI assistant for follow-up analysis:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
# Generate the report
|
||||
maigret johndoe --md
|
||||
|
||||
# Feed it to an AI tool
|
||||
cat reports/report_johndoe.md | llm "Analyze this OSINT report and summarize key findings"
|
||||
|
||||
The structured Markdown with per-site sections makes it easy for AI tools to extract relationships, cross-reference identities, and identify patterns across accounts.
|
||||
|
||||
|
||||
@@ -22,8 +22,16 @@ The supported methods (``checkType`` values in ``data.json``) are:
|
||||
- ``status_code`` - checks that status code of the response is 2XX
|
||||
- ``response_url`` - check if there is not redirect and the response is 2XX
|
||||
|
||||
.. note::
|
||||
Maigret natively treats specific anti-bot HTTP status codes (like LinkedIn's ``HTTP 999``) as a standard "Not Found/Available" signal instead of throwing an infrastructure Server Error, gracefully preventing false positives.
|
||||
|
||||
See the details of check mechanisms in the `checking.py <https://github.com/soxoj/maigret/blob/main/maigret/checking.py#L339>`_ file.
|
||||
|
||||
.. note::
|
||||
Maigret now uses the **Majestic Million** dataset for site popularity sorting instead of the discontinued Alexa Rank API. For backward compatibility with existing configurations and parsers, the ranking field in `data.json` and internal site models remains named ``alexaRank`` and ``alexa_rank``.
|
||||
|
||||
**Mirrors and ``--top-sites``:** When you limit scans with ``--top-sites N``, Maigret also includes *mirror* sites (entries whose ``source`` field points at a parent platform such as Twitter or Instagram) if that parent would appear in the Majestic Million top *N* when disabled sites are considered for ranking. See the **Mirrors** paragraph under ``--top-sites`` in :doc:`command-line-options`.
|
||||
|
||||
Testing
|
||||
-------
|
||||
|
||||
@@ -61,6 +69,21 @@ Use the following commands to check Maigret:
|
||||
make speed
|
||||
|
||||
|
||||
Site naming conventions
|
||||
-----------------------------------------------
|
||||
|
||||
Site names are the keys in ``data.json`` and appear in user-facing reports. Follow these rules:
|
||||
|
||||
- **Title Case** by default: ``Product Hunt``, ``Hacker News``.
|
||||
- **Lowercase** only if the brand itself is written that way: ``kofi``, ``note``, ``hi5``.
|
||||
- **No domain suffix** (``calendly.com`` → ``Calendly``), unless the domain is part of the recognized brand name: ``last.fm``, ``VC.ru``, ``Archive.org``.
|
||||
- **No full UPPERCASE** unless the brand is an acronym: ``VK``, ``CNET``, ``ICQ``, ``IFTTT``.
|
||||
- **No** ``www.`` **or** ``https://`` **prefix** in the name.
|
||||
- **Spaces** are allowed when the brand uses them: ``Star Citizen``, ``Google Maps``.
|
||||
- **{username} templates** in names are acceptable: ``{username}.tilda.ws``.
|
||||
|
||||
When in doubt, check how the service refers to itself on its homepage.
|
||||
|
||||
How to fix false-positives
|
||||
-----------------------------------------------
|
||||
|
||||
@@ -112,6 +135,57 @@ There are few options for sites data.json helpful in various cases:
|
||||
- ``headers`` - a dictionary of additional headers to be sent to the site
|
||||
- ``requestHeadOnly`` - set to ``true`` if it's enough to make a HEAD request to the site
|
||||
- ``regexCheck`` - a regex to check if the username is valid, in case of frequent false-positives
|
||||
- ``requestMethod`` - set the HTTP method to use (e.g., ``POST``). By default, Maigret natively defaults to GET or HEAD.
|
||||
- ``requestPayload`` - a dictionary with the JSON payload to send for POST requests (e.g., ``{"username": "{username}"}``), extremely useful for parsing GraphQL or modern JSON APIs.
|
||||
- ``protection`` - a list of protection types detected on the site (see below).
|
||||
|
||||
``protection`` (site protection tracking)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The ``protection`` field records what kind of anti-bot protection a site uses. Maigret reads this field and automatically applies the appropriate bypass mechanism.
|
||||
|
||||
Supported values:
|
||||
|
||||
- ``tls_fingerprint`` — the site fingerprints the TLS handshake (JA3/JA4) and blocks non-browser clients. Maigret automatically uses ``curl_cffi`` with Chrome browser emulation to bypass this. Requires the ``curl_cffi`` package (included as a dependency). Examples: Instagram, NPM, Codepen, Kickstarter, Letterboxd.
|
||||
- ``ip_reputation`` — the site blocks requests from datacenter/cloud IPs regardless of headers or TLS. Cannot be bypassed automatically; run Maigret from a regular internet connection (not a datacenter) or use a proxy (``--proxy``). Examples: Reddit, Patreon, Figma.
|
||||
- ``js_challenge`` — the site serves a JavaScript challenge page (e.g. "Just a moment...") that cannot be solved without a browser. Maigret detects challenge signatures and returns UNKNOWN instead of a false positive.
|
||||
|
||||
Example:
|
||||
|
||||
.. code-block:: json
|
||||
|
||||
"Instagram": {
|
||||
"url": "https://www.instagram.com/{username}/",
|
||||
"checkType": "message",
|
||||
"presenseStrs": ["\"routePath\":\"\\/"],
|
||||
"absenceStrs": ["\"routePath\":null"],
|
||||
"protection": ["tls_fingerprint"]
|
||||
}
|
||||
|
||||
``urlProbe`` (optional profile probe URL)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
By default Maigret performs the HTTP request to the same URL as ``url`` (the public profile link pattern).
|
||||
|
||||
If you set ``urlProbe`` in ``data.json``, Maigret **fetches** that URL for the presence check (API, GraphQL, JSON endpoint, etc.), while **reports and ``url_user``** still use ``url`` — the human-readable profile page users should open.
|
||||
|
||||
Placeholders: ``{username}``, ``{urlMain}``, ``{urlSubpath}`` (same as for ``url``). Example: GitHub uses ``url`` ``https://github.com/{username}`` and ``urlProbe`` ``https://api.github.com/users/{username}``; Picsart uses the web profile ``https://picsart.com/u/{username}`` and probes ``https://api.picsart.com/users/show/{username}.json``.
|
||||
|
||||
Implementation: ``make_site_result`` in `checking.py <https://github.com/soxoj/maigret/blob/main/maigret/checking.py>`_.
|
||||
|
||||
Site check fixes using LLM
|
||||
--------------------------
|
||||
|
||||
.. note::
|
||||
The ``LLM/`` directory at the root of the repository contains detailed instructions for editing site checks (in Markdown format): checklist, full guide to ``checkType`` / ``data.json`` / ``urlProbe``, handling false positives, searching for public JSON APIs, and the proposal log for ``socid_extractor``.
|
||||
|
||||
Main files:
|
||||
|
||||
- `site-checks-playbook.md <https://github.com/soxoj/maigret/blob/main/LLM/site-checks-playbook.md>`_ — short checklist
|
||||
- `site-checks-guide.md <https://github.com/soxoj/maigret/blob/main/LLM/site-checks-guide.md>`_ — detailed guide
|
||||
- `socid_extractor_improvements.log <https://github.com/soxoj/maigret/blob/main/LLM/socid_extractor_improvements.log>`_ — template and entries for identity extractor improvements
|
||||
|
||||
These files should be kept up-to-date whenever changes are made to the check logic in the code or in ``data.json``.
|
||||
|
||||
.. _activation-mechanism:
|
||||
|
||||
|
||||
@@ -170,6 +170,35 @@ Maigret will do retries of the requests with temporary errors got (connection fa
|
||||
|
||||
One attempt by default, can be changed with option ``--retries N``.
|
||||
|
||||
Database self-check
|
||||
-------------------
|
||||
|
||||
Maigret includes a self-check mode (``--self-check``) that validates every site
|
||||
in the database by looking up its known-claimed and known-unclaimed usernames
|
||||
and verifying that the detection results match expectations.
|
||||
|
||||
The self-check is **error-resilient**: if an individual site check raises an
|
||||
unexpected exception (e.g. a network error or a parsing failure), the error is
|
||||
caught, logged, and recorded as an issue — the remaining sites continue to be
|
||||
checked without interruption. This means the process always runs to completion,
|
||||
even when checking hundreds of sites with ``-a --self-check``.
|
||||
|
||||
Use ``--auto-disable`` together with ``--self-check`` to automatically disable
|
||||
sites that fail checks. Without it, issues are only reported. Use ``--diagnose``
|
||||
to print detailed per-site diagnosis including the check type, specific issues,
|
||||
and recommendations.
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
# Report-only mode (no changes to the database)
|
||||
maigret --self-check
|
||||
|
||||
# Automatically disable failing sites and save updates
|
||||
maigret -a --self-check --auto-disable
|
||||
|
||||
# Show detailed diagnosis for each failing site
|
||||
maigret -a --self-check --diagnose
|
||||
|
||||
Archives and mirrors checking
|
||||
-----------------------------
|
||||
|
||||
|
||||
@@ -90,3 +90,39 @@ Docker
|
||||
|
||||
# manual build
|
||||
docker build -t maigret .
|
||||
|
||||
Troubleshooting
|
||||
---------------
|
||||
|
||||
If you encounter build errors during installation such as ``cannot find ft2build.h``
|
||||
or errors related to ``reportlab`` / ``_renderPM``, you need to install system-level
|
||||
dependencies required to compile native extensions.
|
||||
|
||||
**Debian/Ubuntu/Kali:**
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
sudo apt install -y libfreetype6-dev libjpeg-dev libffi-dev
|
||||
|
||||
**Fedora/RHEL/CentOS:**
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
sudo dnf install -y freetype-devel libjpeg-devel libffi-devel
|
||||
|
||||
**Arch Linux:**
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
sudo pacman -S freetype2 libjpeg-turbo libffi
|
||||
|
||||
**macOS (Homebrew):**
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
brew install freetype
|
||||
|
||||
After installing the system dependencies, retry the maigret installation.
|
||||
|
||||
If you continue to have issues, consider using Docker instead, which includes all
|
||||
necessary dependencies.
|
||||
|
||||
@@ -27,3 +27,77 @@ Missing any of these files is not an error.
|
||||
If the next settings file contains already known option,
|
||||
this option will be rewrited. So it is possible to make
|
||||
custom configuration for different users and directories.
|
||||
|
||||
.. _database-auto-update:
|
||||
|
||||
Database auto-update
|
||||
--------------------
|
||||
|
||||
Maigret ships with a bundled site database, but it gets outdated between releases. To keep the database current, Maigret automatically checks for updates on startup.
|
||||
|
||||
**How it works:**
|
||||
|
||||
1. On startup, Maigret checks if more than 24 hours have passed since the last update check.
|
||||
2. If so, it fetches a lightweight metadata file (~200 bytes) from GitHub to see if a newer database is available.
|
||||
3. If a newer, compatible database exists, Maigret downloads it to ``~/.maigret/data.json`` and uses it instead of the bundled copy.
|
||||
4. If the download fails or the new database is incompatible with your Maigret version, the bundled database is used as a fallback.
|
||||
|
||||
The downloaded database has **higher priority** than the bundled one — it replaces, not overlays.
|
||||
|
||||
**Status messages** are printed only when an action occurs:
|
||||
|
||||
.. code-block:: text
|
||||
|
||||
[*] DB auto-update: checking for updates...
|
||||
[+] DB auto-update: database updated successfully (3180 sites)
|
||||
[*] DB auto-update: database is up to date (3157 sites)
|
||||
[!] DB auto-update: latest database requires maigret >= 0.6.0, you have 0.5.0
|
||||
|
||||
**Forcing an update:**
|
||||
|
||||
Use the ``--force-update`` flag to check for updates immediately, ignoring the check interval:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
maigret username --force-update
|
||||
|
||||
The update happens at startup, then the search continues normally with the freshly downloaded database.
|
||||
|
||||
**Disabling auto-update:**
|
||||
|
||||
Use the ``--no-autoupdate`` flag to skip the update check entirely:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
maigret username --no-autoupdate
|
||||
|
||||
Or set it permanently in ``~/.maigret/settings.json``:
|
||||
|
||||
.. code-block:: json
|
||||
|
||||
{
|
||||
"no_autoupdate": true
|
||||
}
|
||||
|
||||
This is recommended for **Docker containers**, **CI pipelines**, and **air-gapped environments**.
|
||||
|
||||
**Configuration options** (in ``settings.json``):
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:widths: 35 15 50
|
||||
|
||||
* - Setting
|
||||
- Default
|
||||
- Description
|
||||
* - ``no_autoupdate``
|
||||
- ``false``
|
||||
- Disable auto-update entirely
|
||||
* - ``autoupdate_check_interval_hours``
|
||||
- ``24``
|
||||
- How often to check for updates (in hours)
|
||||
* - ``db_update_meta_url``
|
||||
- GitHub raw URL
|
||||
- URL of the metadata file (for custom mirrors)
|
||||
|
||||
**Using a custom database** with ``--db`` always skips auto-update — you are explicitly choosing your data source.
|
||||
|
||||
+22
-1
@@ -10,7 +10,12 @@ The use of tags allows you to select a subset of the sites from big Maigret DB f
|
||||
|
||||
There are several types of tags:
|
||||
|
||||
1. **Country codes**: ``us``, ``jp``, ``br``... (`ISO 3166-1 alpha-2 <https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2>`_). These tags reflect the site language and regional origin of its users and are then used to locate the owner of a username. If the regional origin is difficult to establish or a site is positioned as worldwide, `no country code is given`. There could be multiple country code tags for one site.
|
||||
1. **Country codes**: ``us``, ``jp``, ``br``... (`ISO 3166-1 alpha-2 <https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2>`_). A country tag means that having an account on the site implies a connection to that country — either origin or residence. The goal is attribution, not perfect accuracy.
|
||||
|
||||
- **Global sites** (GitHub, YouTube, Reddit, Medium, etc.) get **no country tag** — an account there says nothing about where a person is from.
|
||||
- **Regional/local sites** where an account implies a specific country **must** have a country tag: ``VK`` → ``ru``, ``Naver`` → ``kr``, ``Zhihu`` → ``cn``.
|
||||
- Multiple country tags are allowed when a service is used predominantly in a few countries (e.g. ``Xing`` → ``de``, ``eu``).
|
||||
- Do **not** assign country tags based on traffic statistics alone — a site popular in India by traffic is not "Indian" if it is used globally.
|
||||
|
||||
2. **Site engines**. Most of them are forum engines now: ``uCoz``, ``vBulletin``, ``XenForo`` et al. Full list of engines stored in the Maigret database.
|
||||
|
||||
@@ -23,3 +28,19 @@ Usage
|
||||
``--tags coding`` -- search on sites related to software development.
|
||||
|
||||
``--tags ucoz`` -- search on uCoz sites only (mostly CIS countries)
|
||||
|
||||
Blacklisting (excluding) tags
|
||||
------------------------------
|
||||
You can exclude sites with certain tags from the search using ``--exclude-tags``:
|
||||
|
||||
``--exclude-tags porn,dating`` -- skip all sites tagged with ``porn`` or ``dating``.
|
||||
|
||||
``--exclude-tags ru`` -- skip all Russian sites.
|
||||
|
||||
You can combine ``--tags`` and ``--exclude-tags`` to fine-tune your search:
|
||||
|
||||
``--tags forum --exclude-tags ru`` -- search on forum sites, but skip Russian ones.
|
||||
|
||||
In the web interface, the tag cloud supports three states per tag:
|
||||
click once to **include** (green), click again to **exclude** (dark/strikethrough),
|
||||
and click once more to return to **neutral** (red).
|
||||
|
||||
@@ -13,7 +13,7 @@ Use Cases
|
||||
---------
|
||||
|
||||
|
||||
1. Search for accounts with username ``machine42`` on top 500 sites (by default, according to Alexa rank) from the Maigret DB.
|
||||
1. Search for accounts with username ``machine42`` on top 500 sites (by default, according to Majestic Million rank) from the Maigret DB.
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
@@ -33,7 +33,7 @@ Use Cases
|
||||
If you experience many false positives, you can do the following:
|
||||
|
||||
- Install the last development version of Maigret from GitHub
|
||||
- Run Maigret with ``--self-check`` flag and agree on disabling of problematic sites
|
||||
- Run Maigret with ``--self-check --auto-disable`` flag and agree on disabling of problematic sites
|
||||
|
||||
3. Search for accounts with username ``machine42`` and generate HTML and PDF reports.
|
||||
|
||||
|
||||
+3
-14
@@ -30,17 +30,6 @@ class ParsingActivator:
|
||||
jwt_token = r.json()["jwt"]
|
||||
site.headers["Authorization"] = "jwt " + jwt_token
|
||||
|
||||
@staticmethod
|
||||
def spotify(site, logger, cookies={}):
|
||||
headers = dict(site.headers)
|
||||
if "Authorization" in headers:
|
||||
del headers["Authorization"]
|
||||
import requests
|
||||
|
||||
r = requests.get(site.activation["url"])
|
||||
bearer_token = r.json()["accessToken"]
|
||||
site.headers["authorization"] = f"Bearer {bearer_token}"
|
||||
|
||||
@staticmethod
|
||||
def weibo(site, logger):
|
||||
headers = dict(site.headers)
|
||||
@@ -54,7 +43,7 @@ class ParsingActivator:
|
||||
logger.debug(
|
||||
f"1 stage: {'success' if r.status_code == 302 else 'no 302 redirect, fail!'}"
|
||||
)
|
||||
location = r.headers.get("Location")
|
||||
location = r.headers.get("Location", "")
|
||||
|
||||
# 2 stage: go to passport visitor page
|
||||
headers["Referer"] = location
|
||||
@@ -84,9 +73,9 @@ def import_aiohttp_cookies(cookiestxt_filename):
|
||||
cookies = CookieJar()
|
||||
|
||||
cookies_list = []
|
||||
for domain in cookies_obj._cookies.values():
|
||||
for domain in cookies_obj._cookies.values(): # type: ignore[attr-defined]
|
||||
for key, cookie in list(domain.values())[0].items():
|
||||
c = Morsel()
|
||||
c: Morsel = Morsel()
|
||||
c.set(key, cookie.value, cookie.value)
|
||||
c["domain"] = cookie.domain
|
||||
c["path"] = cookie.path
|
||||
|
||||
+379
-106
@@ -6,7 +6,7 @@ import random
|
||||
import re
|
||||
import ssl
|
||||
import sys
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from urllib.parse import quote
|
||||
|
||||
# Third party imports
|
||||
@@ -15,7 +15,7 @@ from alive_progress import alive_bar
|
||||
from aiohttp import ClientSession, TCPConnector, http_exceptions
|
||||
from aiohttp.client_exceptions import ClientConnectorError, ServerDisconnectedError
|
||||
from python_socks import _errors as proxy_errors
|
||||
from socid_extractor import extract
|
||||
from socid_extractor import extract # type: ignore[import-not-found]
|
||||
|
||||
try:
|
||||
from mock import Mock
|
||||
@@ -61,30 +61,49 @@ class SimpleAiohttpChecker(CheckerBase):
|
||||
self.headers = None
|
||||
self.allow_redirects = True
|
||||
self.timeout = 0
|
||||
self.allow_redirects = True
|
||||
self.timeout = 0
|
||||
self.method = 'get'
|
||||
self.payload = None
|
||||
|
||||
def prepare(self, url, headers=None, allow_redirects=True, timeout=0, method='get'):
|
||||
def prepare(self, url, headers=None, allow_redirects=True, timeout=0, method='get', payload=None):
|
||||
self.url = url
|
||||
self.headers = headers
|
||||
self.allow_redirects = allow_redirects
|
||||
self.timeout = timeout
|
||||
self.method = method
|
||||
self.payload = payload
|
||||
return None
|
||||
|
||||
async def close(self):
|
||||
pass
|
||||
|
||||
async def _make_request(
|
||||
self, session, url, headers, allow_redirects, timeout, method, logger
|
||||
) -> Tuple[str, int, Optional[CheckError]]:
|
||||
self, session, url, headers, allow_redirects, timeout, method, logger, payload=None
|
||||
) -> Tuple[Optional[str], int, Optional[CheckError]]:
|
||||
try:
|
||||
request_method = session.get if method == 'get' else session.head
|
||||
async with request_method(
|
||||
url=url,
|
||||
headers=headers,
|
||||
allow_redirects=allow_redirects,
|
||||
timeout=timeout,
|
||||
) as response:
|
||||
if method.lower() == 'get':
|
||||
request_method = session.get
|
||||
elif method.lower() == 'post':
|
||||
request_method = session.post
|
||||
elif method.lower() == 'head':
|
||||
request_method = session.head
|
||||
else:
|
||||
request_method = session.get
|
||||
|
||||
kwargs = {
|
||||
'url': url,
|
||||
'headers': headers,
|
||||
'allow_redirects': allow_redirects,
|
||||
'timeout': timeout,
|
||||
}
|
||||
if payload and method.lower() == 'post':
|
||||
if headers and headers.get('Content-Type') == 'application/x-www-form-urlencoded':
|
||||
kwargs['data'] = payload
|
||||
else:
|
||||
kwargs['json'] = payload
|
||||
|
||||
async with request_method(**kwargs) as response:
|
||||
status_code = response.status
|
||||
response_content = await response.content.read()
|
||||
charset = response.charset or "utf-8"
|
||||
@@ -117,15 +136,21 @@ class SimpleAiohttpChecker(CheckerBase):
|
||||
logger.debug(e, exc_info=True)
|
||||
return None, 0, CheckError("Unexpected", str(e))
|
||||
|
||||
async def check(self) -> Tuple[str, int, Optional[CheckError]]:
|
||||
async def check(self) -> Tuple[Optional[str], int, Optional[CheckError]]:
|
||||
from aiohttp_socks import ProxyConnector
|
||||
|
||||
# Use a real SSL context instead of ssl=False to avoid TLS fingerprinting
|
||||
# blocks by Cloudflare and similar WAFs. Certificate verification is
|
||||
# disabled to handle sites with invalid/expired certs.
|
||||
ssl_context = ssl.create_default_context()
|
||||
ssl_context.check_hostname = False
|
||||
ssl_context.verify_mode = ssl.CERT_NONE
|
||||
|
||||
connector = (
|
||||
ProxyConnector.from_url(self.proxy)
|
||||
if self.proxy
|
||||
else TCPConnector(ssl=False)
|
||||
else TCPConnector(ssl=ssl_context)
|
||||
)
|
||||
connector.verify_ssl = False
|
||||
|
||||
async with ClientSession(
|
||||
connector=connector,
|
||||
@@ -141,6 +166,7 @@ class SimpleAiohttpChecker(CheckerBase):
|
||||
self.timeout,
|
||||
self.method,
|
||||
self.logger,
|
||||
self.payload,
|
||||
)
|
||||
|
||||
if error and str(error) == "Invalid proxy response":
|
||||
@@ -165,11 +191,11 @@ class AiodnsDomainResolver(CheckerBase):
|
||||
self.logger = kwargs.get('logger', Mock())
|
||||
self.resolver = aiodns.DNSResolver(loop=loop)
|
||||
|
||||
def prepare(self, url, headers=None, allow_redirects=True, timeout=0, method='get'):
|
||||
def prepare(self, url, headers=None, allow_redirects=True, timeout=0, method='get', payload=None):
|
||||
self.url = url
|
||||
return None
|
||||
|
||||
async def check(self) -> Tuple[str, int, Optional[CheckError]]:
|
||||
async def check(self) -> Tuple[Optional[str], int, Optional[CheckError]]:
|
||||
status = 404
|
||||
error = None
|
||||
text = ''
|
||||
@@ -187,14 +213,84 @@ class AiodnsDomainResolver(CheckerBase):
|
||||
return text, status, error
|
||||
|
||||
|
||||
try:
|
||||
from curl_cffi.requests import AsyncSession as CurlCffiAsyncSession
|
||||
|
||||
CURL_CFFI_AVAILABLE = True
|
||||
except ImportError:
|
||||
CURL_CFFI_AVAILABLE = False
|
||||
|
||||
|
||||
class CurlCffiChecker(CheckerBase):
|
||||
"""Checker using curl_cffi to emulate browser TLS fingerprint and bypass WAF."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.logger = kwargs.get('logger', Mock())
|
||||
self.browser_emulate = kwargs.get('browser_emulate', 'chrome')
|
||||
self.url = None
|
||||
self.headers = None
|
||||
self.allow_redirects = True
|
||||
self.timeout = 0
|
||||
self.method = 'get'
|
||||
self.payload = None
|
||||
|
||||
def prepare(self, url, headers=None, allow_redirects=True, timeout=0, method='get', payload=None):
|
||||
self.url = url
|
||||
self.headers = headers
|
||||
self.allow_redirects = allow_redirects
|
||||
self.timeout = timeout
|
||||
self.method = method
|
||||
self.payload = payload
|
||||
return None
|
||||
|
||||
async def close(self):
|
||||
pass
|
||||
|
||||
async def check(self) -> Tuple[Optional[str], int, Optional[CheckError]]:
|
||||
try:
|
||||
async with CurlCffiAsyncSession() as session:
|
||||
kwargs = {
|
||||
'url': self.url,
|
||||
'headers': self.headers,
|
||||
'allow_redirects': self.allow_redirects,
|
||||
'timeout': self.timeout if self.timeout else 10,
|
||||
'impersonate': self.browser_emulate,
|
||||
}
|
||||
if self.payload and self.method.lower() == 'post':
|
||||
kwargs['json'] = self.payload
|
||||
|
||||
if self.method.lower() == 'post':
|
||||
response = await session.post(**kwargs)
|
||||
elif self.method.lower() == 'head':
|
||||
response = await session.head(**kwargs)
|
||||
else:
|
||||
response = await session.get(**kwargs)
|
||||
|
||||
status_code = response.status_code
|
||||
decoded_content = response.text
|
||||
|
||||
self.logger.debug(decoded_content)
|
||||
|
||||
error = CheckError("Connection lost") if status_code == 0 else None
|
||||
return decoded_content, status_code, error
|
||||
|
||||
except asyncio.TimeoutError as e:
|
||||
return None, 0, CheckError("Request timeout", str(e))
|
||||
except KeyboardInterrupt:
|
||||
return None, 0, CheckError("Interrupted")
|
||||
except Exception as e:
|
||||
self.logger.debug(e, exc_info=True)
|
||||
return None, 0, CheckError("Unexpected", str(e))
|
||||
|
||||
|
||||
class CheckerMock:
|
||||
def __init__(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
def prepare(self, url, headers=None, allow_redirects=True, timeout=0, method='get'):
|
||||
def prepare(self, url, headers=None, allow_redirects=True, timeout=0, method='get', payload=None):
|
||||
return None
|
||||
|
||||
async def check(self) -> Tuple[str, int, Optional[CheckError]]:
|
||||
async def check(self) -> Tuple[Optional[str], int, Optional[CheckError]]:
|
||||
await asyncio.sleep(0)
|
||||
return '', 0, None
|
||||
|
||||
@@ -220,6 +316,11 @@ def detect_error_page(
|
||||
if status_code == 403 and not ignore_403:
|
||||
return CheckError("Access denied", "403 status code, use proxy/vpn")
|
||||
|
||||
elif status_code == 999:
|
||||
# LinkedIn anti-bot / HTTP 999 workaround. It shouldn't trigger an infrastructure
|
||||
# Server Error because it represents a valid "Not Found / Blocked" state for the username.
|
||||
pass
|
||||
|
||||
elif status_code >= 500:
|
||||
return CheckError("Server", f"{status_code} status code")
|
||||
|
||||
@@ -307,6 +408,12 @@ def process_site_result(
|
||||
|
||||
if html_text:
|
||||
if not presense_flags:
|
||||
if check_type == "message" and logger.isEnabledFor(logging.DEBUG):
|
||||
logger.debug(
|
||||
"Site %s uses checkType message with empty presenseStrs; "
|
||||
"presence is treated as true for any page.",
|
||||
site.name,
|
||||
)
|
||||
is_presense_detected = True
|
||||
site.stats["presense_flag"] = None
|
||||
else:
|
||||
@@ -349,7 +456,7 @@ def process_site_result(
|
||||
result = build_result(MaigretCheckStatus.CLAIMED)
|
||||
else:
|
||||
result = build_result(MaigretCheckStatus.AVAILABLE)
|
||||
elif check_type in "status_code":
|
||||
elif check_type == "status_code":
|
||||
# Checks if the status code of the response is 2XX
|
||||
if 200 <= status_code < 300:
|
||||
result = build_result(MaigretCheckStatus.CLAIMED)
|
||||
@@ -432,8 +539,18 @@ def make_site_result(
|
||||
# workaround to prevent slash errors
|
||||
url = re.sub("(?<!:)/+", "/", url)
|
||||
|
||||
# always clearweb_checker for now
|
||||
checker = options["checkers"][site.protocol]
|
||||
# Select checker: use curl_cffi for sites requiring TLS impersonation
|
||||
needs_impersonation = 'tls_fingerprint' in site.protection
|
||||
if needs_impersonation and CURL_CFFI_AVAILABLE:
|
||||
checker = CurlCffiChecker(logger=logger, browser_emulate='chrome')
|
||||
elif needs_impersonation and not CURL_CFFI_AVAILABLE:
|
||||
logger.warning(
|
||||
f"Site {site.name} requires TLS impersonation (curl_cffi) but it's not installed. "
|
||||
"Install with: pip install curl_cffi"
|
||||
)
|
||||
checker = options["checkers"][site.protocol]
|
||||
else:
|
||||
checker = options["checkers"][site.protocol]
|
||||
|
||||
# site check is disabled
|
||||
if site.disabled and not options['forced']:
|
||||
@@ -488,7 +605,9 @@ def make_site_result(
|
||||
for k, v in site.get_params.items():
|
||||
url_probe += f"&{k}={v}"
|
||||
|
||||
if site.check_type == "status_code" and site.request_head_only:
|
||||
if site.request_method:
|
||||
request_method = site.request_method.lower()
|
||||
elif site.check_type == "status_code" and site.request_head_only:
|
||||
# In most cases when we are detecting by status code,
|
||||
# it is not necessary to get the entire body: we can
|
||||
# detect fine with just the HEAD response.
|
||||
@@ -499,6 +618,15 @@ def make_site_result(
|
||||
# not respond properly unless we request the whole page.
|
||||
request_method = 'get'
|
||||
|
||||
payload = None
|
||||
if site.request_payload:
|
||||
payload = {}
|
||||
for k, v in site.request_payload.items():
|
||||
if isinstance(v, str):
|
||||
payload[k] = v.format(username=username)
|
||||
else:
|
||||
payload[k] = v
|
||||
|
||||
if site.check_type == "response_url":
|
||||
# Site forwards request to a different URL if username not
|
||||
# found. Disallow the redirect so we can capture the
|
||||
@@ -515,6 +643,7 @@ def make_site_result(
|
||||
headers=headers,
|
||||
allow_redirects=allow_redirects,
|
||||
timeout=options['timeout'],
|
||||
payload=payload,
|
||||
)
|
||||
|
||||
# Store future request object in the results object
|
||||
@@ -541,6 +670,39 @@ async def check_site_for_username(
|
||||
return site.name, default_result
|
||||
|
||||
response = await checker.check()
|
||||
html_text = response[0] if response and response[0] else ""
|
||||
|
||||
# Retry once after token-style activation (e.g. Twitter guest token refresh).
|
||||
act = site.activation
|
||||
if act and html_text:
|
||||
marks = act.get("marks") or []
|
||||
if marks and any(m in html_text for m in marks):
|
||||
method = act["method"]
|
||||
try:
|
||||
activate_fun = getattr(ParsingActivator(), method)
|
||||
activate_fun(site, logger)
|
||||
except AttributeError as e:
|
||||
logger.warning(
|
||||
f"Activation method {method} for site {site.name} not found!",
|
||||
exc_info=True,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Failed activation {method} for site {site.name}: {str(e)}",
|
||||
exc_info=True,
|
||||
)
|
||||
else:
|
||||
merged = dict(checker.headers or {})
|
||||
merged.update(site.headers)
|
||||
checker.prepare(
|
||||
url=checker.url,
|
||||
headers=merged,
|
||||
allow_redirects=checker.allow_redirects,
|
||||
timeout=checker.timeout,
|
||||
method=checker.method,
|
||||
payload=getattr(checker, 'payload', None),
|
||||
)
|
||||
response = await checker.check()
|
||||
|
||||
response_result = process_site_result(
|
||||
response, query_notify, logger, default_result, site
|
||||
@@ -723,7 +885,7 @@ async def maigret(
|
||||
with alive_bar(
|
||||
len(tasks_dict), title="Searching", force_tty=True, disable=no_progressbar
|
||||
) as progress:
|
||||
async for result in executor.run(tasks_dict.values()):
|
||||
async for result in executor.run(list(tasks_dict.values())): # type: ignore[arg-type]
|
||||
cur_results.append(result)
|
||||
progress()
|
||||
|
||||
@@ -788,91 +950,160 @@ async def site_self_check(
|
||||
i2p_proxy=None,
|
||||
skip_errors=False,
|
||||
cookies=None,
|
||||
auto_disable=False,
|
||||
diagnose=False,
|
||||
):
|
||||
changes = {
|
||||
"""
|
||||
Self-check a site configuration.
|
||||
|
||||
Args:
|
||||
auto_disable: If True, automatically disable sites that fail checks.
|
||||
If False (default), only report issues without disabling.
|
||||
diagnose: If True, print detailed diagnosis information.
|
||||
"""
|
||||
changes: Dict[str, Any] = {
|
||||
"disabled": False,
|
||||
"issues": [],
|
||||
"recommendations": [],
|
||||
}
|
||||
|
||||
check_data = [
|
||||
(site.username_claimed, MaigretCheckStatus.CLAIMED),
|
||||
(site.username_unclaimed, MaigretCheckStatus.AVAILABLE),
|
||||
]
|
||||
try:
|
||||
check_data = [
|
||||
(site.username_claimed, MaigretCheckStatus.CLAIMED),
|
||||
(site.username_unclaimed, MaigretCheckStatus.AVAILABLE),
|
||||
]
|
||||
|
||||
logger.info(f"Checking {site.name}...")
|
||||
logger.info(f"Checking {site.name}...")
|
||||
|
||||
for username, status in check_data:
|
||||
async with semaphore:
|
||||
results_dict = await maigret(
|
||||
username=username,
|
||||
site_dict={site.name: site},
|
||||
logger=logger,
|
||||
timeout=30,
|
||||
id_type=site.type,
|
||||
forced=True,
|
||||
no_progressbar=True,
|
||||
retries=1,
|
||||
proxy=proxy,
|
||||
tor_proxy=tor_proxy,
|
||||
i2p_proxy=i2p_proxy,
|
||||
cookies=cookies,
|
||||
)
|
||||
results_cache = {}
|
||||
|
||||
# don't disable entries with other ids types
|
||||
# TODO: make normal checking
|
||||
if site.name not in results_dict:
|
||||
logger.info(results_dict)
|
||||
changes["disabled"] = True
|
||||
continue
|
||||
|
||||
logger.debug(results_dict)
|
||||
|
||||
result = results_dict[site.name]["status"]
|
||||
|
||||
if result.error and 'Cannot connect to host' in result.error.desc:
|
||||
changes["disabled"] = True
|
||||
|
||||
site_status = result.status
|
||||
|
||||
if site_status != status:
|
||||
if site_status == MaigretCheckStatus.UNKNOWN:
|
||||
msgs = site.absence_strs
|
||||
etype = site.check_type
|
||||
logger.warning(
|
||||
f"Error while searching {username} in {site.name}: {result.context}, {msgs}, type {etype}"
|
||||
for username, status in check_data:
|
||||
async with semaphore:
|
||||
results_dict = await maigret(
|
||||
username=username,
|
||||
site_dict={site.name: site},
|
||||
logger=logger,
|
||||
timeout=30,
|
||||
id_type=site.type,
|
||||
forced=True,
|
||||
no_progressbar=True,
|
||||
retries=1,
|
||||
proxy=proxy,
|
||||
tor_proxy=tor_proxy,
|
||||
i2p_proxy=i2p_proxy,
|
||||
cookies=cookies,
|
||||
)
|
||||
# don't disable sites after the error
|
||||
# meaning that the site could be available, but returned error for the check
|
||||
# e.g. many sites protected by cloudflare and available in general
|
||||
if skip_errors:
|
||||
pass
|
||||
# don't disable in case of available username
|
||||
elif status == MaigretCheckStatus.CLAIMED:
|
||||
|
||||
# don't disable entries with other ids types
|
||||
# TODO: make normal checking
|
||||
if site.name not in results_dict:
|
||||
logger.info(results_dict)
|
||||
changes["issues"].append(f"Site {site.name} not in results (wrong id_type?)")
|
||||
if auto_disable:
|
||||
changes["disabled"] = True
|
||||
continue
|
||||
|
||||
logger.debug(results_dict)
|
||||
|
||||
result = results_dict[site.name]["status"]
|
||||
results_cache[username] = results_dict[site.name]
|
||||
|
||||
if result.error and 'Cannot connect to host' in result.error.desc:
|
||||
changes["issues"].append("Cannot connect to host")
|
||||
if auto_disable:
|
||||
changes["disabled"] = True
|
||||
elif status == MaigretCheckStatus.CLAIMED:
|
||||
logger.warning(
|
||||
f"Not found `{username}` in {site.name}, must be claimed"
|
||||
)
|
||||
logger.info(results_dict[site.name])
|
||||
changes["disabled"] = True
|
||||
else:
|
||||
logger.warning(f"Found `{username}` in {site.name}, must be available")
|
||||
logger.info(results_dict[site.name])
|
||||
changes["disabled"] = True
|
||||
|
||||
logger.info(f"Site {site.name} checking is finished")
|
||||
site_status = result.status
|
||||
|
||||
if changes["disabled"] != site.disabled:
|
||||
site.disabled = changes["disabled"]
|
||||
logger.info(f"Switching property 'disabled' for {site.name} to {site.disabled}")
|
||||
db.update_site(site)
|
||||
if not silent:
|
||||
action = "Disabled" if site.disabled else "Enabled"
|
||||
print(f"{action} site {site.name}...")
|
||||
if site_status != status:
|
||||
if site_status == MaigretCheckStatus.UNKNOWN:
|
||||
msgs = site.absence_strs
|
||||
etype = site.check_type
|
||||
error_msg = f"Error checking {username}: {result.context}"
|
||||
changes["issues"].append(error_msg)
|
||||
logger.warning(
|
||||
f"Error while searching {username} in {site.name}: {result.context}, {msgs}, type {etype}"
|
||||
)
|
||||
# don't disable sites after the error
|
||||
# meaning that the site could be available, but returned error for the check
|
||||
# e.g. many sites protected by cloudflare and available in general
|
||||
if skip_errors:
|
||||
pass
|
||||
# don't disable in case of available username
|
||||
elif status == MaigretCheckStatus.CLAIMED and auto_disable:
|
||||
changes["disabled"] = True
|
||||
elif status == MaigretCheckStatus.CLAIMED:
|
||||
changes["issues"].append(f"Claimed user '{username}' not detected as claimed")
|
||||
logger.warning(
|
||||
f"Not found `{username}` in {site.name}, must be claimed"
|
||||
)
|
||||
logger.info(results_dict[site.name])
|
||||
if auto_disable:
|
||||
changes["disabled"] = True
|
||||
else:
|
||||
changes["issues"].append(f"Unclaimed user '{username}' detected as claimed")
|
||||
logger.warning(f"Found `{username}` in {site.name}, must be available")
|
||||
logger.info(results_dict[site.name])
|
||||
if auto_disable:
|
||||
changes["disabled"] = True
|
||||
|
||||
# remove service tag "unchecked"
|
||||
if "unchecked" in site.tags:
|
||||
site.tags.remove("unchecked")
|
||||
db.update_site(site)
|
||||
logger.info(f"Site {site.name} checking is finished")
|
||||
|
||||
# Generate recommendations based on issues
|
||||
if changes["issues"] and len(results_cache) == 2:
|
||||
claimed_result = results_cache.get(site.username_claimed, {})
|
||||
unclaimed_result = results_cache.get(site.username_unclaimed, {})
|
||||
|
||||
claimed_http = claimed_result.get("http_status")
|
||||
unclaimed_http = unclaimed_result.get("http_status")
|
||||
|
||||
if claimed_http and unclaimed_http:
|
||||
if claimed_http != unclaimed_http and site.check_type != "status_code":
|
||||
changes["recommendations"].append(
|
||||
f"Consider checkType: status_code (HTTP {claimed_http} vs {unclaimed_http})"
|
||||
)
|
||||
|
||||
# Print diagnosis if requested
|
||||
if diagnose and changes["issues"]:
|
||||
print(f"\n--- {site.name} DIAGNOSIS ---")
|
||||
print(f" Check type: {site.check_type}")
|
||||
print(" Issues:")
|
||||
for issue in changes["issues"]:
|
||||
print(f" - {issue}")
|
||||
if changes["recommendations"]:
|
||||
print(" Recommendations:")
|
||||
for rec in changes["recommendations"]:
|
||||
print(f" -> {rec}")
|
||||
|
||||
# Only modify site if auto_disable is enabled
|
||||
if auto_disable and changes["disabled"] != site.disabled:
|
||||
site.disabled = changes["disabled"]
|
||||
logger.info(f"Switching property 'disabled' for {site.name} to {site.disabled}")
|
||||
db.update_site(site)
|
||||
if not silent:
|
||||
action = "Disabled" if site.disabled else "Enabled"
|
||||
print(f"{action} site {site.name}...")
|
||||
elif changes["issues"] and not silent and not diagnose:
|
||||
# Report issues without disabling
|
||||
print(f"Issues found in {site.name}: {len(changes['issues'])} (not auto-disabled)")
|
||||
|
||||
# remove service tag "unchecked"
|
||||
if "unchecked" in site.tags:
|
||||
site.tags.remove("unchecked")
|
||||
db.update_site(site)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Self-check of {site.name} failed with unexpected error: {e}",
|
||||
exc_info=True,
|
||||
)
|
||||
changes["issues"].append(f"Unexpected error: {e}")
|
||||
if auto_disable and not site.disabled:
|
||||
changes["disabled"] = True
|
||||
site.disabled = True
|
||||
db.update_site(site)
|
||||
if not silent:
|
||||
print(f"Disabled site {site.name} (unexpected error)...")
|
||||
|
||||
return changes
|
||||
|
||||
@@ -886,10 +1117,25 @@ async def self_check(
|
||||
proxy=None,
|
||||
tor_proxy=None,
|
||||
i2p_proxy=None,
|
||||
) -> bool:
|
||||
auto_disable=False,
|
||||
diagnose=False,
|
||||
no_progressbar=False,
|
||||
) -> dict:
|
||||
"""
|
||||
Run self-check on sites.
|
||||
|
||||
Args:
|
||||
auto_disable: If True, automatically disable sites that fail checks.
|
||||
If False (default), only report issues without disabling.
|
||||
diagnose: If True, print detailed diagnosis for each failing site.
|
||||
|
||||
Returns:
|
||||
dict with 'needs_update' bool and 'results' list of check results
|
||||
"""
|
||||
sem = asyncio.Semaphore(max_connections)
|
||||
tasks = []
|
||||
all_sites = site_data
|
||||
all_results = []
|
||||
|
||||
def disabled_count(lst):
|
||||
return len(list(filter(lambda x: x.disabled, lst)))
|
||||
@@ -901,15 +1147,29 @@ async def self_check(
|
||||
|
||||
for _, site in all_sites.items():
|
||||
check_coro = site_self_check(
|
||||
site, logger, sem, db, silent, proxy, tor_proxy, i2p_proxy, skip_errors=True
|
||||
site, logger, sem, db, silent, proxy, tor_proxy, i2p_proxy,
|
||||
skip_errors=True, auto_disable=auto_disable, diagnose=diagnose
|
||||
)
|
||||
future = asyncio.ensure_future(check_coro)
|
||||
tasks.append(future)
|
||||
tasks.append((site.name, future))
|
||||
|
||||
if tasks:
|
||||
with alive_bar(len(tasks), title='Self-checking', force_tty=True) as progress:
|
||||
for f in asyncio.as_completed(tasks):
|
||||
await f
|
||||
with alive_bar(len(tasks), title='Self-checking', force_tty=True, disable=no_progressbar) as progress:
|
||||
for site_name, f in tasks:
|
||||
try:
|
||||
result = await f
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Self-check task for {site_name} raised unexpected error: {e}",
|
||||
exc_info=True,
|
||||
)
|
||||
result = {
|
||||
"disabled": False,
|
||||
"issues": [f"Unexpected error: {e}"],
|
||||
"recommendations": [],
|
||||
}
|
||||
result['site_name'] = site_name
|
||||
all_results.append(result)
|
||||
progress() # Update the progress bar
|
||||
|
||||
unchecked_new_count = len(
|
||||
@@ -918,7 +1178,10 @@ async def self_check(
|
||||
disabled_new_count = disabled_count(all_sites.values())
|
||||
total_disabled = disabled_new_count - disabled_old_count
|
||||
|
||||
if total_disabled:
|
||||
# Count issues
|
||||
total_issues = sum(1 for r in all_results if r.get('issues'))
|
||||
|
||||
if auto_disable and total_disabled:
|
||||
if total_disabled >= 0:
|
||||
message = "Disabled"
|
||||
else:
|
||||
@@ -930,11 +1193,21 @@ async def self_check(
|
||||
f"{message} {total_disabled} ({disabled_old_count} => {disabled_new_count}) checked sites. "
|
||||
"Run with `--info` flag to get more information"
|
||||
)
|
||||
elif total_issues and not silent:
|
||||
print(f"\nFound issues in {total_issues} sites (auto-disable is OFF)")
|
||||
print("Use --auto-disable to automatically disable failing sites")
|
||||
print("Use --diagnose to see detailed diagnosis for each site")
|
||||
|
||||
if unchecked_new_count != unchecked_old_count:
|
||||
print(f"Unchecked sites verified: {unchecked_old_count - unchecked_new_count}")
|
||||
|
||||
return total_disabled != 0 or unchecked_new_count != unchecked_old_count
|
||||
needs_update = total_disabled != 0 or unchecked_new_count != unchecked_old_count
|
||||
|
||||
return {
|
||||
'needs_update': needs_update,
|
||||
'results': all_results,
|
||||
'total_issues': total_issues,
|
||||
}
|
||||
|
||||
|
||||
def extract_ids_data(html_text, logger, site) -> Dict:
|
||||
@@ -953,7 +1226,7 @@ def parse_usernames(extracted_ids_data, logger) -> Dict:
|
||||
elif "usernames" in k:
|
||||
try:
|
||||
tree = ast.literal_eval(v)
|
||||
if type(tree) == list:
|
||||
if isinstance(tree, list):
|
||||
for n in tree:
|
||||
new_usernames[n] = "username"
|
||||
except Exception as e:
|
||||
|
||||
@@ -0,0 +1,330 @@
|
||||
"""
|
||||
Database auto-update logic for maigret.
|
||||
|
||||
Checks a lightweight meta file to determine if a newer site database is available,
|
||||
downloads it if compatible, and caches it locally in ~/.maigret/.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import os.path as path
|
||||
import tempfile
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
from colorama import Fore, Style
|
||||
|
||||
from .__version__ import __version__
|
||||
|
||||
logger = logging.getLogger("maigret")
|
||||
|
||||
_use_color = True
|
||||
|
||||
|
||||
def _print_info(msg: str) -> None:
|
||||
text = f"[*] {msg}"
|
||||
if _use_color:
|
||||
print(Style.BRIGHT + Fore.GREEN + text + Style.RESET_ALL)
|
||||
else:
|
||||
print(text)
|
||||
|
||||
|
||||
def _print_success(msg: str) -> None:
|
||||
text = f"[+] {msg}"
|
||||
if _use_color:
|
||||
print(Style.BRIGHT + Fore.GREEN + text + Style.RESET_ALL)
|
||||
else:
|
||||
print(text)
|
||||
|
||||
|
||||
def _print_warning(msg: str) -> None:
|
||||
text = f"[!] {msg}"
|
||||
if _use_color:
|
||||
print(Style.BRIGHT + Fore.YELLOW + text + Style.RESET_ALL)
|
||||
else:
|
||||
print(text)
|
||||
|
||||
|
||||
DEFAULT_META_URL = (
|
||||
"https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/db_meta.json"
|
||||
)
|
||||
DEFAULT_CHECK_INTERVAL_HOURS = 24
|
||||
MAIGRET_HOME = path.expanduser("~/.maigret")
|
||||
CACHED_DB_PATH = path.join(MAIGRET_HOME, "data.json")
|
||||
STATE_PATH = path.join(MAIGRET_HOME, "autoupdate_state.json")
|
||||
BUNDLED_DB_PATH = path.join(path.dirname(path.realpath(__file__)), "resources", "data.json")
|
||||
|
||||
|
||||
def _parse_version(version_str: str) -> tuple:
|
||||
"""Parse a version string like '0.5.0' into a comparable tuple (0, 5, 0)."""
|
||||
try:
|
||||
return tuple(int(x) for x in version_str.strip().split("."))
|
||||
except (ValueError, AttributeError):
|
||||
return (0, 0, 0)
|
||||
|
||||
|
||||
def _ensure_maigret_home() -> None:
|
||||
os.makedirs(MAIGRET_HOME, exist_ok=True)
|
||||
|
||||
|
||||
def _load_state() -> dict:
|
||||
try:
|
||||
with open(STATE_PATH, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
except (FileNotFoundError, json.JSONDecodeError, OSError):
|
||||
return {}
|
||||
|
||||
|
||||
def _save_state(state: dict) -> None:
|
||||
_ensure_maigret_home()
|
||||
tmp_path = STATE_PATH + ".tmp"
|
||||
try:
|
||||
with open(tmp_path, "w", encoding="utf-8") as f:
|
||||
json.dump(state, f, indent=2, ensure_ascii=False)
|
||||
os.replace(tmp_path, STATE_PATH)
|
||||
except OSError:
|
||||
try:
|
||||
os.unlink(tmp_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def _needs_check(state: dict, interval_hours: int) -> bool:
|
||||
last_check = state.get("last_check_at")
|
||||
if not last_check:
|
||||
return True
|
||||
try:
|
||||
last_dt = datetime.fromisoformat(last_check.replace("Z", "+00:00"))
|
||||
elapsed = (datetime.now(timezone.utc) - last_dt).total_seconds() / 3600
|
||||
return elapsed >= interval_hours
|
||||
except (ValueError, TypeError):
|
||||
return True
|
||||
|
||||
|
||||
def _fetch_meta(meta_url: str, timeout: int = 10) -> Optional[dict]:
|
||||
try:
|
||||
response = requests.get(meta_url, timeout=timeout)
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def _is_version_compatible(meta: dict) -> bool:
|
||||
min_ver = meta.get("min_maigret_version", "0.0.0")
|
||||
return _parse_version(__version__) >= _parse_version(min_ver)
|
||||
|
||||
|
||||
def _is_update_available(meta: dict, state: dict) -> bool:
|
||||
if not path.isfile(CACHED_DB_PATH):
|
||||
return True
|
||||
remote_date = meta.get("updated_at", "")
|
||||
cached_date = state.get("last_meta", {}).get("updated_at", "")
|
||||
return remote_date > cached_date
|
||||
|
||||
|
||||
def _download_and_verify(data_url: str, expected_sha256: str, timeout: int = 60) -> Optional[str]:
|
||||
_ensure_maigret_home()
|
||||
tmp_fd, tmp_path = tempfile.mkstemp(dir=MAIGRET_HOME, suffix=".json")
|
||||
try:
|
||||
response = requests.get(data_url, timeout=timeout)
|
||||
if response.status_code != 200:
|
||||
return None
|
||||
|
||||
content = response.content
|
||||
actual_sha256 = hashlib.sha256(content).hexdigest()
|
||||
if actual_sha256 != expected_sha256:
|
||||
_print_warning("DB auto-update: SHA-256 mismatch, download rejected")
|
||||
return None
|
||||
|
||||
# Validate JSON structure
|
||||
data = json.loads(content)
|
||||
if not all(k in data for k in ("sites", "engines", "tags")):
|
||||
_print_warning("DB auto-update: invalid database structure")
|
||||
return None
|
||||
|
||||
os.write(tmp_fd, content)
|
||||
os.close(tmp_fd)
|
||||
tmp_fd = None
|
||||
os.replace(tmp_path, CACHED_DB_PATH)
|
||||
return CACHED_DB_PATH
|
||||
except Exception:
|
||||
return None
|
||||
finally:
|
||||
if tmp_fd is not None:
|
||||
os.close(tmp_fd)
|
||||
try:
|
||||
os.unlink(tmp_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def _best_local() -> str:
|
||||
"""Return cached DB if it exists and is valid, otherwise bundled."""
|
||||
if path.isfile(CACHED_DB_PATH):
|
||||
try:
|
||||
with open(CACHED_DB_PATH, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
if "sites" in data:
|
||||
return CACHED_DB_PATH
|
||||
except (json.JSONDecodeError, OSError):
|
||||
pass
|
||||
return BUNDLED_DB_PATH
|
||||
|
||||
|
||||
def _now_iso() -> str:
|
||||
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
|
||||
|
||||
def resolve_db_path(
|
||||
db_file_arg: str,
|
||||
no_autoupdate: bool = False,
|
||||
meta_url: str = DEFAULT_META_URL,
|
||||
check_interval_hours: int = DEFAULT_CHECK_INTERVAL_HOURS,
|
||||
color: bool = True,
|
||||
) -> str:
|
||||
"""
|
||||
Determine which database file to use, potentially downloading an update.
|
||||
|
||||
Returns the path to the database file that should be loaded.
|
||||
"""
|
||||
global _use_color
|
||||
_use_color = color
|
||||
|
||||
default_db_name = "resources/data.json"
|
||||
|
||||
# User specified a custom DB — skip auto-update
|
||||
is_url = db_file_arg.startswith("http://") or db_file_arg.startswith("https://")
|
||||
is_default = db_file_arg == default_db_name
|
||||
if is_url:
|
||||
return db_file_arg
|
||||
if not is_default:
|
||||
return path.join(path.dirname(path.realpath(__file__)), db_file_arg)
|
||||
|
||||
# Auto-update disabled
|
||||
if no_autoupdate:
|
||||
return _best_local()
|
||||
|
||||
# Check interval
|
||||
_ensure_maigret_home()
|
||||
state = _load_state()
|
||||
if not _needs_check(state, check_interval_hours):
|
||||
return _best_local()
|
||||
|
||||
# Time to check
|
||||
_print_info("DB auto-update: checking for updates...")
|
||||
meta = _fetch_meta(meta_url)
|
||||
if meta is None:
|
||||
_print_warning("DB auto-update: could not reach update server, using local database")
|
||||
state["last_check_at"] = _now_iso()
|
||||
_save_state(state)
|
||||
return _best_local()
|
||||
|
||||
# Version compatibility
|
||||
if not _is_version_compatible(meta):
|
||||
min_ver = meta.get("min_maigret_version", "?")
|
||||
_print_warning(
|
||||
f"DB auto-update: latest database requires maigret >= {min_ver}, "
|
||||
f"you have {__version__}. Please upgrade with: pip install -U maigret"
|
||||
)
|
||||
state["last_check_at"] = _now_iso()
|
||||
_save_state(state)
|
||||
return _best_local()
|
||||
|
||||
# Check if update available
|
||||
if not _is_update_available(meta, state):
|
||||
sites_count = meta.get("sites_count", "?")
|
||||
_print_info(f"DB auto-update: database is up to date ({sites_count} sites)")
|
||||
state["last_check_at"] = _now_iso()
|
||||
state["last_meta"] = meta
|
||||
_save_state(state)
|
||||
return _best_local()
|
||||
|
||||
# Download update
|
||||
new_count = meta.get("sites_count", "?")
|
||||
old_count = state.get("last_meta", {}).get("sites_count")
|
||||
if old_count:
|
||||
_print_info(f"DB auto-update: downloading updated database ({new_count} sites, was {old_count})...")
|
||||
else:
|
||||
_print_info(f"DB auto-update: downloading database ({new_count} sites)...")
|
||||
|
||||
data_url = meta.get("data_url", "")
|
||||
expected_sha = meta.get("data_sha256", "")
|
||||
result = _download_and_verify(data_url, expected_sha)
|
||||
|
||||
if result is None:
|
||||
_print_warning("DB auto-update: download failed, using local database")
|
||||
state["last_check_at"] = _now_iso()
|
||||
_save_state(state)
|
||||
return _best_local()
|
||||
|
||||
_print_success(f"DB auto-update: database updated successfully ({new_count} sites)")
|
||||
state["last_check_at"] = _now_iso()
|
||||
state["last_meta"] = meta
|
||||
state["cached_db_sha256"] = expected_sha
|
||||
_save_state(state)
|
||||
return CACHED_DB_PATH
|
||||
|
||||
|
||||
def force_update(
|
||||
meta_url: str = DEFAULT_META_URL,
|
||||
color: bool = True,
|
||||
) -> bool:
|
||||
"""
|
||||
Force check for database updates and download if available.
|
||||
|
||||
Returns True if database was updated, False otherwise.
|
||||
"""
|
||||
global _use_color
|
||||
_use_color = color
|
||||
|
||||
_ensure_maigret_home()
|
||||
|
||||
_print_info("DB update: checking for updates...")
|
||||
meta = _fetch_meta(meta_url)
|
||||
if meta is None:
|
||||
_print_warning("DB update: could not reach update server")
|
||||
return False
|
||||
|
||||
if not _is_version_compatible(meta):
|
||||
min_ver = meta.get("min_maigret_version", "?")
|
||||
_print_warning(
|
||||
f"DB update: latest database requires maigret >= {min_ver}, "
|
||||
f"you have {__version__}. Please upgrade with: pip install -U maigret"
|
||||
)
|
||||
return False
|
||||
|
||||
state = _load_state()
|
||||
new_count = meta.get("sites_count", "?")
|
||||
old_count = state.get("last_meta", {}).get("sites_count")
|
||||
|
||||
if not _is_update_available(meta, state):
|
||||
_print_info(f"DB update: database is already up to date ({new_count} sites)")
|
||||
state["last_check_at"] = _now_iso()
|
||||
state["last_meta"] = meta
|
||||
_save_state(state)
|
||||
return False
|
||||
|
||||
if old_count:
|
||||
_print_info(f"DB update: downloading updated database ({new_count} sites, was {old_count})...")
|
||||
else:
|
||||
_print_info(f"DB update: downloading database ({new_count} sites)...")
|
||||
|
||||
data_url = meta.get("data_url", "")
|
||||
expected_sha = meta.get("data_sha256", "")
|
||||
result = _download_and_verify(data_url, expected_sha)
|
||||
|
||||
if result is None:
|
||||
_print_warning("DB update: download failed")
|
||||
return False
|
||||
|
||||
_print_success(f"DB update: database updated successfully ({new_count} sites)")
|
||||
state["last_check_at"] = _now_iso()
|
||||
state["last_meta"] = meta
|
||||
state["cached_db_sha256"] = expected_sha
|
||||
_save_state(state)
|
||||
return True
|
||||
@@ -32,6 +32,9 @@ COMMON_ERRORS = {
|
||||
'<title>Attention Required! | Cloudflare</title>': CheckError(
|
||||
'Captcha', 'Cloudflare'
|
||||
),
|
||||
'<title>Just a moment</title>': CheckError(
|
||||
'Bot protection', 'Cloudflare challenge page'
|
||||
),
|
||||
'Please stand by, while we are checking your browser': CheckError(
|
||||
'Bot protection', 'Cloudflare'
|
||||
),
|
||||
@@ -55,6 +58,8 @@ COMMON_ERRORS = {
|
||||
'Censorship', 'MGTS'
|
||||
),
|
||||
'Incapsula incident ID': CheckError('Bot protection', 'Incapsula'),
|
||||
'<title>Client Challenge</title>': CheckError('Bot protection', 'Anti-bot challenge'),
|
||||
'<title>DDoS-Guard</title>': CheckError('Bot protection', 'DDoS-Guard'),
|
||||
'Сайт заблокирован хостинг-провайдером': CheckError(
|
||||
'Site-specific', 'Site is disabled (Beget)'
|
||||
),
|
||||
|
||||
@@ -103,7 +103,7 @@ class AsyncioProgressbarQueueExecutor(AsyncExecutor):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.workers_count = kwargs.get('in_parallel', 10)
|
||||
self.queue = asyncio.Queue(self.workers_count)
|
||||
self.queue: asyncio.Queue = asyncio.Queue(self.workers_count)
|
||||
self.timeout = kwargs.get('timeout')
|
||||
# Pass a progress function; alive_bar by default
|
||||
self.progress_func = kwargs.get('progress_func', alive_bar)
|
||||
@@ -184,10 +184,10 @@ class AsyncioQueueGeneratorExecutor:
|
||||
# Deprecated: will be removed soon, don't use it
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.workers_count = kwargs.get('in_parallel', 10)
|
||||
self.queue = asyncio.Queue()
|
||||
self.queue: asyncio.Queue = asyncio.Queue()
|
||||
self.timeout = kwargs.get('timeout')
|
||||
self.logger = kwargs['logger']
|
||||
self._results = asyncio.Queue()
|
||||
self._results: asyncio.Queue = asyncio.Queue()
|
||||
self._stop_signal = object()
|
||||
|
||||
async def worker(self):
|
||||
@@ -209,7 +209,7 @@ class AsyncioQueueGeneratorExecutor:
|
||||
result = kwargs.get('default')
|
||||
await self._results.put(result)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error in worker: {e}")
|
||||
self.logger.error(f"Error in worker: {e}", exc_info=True)
|
||||
finally:
|
||||
self.queue.task_done()
|
||||
|
||||
|
||||
+103
-10
@@ -13,7 +13,7 @@ from argparse import ArgumentParser, RawDescriptionHelpFormatter
|
||||
from typing import List, Tuple
|
||||
import os.path as path
|
||||
|
||||
from socid_extractor import extract, parse
|
||||
from socid_extractor import extract, parse # type: ignore[import-not-found]
|
||||
|
||||
from .__version__ import __version__
|
||||
from .checking import (
|
||||
@@ -37,6 +37,7 @@ from .report import (
|
||||
get_plaintext_report,
|
||||
sort_report_by_data_points,
|
||||
save_graph_report,
|
||||
save_markdown_report,
|
||||
)
|
||||
from .sites import MaigretDatabase
|
||||
from .submit import Submitter
|
||||
@@ -75,7 +76,7 @@ def extract_ids_from_page(url, logger, timeout=5) -> dict:
|
||||
elif 'usernames' in k:
|
||||
try:
|
||||
tree = ast.literal_eval(v)
|
||||
if type(tree) == list:
|
||||
if isinstance(tree, list):
|
||||
for n in tree:
|
||||
results[n] = 'username'
|
||||
except Exception as e:
|
||||
@@ -201,6 +202,20 @@ def setup_arguments_parser(settings: Settings):
|
||||
default=settings.sites_db_path,
|
||||
help="Load Maigret database from a JSON file or HTTP web resource.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-autoupdate",
|
||||
action="store_true",
|
||||
dest="no_autoupdate",
|
||||
default=settings.no_autoupdate,
|
||||
help="Disable automatic database updates on startup.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--force-update",
|
||||
action="store_true",
|
||||
dest="force_update",
|
||||
default=False,
|
||||
help="Force check for database updates and download if available.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cookies-jar-file",
|
||||
metavar="COOKIE_FILE",
|
||||
@@ -277,6 +292,12 @@ def setup_arguments_parser(settings: Settings):
|
||||
filter_group.add_argument(
|
||||
"--tags", dest="tags", default='', help="Specify tags of sites (see `--stats`)."
|
||||
)
|
||||
filter_group.add_argument(
|
||||
"--exclude-tags",
|
||||
dest="exclude_tags",
|
||||
default='',
|
||||
help="Specify tags to exclude from search (blacklist).",
|
||||
)
|
||||
filter_group.add_argument(
|
||||
"--site",
|
||||
action="append",
|
||||
@@ -316,7 +337,19 @@ def setup_arguments_parser(settings: Settings):
|
||||
"--self-check",
|
||||
action="store_true",
|
||||
default=settings.self_check_enabled,
|
||||
help="Do self check for sites and database and disable non-working ones.",
|
||||
help="Do self check for sites and database. Use --auto-disable to disable failing sites.",
|
||||
)
|
||||
modes_group.add_argument(
|
||||
"--auto-disable",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="With --self-check: automatically disable sites that fail checks.",
|
||||
)
|
||||
modes_group.add_argument(
|
||||
"--diagnose",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="With --self-check: print detailed diagnosis for each failing site.",
|
||||
)
|
||||
modes_group.add_argument(
|
||||
"--stats",
|
||||
@@ -433,6 +466,14 @@ def setup_arguments_parser(settings: Settings):
|
||||
default=settings.pdf_report,
|
||||
help="Generate a PDF report (general report on all usernames).",
|
||||
)
|
||||
report_group.add_argument(
|
||||
"-M",
|
||||
"--md",
|
||||
action="store_true",
|
||||
dest="md",
|
||||
default=settings.md_report,
|
||||
help="Generate a Markdown report (general report on all usernames).",
|
||||
)
|
||||
report_group.add_argument(
|
||||
"-G",
|
||||
"--graph",
|
||||
@@ -520,9 +561,26 @@ async def main():
|
||||
if args.tags:
|
||||
args.tags = list(set(str(args.tags).split(',')))
|
||||
|
||||
db_file = args.db_file \
|
||||
if (args.db_file.startswith("http://") or args.db_file.startswith("https://")) \
|
||||
else path.join(path.dirname(path.realpath(__file__)), args.db_file)
|
||||
if args.exclude_tags:
|
||||
args.exclude_tags = list(set(str(args.exclude_tags).split(',')))
|
||||
else:
|
||||
args.exclude_tags = []
|
||||
|
||||
from .db_updater import resolve_db_path, force_update, BUNDLED_DB_PATH
|
||||
|
||||
if args.force_update:
|
||||
force_update(
|
||||
meta_url=settings.db_update_meta_url,
|
||||
color=not args.no_color,
|
||||
)
|
||||
|
||||
db_file = resolve_db_path(
|
||||
db_file_arg=args.db_file,
|
||||
no_autoupdate=args.no_autoupdate or args.force_update,
|
||||
meta_url=settings.db_update_meta_url,
|
||||
check_interval_hours=settings.autoupdate_check_interval_hours,
|
||||
color=not args.no_color,
|
||||
)
|
||||
|
||||
if args.top_sites == 0 or args.all_sites:
|
||||
args.top_sites = sys.maxsize
|
||||
@@ -537,10 +595,19 @@ async def main():
|
||||
)
|
||||
|
||||
# Create object with all information about sites we are aware of.
|
||||
db = MaigretDatabase().load_from_path(db_file)
|
||||
try:
|
||||
db = MaigretDatabase().load_from_path(db_file)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load database from {db_file}: {e}")
|
||||
if db_file != BUNDLED_DB_PATH:
|
||||
logger.warning("Falling back to bundled database")
|
||||
db = MaigretDatabase().load_from_path(BUNDLED_DB_PATH)
|
||||
else:
|
||||
raise
|
||||
get_top_sites_for_id = lambda x: db.ranked_sites_dict(
|
||||
top=args.top_sites,
|
||||
tags=args.tags,
|
||||
excluded_tags=args.exclude_tags,
|
||||
names=args.site_list,
|
||||
disabled=args.use_disabled_sites,
|
||||
id_type=x,
|
||||
@@ -566,7 +633,7 @@ async def main():
|
||||
query_notify.success(
|
||||
f'Maigret sites database self-check started for {len(site_data)} sites...'
|
||||
)
|
||||
is_need_update = await self_check(
|
||||
check_result = await self_check(
|
||||
db,
|
||||
site_data,
|
||||
logger,
|
||||
@@ -574,7 +641,13 @@ async def main():
|
||||
max_connections=args.connections,
|
||||
tor_proxy=args.tor_proxy,
|
||||
i2p_proxy=args.i2p_proxy,
|
||||
auto_disable=args.auto_disable,
|
||||
diagnose=args.diagnose,
|
||||
no_progressbar=args.no_progressbar,
|
||||
)
|
||||
|
||||
is_need_update = check_result.get('needs_update', False)
|
||||
|
||||
if is_need_update:
|
||||
if input('Do you want to save changes permanently? [Yn]\n').lower() in (
|
||||
'y',
|
||||
@@ -611,7 +684,10 @@ async def main():
|
||||
port = (
|
||||
args.web if args.web else 5000
|
||||
) # args.web is either the specified port or 5000 by default
|
||||
app.run(port=port)
|
||||
|
||||
# Host configuration: secure by default, but allow override via environment
|
||||
host = os.getenv('FLASK_HOST', '127.0.0.1')
|
||||
app.run(host=host, port=port)
|
||||
return
|
||||
|
||||
if usernames == {}:
|
||||
@@ -736,7 +812,7 @@ async def main():
|
||||
|
||||
# reporting for all the result
|
||||
if general_results:
|
||||
if args.html or args.pdf:
|
||||
if args.html or args.pdf or args.md:
|
||||
query_notify.warning('Generating report info...')
|
||||
report_context = generate_report_context(general_results)
|
||||
# determine main username
|
||||
@@ -756,6 +832,23 @@ async def main():
|
||||
save_pdf_report(filename, report_context)
|
||||
query_notify.warning(f'PDF report on all usernames saved in {filename}')
|
||||
|
||||
if args.md:
|
||||
username = username.replace('/', '_')
|
||||
filename = report_filepath_tpl.format(username=username, postfix='.md')
|
||||
run_flags = []
|
||||
if args.tags:
|
||||
run_flags.append(f"--tags {args.tags}")
|
||||
if args.site_list:
|
||||
run_flags.append(f"--site {','.join(args.site_list)}")
|
||||
if args.all_sites:
|
||||
run_flags.append("--all-sites")
|
||||
run_info = {
|
||||
"sites_count": sum(len(d) for _, _, d in general_results),
|
||||
"flags": " ".join(run_flags) if run_flags else None,
|
||||
}
|
||||
save_markdown_report(filename, report_context, run_info=run_info)
|
||||
query_notify.warning(f'Markdown report on all usernames saved in {filename}')
|
||||
|
||||
if args.graph:
|
||||
username = username.replace('/', '_')
|
||||
filename = report_filepath_tpl.format(
|
||||
|
||||
+1
-1
@@ -174,7 +174,7 @@ class QueryNotifyPrint(QueryNotify):
|
||||
else:
|
||||
return self.make_simple_terminal_notify(*args)
|
||||
|
||||
def start(self, message, id_type):
|
||||
def start(self, message=None, id_type="username"):
|
||||
"""Notify Start.
|
||||
|
||||
Will print the title to the standard output.
|
||||
|
||||
+151
-12
@@ -7,7 +7,7 @@ import os
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any
|
||||
|
||||
import xmind
|
||||
import xmind # type: ignore[import-untyped]
|
||||
from dateutil.tz import gettz
|
||||
from dateutil.parser import parse as parse_datetime_str
|
||||
from jinja2 import Template
|
||||
@@ -79,7 +79,7 @@ def save_pdf_report(filename: str, context: dict):
|
||||
filled_template = template.render(**context)
|
||||
|
||||
# moved here to speed up the launch of Maigret
|
||||
from xhtml2pdf import pisa
|
||||
from xhtml2pdf import pisa # type: ignore[import-untyped]
|
||||
|
||||
with open(filename, "w+b") as f:
|
||||
pisa.pisaDocument(io.StringIO(filled_template), dest=f, default_css=css)
|
||||
@@ -91,9 +91,9 @@ def save_json_report(filename: str, username: str, results: dict, report_type: s
|
||||
|
||||
|
||||
class MaigretGraph:
|
||||
other_params = {'size': 10, 'group': 3}
|
||||
site_params = {'size': 15, 'group': 2}
|
||||
username_params = {'size': 20, 'group': 1}
|
||||
other_params: dict = {'size': 10, 'group': 3}
|
||||
site_params: dict = {'size': 15, 'group': 2}
|
||||
username_params: dict = {'size': 20, 'group': 1}
|
||||
|
||||
def __init__(self, graph):
|
||||
self.G = graph
|
||||
@@ -121,12 +121,12 @@ class MaigretGraph:
|
||||
def save_graph_report(filename: str, username_results: list, db: MaigretDatabase):
|
||||
import networkx as nx
|
||||
|
||||
G = nx.Graph()
|
||||
G: Any = nx.Graph()
|
||||
graph = MaigretGraph(G)
|
||||
|
||||
base_site_nodes = {}
|
||||
site_account_nodes = {}
|
||||
processed_values = {} # Track processed values to avoid duplicates
|
||||
processed_values: Dict[str, Any] = {} # Track processed values to avoid duplicates
|
||||
|
||||
for username, id_type, results in username_results:
|
||||
# Add username node, using normalized version directly if different
|
||||
@@ -239,7 +239,7 @@ def save_graph_report(filename: str, username_results: list, db: MaigretDatabase
|
||||
G.remove_nodes_from(single_degree_sites)
|
||||
|
||||
# Generate interactive visualization
|
||||
from pyvis.network import Network
|
||||
from pyvis.network import Network # type: ignore[import-untyped]
|
||||
|
||||
nt = Network(notebook=True, height="750px", width="100%")
|
||||
nt.from_nx(G)
|
||||
@@ -257,6 +257,144 @@ def get_plaintext_report(context: dict) -> str:
|
||||
return output.strip()
|
||||
|
||||
|
||||
def _md_format_value(value) -> str:
|
||||
"""Format a value for Markdown output, detecting links."""
|
||||
if isinstance(value, list):
|
||||
return ", ".join(str(v) for v in value)
|
||||
s = str(value)
|
||||
if s.startswith("http://") or s.startswith("https://"):
|
||||
return f"[{s}]({s})"
|
||||
return s
|
||||
|
||||
|
||||
def save_markdown_report(filename: str, context: dict, run_info: dict = None):
|
||||
username = context.get("username", "unknown")
|
||||
generated_at = context.get("generated_at", "")
|
||||
brief = context.get("brief", "")
|
||||
countries = context.get("countries_tuple_list", [])
|
||||
interests = context.get("interests_tuple_list", [])
|
||||
first_seen = context.get("first_seen")
|
||||
results = context.get("results", [])
|
||||
|
||||
# Collect ALL values for key fields across all accounts
|
||||
all_fields: Dict[str, list] = {}
|
||||
last_seen = None
|
||||
for _, _, data in results:
|
||||
for _, v in data.items():
|
||||
if not v.get("found") or v.get("is_similar"):
|
||||
continue
|
||||
ids_data = v.get("ids_data", {})
|
||||
# Map multiple source fields to unified output fields
|
||||
field_sources = {
|
||||
"fullname": ("fullname", "name"),
|
||||
"location": ("location", "country", "city", "country_code", "locale", "region"),
|
||||
"gender": ("gender",),
|
||||
"bio": ("bio", "about", "description"),
|
||||
}
|
||||
for out_field, source_keys in field_sources.items():
|
||||
for src in source_keys:
|
||||
val = ids_data.get(src)
|
||||
if val:
|
||||
all_fields.setdefault(out_field, [])
|
||||
val_str = str(val)
|
||||
if val_str not in all_fields[out_field]:
|
||||
all_fields[out_field].append(val_str)
|
||||
# Track last_seen
|
||||
for ts_field in ("last_online", "latest_activity_at", "updated_at"):
|
||||
ts = ids_data.get(ts_field)
|
||||
if ts and (last_seen is None or str(ts) > str(last_seen)):
|
||||
last_seen = ts
|
||||
|
||||
lines = []
|
||||
lines.append(f"# Report by searching on username \"{username}\"\n")
|
||||
|
||||
# Generated line with run info
|
||||
gen_line = f"Generated at {generated_at} by [Maigret](https://github.com/soxoj/maigret)"
|
||||
if run_info:
|
||||
parts = []
|
||||
if run_info.get("sites_count"):
|
||||
parts.append(f"{run_info['sites_count']} sites checked")
|
||||
if run_info.get("flags"):
|
||||
parts.append(f"flags: `{run_info['flags']}`")
|
||||
if parts:
|
||||
gen_line += f" ({', '.join(parts)})"
|
||||
lines.append(f"{gen_line}\n")
|
||||
|
||||
# Summary
|
||||
lines.append("## Summary\n")
|
||||
lines.append(f"{brief}\n")
|
||||
|
||||
if all_fields:
|
||||
lines.append("**Information extracted from accounts:**\n")
|
||||
for field, values in all_fields.items():
|
||||
title = CaseConverter.snake_to_title(field)
|
||||
lines.append(f"- {title}: {'; '.join(values)}")
|
||||
lines.append("")
|
||||
|
||||
if countries:
|
||||
geo = ", ".join(f"{code} (x{count})" for code, count in countries)
|
||||
lines.append(f"**Country tags:** {geo}\n")
|
||||
|
||||
if interests:
|
||||
tags = ", ".join(f"{tag} (x{count})" for tag, count in interests)
|
||||
lines.append(f"**Website tags:** {tags}\n")
|
||||
|
||||
if first_seen:
|
||||
lines.append(f"**First seen:** {first_seen}")
|
||||
if last_seen:
|
||||
lines.append(f"**Last seen:** {last_seen}")
|
||||
if first_seen or last_seen:
|
||||
lines.append("")
|
||||
|
||||
# Accounts found
|
||||
lines.append("## Accounts found\n")
|
||||
|
||||
for u, id_type, data in results:
|
||||
for site_name, v in data.items():
|
||||
if not v.get("found") or v.get("is_similar"):
|
||||
continue
|
||||
|
||||
lines.append(f"### {site_name}\n")
|
||||
lines.append(f"- **URL:** [{v.get('url_user', '')}]({v.get('url_user', '')})")
|
||||
|
||||
tags = v.get("status") and v["status"].tags or []
|
||||
if tags:
|
||||
lines.append(f"- **Tags:** {', '.join(tags)}")
|
||||
lines.append("")
|
||||
|
||||
ids_data = v.get("ids_data", {})
|
||||
if ids_data:
|
||||
for field, value in ids_data.items():
|
||||
if field == "image":
|
||||
continue
|
||||
title = CaseConverter.snake_to_title(field)
|
||||
lines.append(f"- {title}: {_md_format_value(value)}")
|
||||
|
||||
lines.append("")
|
||||
|
||||
# Possible false positives
|
||||
lines.append("## Possible false positives\n")
|
||||
lines.append(
|
||||
f"This report was generated by searching for accounts matching the username `{username}`. "
|
||||
f"Accounts listed above may belong to different people who happen to use the same "
|
||||
f"or similar username. Results without extracted personal information could contain "
|
||||
f"some false positive findings. Always verify findings before drawing conclusions.\n"
|
||||
)
|
||||
|
||||
# Ethical use
|
||||
lines.append("## Ethical use\n")
|
||||
lines.append(
|
||||
"This report is a result of a technical collection of publicly available information "
|
||||
"from online accounts and does not constitute personal data processing. If you intend "
|
||||
"to use this data for personal data processing or collection purposes, ensure your use "
|
||||
"complies with applicable laws and regulations in your jurisdiction (such as GDPR, "
|
||||
"CCPA, and similar).\n"
|
||||
)
|
||||
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
f.write("\n".join(lines))
|
||||
|
||||
|
||||
"""
|
||||
REPORTS GENERATING
|
||||
"""
|
||||
@@ -353,11 +491,12 @@ def generate_report_context(username_results: list):
|
||||
if k in ["country", "locale"]:
|
||||
try:
|
||||
if is_country_tag(k):
|
||||
tag = pycountry.countries.get(alpha_2=v).alpha_2.lower()
|
||||
country = pycountry.countries.get(alpha_2=v)
|
||||
tag = country.alpha_2.lower() # type: ignore[union-attr]
|
||||
else:
|
||||
tag = pycountry.countries.search_fuzzy(v)[
|
||||
0
|
||||
].alpha_2.lower()
|
||||
].alpha_2.lower() # type: ignore[attr-defined]
|
||||
# TODO: move countries to another struct
|
||||
tags[tag] = tags.get(tag, 0) + 1
|
||||
except Exception as e:
|
||||
@@ -513,8 +652,8 @@ def add_xmind_subtopic(userlink, k, v, supposed_data):
|
||||
|
||||
|
||||
def design_xmind_sheet(sheet, username, results):
|
||||
alltags = {}
|
||||
supposed_data = {}
|
||||
alltags: Dict[str, Any] = {}
|
||||
supposed_data: Dict[str, Any] = {}
|
||||
|
||||
sheet.setTitle("%s Analysis" % (username))
|
||||
root_topic1 = sheet.getRootTopic()
|
||||
|
||||
+24204
-24672
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,8 @@
|
||||
{
|
||||
"version": 1,
|
||||
"updated_at": "2026-04-07T16:18:18Z",
|
||||
"sites_count": 3155,
|
||||
"min_maigret_version": "0.5.0",
|
||||
"data_sha256": "279fb90280814cd11dcd711b1b8e6c6a99fefea4ce6ef05c9d64dced6ac795c0",
|
||||
"data_url": "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/data.json"
|
||||
}
|
||||
@@ -54,5 +54,9 @@
|
||||
"graph_report": false,
|
||||
"pdf_report": false,
|
||||
"html_report": false,
|
||||
"web_interface_port": 5000
|
||||
"md_report": false,
|
||||
"web_interface_port": 5000,
|
||||
"no_autoupdate": false,
|
||||
"db_update_meta_url": "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/db_meta.json",
|
||||
"autoupdate_check_interval_hours": 24
|
||||
}
|
||||
+5
-1
@@ -5,7 +5,7 @@ from typing import List
|
||||
|
||||
SETTINGS_FILES_PATHS = [
|
||||
path.join(path.dirname(path.realpath(__file__)), "resources/settings.json"),
|
||||
'~/.maigret/settings.json',
|
||||
path.expanduser('~/.maigret/settings.json'),
|
||||
path.join(os.getcwd(), 'settings.json'),
|
||||
]
|
||||
|
||||
@@ -42,7 +42,11 @@ class Settings:
|
||||
pdf_report: bool
|
||||
html_report: bool
|
||||
graph_report: bool
|
||||
md_report: bool
|
||||
web_interface_port: int
|
||||
no_autoupdate: bool
|
||||
db_update_meta_url: str
|
||||
autoupdate_check_interval_hours: int
|
||||
|
||||
# submit mode settings
|
||||
presence_strings: list
|
||||
|
||||
+73
-9
@@ -65,6 +65,10 @@ class MaigretSite:
|
||||
url_probe = None
|
||||
# Type of check to perform
|
||||
check_type = ""
|
||||
# HTTP request method (GET, POST, HEAD, etc.)
|
||||
request_method = ""
|
||||
# HTTP request payload (for POST, PUT, etc.)
|
||||
request_payload: Dict[str, Any] = {}
|
||||
# Whether to only send HEAD requests (GET by default)
|
||||
request_head_only = ""
|
||||
# GET parameters to include in requests
|
||||
@@ -88,10 +92,12 @@ class MaigretSite:
|
||||
# Alexa traffic rank
|
||||
alexa_rank = None
|
||||
# Source (in case a site is a mirror of another site)
|
||||
source = None
|
||||
source: Optional[str] = None
|
||||
|
||||
# URL protocol (http/https)
|
||||
protocol = ''
|
||||
# Protection types detected on this site (e.g. ["tls_fingerprint", "ddos_guard"])
|
||||
protection: List[str] = []
|
||||
|
||||
def __init__(self, name, information):
|
||||
self.name = name
|
||||
@@ -137,6 +143,8 @@ class MaigretSite:
|
||||
'regex_check',
|
||||
'url_probe',
|
||||
'check_type',
|
||||
'request_method',
|
||||
'request_payload',
|
||||
'request_head_only',
|
||||
'get_params',
|
||||
'presense_strs',
|
||||
@@ -167,7 +175,7 @@ class MaigretSite:
|
||||
self.__dict__[CaseConverter.camel_to_snake(group)],
|
||||
)
|
||||
|
||||
self.url_regexp = URLMatcher.make_profile_url_regexp(url, self.regex_check)
|
||||
self.url_regexp = URLMatcher.make_profile_url_regexp(url, self.regex_check or "")
|
||||
|
||||
def detect_username(self, url: str) -> Optional[str]:
|
||||
if self.url_regexp:
|
||||
@@ -318,6 +326,7 @@ class MaigretDatabase:
|
||||
reverse=False,
|
||||
top=sys.maxsize,
|
||||
tags=[],
|
||||
excluded_tags=[],
|
||||
names=[],
|
||||
disabled=True,
|
||||
id_type="username",
|
||||
@@ -325,19 +334,30 @@ class MaigretDatabase:
|
||||
"""
|
||||
Ranking and filtering of the sites list
|
||||
|
||||
When ``top`` is limited (not "all sites"), **mirrors** may be appended after
|
||||
the Alexa-ranked slice. A mirror is any filtered site with a non-empty
|
||||
``source`` field equal to the name of a site that appears in the first
|
||||
``top`` positions of a **parent ranking** that includes disabled sites.
|
||||
Thus mirrors such as third-party viewers (e.g. for Twitter or Instagram)
|
||||
are still scanned when their parent platform ranks highly, even if the
|
||||
official site is disabled and omitted from the main list.
|
||||
|
||||
Args:
|
||||
reverse (bool, optional): Reverse the sorting order. Defaults to False.
|
||||
top (int, optional): Maximum number of sites to return. Defaults to sys.maxsize.
|
||||
tags (list, optional): List of tags to filter sites by. Defaults to empty list.
|
||||
tags (list, optional): List of tags to filter sites by (whitelist). Defaults to empty list.
|
||||
excluded_tags (list, optional): List of tags to exclude sites by (blacklist). Defaults to empty list.
|
||||
names (list, optional): List of site names (or urls, see MaigretSite.__eq__) to filter by. Defaults to empty list.
|
||||
disabled (bool, optional): Whether to include disabled sites. Defaults to True.
|
||||
id_type (str, optional): Type of identifier to filter by. Defaults to "username".
|
||||
|
||||
Returns:
|
||||
dict: Dictionary of filtered and ranked sites, with site names as keys and MaigretSite objects as values
|
||||
dict: Dictionary of filtered and ranked sites (base top slice plus mirrors),
|
||||
with site names as keys and MaigretSite objects as values
|
||||
"""
|
||||
normalized_names = list(map(str.lower, names))
|
||||
normalized_tags = list(map(str.lower, tags))
|
||||
normalized_excluded_tags = list(map(str.lower, excluded_tags))
|
||||
|
||||
is_name_ok = lambda x: x.name.lower() in normalized_names
|
||||
is_source_ok = lambda x: x.source and x.source.lower() in normalized_names
|
||||
@@ -351,6 +371,22 @@ class MaigretDatabase:
|
||||
)
|
||||
is_id_type_ok = lambda x: x.type == id_type
|
||||
|
||||
is_excluded_by_tag = lambda x: set(
|
||||
map(str.lower, x.tags)
|
||||
).intersection(set(normalized_excluded_tags))
|
||||
is_excluded_by_engine = lambda x: (
|
||||
isinstance(x.engine, str)
|
||||
and x.engine.lower() in normalized_excluded_tags
|
||||
)
|
||||
is_excluded_by_protocol = lambda x: (
|
||||
x.protocol and x.protocol in normalized_excluded_tags
|
||||
)
|
||||
is_not_excluded = lambda x: not excluded_tags or not (
|
||||
is_excluded_by_tag(x)
|
||||
or is_excluded_by_engine(x)
|
||||
or is_excluded_by_protocol(x)
|
||||
)
|
||||
|
||||
filter_tags_engines_fun = (
|
||||
lambda x: not tags
|
||||
or is_engine_ok(x)
|
||||
@@ -361,6 +397,7 @@ class MaigretDatabase:
|
||||
|
||||
filter_fun = (
|
||||
lambda x: filter_tags_engines_fun(x)
|
||||
and is_not_excluded(x)
|
||||
and filter_names_fun(x)
|
||||
and is_disabled_needed(x)
|
||||
and is_id_type_ok(x)
|
||||
@@ -371,6 +408,33 @@ class MaigretDatabase:
|
||||
sorted_list = sorted(
|
||||
filtered_list, key=lambda x: x.alexa_rank, reverse=reverse
|
||||
)[:top]
|
||||
|
||||
# Mirrors: sites whose `source` matches a parent platform that ranks in the
|
||||
# top `top` by Alexa when disabled entries are included in the ranking pool
|
||||
# (so e.g. Instagram can be a parent for Picuki even if Instagram is disabled).
|
||||
if top < sys.maxsize and sorted_list:
|
||||
filter_fun_ranking_parents = (
|
||||
lambda x: filter_tags_engines_fun(x)
|
||||
and is_not_excluded(x)
|
||||
and filter_names_fun(x)
|
||||
and is_id_type_ok(x)
|
||||
)
|
||||
ranking_pool = [s for s in self.sites if filter_fun_ranking_parents(s)]
|
||||
sorted_parents = sorted(
|
||||
ranking_pool, key=lambda x: x.alexa_rank, reverse=reverse
|
||||
)[:top]
|
||||
parent_names_lower = {s.name.lower() for s in sorted_parents}
|
||||
base_names = {s.name for s in sorted_list}
|
||||
|
||||
def is_mirror(s) -> bool:
|
||||
if not s.source or s.name in base_names:
|
||||
return False
|
||||
return s.source.lower() in parent_names_lower
|
||||
|
||||
mirrors = [s for s in filtered_list if is_mirror(s)]
|
||||
mirrors.sort(key=lambda x: (x.alexa_rank, x.name))
|
||||
sorted_list = list(sorted_list) + mirrors
|
||||
|
||||
return {site.name: site for site in sorted_list}
|
||||
|
||||
@property
|
||||
@@ -400,9 +464,9 @@ class MaigretDatabase:
|
||||
"tags": self._tags,
|
||||
}
|
||||
|
||||
json_data = json.dumps(db_data, indent=4)
|
||||
json_data = json.dumps(db_data, indent=4, ensure_ascii=False)
|
||||
|
||||
with open(filename, "w") as f:
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
f.write(json_data)
|
||||
|
||||
return self
|
||||
@@ -502,7 +566,7 @@ class MaigretDatabase:
|
||||
|
||||
def get_scan_stats(self, sites_dict):
|
||||
sites = sites_dict or self.sites_dict
|
||||
found_flags = {}
|
||||
found_flags: Dict[str, int] = {}
|
||||
for _, s in sites.items():
|
||||
if "presense_flag" in s.stats:
|
||||
flag = s.stats["presense_flag"]
|
||||
@@ -523,8 +587,8 @@ class MaigretDatabase:
|
||||
def get_db_stats(self, is_markdown=False):
|
||||
# Initialize counters
|
||||
sites_dict = self.sites_dict
|
||||
urls = {}
|
||||
tags = {}
|
||||
urls: Dict[str, int] = {}
|
||||
tags: Dict[str, int] = {}
|
||||
disabled_count = 0
|
||||
message_checks_one_factor = 0
|
||||
status_checks = 0
|
||||
|
||||
+41
-28
@@ -6,8 +6,7 @@ import logging
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from aiohttp import ClientSession, TCPConnector
|
||||
from aiohttp_socks import ProxyConnector
|
||||
import cloudscraper
|
||||
import cloudscraper # type: ignore[import-untyped]
|
||||
from colorama import Fore, Style
|
||||
|
||||
from .activation import import_aiohttp_cookies
|
||||
@@ -68,8 +67,10 @@ class Submitter:
|
||||
else:
|
||||
cookie_jar = import_aiohttp_cookies(args.cookie_file)
|
||||
|
||||
connector = ProxyConnector.from_url(proxy) if proxy else TCPConnector(ssl=False)
|
||||
connector.verify_ssl = False
|
||||
ssl_context = __import__('ssl').create_default_context()
|
||||
ssl_context.check_hostname = False
|
||||
ssl_context.verify_mode = __import__('ssl').CERT_NONE
|
||||
connector = ProxyConnector.from_url(proxy) if proxy else TCPConnector(ssl=ssl_context)
|
||||
self.session = ClientSession(
|
||||
connector=connector, trust_env=True, cookie_jar=cookie_jar
|
||||
)
|
||||
@@ -88,7 +89,9 @@ class Submitter:
|
||||
alexa_rank = 0
|
||||
|
||||
try:
|
||||
alexa_rank = int(root.find('.//REACH').attrib['RANK'])
|
||||
reach_elem = root.find('.//REACH')
|
||||
if reach_elem is not None:
|
||||
alexa_rank = int(reach_elem.attrib['RANK'])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@@ -127,7 +130,7 @@ class Submitter:
|
||||
|
||||
async def detect_known_engine(
|
||||
self, url_exists, url_mainpage, session, follow_redirects, headers
|
||||
) -> [List[MaigretSite], str]:
|
||||
) -> Tuple[List[MaigretSite], str]:
|
||||
|
||||
session = session or self.session
|
||||
resp_text, _ = await self.get_html_response_to_compare(
|
||||
@@ -191,8 +194,9 @@ class Submitter:
|
||||
# TODO: replace with checking.py/SimpleAiohttpChecker call
|
||||
@staticmethod
|
||||
async def get_html_response_to_compare(
|
||||
url: str, session: ClientSession = None, redirects=False, headers: Dict = None
|
||||
url: str, session: Optional[ClientSession] = None, redirects=False, headers: Optional[Dict] = None
|
||||
):
|
||||
assert session is not None, "session must not be None"
|
||||
async with session.get(
|
||||
url, allow_redirects=redirects, headers=headers
|
||||
) as response:
|
||||
@@ -211,10 +215,10 @@ class Submitter:
|
||||
username: str,
|
||||
url_exists: str,
|
||||
cookie_filename="", # TODO: use cookies
|
||||
session: ClientSession = None,
|
||||
session: Optional[ClientSession] = None,
|
||||
follow_redirects=False,
|
||||
headers: dict = None,
|
||||
) -> Tuple[List[str], List[str], str, str]:
|
||||
headers: Optional[dict] = None,
|
||||
) -> Tuple[Optional[List[str]], Optional[List[str]], str, str]:
|
||||
|
||||
random_username = generate_random_username()
|
||||
url_of_non_existing_account = url_exists.lower().replace(
|
||||
@@ -269,11 +273,8 @@ class Submitter:
|
||||
tokens_a = set(re.split(f'[{self.SEPARATORS}]', first_html_response))
|
||||
tokens_b = set(re.split(f'[{self.SEPARATORS}]', second_html_response))
|
||||
|
||||
a_minus_b = tokens_a.difference(tokens_b)
|
||||
b_minus_a = tokens_b.difference(tokens_a)
|
||||
|
||||
a_minus_b = list(map(lambda x: x.strip('\\'), a_minus_b))
|
||||
b_minus_a = list(map(lambda x: x.strip('\\'), b_minus_a))
|
||||
a_minus_b: List[str] = [x.strip('\\') for x in tokens_a.difference(tokens_b)]
|
||||
b_minus_a: List[str] = [x.strip('\\') for x in tokens_b.difference(tokens_a)]
|
||||
|
||||
# Filter out strings containing usernames
|
||||
a_minus_b = [s for s in a_minus_b if username.lower() not in s.lower()]
|
||||
@@ -378,7 +379,7 @@ class Submitter:
|
||||
).strip()
|
||||
|
||||
if field in ['tags', 'presense_strs', 'absence_strs']:
|
||||
new_value = list(map(str.strip, new_value.split(',')))
|
||||
new_value = list(map(str.strip, new_value.split(','))) # type: ignore[assignment]
|
||||
|
||||
if new_value:
|
||||
setattr(site, field, new_value)
|
||||
@@ -409,8 +410,13 @@ class Submitter:
|
||||
self.logger.info('Domain is %s', domain_raw)
|
||||
|
||||
# check for existence
|
||||
domain_re = re.compile(
|
||||
r'://(www\.)?' + re.escape(domain_raw) + r'(/|$)'
|
||||
)
|
||||
matched_sites = list(
|
||||
filter(lambda x: domain_raw in x.url_main + x.url, self.db.sites)
|
||||
filter(
|
||||
lambda x: domain_re.search(x.url_main + x.url), self.db.sites
|
||||
)
|
||||
)
|
||||
|
||||
if matched_sites:
|
||||
@@ -419,12 +425,12 @@ class Submitter:
|
||||
f"{Fore.YELLOW}[!] Sites with domain \"{domain_raw}\" already exists in the Maigret database!{Style.RESET_ALL}"
|
||||
)
|
||||
|
||||
status = lambda s: "(disabled)" if s.disabled else ""
|
||||
site_status = lambda s: "(disabled)" if s.disabled else ""
|
||||
url_block = lambda s: f"\n\t{s.url_main}\n\t{s.url}"
|
||||
print(
|
||||
"\n".join(
|
||||
[
|
||||
f"{site.name} {status(site)}{url_block(site)}"
|
||||
f"{site.name} {site_status(site)}{url_block(site)}"
|
||||
for site in matched_sites
|
||||
]
|
||||
)
|
||||
@@ -448,9 +454,14 @@ class Submitter:
|
||||
old_site = next(
|
||||
(site for site in matched_sites if site.name == site_name), None
|
||||
)
|
||||
print(
|
||||
f'{Fore.GREEN}[+] We will update site "{old_site.name}" in case of success.{Style.RESET_ALL}'
|
||||
)
|
||||
if old_site is None:
|
||||
print(
|
||||
f'{Fore.RED}[!] Site "{site_name}" not found in the matched list. Proceeding without updating an existing site.{Style.RESET_ALL}'
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f'{Fore.GREEN}[+] We will update site "{old_site.name}" in case of success.{Style.RESET_ALL}'
|
||||
)
|
||||
|
||||
# Check if the site check is ordinary or not
|
||||
if old_site and (old_site.url_probe or old_site.activation):
|
||||
@@ -487,7 +498,7 @@ class Submitter:
|
||||
)
|
||||
|
||||
print('Detecting site engine, please wait...')
|
||||
sites = []
|
||||
sites: List[MaigretSite] = []
|
||||
text = None
|
||||
try:
|
||||
sites, text = await self.detect_known_engine(
|
||||
@@ -500,7 +511,7 @@ class Submitter:
|
||||
except KeyboardInterrupt:
|
||||
print('Engine detect process is interrupted.')
|
||||
|
||||
if 'cloudflare' in text.lower():
|
||||
if text and 'cloudflare' in text.lower():
|
||||
print(
|
||||
'Cloudflare protection detected. I will use cloudscraper for further work'
|
||||
)
|
||||
@@ -563,6 +574,8 @@ class Submitter:
|
||||
found = True
|
||||
break
|
||||
|
||||
assert chosen_site is not None, "No sites to check"
|
||||
|
||||
if not found:
|
||||
print(
|
||||
f"{Fore.RED}[!] The check for site '{chosen_site.name}' failed!{Style.RESET_ALL}"
|
||||
@@ -621,8 +634,8 @@ class Submitter:
|
||||
# chosen_site.alexa_rank = rank
|
||||
|
||||
self.logger.info(chosen_site.json)
|
||||
site_data = chosen_site.strip_engine_data()
|
||||
self.logger.info(site_data.json)
|
||||
stripped_site = chosen_site.strip_engine_data()
|
||||
self.logger.info(stripped_site.json)
|
||||
|
||||
if old_site:
|
||||
# Update old site with new values and log changes
|
||||
@@ -641,7 +654,7 @@ class Submitter:
|
||||
|
||||
for field, display_name in fields_to_check.items():
|
||||
old_value = getattr(old_site, field)
|
||||
new_value = getattr(site_data, field)
|
||||
new_value = getattr(stripped_site, field)
|
||||
if field == 'tags' and not new_tags:
|
||||
continue
|
||||
if str(old_value) != str(new_value):
|
||||
@@ -651,7 +664,7 @@ class Submitter:
|
||||
old_site.__dict__[field] = new_value
|
||||
|
||||
# update the site
|
||||
final_site = old_site if old_site else site_data
|
||||
final_site = old_site if old_site else stripped_site
|
||||
self.db.update_site(final_site)
|
||||
|
||||
# save the db in file
|
||||
|
||||
+5
-2
@@ -71,7 +71,10 @@ class URLMatcher:
|
||||
|
||||
|
||||
def ascii_data_display(data: str) -> Any:
|
||||
return ast.literal_eval(data)
|
||||
try:
|
||||
return ast.literal_eval(data)
|
||||
except (ValueError, SyntaxError):
|
||||
return data
|
||||
|
||||
|
||||
def get_dict_ascii_tree(items, prepend="", new_line=True):
|
||||
@@ -86,7 +89,7 @@ def get_dict_ascii_tree(items, prepend="", new_line=True):
|
||||
new_result + new_line if num != len(items) - 1 else last_result + new_line
|
||||
)
|
||||
|
||||
if type(item) == tuple:
|
||||
if isinstance(item, tuple):
|
||||
field_name, field_value = item
|
||||
if field_value.startswith("['"):
|
||||
is_last_item = num == len(items) - 1
|
||||
|
||||
+20
-8
@@ -13,20 +13,22 @@ import os
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
from threading import Thread
|
||||
from typing import Any, Dict
|
||||
import maigret
|
||||
import maigret.settings
|
||||
from maigret.sites import MaigretDatabase
|
||||
from maigret.report import generate_report_context
|
||||
|
||||
app = Flask(__name__)
|
||||
app.secret_key = 'your-secret-key-here'
|
||||
# Use environment variable for secret key, generate random one if not set
|
||||
app.secret_key = os.getenv('FLASK_SECRET_KEY', os.urandom(24).hex())
|
||||
|
||||
# add background job tracking
|
||||
background_jobs = {}
|
||||
background_jobs: Dict[str, Any] = {}
|
||||
job_results = {}
|
||||
|
||||
# Configuration
|
||||
app.config["MAIGRET_DB_FILE"] = os.path.join('maigret', 'resources', 'data.json')
|
||||
app.config["MAIGRET_DB_FILE"] = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'resources', 'data.json')
|
||||
app.config["COOKIES_FILE"] = "cookies.txt"
|
||||
app.config["UPLOAD_FOLDER"] = 'uploads'
|
||||
app.config["REPORTS_FOLDER"] = os.path.abspath('/tmp/maigret_reports')
|
||||
@@ -48,12 +50,14 @@ async def maigret_search(username, options):
|
||||
top_sites = 999999999 # effectively all
|
||||
|
||||
tags = options.get('tags', [])
|
||||
excluded_tags = options.get('excluded_tags', [])
|
||||
site_list = options.get('site_list', [])
|
||||
logger.info(f"Filtering sites by tags: {tags}")
|
||||
logger.info(f"Filtering sites by tags: {tags}, excluded: {excluded_tags}")
|
||||
|
||||
sites = db.ranked_sites_dict(
|
||||
top=top_sites,
|
||||
tags=tags,
|
||||
excluded_tags=excluded_tags,
|
||||
names=site_list,
|
||||
disabled=False,
|
||||
id_type='username',
|
||||
@@ -224,7 +228,8 @@ def search():
|
||||
|
||||
# Get selected tags - ensure it's a list
|
||||
selected_tags = request.form.getlist('tags')
|
||||
logging.info(f"Selected tags: {selected_tags}")
|
||||
excluded_tags = request.form.getlist('excluded_tags')
|
||||
logging.info(f"Selected tags: {selected_tags}, Excluded tags: {excluded_tags}")
|
||||
|
||||
options = {
|
||||
'top_sites': request.form.get('top_sites') or '500',
|
||||
@@ -239,13 +244,14 @@ def search():
|
||||
'i2p_proxy': request.form.get('i2p_proxy', None) or None,
|
||||
'permute': 'permute' in request.form,
|
||||
'tags': selected_tags, # Pass selected tags as a list
|
||||
'excluded_tags': excluded_tags, # Pass excluded tags as a list
|
||||
'site_list': [
|
||||
s.strip() for s in request.form.get('site', '').split(',') if s.strip()
|
||||
],
|
||||
}
|
||||
|
||||
logging.info(
|
||||
f"Starting search for usernames: {usernames} with tags: {selected_tags}"
|
||||
f"Starting search for usernames: {usernames} with tags: {selected_tags}, excluded: {excluded_tags}"
|
||||
)
|
||||
|
||||
# Start background job
|
||||
@@ -255,7 +261,7 @@ def search():
|
||||
target=process_search_task, args=(usernames, options, timestamp)
|
||||
),
|
||||
}
|
||||
background_jobs[timestamp]['thread'].start()
|
||||
background_jobs[timestamp]['thread'].start() # type: ignore[union-attr]
|
||||
|
||||
return redirect(url_for('status', timestamp=timestamp))
|
||||
|
||||
@@ -338,4 +344,10 @@ if __name__ == '__main__':
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
)
|
||||
debug_mode = os.getenv('FLASK_DEBUG', 'False').lower() in ['true', '1', 't']
|
||||
app.run(debug=debug_mode)
|
||||
|
||||
# Host configuration: secure by default
|
||||
# Use 127.0.0.1 for local development, 0.0.0.0 only if explicitly set
|
||||
host = os.getenv('FLASK_HOST', '127.0.0.1')
|
||||
port = int(os.getenv('FLASK_PORT', '5000'))
|
||||
|
||||
app.run(host=host, port=port, debug=debug_mode)
|
||||
|
||||
@@ -28,6 +28,11 @@
|
||||
background-color: #28a745;
|
||||
}
|
||||
|
||||
.tag.excluded {
|
||||
background-color: #343a40;
|
||||
text-decoration: line-through;
|
||||
}
|
||||
|
||||
.tag:hover {
|
||||
transform: translateY(-2px);
|
||||
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.2);
|
||||
@@ -168,7 +173,16 @@
|
||||
</div>
|
||||
|
||||
<div class="mb-3">
|
||||
<label class="form-label">Tags (click to select)</label>
|
||||
<label class="form-label">Tags (click to cycle: include → exclude → neutral)</label>
|
||||
<div class="mb-2">
|
||||
<small class="text-muted">
|
||||
<span style="display:inline-block;width:12px;height:12px;background:#28a745;border-radius:50%;"></span> Included (whitelist)
|
||||
|
||||
<span style="display:inline-block;width:12px;height:12px;background:#343a40;border-radius:50%;"></span> Excluded (blacklist)
|
||||
|
||||
<span style="display:inline-block;width:12px;height:12px;background:#dc3545;border-radius:50%;"></span> Neutral
|
||||
</small>
|
||||
</div>
|
||||
<div class="tag-cloud" id="tagCloud"></div>
|
||||
<select multiple class="hidden-select" id="tags" name="tags">
|
||||
<option value="gaming">Gaming</option>
|
||||
@@ -230,6 +244,89 @@
|
||||
<option value="q&a">Q&A</option>
|
||||
<option value="crypto">Crypto</option>
|
||||
<option value="ai">AI</option>
|
||||
<!-- Country tags -->
|
||||
<option value="ae" data-group="country">AE - United Arab Emirates</option>
|
||||
<option value="ao" data-group="country">AO - Angola</option>
|
||||
<option value="ar" data-group="country">AR - Argentina</option>
|
||||
<option value="at" data-group="country">AT - Austria</option>
|
||||
<option value="au" data-group="country">AU - Australia</option>
|
||||
<option value="az" data-group="country">AZ - Azerbaijan</option>
|
||||
<option value="bd" data-group="country">BD - Bangladesh</option>
|
||||
<option value="be" data-group="country">BE - Belgium</option>
|
||||
<option value="bg" data-group="country">BG - Bulgaria</option>
|
||||
<option value="br" data-group="country">BR - Brazil</option>
|
||||
<option value="by" data-group="country">BY - Belarus</option>
|
||||
<option value="ca" data-group="country">CA - Canada</option>
|
||||
<option value="ch" data-group="country">CH - Switzerland</option>
|
||||
<option value="cl" data-group="country">CL - Chile</option>
|
||||
<option value="cn" data-group="country">CN - China</option>
|
||||
<option value="co" data-group="country">CO - Colombia</option>
|
||||
<option value="cr" data-group="country">CR - Costa Rica</option>
|
||||
<option value="cz" data-group="country">CZ - Czechia</option>
|
||||
<option value="de" data-group="country">DE - Germany</option>
|
||||
<option value="dk" data-group="country">DK - Denmark</option>
|
||||
<option value="dz" data-group="country">DZ - Algeria</option>
|
||||
<option value="ee" data-group="country">EE - Estonia</option>
|
||||
<option value="eg" data-group="country">EG - Egypt</option>
|
||||
<option value="es" data-group="country">ES - Spain</option>
|
||||
<option value="eu" data-group="country">EU - European Union</option>
|
||||
<option value="fi" data-group="country">FI - Finland</option>
|
||||
<option value="fr" data-group="country">FR - France</option>
|
||||
<option value="gb" data-group="country">GB - United Kingdom</option>
|
||||
<option value="global" data-group="country">🌍 Global</option>
|
||||
<option value="gr" data-group="country">GR - Greece</option>
|
||||
<option value="hk" data-group="country">HK - Hong Kong</option>
|
||||
<option value="hr" data-group="country">HR - Croatia</option>
|
||||
<option value="hu" data-group="country">HU - Hungary</option>
|
||||
<option value="id" data-group="country">ID - Indonesia</option>
|
||||
<option value="ie" data-group="country">IE - Ireland</option>
|
||||
<option value="il" data-group="country">IL - Israel</option>
|
||||
<option value="in" data-group="country">IN - India</option>
|
||||
<option value="ir" data-group="country">IR - Iran</option>
|
||||
<option value="it" data-group="country">IT - Italy</option>
|
||||
<option value="jp" data-group="country">JP - Japan</option>
|
||||
<option value="kg" data-group="country">KG - Kyrgyzstan</option>
|
||||
<option value="kr" data-group="country">KR - Korea</option>
|
||||
<option value="kz" data-group="country">KZ - Kazakhstan</option>
|
||||
<option value="la" data-group="country">LA - Laos</option>
|
||||
<option value="lk" data-group="country">LK - Sri Lanka</option>
|
||||
<option value="lt" data-group="country">LT - Lithuania</option>
|
||||
<option value="ma" data-group="country">MA - Morocco</option>
|
||||
<option value="md" data-group="country">MD - Moldova</option>
|
||||
<option value="mg" data-group="country">MG - Madagascar</option>
|
||||
<option value="mk" data-group="country">MK - North Macedonia</option>
|
||||
<option value="mx" data-group="country">MX - Mexico</option>
|
||||
<option value="ng" data-group="country">NG - Nigeria</option>
|
||||
<option value="nl" data-group="country">NL - Netherlands</option>
|
||||
<option value="no" data-group="country">NO - Norway</option>
|
||||
<option value="ph" data-group="country">PH - Philippines</option>
|
||||
<option value="pk" data-group="country">PK - Pakistan</option>
|
||||
<option value="pl" data-group="country">PL - Poland</option>
|
||||
<option value="pt" data-group="country">PT - Portugal</option>
|
||||
<option value="re" data-group="country">RE - Réunion</option>
|
||||
<option value="ro" data-group="country">RO - Romania</option>
|
||||
<option value="rs" data-group="country">RS - Serbia</option>
|
||||
<option value="ru" data-group="country">RU - Russia</option>
|
||||
<option value="sa" data-group="country">SA - Saudi Arabia</option>
|
||||
<option value="sd" data-group="country">SD - Sudan</option>
|
||||
<option value="se" data-group="country">SE - Sweden</option>
|
||||
<option value="sg" data-group="country">SG - Singapore</option>
|
||||
<option value="sk" data-group="country">SK - Slovakia</option>
|
||||
<option value="sv" data-group="country">SV - El Salvador</option>
|
||||
<option value="th" data-group="country">TH - Thailand</option>
|
||||
<option value="tn" data-group="country">TN - Tunisia</option>
|
||||
<option value="tr" data-group="country">TR - Türkiye</option>
|
||||
<option value="tw" data-group="country">TW - Taiwan</option>
|
||||
<option value="ua" data-group="country">UA - Ukraine</option>
|
||||
<option value="uk" data-group="country">UK - United Kingdom</option>
|
||||
<option value="us" data-group="country">US - United States</option>
|
||||
<option value="uz" data-group="country">UZ - Uzbekistan</option>
|
||||
<option value="ve" data-group="country">VE - Venezuela</option>
|
||||
<option value="vi" data-group="country">VI - Virgin Islands</option>
|
||||
<option value="vn" data-group="country">VN - Viet Nam</option>
|
||||
<option value="za" data-group="country">ZA - South Africa</option>
|
||||
</select>
|
||||
<select multiple class="hidden-select" id="excludedTags" name="excluded_tags">
|
||||
</select>
|
||||
</div>
|
||||
</div>
|
||||
@@ -292,26 +389,66 @@
|
||||
}
|
||||
|
||||
document.addEventListener('DOMContentLoaded', function () {
|
||||
// Tag cloud functionality
|
||||
// Tag cloud functionality with include/exclude (whitelist/blacklist) support
|
||||
const tagCloud = document.getElementById('tagCloud');
|
||||
const hiddenSelect = document.getElementById('tags');
|
||||
const excludedSelect = document.getElementById('excludedTags');
|
||||
const allTags = Array.from(hiddenSelect.options).map(opt => ({
|
||||
value: opt.value,
|
||||
label: opt.text
|
||||
label: opt.text,
|
||||
group: opt.dataset.group || 'category'
|
||||
}));
|
||||
|
||||
function updateTagSelects() {
|
||||
// Clear and repopulate hidden selects based on tag states
|
||||
Array.from(hiddenSelect.options).forEach(opt => opt.selected = false);
|
||||
// Clear excluded select
|
||||
excludedSelect.innerHTML = '';
|
||||
|
||||
document.querySelectorAll('#tagCloud .tag').forEach(tagEl => {
|
||||
const val = tagEl.dataset.value;
|
||||
if (tagEl.classList.contains('selected')) {
|
||||
const option = Array.from(hiddenSelect.options).find(opt => opt.value === val);
|
||||
if (option) option.selected = true;
|
||||
} else if (tagEl.classList.contains('excluded')) {
|
||||
const opt = document.createElement('option');
|
||||
opt.value = val;
|
||||
opt.selected = true;
|
||||
excludedSelect.appendChild(opt);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
let lastGroup = '';
|
||||
allTags.forEach(tag => {
|
||||
if (tag.group !== lastGroup && tag.group === 'country') {
|
||||
const separator = document.createElement('div');
|
||||
separator.style.cssText = 'width:100%;margin:8px 0 4px;padding:4px 0;border-top:1px solid rgba(0,0,0,0.15);font-size:13px;color:#666;';
|
||||
separator.textContent = 'Countries';
|
||||
tagCloud.appendChild(separator);
|
||||
}
|
||||
lastGroup = tag.group;
|
||||
|
||||
const tagElement = document.createElement('span');
|
||||
tagElement.className = 'tag';
|
||||
tagElement.textContent = tag.label;
|
||||
tagElement.dataset.value = tag.value;
|
||||
|
||||
tagElement.addEventListener('click', function () {
|
||||
const isSelected = this.classList.toggle('selected');
|
||||
const option = Array.from(hiddenSelect.options).find(opt => opt.value === tag.value);
|
||||
if (option) {
|
||||
option.selected = isSelected;
|
||||
// Single click cycles: neutral -> included -> excluded -> neutral
|
||||
tagElement.addEventListener('click', function (e) {
|
||||
e.preventDefault();
|
||||
if (this.classList.contains('selected')) {
|
||||
// included -> excluded
|
||||
this.classList.remove('selected');
|
||||
this.classList.add('excluded');
|
||||
} else if (this.classList.contains('excluded')) {
|
||||
// excluded -> neutral
|
||||
this.classList.remove('excluded');
|
||||
} else {
|
||||
// neutral -> included
|
||||
this.classList.add('selected');
|
||||
}
|
||||
updateTagSelects();
|
||||
});
|
||||
|
||||
tagCloud.appendChild(tagElement);
|
||||
|
||||
Generated
+1778
-1191
File diff suppressed because it is too large
Load Diff
@@ -1,5 +1,5 @@
|
||||
maigret @ https://github.com/soxoj/maigret/archive/refs/heads/main.zip
|
||||
pefile==2023.2.7 # do not bump while pyinstaller is 6.11.1, there is a conflict
|
||||
psutil==6.1.1
|
||||
pyinstaller==6.11.1
|
||||
psutil==7.2.2
|
||||
pyinstaller==6.19.0
|
||||
pywin32-ctypes==0.2.3
|
||||
|
||||
+14
-12
@@ -31,30 +31,31 @@ classifiers = [
|
||||
# Install with dev dependencies:
|
||||
# poetry install --with dev
|
||||
python = "^3.10"
|
||||
aiodns = "^3.0.0"
|
||||
aiodns = ">=3,<5"
|
||||
aiohttp = "^3.12.14"
|
||||
aiohttp-socks = "^0.10.1"
|
||||
aiohttp-socks = ">=0.10.1,<0.12.0"
|
||||
arabic-reshaper = "^3.0.0"
|
||||
async-timeout = "^5.0.1"
|
||||
attrs = "^25.3.0"
|
||||
certifi = "^2025.6.15"
|
||||
chardet = "^5.0.0"
|
||||
attrs = ">=25.3,<27.0"
|
||||
certifi = ">=2025.6.15,<2027.0.0"
|
||||
chardet = ">=5,<8"
|
||||
colorama = "^0.4.6"
|
||||
future = "^1.0.0"
|
||||
future-annotations= "^1.0.0"
|
||||
html5lib = "^1.1"
|
||||
idna = "^3.4"
|
||||
Jinja2 = "^3.1.6"
|
||||
lxml = "^5.3.0"
|
||||
lxml = ">=6.0.2,<7.0"
|
||||
MarkupSafe = "^3.0.2"
|
||||
mock = "^5.1.0"
|
||||
multidict = "^6.6.3"
|
||||
pycountry = "^24.6.1"
|
||||
pycountry = ">=24.6.1,<27.0.0"
|
||||
PyPDF2 = "^3.0.1"
|
||||
PySocks = "^1.7.1"
|
||||
python-bidi = "^0.6.3"
|
||||
requests = "^2.32.4"
|
||||
requests-futures = "^1.0.2"
|
||||
requests-toolbelt = "^1.0.0"
|
||||
six = "^1.17.0"
|
||||
socid-extractor = "^0.0.27"
|
||||
soupsieve = "^2.6"
|
||||
@@ -73,24 +74,25 @@ cloudscraper = "^1.2.71"
|
||||
flask = {extras = ["async"], version = "^3.1.1"}
|
||||
asgiref = "^3.9.1"
|
||||
platformdirs = "^4.3.8"
|
||||
curl-cffi = ">=0.14,<1.0"
|
||||
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
# How to add a new dev dependency: poetry add black --group dev
|
||||
# Install dev dependencies with: poetry install --with dev
|
||||
flake8 = "^7.1.1"
|
||||
pytest = "^8.3.4"
|
||||
pytest = ">=8.3.4,<10.0.0"
|
||||
pytest-asyncio = "^1.0.0"
|
||||
pytest-cov = "^6.0.0"
|
||||
pytest-cov = ">=6,<8"
|
||||
pytest-httpserver = "^1.0.0"
|
||||
pytest-rerunfailures = "^15.1"
|
||||
pytest-rerunfailures = ">=15.1,<17.0"
|
||||
reportlab = "^4.4.3"
|
||||
mypy = "^1.14.1"
|
||||
tuna = "^0.5.11"
|
||||
coverage = "^7.9.2"
|
||||
black = "^25.1.0"
|
||||
black = ">=25.1,<27.0"
|
||||
|
||||
[tool.poetry.scripts]
|
||||
# Run with: poetry run maigret <username>
|
||||
maigret = "maigret.maigret:run"
|
||||
update_sitesmd = "utils.update_site_data:main"
|
||||
update_sitesmd = "utils.update_site_data:main"
|
||||
|
||||
@@ -5,11 +5,13 @@ from typing import Dict, Any
|
||||
|
||||
DEFAULT_ARGS: Dict[str, Any] = {
|
||||
'all_sites': False,
|
||||
'auto_disable': False,
|
||||
'connections': 100,
|
||||
'cookie_file': None,
|
||||
'csv': False,
|
||||
'db_file': 'resources/data.json',
|
||||
'debug': False,
|
||||
'diagnose': False,
|
||||
'disable_extracting': False,
|
||||
'disable_recursive_search': False,
|
||||
'folderoutput': 'reports',
|
||||
@@ -34,6 +36,7 @@ DEFAULT_ARGS: Dict[str, Any] = {
|
||||
'site_list': [],
|
||||
'stats': False,
|
||||
'tags': '',
|
||||
'exclude_tags': '',
|
||||
'timeout': 30,
|
||||
'tor_proxy': 'socks5://127.0.0.1:9050',
|
||||
'i2p_proxy': 'http://127.0.0.1:4444',
|
||||
@@ -45,6 +48,9 @@ DEFAULT_ARGS: Dict[str, Any] = {
|
||||
'web': None,
|
||||
'with_domains': False,
|
||||
'xmind': False,
|
||||
'md': False,
|
||||
'no_autoupdate': False,
|
||||
'force_update': False,
|
||||
}
|
||||
|
||||
|
||||
@@ -103,3 +109,34 @@ def test_args_multiple_sites(argparser):
|
||||
|
||||
for arg in vars(args):
|
||||
assert getattr(args, arg) == want_args[arg]
|
||||
|
||||
|
||||
def test_args_exclude_tags(argparser):
|
||||
args = argparser.parse_args('--exclude-tags porn,dating username'.split())
|
||||
|
||||
want_args = dict(DEFAULT_ARGS)
|
||||
want_args.update(
|
||||
{
|
||||
'exclude_tags': 'porn,dating',
|
||||
'username': ['username'],
|
||||
}
|
||||
)
|
||||
|
||||
for arg in vars(args):
|
||||
assert getattr(args, arg) == want_args[arg]
|
||||
|
||||
|
||||
def test_args_tags_with_exclude_tags(argparser):
|
||||
args = argparser.parse_args('--tags coding --exclude-tags porn username'.split())
|
||||
|
||||
want_args = dict(DEFAULT_ARGS)
|
||||
want_args.update(
|
||||
{
|
||||
'tags': 'coding',
|
||||
'exclude_tags': 'porn',
|
||||
'username': ['username'],
|
||||
}
|
||||
)
|
||||
|
||||
for arg in vars(args):
|
||||
assert getattr(args, arg) == want_args[arg]
|
||||
|
||||
@@ -4,6 +4,30 @@ import pytest
|
||||
from maigret.utils import is_country_tag
|
||||
|
||||
|
||||
TOP_SITES_ALEXA_RANK_LIMIT = 50
|
||||
|
||||
KNOWN_SOCIAL_DOMAINS = [
|
||||
"facebook.com",
|
||||
"instagram.com",
|
||||
"twitter.com",
|
||||
"tiktok.com",
|
||||
"vk.com",
|
||||
"reddit.com",
|
||||
"pinterest.com",
|
||||
"snapchat.com",
|
||||
"linkedin.com",
|
||||
"tumblr.com",
|
||||
"threads.net",
|
||||
"bsky.app",
|
||||
"myspace.com",
|
||||
"weibo.com",
|
||||
"mastodon.social",
|
||||
"gab.com",
|
||||
"minds.com",
|
||||
"clubhouse.com",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_tags_validity(default_db):
|
||||
unknown_tags = set()
|
||||
@@ -19,3 +43,62 @@ def test_tags_validity(default_db):
|
||||
# if you see "unchecked" tag error, please, do
|
||||
# maigret --db `pwd`/maigret/resources/data.json --self-check --tag unchecked --use-disabled-sites
|
||||
assert unknown_tags == set()
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_top_sites_have_category_tag(default_db):
|
||||
"""Top sites by alexaRank must have at least one category tag (not just country codes)."""
|
||||
sites_ranked = sorted(
|
||||
[s for s in default_db.sites if s.alexa_rank],
|
||||
key=lambda s: s.alexa_rank,
|
||||
)[:TOP_SITES_ALEXA_RANK_LIMIT]
|
||||
|
||||
missing_category = []
|
||||
for site in sites_ranked:
|
||||
category_tags = [t for t in site.tags if not is_country_tag(t)]
|
||||
if not category_tags:
|
||||
missing_category.append(f"{site.name} (rank {site.alexa_rank})")
|
||||
|
||||
assert missing_category == [], (
|
||||
f"{len(missing_category)} top-{TOP_SITES_ALEXA_RANK_LIMIT} sites have no category tag: "
|
||||
+ ", ".join(missing_category[:20])
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_no_unused_tags_in_registry(default_db):
|
||||
"""Every tag in the registry should be used by at least one site."""
|
||||
all_used_tags = set()
|
||||
for site in default_db.sites:
|
||||
for tag in site.tags:
|
||||
if not is_country_tag(tag):
|
||||
all_used_tags.add(tag)
|
||||
|
||||
registered_tags = set(default_db._tags)
|
||||
unused = registered_tags - all_used_tags
|
||||
|
||||
assert unused == set(), f"Tags registered but not used by any site: {unused}"
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_social_networks_have_social_tag(default_db):
|
||||
"""Known social network domains must have the 'social' tag."""
|
||||
from urllib.parse import urlparse
|
||||
|
||||
missing_social = []
|
||||
for site in default_db.sites:
|
||||
url = site.url_main or ""
|
||||
try:
|
||||
hostname = urlparse(url).hostname or ""
|
||||
except Exception:
|
||||
continue
|
||||
for domain in KNOWN_SOCIAL_DOMAINS:
|
||||
if hostname == domain or hostname.endswith("." + domain):
|
||||
if "social" not in site.tags:
|
||||
missing_social.append(f"{site.name} ({domain})")
|
||||
break
|
||||
|
||||
assert missing_social == [], (
|
||||
f"{len(missing_social)} known social networks missing 'social' tag: "
|
||||
+ ", ".join(missing_social)
|
||||
)
|
||||
|
||||
@@ -0,0 +1,233 @@
|
||||
"""Tests for the database auto-update system."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import hashlib
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from maigret.db_updater import (
|
||||
_parse_version,
|
||||
_needs_check,
|
||||
_is_version_compatible,
|
||||
_is_update_available,
|
||||
_load_state,
|
||||
_save_state,
|
||||
_best_local,
|
||||
_now_iso,
|
||||
resolve_db_path,
|
||||
force_update,
|
||||
CACHED_DB_PATH,
|
||||
BUNDLED_DB_PATH,
|
||||
STATE_PATH,
|
||||
MAIGRET_HOME,
|
||||
)
|
||||
|
||||
|
||||
def test_parse_version():
|
||||
assert _parse_version("0.5.0") == (0, 5, 0)
|
||||
assert _parse_version("1.2.3") == (1, 2, 3)
|
||||
assert _parse_version("bad") == (0, 0, 0)
|
||||
assert _parse_version("") == (0, 0, 0)
|
||||
|
||||
|
||||
def test_needs_check_no_state():
|
||||
assert _needs_check({}, 24) is True
|
||||
|
||||
|
||||
def test_needs_check_recent():
|
||||
state = {"last_check_at": _now_iso()}
|
||||
assert _needs_check(state, 24) is False
|
||||
|
||||
|
||||
def test_needs_check_expired():
|
||||
old_time = (datetime.now(timezone.utc) - timedelta(hours=25)).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
state = {"last_check_at": old_time}
|
||||
assert _needs_check(state, 24) is True
|
||||
|
||||
|
||||
def test_needs_check_corrupt():
|
||||
state = {"last_check_at": "not-a-date"}
|
||||
assert _needs_check(state, 24) is True
|
||||
|
||||
|
||||
def test_version_compatible():
|
||||
with patch("maigret.db_updater.__version__", "0.5.0"):
|
||||
assert _is_version_compatible({"min_maigret_version": "0.5.0"}) is True
|
||||
assert _is_version_compatible({"min_maigret_version": "0.4.0"}) is True
|
||||
assert _is_version_compatible({"min_maigret_version": "0.6.0"}) is False
|
||||
assert _is_version_compatible({}) is True # missing field = compatible
|
||||
|
||||
|
||||
def test_update_available_no_cache(tmp_path):
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(tmp_path / "nonexistent.json")):
|
||||
assert _is_update_available({"updated_at": "2026-01-01T00:00:00Z"}, {}) is True
|
||||
|
||||
|
||||
def test_update_available_newer(tmp_path):
|
||||
cache = tmp_path / "data.json"
|
||||
cache.write_text("{}")
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(cache)):
|
||||
state = {"last_meta": {"updated_at": "2026-01-01T00:00:00Z"}}
|
||||
meta = {"updated_at": "2026-02-01T00:00:00Z"}
|
||||
assert _is_update_available(meta, state) is True
|
||||
|
||||
|
||||
def test_update_available_same(tmp_path):
|
||||
cache = tmp_path / "data.json"
|
||||
cache.write_text("{}")
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(cache)):
|
||||
state = {"last_meta": {"updated_at": "2026-01-01T00:00:00Z"}}
|
||||
meta = {"updated_at": "2026-01-01T00:00:00Z"}
|
||||
assert _is_update_available(meta, state) is False
|
||||
|
||||
|
||||
def test_load_state_missing(tmp_path):
|
||||
with patch("maigret.db_updater.STATE_PATH", str(tmp_path / "missing.json")):
|
||||
assert _load_state() == {}
|
||||
|
||||
|
||||
def test_load_state_corrupt(tmp_path):
|
||||
corrupt = tmp_path / "state.json"
|
||||
corrupt.write_text("not json{{{")
|
||||
with patch("maigret.db_updater.STATE_PATH", str(corrupt)):
|
||||
assert _load_state() == {}
|
||||
|
||||
|
||||
def test_save_and_load_state(tmp_path):
|
||||
state_file = tmp_path / "state.json"
|
||||
with patch("maigret.db_updater.STATE_PATH", str(state_file)):
|
||||
with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)):
|
||||
_save_state({"last_check_at": "2026-01-01T00:00:00Z"})
|
||||
loaded = _load_state()
|
||||
assert loaded["last_check_at"] == "2026-01-01T00:00:00Z"
|
||||
|
||||
|
||||
def test_best_local_with_valid_cache(tmp_path):
|
||||
cache = tmp_path / "data.json"
|
||||
cache.write_text('{"sites": {}, "engines": {}, "tags": []}')
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(cache)):
|
||||
assert _best_local() == str(cache)
|
||||
|
||||
|
||||
def test_best_local_with_corrupt_cache(tmp_path):
|
||||
cache = tmp_path / "data.json"
|
||||
cache.write_text("not json")
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(cache)):
|
||||
assert _best_local() == BUNDLED_DB_PATH
|
||||
|
||||
|
||||
def test_best_local_no_cache(tmp_path):
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(tmp_path / "missing.json")):
|
||||
assert _best_local() == BUNDLED_DB_PATH
|
||||
|
||||
|
||||
def test_resolve_db_path_custom_url():
|
||||
result = resolve_db_path("https://example.com/db.json")
|
||||
assert result == "https://example.com/db.json"
|
||||
|
||||
|
||||
def test_resolve_db_path_custom_file():
|
||||
result = resolve_db_path("custom/path.json")
|
||||
assert result.endswith("custom/path.json")
|
||||
|
||||
|
||||
def test_resolve_db_path_no_autoupdate(tmp_path):
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(tmp_path / "missing.json")):
|
||||
result = resolve_db_path("resources/data.json", no_autoupdate=True)
|
||||
assert result == BUNDLED_DB_PATH
|
||||
|
||||
|
||||
def test_resolve_db_path_no_autoupdate_with_cache(tmp_path):
|
||||
cache = tmp_path / "data.json"
|
||||
cache.write_text('{"sites": {}, "engines": {}, "tags": []}')
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(cache)):
|
||||
result = resolve_db_path("resources/data.json", no_autoupdate=True)
|
||||
assert result == str(cache)
|
||||
|
||||
|
||||
@patch("maigret.db_updater._fetch_meta")
|
||||
def test_resolve_db_path_network_failure(mock_fetch, tmp_path):
|
||||
mock_fetch.return_value = None
|
||||
with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)):
|
||||
with patch("maigret.db_updater.STATE_PATH", str(tmp_path / "state.json")):
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(tmp_path / "missing.json")):
|
||||
result = resolve_db_path("resources/data.json")
|
||||
assert result == BUNDLED_DB_PATH
|
||||
|
||||
|
||||
# --- force_update tests ---
|
||||
|
||||
|
||||
@patch("maigret.db_updater._fetch_meta")
|
||||
def test_force_update_network_failure(mock_fetch, tmp_path):
|
||||
mock_fetch.return_value = None
|
||||
with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)):
|
||||
with patch("maigret.db_updater.STATE_PATH", str(tmp_path / "state.json")):
|
||||
assert force_update() is False
|
||||
|
||||
|
||||
@patch("maigret.db_updater._fetch_meta")
|
||||
def test_force_update_incompatible_version(mock_fetch, tmp_path):
|
||||
mock_fetch.return_value = {"min_maigret_version": "99.0.0", "sites_count": 100}
|
||||
with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)):
|
||||
with patch("maigret.db_updater.STATE_PATH", str(tmp_path / "state.json")):
|
||||
assert force_update() is False
|
||||
|
||||
|
||||
@patch("maigret.db_updater._download_and_verify")
|
||||
@patch("maigret.db_updater._fetch_meta")
|
||||
def test_force_update_success(mock_fetch, mock_download, tmp_path):
|
||||
mock_fetch.return_value = {
|
||||
"min_maigret_version": "0.1.0",
|
||||
"sites_count": 3200,
|
||||
"updated_at": "2099-01-01T00:00:00Z",
|
||||
"data_url": "https://example.com/data.json",
|
||||
"data_sha256": "abc123",
|
||||
}
|
||||
mock_download.return_value = str(tmp_path / "data.json")
|
||||
with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)):
|
||||
with patch("maigret.db_updater.STATE_PATH", str(tmp_path / "state.json")):
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(tmp_path / "missing.json")):
|
||||
assert force_update() is True
|
||||
state = _load_state()
|
||||
assert state["last_meta"]["sites_count"] == 3200
|
||||
|
||||
|
||||
@patch("maigret.db_updater._fetch_meta")
|
||||
def test_force_update_already_up_to_date(mock_fetch, tmp_path):
|
||||
cache = tmp_path / "data.json"
|
||||
cache.write_text('{"sites": {}, "engines": {}, "tags": []}')
|
||||
state_file = tmp_path / "state.json"
|
||||
state_file.write_text(json.dumps({
|
||||
"last_check_at": _now_iso(),
|
||||
"last_meta": {"updated_at": "2026-01-01T00:00:00Z", "sites_count": 3000},
|
||||
}))
|
||||
mock_fetch.return_value = {
|
||||
"min_maigret_version": "0.1.0",
|
||||
"sites_count": 3000,
|
||||
"updated_at": "2026-01-01T00:00:00Z",
|
||||
}
|
||||
with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)):
|
||||
with patch("maigret.db_updater.STATE_PATH", str(state_file)):
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(cache)):
|
||||
assert force_update() is False
|
||||
|
||||
|
||||
@patch("maigret.db_updater._download_and_verify")
|
||||
@patch("maigret.db_updater._fetch_meta")
|
||||
def test_force_update_download_fails(mock_fetch, mock_download, tmp_path):
|
||||
mock_fetch.return_value = {
|
||||
"min_maigret_version": "0.1.0",
|
||||
"sites_count": 3200,
|
||||
"updated_at": "2099-01-01T00:00:00Z",
|
||||
"data_url": "https://example.com/data.json",
|
||||
"data_sha256": "abc123",
|
||||
}
|
||||
mock_download.return_value = None
|
||||
with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)):
|
||||
with patch("maigret.db_updater.STATE_PATH", str(tmp_path / "state.json")):
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(tmp_path / "missing.json")):
|
||||
assert force_update() is False
|
||||
@@ -36,7 +36,7 @@ def test_notify_about_errors():
|
||||
},
|
||||
}
|
||||
|
||||
results = notify_about_errors(results, query_notify=None, show_statistics=True)
|
||||
notifications = notify_about_errors(results, query_notify=None, show_statistics=True)
|
||||
|
||||
# Check the output
|
||||
expected_output = [
|
||||
@@ -55,4 +55,4 @@ def test_notify_about_errors():
|
||||
('Access denied: 25.0%', '!'),
|
||||
('You can see detailed site check errors with a flag `--print-errors`', '-'),
|
||||
]
|
||||
assert results == expected_output
|
||||
assert notifications == expected_output
|
||||
|
||||
+10
-9
@@ -3,6 +3,7 @@
|
||||
import pytest
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Any, List, Tuple, Callable, Dict
|
||||
from maigret.executors import (
|
||||
AsyncioSimpleExecutor,
|
||||
AsyncioProgressbarExecutor,
|
||||
@@ -21,7 +22,7 @@ async def func(n):
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_simple_asyncio_executor():
|
||||
tasks = [(func, [n], {}) for n in range(10)]
|
||||
tasks: List[Tuple[Callable, list, dict]] = [(func, [n], {}) for n in range(10)]
|
||||
executor = AsyncioSimpleExecutor(logger=logger)
|
||||
assert await executor.run(tasks) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
|
||||
assert executor.execution_time > 0.2
|
||||
@@ -30,7 +31,7 @@ async def test_simple_asyncio_executor():
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_asyncio_progressbar_executor():
|
||||
tasks = [(func, [n], {}) for n in range(10)]
|
||||
tasks: List[Tuple[Callable, list, dict]] = [(func, [n], {}) for n in range(10)]
|
||||
|
||||
executor = AsyncioProgressbarExecutor(logger=logger)
|
||||
# no guarantees for the results order
|
||||
@@ -41,7 +42,7 @@ async def test_asyncio_progressbar_executor():
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_asyncio_progressbar_semaphore_executor():
|
||||
tasks = [(func, [n], {}) for n in range(10)]
|
||||
tasks: List[Tuple[Callable, list, dict]] = [(func, [n], {}) for n in range(10)]
|
||||
|
||||
executor = AsyncioProgressbarSemaphoreExecutor(logger=logger, in_parallel=5)
|
||||
# no guarantees for the results order
|
||||
@@ -53,7 +54,7 @@ async def test_asyncio_progressbar_semaphore_executor():
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.asyncio
|
||||
async def test_asyncio_progressbar_queue_executor():
|
||||
tasks = [(func, [n], {}) for n in range(10)]
|
||||
tasks: List[Tuple[Callable, list, dict]] = [(func, [n], {}) for n in range(10)]
|
||||
|
||||
executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=2)
|
||||
assert await executor.run(tasks) == [0, 1, 3, 2, 4, 6, 7, 5, 9, 8]
|
||||
@@ -81,22 +82,22 @@ async def test_asyncio_progressbar_queue_executor():
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_asyncio_queue_generator_executor():
|
||||
tasks = [(func, [n], {}) for n in range(10)]
|
||||
tasks: List[Tuple[Callable, list, dict]] = [(func, [n], {}) for n in range(10)]
|
||||
|
||||
executor = AsyncioQueueGeneratorExecutor(logger=logger, in_parallel=2)
|
||||
results = [result async for result in executor.run(tasks)]
|
||||
results = [result async for result in executor.run(tasks)] # type: ignore[arg-type]
|
||||
assert results == [0, 1, 3, 2, 4, 6, 7, 5, 9, 8]
|
||||
assert executor.execution_time > 0.5
|
||||
assert executor.execution_time < 0.6
|
||||
|
||||
executor = AsyncioQueueGeneratorExecutor(logger=logger, in_parallel=3)
|
||||
results = [result async for result in executor.run(tasks)]
|
||||
results = [result async for result in executor.run(tasks)] # type: ignore[arg-type]
|
||||
assert results == [0, 3, 1, 4, 6, 2, 7, 9, 5, 8]
|
||||
assert executor.execution_time > 0.4
|
||||
assert executor.execution_time < 0.5
|
||||
|
||||
executor = AsyncioQueueGeneratorExecutor(logger=logger, in_parallel=5)
|
||||
results = [result async for result in executor.run(tasks)]
|
||||
results = [result async for result in executor.run(tasks)] # type: ignore[arg-type]
|
||||
assert results in (
|
||||
[0, 3, 6, 1, 4, 7, 9, 2, 5, 8],
|
||||
[0, 3, 6, 1, 4, 9, 7, 2, 5, 8],
|
||||
@@ -105,7 +106,7 @@ async def test_asyncio_queue_generator_executor():
|
||||
assert executor.execution_time < 0.4
|
||||
|
||||
executor = AsyncioQueueGeneratorExecutor(logger=logger, in_parallel=10)
|
||||
results = [result async for result in executor.run(tasks)]
|
||||
results = [result async for result in executor.run(tasks)] # type: ignore[arg-type]
|
||||
assert results == [0, 3, 6, 9, 1, 4, 7, 2, 5, 8]
|
||||
assert executor.execution_time > 0.2
|
||||
assert executor.execution_time < 0.3
|
||||
|
||||
+87
-3
@@ -2,6 +2,7 @@
|
||||
|
||||
import asyncio
|
||||
import copy
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from mock import Mock
|
||||
@@ -11,7 +12,8 @@ from maigret.maigret import (
|
||||
extract_ids_from_page,
|
||||
extract_ids_from_results,
|
||||
)
|
||||
from maigret.sites import MaigretSite
|
||||
from maigret.checking import site_self_check
|
||||
from maigret.sites import MaigretSite, MaigretDatabase
|
||||
from maigret.result import MaigretCheckResult, MaigretCheckStatus
|
||||
from tests.conftest import RESULTS_EXAMPLE
|
||||
|
||||
@@ -27,7 +29,9 @@ async def test_self_check_db(test_db):
|
||||
assert test_db.sites_dict['ValidActive'].disabled is False
|
||||
assert test_db.sites_dict['InvalidInactive'].disabled is True
|
||||
|
||||
await self_check(test_db, test_db.sites_dict, logger, silent=False)
|
||||
await self_check(
|
||||
test_db, test_db.sites_dict, logger, silent=False, auto_disable=True
|
||||
)
|
||||
|
||||
assert test_db.sites_dict['InvalidActive'].disabled is True
|
||||
assert test_db.sites_dict['ValidInactive'].disabled is False
|
||||
@@ -35,6 +39,86 @@ async def test_self_check_db(test_db):
|
||||
assert test_db.sites_dict['InvalidInactive'].disabled is True
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.asyncio
|
||||
async def test_self_check_no_progressbar(test_db):
|
||||
"""Verify that no_progressbar=True disables the alive_bar in self_check."""
|
||||
logger = Mock()
|
||||
|
||||
with patch('maigret.checking.alive_bar') as mock_alive_bar:
|
||||
mock_bar = Mock()
|
||||
mock_alive_bar.return_value.__enter__ = Mock(return_value=mock_bar)
|
||||
mock_alive_bar.return_value.__exit__ = Mock(return_value=False)
|
||||
|
||||
await self_check(
|
||||
test_db, test_db.sites_dict, logger, silent=True,
|
||||
no_progressbar=True,
|
||||
)
|
||||
|
||||
# First call is the self-check progress bar; subsequent calls are
|
||||
# from inner search() invocations.
|
||||
self_check_call = mock_alive_bar.call_args_list[0]
|
||||
_, kwargs = self_check_call
|
||||
assert kwargs.get('title') == 'Self-checking'
|
||||
assert kwargs.get('disable') is True
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.asyncio
|
||||
async def test_self_check_progressbar_enabled_by_default(test_db):
|
||||
"""Verify that alive_bar is enabled by default (no_progressbar=False)."""
|
||||
logger = Mock()
|
||||
|
||||
with patch('maigret.checking.alive_bar') as mock_alive_bar:
|
||||
mock_bar = Mock()
|
||||
mock_alive_bar.return_value.__enter__ = Mock(return_value=mock_bar)
|
||||
mock_alive_bar.return_value.__exit__ = Mock(return_value=False)
|
||||
|
||||
await self_check(
|
||||
test_db, test_db.sites_dict, logger, silent=True,
|
||||
)
|
||||
|
||||
self_check_call = mock_alive_bar.call_args_list[0]
|
||||
_, kwargs = self_check_call
|
||||
assert kwargs.get('title') == 'Self-checking'
|
||||
assert kwargs.get('disable') is False
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_site_self_check_handles_exception(test_db):
|
||||
"""Verify that site_self_check catches unexpected exceptions and returns a valid result."""
|
||||
logger = Mock()
|
||||
sem = asyncio.Semaphore(1)
|
||||
site = test_db.sites_dict['ValidActive']
|
||||
|
||||
with patch('maigret.checking.maigret', side_effect=RuntimeError("test crash")):
|
||||
result = await site_self_check(site, logger, sem, test_db)
|
||||
|
||||
assert isinstance(result, dict)
|
||||
assert "issues" in result
|
||||
assert len(result["issues"]) > 0
|
||||
assert any("Unexpected error" in issue for issue in result["issues"])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_self_check_handles_task_exception(test_db):
|
||||
"""Verify that self_check continues when individual site checks raise exceptions."""
|
||||
logger = Mock()
|
||||
|
||||
with patch('maigret.checking.maigret', side_effect=RuntimeError("test crash")):
|
||||
result = await self_check(
|
||||
test_db, test_db.sites_dict, logger, silent=True,
|
||||
no_progressbar=True,
|
||||
)
|
||||
|
||||
assert isinstance(result, dict)
|
||||
assert 'results' in result
|
||||
assert len(result['results']) == len(test_db.sites_dict)
|
||||
for r in result['results']:
|
||||
assert 'site_name' in r
|
||||
assert 'issues' in r
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.skip(reason="broken, fixme")
|
||||
def test_maigret_results(test_db):
|
||||
@@ -110,7 +194,7 @@ def test_extract_ids_from_page(test_db):
|
||||
|
||||
|
||||
def test_extract_ids_from_results(test_db):
|
||||
TEST_EXAMPLE = copy.deepcopy(RESULTS_EXAMPLE)
|
||||
TEST_EXAMPLE: dict = copy.deepcopy(RESULTS_EXAMPLE)
|
||||
TEST_EXAMPLE['Reddit']['ids_usernames'] = {'test1': 'yandex_public_id'}
|
||||
TEST_EXAMPLE['Reddit']['ids_links'] = ['https://www.reddit.com/user/test2']
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ import os
|
||||
import pytest
|
||||
from io import StringIO
|
||||
|
||||
import xmind
|
||||
import xmind # type: ignore[import-untyped]
|
||||
from jinja2 import Template
|
||||
|
||||
from maigret.report import (
|
||||
|
||||
@@ -0,0 +1,53 @@
|
||||
import unittest
|
||||
from unittest.mock import patch, mock_open
|
||||
|
||||
from maigret.settings import Settings
|
||||
|
||||
|
||||
class TestSettings(unittest.TestCase):
|
||||
@patch('json.load')
|
||||
@patch('builtins.open', new_callable=mock_open)
|
||||
def test_settings_cascade_and_override(self, mock_file, mock_json_load):
|
||||
file1_data = {"timeout": 10, "retries_count": 3, "proxy_url": "http://proxy1"}
|
||||
file2_data = {"timeout": 20, "recursive_search": True}
|
||||
file3_data = {"proxy_url": "http://proxy3", "print_not_found": False}
|
||||
|
||||
mock_json_load.side_effect = [file1_data, file2_data, file3_data]
|
||||
|
||||
settings = Settings()
|
||||
paths = ['file1.json', 'file2.json', 'file3.json']
|
||||
|
||||
was_inited, msg = settings.load(paths)
|
||||
|
||||
self.assertTrue(was_inited)
|
||||
self.assertEqual(settings.retries_count, 3)
|
||||
self.assertEqual(settings.timeout, 20)
|
||||
self.assertTrue(settings.recursive_search)
|
||||
self.assertEqual(settings.proxy_url, "http://proxy3")
|
||||
self.assertFalse(settings.print_not_found)
|
||||
|
||||
@patch('builtins.open')
|
||||
def test_settings_file_not_found(self, mock_open_func):
|
||||
mock_open_func.side_effect = FileNotFoundError()
|
||||
|
||||
settings = Settings()
|
||||
paths = ['nonexistent.json']
|
||||
|
||||
was_inited, msg = settings.load(paths)
|
||||
|
||||
self.assertFalse(was_inited)
|
||||
self.assertIn('None of the default settings files found', msg)
|
||||
|
||||
@patch('json.load')
|
||||
@patch('builtins.open', new_callable=mock_open)
|
||||
def test_settings_invalid_json(self, mock_file, mock_json_load):
|
||||
mock_json_load.side_effect = ValueError("Expecting value")
|
||||
|
||||
settings = Settings()
|
||||
paths = ['invalid.json']
|
||||
|
||||
was_inited, msg = settings.load(paths)
|
||||
|
||||
self.assertFalse(was_inited)
|
||||
self.assertIsInstance(msg, ValueError)
|
||||
self.assertIn('Problem with parsing json contents', str(msg))
|
||||
+94
-1
@@ -1,8 +1,10 @@
|
||||
"""Maigret Database test functions"""
|
||||
|
||||
from typing import Any, Dict
|
||||
|
||||
from maigret.sites import MaigretDatabase, MaigretSite
|
||||
|
||||
EXAMPLE_DB = {
|
||||
EXAMPLE_DB: Dict[str, Any] = {
|
||||
'engines': {
|
||||
"XenForo": {
|
||||
"presenseStrs": ["XenForo"],
|
||||
@@ -182,6 +184,97 @@ def test_ranked_sites_dict_id_type():
|
||||
assert len(db.ranked_sites_dict(id_type='gaia_id')) == 1
|
||||
|
||||
|
||||
def test_ranked_sites_dict_excluded_tags():
|
||||
db = MaigretDatabase()
|
||||
db.update_site(MaigretSite('3', {'alexaRank': 1000, 'engine': 'ucoz'}))
|
||||
db.update_site(MaigretSite('1', {'alexaRank': 2, 'tags': ['forum']}))
|
||||
db.update_site(MaigretSite('2', {'alexaRank': 10, 'tags': ['ru', 'forum']}))
|
||||
|
||||
# excluding by tag
|
||||
assert list(db.ranked_sites_dict(excluded_tags=['ru']).keys()) == ['1', '3']
|
||||
assert list(db.ranked_sites_dict(excluded_tags=['forum']).keys()) == ['3']
|
||||
|
||||
# excluding by engine
|
||||
assert list(db.ranked_sites_dict(excluded_tags=['ucoz']).keys()) == ['1', '2']
|
||||
|
||||
# combining include and exclude tags
|
||||
assert list(db.ranked_sites_dict(tags=['forum'], excluded_tags=['ru']).keys()) == ['1']
|
||||
|
||||
# excluding non-existent tag has no effect
|
||||
assert list(db.ranked_sites_dict(excluded_tags=['nonexistent']).keys()) == ['1', '2', '3']
|
||||
|
||||
# exclude all
|
||||
assert list(db.ranked_sites_dict(excluded_tags=['forum', 'ucoz']).keys()) == []
|
||||
|
||||
|
||||
def test_ranked_sites_dict_excluded_tags_with_top():
|
||||
"""Excluded tags should also prevent mirrors from being included."""
|
||||
db = MaigretDatabase()
|
||||
db.update_site(
|
||||
MaigretSite('Parent', {'alexaRank': 1, 'tags': ['forum'], 'type': 'username'})
|
||||
)
|
||||
db.update_site(
|
||||
MaigretSite('Mirror', {'alexaRank': 999999, 'source': 'Parent', 'tags': ['forum'], 'type': 'username'})
|
||||
)
|
||||
db.update_site(
|
||||
MaigretSite('Other', {'alexaRank': 2, 'tags': ['coding'], 'type': 'username'})
|
||||
)
|
||||
|
||||
# Without exclusion, mirror should be included
|
||||
result = db.ranked_sites_dict(top=1, id_type='username')
|
||||
assert 'Parent' in result
|
||||
assert 'Mirror' in result
|
||||
|
||||
# With exclusion of 'forum', both Parent and Mirror should be excluded
|
||||
result = db.ranked_sites_dict(top=2, excluded_tags=['forum'], id_type='username')
|
||||
assert 'Parent' not in result
|
||||
assert 'Mirror' not in result
|
||||
assert 'Other' in result
|
||||
|
||||
|
||||
def test_ranked_sites_dict_mirrors_disabled_parent():
|
||||
"""Mirror is included when parent ranks in top N but parent is disabled."""
|
||||
db = MaigretDatabase()
|
||||
db.update_site(
|
||||
MaigretSite(
|
||||
'ParentPlatform',
|
||||
{'alexaRank': 5, 'disabled': True, 'type': 'username'},
|
||||
)
|
||||
)
|
||||
db.update_site(
|
||||
MaigretSite(
|
||||
'OtherSite',
|
||||
{'alexaRank': 100, 'type': 'username'},
|
||||
)
|
||||
)
|
||||
db.update_site(
|
||||
MaigretSite(
|
||||
'MirrorSite',
|
||||
{
|
||||
'alexaRank': 99999999,
|
||||
'source': 'ParentPlatform',
|
||||
'type': 'username',
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
result = db.ranked_sites_dict(top=1, disabled=False, id_type='username')
|
||||
assert list(result.keys()) == ['OtherSite', 'MirrorSite']
|
||||
|
||||
|
||||
def test_ranked_sites_dict_mirrors_no_extra_without_parent_in_top():
|
||||
db = MaigretDatabase()
|
||||
db.update_site(MaigretSite('A', {'alexaRank': 1, 'type': 'username'}))
|
||||
db.update_site(
|
||||
MaigretSite(
|
||||
'B',
|
||||
{'alexaRank': 2, 'source': 'NotInDb', 'type': 'username'},
|
||||
)
|
||||
)
|
||||
|
||||
assert list(db.ranked_sites_dict(top=1, id_type='username').keys()) == ['A']
|
||||
|
||||
|
||||
def test_get_url_template():
|
||||
site = MaigretSite(
|
||||
"test",
|
||||
|
||||
+86
-3
@@ -1,8 +1,10 @@
|
||||
import re
|
||||
|
||||
import pytest
|
||||
from unittest.mock import MagicMock, patch
|
||||
from maigret.submit import Submitter
|
||||
from aiohttp import ClientSession
|
||||
from maigret.sites import MaigretDatabase
|
||||
from maigret.sites import MaigretDatabase, MaigretSite
|
||||
import logging
|
||||
|
||||
|
||||
@@ -26,7 +28,7 @@ async def test_detect_known_engine(test_db, local_test_db):
|
||||
url_exists = "https://devforum.zoom.us/u/adam"
|
||||
url_mainpage = "https://devforum.zoom.us/"
|
||||
# Mock extract_username_dialog to return "adam"
|
||||
submitter.extract_username_dialog = MagicMock(return_value="adam")
|
||||
submitter.extract_username_dialog = MagicMock(return_value="adam") # type: ignore[method-assign]
|
||||
|
||||
sites, resp_text = await submitter.detect_known_engine(
|
||||
url_exists, url_mainpage, session=None, follow_redirects=False, headers=None
|
||||
@@ -109,7 +111,7 @@ async def test_check_features_manually_success(settings):
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_features_manually_success(settings):
|
||||
async def test_check_features_manually_cloudflare(settings):
|
||||
# Setup
|
||||
db = MaigretDatabase()
|
||||
logger = logging.getLogger("test_logger")
|
||||
@@ -275,3 +277,84 @@ async def test_dialog_adds_site_negative(settings):
|
||||
await submitter.close()
|
||||
|
||||
assert result is False
|
||||
|
||||
|
||||
def test_domain_matching_exact():
|
||||
"""Test that domain matching uses proper boundary checks, not substring matching.
|
||||
|
||||
x.com should NOT match sites like 500px.com, mix.com, etc.
|
||||
"""
|
||||
domain_raw = "x.com"
|
||||
domain_re = re.compile(
|
||||
r'://(www\.)?' + re.escape(domain_raw) + r'(/|$)'
|
||||
)
|
||||
|
||||
# These should NOT match x.com
|
||||
non_matching = [
|
||||
MaigretSite("500px", {"url": "https://500px.com/p/{username}", "urlMain": "https://500px.com/"}),
|
||||
MaigretSite("Mix", {"url": "https://mix.com/{username}", "urlMain": "https://mix.com"}),
|
||||
MaigretSite("Screwfix", {"url": "{urlMain}{urlSubpath}/members/?username={username}", "urlMain": "https://community.screwfix.com"}),
|
||||
MaigretSite("Wix", {"url": "https://{username}.wix.com", "urlMain": "https://wix.com/"}),
|
||||
MaigretSite("1x", {"url": "https://1x.com/{username}", "urlMain": "https://1x.com"}),
|
||||
MaigretSite("Roblox", {"url": "https://www.roblox.com/user.aspx?username={username}", "urlMain": "https://www.roblox.com/"}),
|
||||
]
|
||||
|
||||
for site in non_matching:
|
||||
assert not domain_re.search(site.url_main + site.url), \
|
||||
f"x.com should NOT match site {site.name} ({site.url_main})"
|
||||
|
||||
|
||||
def test_domain_matching_positive():
|
||||
"""Test that domain matching correctly matches the exact domain."""
|
||||
domain_raw = "x.com"
|
||||
domain_re = re.compile(
|
||||
r'://(www\.)?' + re.escape(domain_raw) + r'(/|$)'
|
||||
)
|
||||
|
||||
# These SHOULD match x.com
|
||||
matching = [
|
||||
MaigretSite("X", {"url": "https://x.com/{username}", "urlMain": "https://x.com"}),
|
||||
MaigretSite("X-www", {"url": "https://www.x.com/{username}", "urlMain": "https://www.x.com"}),
|
||||
]
|
||||
|
||||
for site in matching:
|
||||
assert domain_re.search(site.url_main + site.url), \
|
||||
f"x.com SHOULD match site {site.name} ({site.url_main})"
|
||||
|
||||
|
||||
def test_dialog_nonexistent_site_name_no_crash():
|
||||
"""Test that entering a site name not in the matched list doesn't crash.
|
||||
|
||||
This tests the fix for: AttributeError: 'NoneType' object has no attribute 'name'
|
||||
The old_site should be None when user enters a name not in matched_sites,
|
||||
and the code should handle it gracefully.
|
||||
"""
|
||||
# Simulate the logic that was crashing
|
||||
matched_sites = [
|
||||
MaigretSite("ValidActive", {"url": "https://example.com/{username}", "urlMain": "https://example.com"}),
|
||||
MaigretSite("InvalidActive", {"url": "https://example.com/alt/{username}", "urlMain": "https://example.com"}),
|
||||
]
|
||||
site_name = "NonExistentSite"
|
||||
|
||||
old_site = next(
|
||||
(site for site in matched_sites if site.name == site_name), None
|
||||
)
|
||||
|
||||
# This is what the old code did - it would crash here
|
||||
assert old_site is None
|
||||
|
||||
# The fix: check before accessing .name
|
||||
if old_site is None:
|
||||
result = "not found"
|
||||
else:
|
||||
result = old_site.name
|
||||
|
||||
assert result == "not found"
|
||||
|
||||
# And when site_name IS in matched_sites, it should work
|
||||
site_name = "ValidActive"
|
||||
old_site = next(
|
||||
(site for site in matched_sites if site.name == site_name), None
|
||||
)
|
||||
assert old_site is not None
|
||||
assert old_site.name == "ValidActive"
|
||||
|
||||
@@ -0,0 +1,63 @@
|
||||
"""Tests for the Twitter / X site entry and GraphQL probe."""
|
||||
|
||||
import re
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from maigret.sites import MaigretSite
|
||||
|
||||
|
||||
def _twitter_site(site: MaigretSite) -> None:
|
||||
assert site.name == "Twitter"
|
||||
assert site.disabled is False
|
||||
assert site.check_type == "message"
|
||||
assert site.url_probe and "{username}" in site.url_probe
|
||||
assert "UserByScreenName" in site.url_probe or "graphql" in site.url_probe
|
||||
assert site.regex_check
|
||||
assert re.fullmatch(site.regex_check, site.username_claimed)
|
||||
assert re.fullmatch(site.regex_check, site.username_unclaimed)
|
||||
assert site.absence_strs
|
||||
assert site.activation.get("method") == "twitter"
|
||||
assert site.activation.get("url")
|
||||
assert "authorization" in {k.lower() for k in site.headers.keys()}
|
||||
|
||||
|
||||
def test_twitter_site_entry_config(default_db):
|
||||
"""Twitter entry in data.json must define probe URL, regex, and activation."""
|
||||
site = default_db.sites_dict["Twitter"]
|
||||
assert isinstance(site, MaigretSite)
|
||||
_twitter_site(site)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_twitter_graphql_probe_claimed_vs_unclaimed(default_db):
|
||||
"""
|
||||
Live check: guest activation + UserByScreenName GraphQL returns a user for
|
||||
usernameClaimed and no user for usernameUnclaimed (same flow as urlProbe).
|
||||
"""
|
||||
site = default_db.sites_dict["Twitter"]
|
||||
_twitter_site(site)
|
||||
|
||||
headers = dict(site.headers)
|
||||
headers.pop("x-guest-token", None)
|
||||
|
||||
act = requests.post(site.activation["url"], headers=headers, timeout=45)
|
||||
assert act.status_code == 200, act.text[:500]
|
||||
body = act.json()
|
||||
assert "guest_token" in body
|
||||
headers["x-guest-token"] = body["guest_token"]
|
||||
|
||||
def fetch(username: str) -> dict:
|
||||
url = site.url_probe.format(username=username)
|
||||
resp = requests.get(url, headers=headers, timeout=45)
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
|
||||
claimed_json = fetch(site.username_claimed)
|
||||
assert "data" in claimed_json
|
||||
assert claimed_json["data"].get("user") is not None
|
||||
|
||||
unclaimed_json = fetch(site.username_unclaimed)
|
||||
data = unclaimed_json.get("data") or {}
|
||||
assert data == {} or data.get("user") is None
|
||||
@@ -0,0 +1,480 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Mass site checking utility for Maigret development.
|
||||
Check top-N sites from data.json and generate a report.
|
||||
|
||||
Usage:
|
||||
python utils/check_top_n.py --top 100 # Check top 100 sites
|
||||
python utils/check_top_n.py --top 50 --parallel 10 # Check with 10 parallel requests
|
||||
python utils/check_top_n.py --top 100 --output report.json
|
||||
python utils/check_top_n.py --top 100 --fix # Auto-fix simple issues
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
# Add parent dir for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
try:
|
||||
import aiohttp
|
||||
except ImportError:
|
||||
print("aiohttp not installed. Run: pip install aiohttp")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
class Colors:
|
||||
RED = "\033[91m"
|
||||
GREEN = "\033[92m"
|
||||
YELLOW = "\033[93m"
|
||||
BLUE = "\033[94m"
|
||||
CYAN = "\033[96m"
|
||||
RESET = "\033[0m"
|
||||
BOLD = "\033[1m"
|
||||
|
||||
|
||||
def color(text: str, c: str) -> str:
|
||||
return f"{c}{text}{Colors.RESET}"
|
||||
|
||||
|
||||
@dataclass
|
||||
class SiteCheckResult:
|
||||
"""Result of checking a single site."""
|
||||
site_name: str
|
||||
alexa_rank: int
|
||||
disabled: bool
|
||||
check_type: str
|
||||
|
||||
# Status
|
||||
status: str = "unknown" # working, broken, timeout, error, anti_bot, disabled
|
||||
|
||||
# HTTP results
|
||||
claimed_http_status: Optional[int] = None
|
||||
unclaimed_http_status: Optional[int] = None
|
||||
claimed_error: Optional[str] = None
|
||||
unclaimed_error: Optional[str] = None
|
||||
|
||||
# Issues detected
|
||||
issues: List[str] = field(default_factory=list)
|
||||
warnings: List[str] = field(default_factory=list)
|
||||
|
||||
# Recommendations
|
||||
recommendations: List[str] = field(default_factory=list)
|
||||
|
||||
# Timing
|
||||
check_time_ms: int = 0
|
||||
|
||||
|
||||
DEFAULT_HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.5",
|
||||
}
|
||||
|
||||
|
||||
async def check_url(url: str, headers: dict, timeout: int = 15) -> dict:
|
||||
"""Quick URL check returning status and basic info."""
|
||||
result = {
|
||||
"status": None,
|
||||
"final_url": None,
|
||||
"content_length": 0,
|
||||
"error": None,
|
||||
"error_type": None,
|
||||
"content": None,
|
||||
"markers": {},
|
||||
}
|
||||
|
||||
try:
|
||||
connector = aiohttp.TCPConnector(ssl=False)
|
||||
timeout_obj = aiohttp.ClientTimeout(total=timeout)
|
||||
|
||||
async with aiohttp.ClientSession(connector=connector, timeout=timeout_obj) as session:
|
||||
async with session.get(url, headers=headers, allow_redirects=True) as resp:
|
||||
result["status"] = resp.status
|
||||
result["final_url"] = str(resp.url)
|
||||
|
||||
try:
|
||||
text = await resp.text()
|
||||
result["content_length"] = len(text)
|
||||
result["content"] = text
|
||||
|
||||
text_lower = text.lower()
|
||||
result["markers"] = {
|
||||
"404_text": any(m in text_lower for m in ["not found", "404", "doesn't exist"]),
|
||||
"captcha": any(m in text_lower for m in ["captcha", "recaptcha", "challenge"]),
|
||||
"cloudflare": "cloudflare" in text_lower,
|
||||
"login": any(m in text_lower for m in ["log in", "login", "sign in"]),
|
||||
}
|
||||
except Exception as e:
|
||||
result["error"] = f"Content error: {e}"
|
||||
result["error_type"] = "content"
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
result["error"] = "Timeout"
|
||||
result["error_type"] = "timeout"
|
||||
except aiohttp.ClientError as e:
|
||||
result["error"] = str(e)
|
||||
result["error_type"] = "client"
|
||||
except Exception as e:
|
||||
result["error"] = str(e)
|
||||
result["error_type"] = "unknown"
|
||||
|
||||
return result
|
||||
|
||||
|
||||
async def check_site(site_name: str, config: dict, timeout: int = 15) -> SiteCheckResult:
|
||||
"""Check a single site and return detailed result."""
|
||||
start_time = time.time()
|
||||
|
||||
result = SiteCheckResult(
|
||||
site_name=site_name,
|
||||
alexa_rank=config.get("alexaRank", 999999),
|
||||
disabled=config.get("disabled", False),
|
||||
check_type=config.get("checkType", "status_code"),
|
||||
)
|
||||
|
||||
# Skip disabled sites
|
||||
if result.disabled:
|
||||
result.status = "disabled"
|
||||
return result
|
||||
|
||||
# Build URL
|
||||
url_template = config.get("url", "")
|
||||
url_main = config.get("urlMain", "")
|
||||
url_subpath = config.get("urlSubpath", "")
|
||||
url_template = url_template.replace("{urlMain}", url_main).replace("{urlSubpath}", url_subpath)
|
||||
|
||||
claimed = config.get("usernameClaimed")
|
||||
unclaimed = config.get("usernameUnclaimed", "noonewouldeverusethis7")
|
||||
|
||||
if not claimed:
|
||||
result.status = "error"
|
||||
result.issues.append("No usernameClaimed defined")
|
||||
return result
|
||||
|
||||
# Prepare headers
|
||||
headers = DEFAULT_HEADERS.copy()
|
||||
if config.get("headers"):
|
||||
headers.update(config["headers"])
|
||||
|
||||
# Check both URLs
|
||||
url_claimed = url_template.replace("{username}", claimed)
|
||||
url_unclaimed = url_template.replace("{username}", unclaimed)
|
||||
|
||||
try:
|
||||
claimed_result, unclaimed_result = await asyncio.gather(
|
||||
check_url(url_claimed, headers, timeout),
|
||||
check_url(url_unclaimed, headers, timeout),
|
||||
)
|
||||
except Exception as e:
|
||||
result.status = "error"
|
||||
result.issues.append(f"Check failed: {e}")
|
||||
return result
|
||||
|
||||
result.claimed_http_status = claimed_result["status"]
|
||||
result.unclaimed_http_status = unclaimed_result["status"]
|
||||
result.claimed_error = claimed_result.get("error")
|
||||
result.unclaimed_error = unclaimed_result.get("error")
|
||||
|
||||
# Categorize result
|
||||
if claimed_result["error_type"] == "timeout" or unclaimed_result["error_type"] == "timeout":
|
||||
result.status = "timeout"
|
||||
result.issues.append("Request timeout")
|
||||
|
||||
elif claimed_result["status"] == 403 or claimed_result["status"] == 429:
|
||||
result.status = "anti_bot"
|
||||
result.issues.append(f"Anti-bot protection (HTTP {claimed_result['status']})")
|
||||
|
||||
elif claimed_result.get("markers", {}).get("captcha"):
|
||||
result.status = "anti_bot"
|
||||
result.issues.append("Captcha detected")
|
||||
|
||||
elif claimed_result.get("markers", {}).get("cloudflare"):
|
||||
result.status = "anti_bot"
|
||||
result.warnings.append("Cloudflare protection detected")
|
||||
|
||||
elif claimed_result["error"] or unclaimed_result["error"]:
|
||||
result.status = "error"
|
||||
if claimed_result["error"]:
|
||||
result.issues.append(f"Claimed error: {claimed_result['error']}")
|
||||
if unclaimed_result["error"]:
|
||||
result.issues.append(f"Unclaimed error: {unclaimed_result['error']}")
|
||||
|
||||
else:
|
||||
# Validate check type
|
||||
check_type = config.get("checkType", "status_code")
|
||||
|
||||
if check_type == "status_code":
|
||||
if claimed_result["status"] == unclaimed_result["status"]:
|
||||
result.status = "broken"
|
||||
result.issues.append(f"Same status code ({claimed_result['status']}) for both")
|
||||
# Suggest fix
|
||||
if claimed_result["final_url"] != unclaimed_result["final_url"]:
|
||||
result.recommendations.append("Switch to checkType: response_url")
|
||||
else:
|
||||
result.status = "working"
|
||||
|
||||
elif check_type == "response_url":
|
||||
if claimed_result["final_url"] == unclaimed_result["final_url"]:
|
||||
result.status = "broken"
|
||||
result.issues.append("Same final URL for both")
|
||||
if claimed_result["status"] != unclaimed_result["status"]:
|
||||
result.recommendations.append("Switch to checkType: status_code")
|
||||
else:
|
||||
result.status = "working"
|
||||
|
||||
elif check_type == "message":
|
||||
presense_strs = config.get("presenseStrs", [])
|
||||
absence_strs = config.get("absenceStrs", [])
|
||||
|
||||
claimed_content = claimed_result.get("content", "") or ""
|
||||
unclaimed_content = unclaimed_result.get("content", "") or ""
|
||||
|
||||
presense_ok = not presense_strs or any(s in claimed_content for s in presense_strs)
|
||||
absence_claimed = absence_strs and any(s in claimed_content for s in absence_strs)
|
||||
absence_unclaimed = absence_strs and any(s in unclaimed_content for s in absence_strs)
|
||||
|
||||
if presense_strs and not presense_ok:
|
||||
result.status = "broken"
|
||||
result.issues.append(f"presenseStrs not found: {presense_strs}")
|
||||
# Check if status_code would work
|
||||
if claimed_result["status"] != unclaimed_result["status"]:
|
||||
result.recommendations.append(f"Switch to checkType: status_code ({claimed_result['status']} vs {unclaimed_result['status']})")
|
||||
elif absence_claimed:
|
||||
result.status = "broken"
|
||||
result.issues.append(f"absenceStrs found in claimed page")
|
||||
elif absence_strs and not absence_unclaimed:
|
||||
result.status = "broken"
|
||||
result.warnings.append("absenceStrs not found in unclaimed page")
|
||||
else:
|
||||
result.status = "working"
|
||||
|
||||
else:
|
||||
result.status = "unknown"
|
||||
result.warnings.append(f"Unknown checkType: {check_type}")
|
||||
|
||||
result.check_time_ms = int((time.time() - start_time) * 1000)
|
||||
return result
|
||||
|
||||
|
||||
def load_sites(db_path: Path) -> Dict[str, dict]:
|
||||
"""Load all sites from data.json."""
|
||||
with open(db_path) as f:
|
||||
data = json.load(f)
|
||||
return data.get("sites", {})
|
||||
|
||||
|
||||
def get_top_sites(sites: Dict[str, dict], n: int) -> List[Tuple[str, dict]]:
|
||||
"""Get top N sites by Alexa rank."""
|
||||
ranked = []
|
||||
for name, config in sites.items():
|
||||
rank = config.get("alexaRank", 999999)
|
||||
ranked.append((name, config, rank))
|
||||
|
||||
ranked.sort(key=lambda x: x[2])
|
||||
return [(name, config) for name, config, _ in ranked[:n]]
|
||||
|
||||
|
||||
async def check_sites_batch(sites: List[Tuple[str, dict]], parallel: int = 5,
|
||||
timeout: int = 15, progress_callback=None) -> List[SiteCheckResult]:
|
||||
"""Check multiple sites with parallelism control."""
|
||||
results = []
|
||||
semaphore = asyncio.Semaphore(parallel)
|
||||
|
||||
async def check_with_semaphore(name, config, index):
|
||||
async with semaphore:
|
||||
if progress_callback:
|
||||
progress_callback(index, len(sites), name)
|
||||
return await check_site(name, config, timeout)
|
||||
|
||||
tasks = [
|
||||
check_with_semaphore(name, config, i)
|
||||
for i, (name, config) in enumerate(sites)
|
||||
]
|
||||
|
||||
results = await asyncio.gather(*tasks)
|
||||
return results
|
||||
|
||||
|
||||
def print_progress(current: int, total: int, site_name: str):
|
||||
"""Print progress indicator."""
|
||||
pct = int(current / total * 100)
|
||||
bar_width = 30
|
||||
filled = int(bar_width * current / total)
|
||||
bar = "█" * filled + "░" * (bar_width - filled)
|
||||
print(f"\r[{bar}] {pct:3d}% ({current}/{total}) {site_name:<30}", end="", flush=True)
|
||||
|
||||
|
||||
def generate_report(results: List[SiteCheckResult]) -> dict:
|
||||
"""Generate a summary report from check results."""
|
||||
report = {
|
||||
"summary": {
|
||||
"total": len(results),
|
||||
"working": 0,
|
||||
"broken": 0,
|
||||
"disabled": 0,
|
||||
"timeout": 0,
|
||||
"anti_bot": 0,
|
||||
"error": 0,
|
||||
"unknown": 0,
|
||||
},
|
||||
"by_status": defaultdict(list),
|
||||
"issues": [],
|
||||
"recommendations": [],
|
||||
}
|
||||
|
||||
for r in results:
|
||||
report["summary"][r.status] = report["summary"].get(r.status, 0) + 1
|
||||
report["by_status"][r.status].append(r.site_name)
|
||||
|
||||
if r.issues:
|
||||
report["issues"].append({
|
||||
"site": r.site_name,
|
||||
"rank": r.alexa_rank,
|
||||
"issues": r.issues,
|
||||
})
|
||||
|
||||
if r.recommendations:
|
||||
report["recommendations"].append({
|
||||
"site": r.site_name,
|
||||
"rank": r.alexa_rank,
|
||||
"recommendations": r.recommendations,
|
||||
})
|
||||
|
||||
return report
|
||||
|
||||
|
||||
def print_report(report: dict, results: List[SiteCheckResult]):
|
||||
"""Print a formatted report to console."""
|
||||
summary = report["summary"]
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"{color('SITE CHECK REPORT', Colors.CYAN)}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
print(f"{color('SUMMARY:', Colors.BOLD)}")
|
||||
print(f" Total sites checked: {summary['total']}")
|
||||
print(f" {color('Working:', Colors.GREEN)} {summary['working']}")
|
||||
print(f" {color('Broken:', Colors.RED)} {summary['broken']}")
|
||||
print(f" {color('Disabled:', Colors.YELLOW)} {summary['disabled']}")
|
||||
print(f" {color('Timeout:', Colors.YELLOW)} {summary['timeout']}")
|
||||
print(f" {color('Anti-bot:', Colors.YELLOW)} {summary['anti_bot']}")
|
||||
print(f" {color('Error:', Colors.RED)} {summary['error']}")
|
||||
|
||||
# Broken sites
|
||||
if report["by_status"]["broken"]:
|
||||
print(f"\n{color('BROKEN SITES:', Colors.RED)}")
|
||||
for site in report["by_status"]["broken"][:20]:
|
||||
r = next(x for x in results if x.site_name == site)
|
||||
print(f" - {site} (rank {r.alexa_rank}): {', '.join(r.issues)}")
|
||||
if len(report["by_status"]["broken"]) > 20:
|
||||
print(f" ... and {len(report['by_status']['broken']) - 20} more")
|
||||
|
||||
# Timeout sites
|
||||
if report["by_status"]["timeout"]:
|
||||
print(f"\n{color('TIMEOUT SITES:', Colors.YELLOW)}")
|
||||
for site in report["by_status"]["timeout"][:10]:
|
||||
print(f" - {site}")
|
||||
if len(report["by_status"]["timeout"]) > 10:
|
||||
print(f" ... and {len(report['by_status']['timeout']) - 10} more")
|
||||
|
||||
# Anti-bot sites
|
||||
if report["by_status"]["anti_bot"]:
|
||||
print(f"\n{color('ANTI-BOT PROTECTED:', Colors.YELLOW)}")
|
||||
for site in report["by_status"]["anti_bot"][:10]:
|
||||
r = next(x for x in results if x.site_name == site)
|
||||
print(f" - {site}: {', '.join(r.issues)}")
|
||||
if len(report["by_status"]["anti_bot"]) > 10:
|
||||
print(f" ... and {len(report['by_status']['anti_bot']) - 10} more")
|
||||
|
||||
# Recommendations
|
||||
if report["recommendations"]:
|
||||
print(f"\n{color('RECOMMENDATIONS:', Colors.CYAN)}")
|
||||
for rec in report["recommendations"][:15]:
|
||||
print(f" {rec['site']} (rank {rec['rank']}):")
|
||||
for r in rec["recommendations"]:
|
||||
print(f" -> {r}")
|
||||
if len(report["recommendations"]) > 15:
|
||||
print(f" ... and {len(report['recommendations']) - 15} more")
|
||||
|
||||
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Mass site checking for Maigret",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
parser.add_argument("--top", "-n", type=int, default=100,
|
||||
help="Check top N sites by Alexa rank (default: 100)")
|
||||
parser.add_argument("--parallel", "-p", type=int, default=5,
|
||||
help="Number of parallel requests (default: 5)")
|
||||
parser.add_argument("--timeout", "-t", type=int, default=15,
|
||||
help="Request timeout in seconds (default: 15)")
|
||||
parser.add_argument("--output", "-o", help="Output JSON report to file")
|
||||
parser.add_argument("--include-disabled", action="store_true",
|
||||
help="Include disabled sites in results")
|
||||
parser.add_argument("--only-broken", action="store_true",
|
||||
help="Only show broken sites")
|
||||
parser.add_argument("--json", action="store_true",
|
||||
help="Output as JSON only")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load sites
|
||||
db_path = Path(__file__).parent.parent / "maigret" / "resources" / "data.json"
|
||||
if not db_path.exists():
|
||||
print(f"Database not found: {db_path}")
|
||||
sys.exit(1)
|
||||
|
||||
sites = load_sites(db_path)
|
||||
top_sites = get_top_sites(sites, args.top)
|
||||
|
||||
if not args.json:
|
||||
print(f"Checking top {len(top_sites)} sites (parallel={args.parallel}, timeout={args.timeout}s)...")
|
||||
print()
|
||||
|
||||
# Run checks
|
||||
progress = print_progress if not args.json else None
|
||||
results = await check_sites_batch(top_sites, args.parallel, args.timeout, progress)
|
||||
|
||||
if not args.json:
|
||||
print() # Clear progress line
|
||||
|
||||
# Filter results
|
||||
if not args.include_disabled:
|
||||
results = [r for r in results if r.status != "disabled"]
|
||||
if args.only_broken:
|
||||
results = [r for r in results if r.status in ("broken", "error", "timeout")]
|
||||
|
||||
# Generate report
|
||||
report = generate_report(results)
|
||||
|
||||
# Output
|
||||
if args.json:
|
||||
output = {
|
||||
"report": report,
|
||||
"results": [asdict(r) for r in results],
|
||||
}
|
||||
print(json.dumps(output, indent=2))
|
||||
else:
|
||||
print_report(report, results)
|
||||
|
||||
# Save to file
|
||||
if args.output:
|
||||
output = {
|
||||
"report": report,
|
||||
"results": [asdict(r) for r in results],
|
||||
}
|
||||
with open(args.output, "w") as f:
|
||||
json.dump(output, f, indent=2)
|
||||
print(f"\nReport saved to: {args.output}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -0,0 +1,223 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Probe likely false-positive sites among the top-N Alexa-ranked entries.
|
||||
|
||||
For each of K random *distinct* usernames taken from ``usernameClaimed`` fields in
|
||||
the Maigret database, runs a clean ``maigret`` scan (``--top-sites N --json simple|ndjson``).
|
||||
Sites that return CLAIMED in *every* run are reported: unrelated random claimed
|
||||
handles are unlikely to all exist on the same third-party site, so such sites are
|
||||
candidates for broken checks.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import random
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def repo_root() -> Path:
|
||||
return Path(__file__).resolve().parent.parent
|
||||
|
||||
|
||||
def load_username_claimed_pool(db_path: Path) -> list[str]:
|
||||
with db_path.open(encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
sites = data.get("sites") or {}
|
||||
seen: set[str] = set()
|
||||
pool: list[str] = []
|
||||
for _name, site in sites.items():
|
||||
u = (site or {}).get("usernameClaimed")
|
||||
if not u or not isinstance(u, str):
|
||||
continue
|
||||
u = u.strip()
|
||||
if not u or u in seen:
|
||||
continue
|
||||
seen.add(u)
|
||||
pool.append(u)
|
||||
return pool
|
||||
|
||||
|
||||
def run_maigret(
|
||||
*,
|
||||
username: str,
|
||||
db_path: Path,
|
||||
out_dir: Path,
|
||||
top_sites: int,
|
||||
json_format: str,
|
||||
quiet: bool,
|
||||
) -> Path:
|
||||
"""Run maigret subprocess; return path to the written JSON report."""
|
||||
safe = username.replace("/", "_")
|
||||
report_name = f"report_{safe}_{json_format}.json"
|
||||
report_path = out_dir / report_name
|
||||
|
||||
cmd = [
|
||||
sys.executable,
|
||||
"-m",
|
||||
"maigret",
|
||||
username,
|
||||
"--db",
|
||||
str(db_path),
|
||||
"--top-sites",
|
||||
str(top_sites),
|
||||
"--json",
|
||||
json_format,
|
||||
"--folderoutput",
|
||||
str(out_dir),
|
||||
"--no-progressbar",
|
||||
"--no-color",
|
||||
"--no-recursion",
|
||||
"--no-extracting",
|
||||
]
|
||||
sink = subprocess.DEVNULL if quiet else None
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
cwd=str(repo_root()),
|
||||
text=True,
|
||||
stdout=sink,
|
||||
stderr=sink,
|
||||
)
|
||||
if proc.returncode != 0:
|
||||
raise RuntimeError(
|
||||
f"maigret exited with {proc.returncode} for username {username!r}"
|
||||
)
|
||||
if not report_path.is_file():
|
||||
raise FileNotFoundError(f"Expected report missing: {report_path}")
|
||||
return report_path
|
||||
|
||||
|
||||
def claimed_sites_from_report(path: Path, json_format: str) -> set[str]:
|
||||
if json_format == "simple":
|
||||
with path.open(encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
if not isinstance(data, dict):
|
||||
return set()
|
||||
return set(data.keys())
|
||||
# ndjson: one object per line, each has "sitename"
|
||||
sites: set[str] = set()
|
||||
with path.open(encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
obj = json.loads(line)
|
||||
name = obj.get("sitename")
|
||||
if isinstance(name, str) and name:
|
||||
sites.add(name)
|
||||
return sites
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
description=(
|
||||
"Pick random distinct usernameClaimed values, run maigret --top-sites N "
|
||||
"with JSON reports, and list sites that claimed all of them (suspicious FP)."
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
"--db",
|
||||
"-b",
|
||||
type=Path,
|
||||
default=repo_root() / "maigret" / "resources" / "data.json",
|
||||
help="Path to Maigret data.json (a temp copy is used for runs).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--top-sites",
|
||||
"-n",
|
||||
type=int,
|
||||
default=500,
|
||||
metavar="N",
|
||||
help="Value for maigret --top-sites (default: 500).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--samples",
|
||||
"-k",
|
||||
type=int,
|
||||
default=5,
|
||||
metavar="K",
|
||||
help="How many distinct random usernames to draw (default: 5).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--seed",
|
||||
type=int,
|
||||
default=None,
|
||||
help="RNG seed for reproducible username selection.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--json",
|
||||
dest="json_format",
|
||||
default="simple",
|
||||
choices=["simple", "ndjson"],
|
||||
help="JSON report type passed to maigret -J (default: simple).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
"-v",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Print maigret stdout/stderr (default: suppress child output).",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
quiet = not args.verbose
|
||||
|
||||
db_src = args.db.resolve()
|
||||
if not db_src.is_file():
|
||||
print(f"Database not found: {db_src}", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
pool = load_username_claimed_pool(db_src)
|
||||
if len(pool) < args.samples:
|
||||
print(
|
||||
f"Need at least {args.samples} distinct usernameClaimed entries, "
|
||||
f"found {len(pool)}.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 2
|
||||
|
||||
rng = random.Random(args.seed)
|
||||
picked = rng.sample(pool, args.samples)
|
||||
|
||||
print(f"Database: {db_src}")
|
||||
print(f"--top-sites {args.top_sites}, {args.samples} random usernameClaimed:")
|
||||
for i, u in enumerate(picked, 1):
|
||||
print(f" {i}. {u}")
|
||||
|
||||
site_sets: list[set[str]] = []
|
||||
with tempfile.TemporaryDirectory(prefix="maigret_fp_probe_") as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
db_work = tmp_path / "data.json"
|
||||
shutil.copyfile(db_src, db_work)
|
||||
|
||||
for u in picked:
|
||||
print(f"\nRunning maigret for {u!r} ...", flush=True)
|
||||
report = run_maigret(
|
||||
username=u,
|
||||
db_path=db_work,
|
||||
out_dir=tmp_path,
|
||||
top_sites=args.top_sites,
|
||||
json_format=args.json_format,
|
||||
quiet=quiet,
|
||||
)
|
||||
sites = claimed_sites_from_report(report, args.json_format)
|
||||
site_sets.append(sites)
|
||||
print(f" -> {len(sites)} positive site(s) in JSON", flush=True)
|
||||
|
||||
always = set.intersection(*site_sets) if site_sets else set()
|
||||
print("\n--- Sites with CLAIMED in all runs (candidates for false positives) ---")
|
||||
if not always:
|
||||
print("(none)")
|
||||
else:
|
||||
for name in sorted(always):
|
||||
print(name)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -0,0 +1,59 @@
|
||||
"""Generate db_meta.json from data.json for the auto-update system."""
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import os.path as path
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
|
||||
RESOURCES_DIR = path.join(path.dirname(path.dirname(path.abspath(__file__))), "maigret", "resources")
|
||||
DATA_JSON_PATH = path.join(RESOURCES_DIR, "data.json")
|
||||
META_JSON_PATH = path.join(RESOURCES_DIR, "db_meta.json")
|
||||
DEFAULT_DATA_URL = "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/data.json"
|
||||
|
||||
|
||||
def get_current_version():
|
||||
version_file = path.join(path.dirname(path.dirname(path.abspath(__file__))), "maigret", "__version__.py")
|
||||
with open(version_file) as f:
|
||||
for line in f:
|
||||
if line.startswith("__version__"):
|
||||
return line.split("=")[1].strip().strip("'\"")
|
||||
return "0.0.0"
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Generate db_meta.json from data.json")
|
||||
parser.add_argument("--min-version", default=None, help="Minimum compatible maigret version (default: current version)")
|
||||
parser.add_argument("--data-url", default=DEFAULT_DATA_URL, help="URL where data.json can be downloaded")
|
||||
args = parser.parse_args()
|
||||
|
||||
min_version = args.min_version or get_current_version()
|
||||
|
||||
with open(DATA_JSON_PATH, "rb") as f:
|
||||
raw = f.read()
|
||||
sha256 = hashlib.sha256(raw).hexdigest()
|
||||
|
||||
data = json.loads(raw)
|
||||
sites_count = len(data.get("sites", {}))
|
||||
|
||||
meta = {
|
||||
"version": 1,
|
||||
"updated_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"sites_count": sites_count,
|
||||
"min_maigret_version": min_version,
|
||||
"data_sha256": sha256,
|
||||
"data_url": args.data_url,
|
||||
}
|
||||
|
||||
with open(META_JSON_PATH, "w", encoding="utf-8") as f:
|
||||
json.dump(meta, f, indent=4, ensure_ascii=False)
|
||||
|
||||
print(f"Generated {META_JSON_PATH}")
|
||||
print(f" sites: {sites_count}")
|
||||
print(f" sha256: {sha256[:16]}...")
|
||||
print(f" min_version: {min_version}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,808 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Site check utility for Maigret development.
|
||||
Quickly test site availability, find valid usernames, and diagnose check issues.
|
||||
|
||||
Usage:
|
||||
python utils/site_check.py --site "SiteName" --check-claimed
|
||||
python utils/site_check.py --site "SiteName" --maigret # Test via Maigret
|
||||
python utils/site_check.py --site "SiteName" --compare-methods # aiohttp vs Maigret
|
||||
python utils/site_check.py --url "https://example.com/user/{username}" --test "john"
|
||||
python utils/site_check.py --site "SiteName" --find-user
|
||||
python utils/site_check.py --site "SiteName" --diagnose # Full diagnosis
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
# Add parent dir for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
try:
|
||||
import aiohttp
|
||||
from yarl import URL as YarlURL
|
||||
except ImportError:
|
||||
print("aiohttp not installed. Run: pip install aiohttp")
|
||||
sys.exit(1)
|
||||
|
||||
# Maigret imports (optional, for --maigret mode)
|
||||
MAIGRET_AVAILABLE = False
|
||||
try:
|
||||
from maigret.sites import MaigretDatabase, MaigretSite
|
||||
from maigret.checking import (
|
||||
SimpleAiohttpChecker,
|
||||
check_site_for_username,
|
||||
process_site_result,
|
||||
make_site_result,
|
||||
)
|
||||
from maigret.notify import QueryNotifyPrint
|
||||
from maigret.result import QueryStatus
|
||||
MAIGRET_AVAILABLE = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
DEFAULT_HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.5",
|
||||
}
|
||||
|
||||
COMMON_USERNAMES = ["blue", "test", "admin", "user", "john", "alex", "david", "mike", "chris", "dan"]
|
||||
|
||||
|
||||
class Colors:
|
||||
"""ANSI color codes for terminal output."""
|
||||
RED = "\033[91m"
|
||||
GREEN = "\033[92m"
|
||||
YELLOW = "\033[93m"
|
||||
BLUE = "\033[94m"
|
||||
MAGENTA = "\033[95m"
|
||||
CYAN = "\033[96m"
|
||||
RESET = "\033[0m"
|
||||
BOLD = "\033[1m"
|
||||
|
||||
|
||||
def color(text: str, c: str) -> str:
|
||||
"""Wrap text with color codes."""
|
||||
return f"{c}{text}{Colors.RESET}"
|
||||
|
||||
|
||||
async def check_url_aiohttp(url: str, headers: dict = None, follow_redirects: bool = True,
|
||||
timeout: int = 15, ssl_verify: bool = False,
|
||||
method: str = "GET", payload: dict = None) -> dict:
|
||||
"""Check a URL using aiohttp and return detailed response info.
|
||||
|
||||
Args:
|
||||
method: HTTP method ("GET" or "POST").
|
||||
payload: JSON payload for POST requests (dict, will be serialized).
|
||||
"""
|
||||
headers = headers or DEFAULT_HEADERS.copy()
|
||||
result = {
|
||||
"method": "aiohttp",
|
||||
"url": url,
|
||||
"status": None,
|
||||
"final_url": None,
|
||||
"redirects": [],
|
||||
"content_length": 0,
|
||||
"content": None,
|
||||
"title": None,
|
||||
"error": None,
|
||||
"error_type": None,
|
||||
"markers": {},
|
||||
}
|
||||
|
||||
try:
|
||||
connector = aiohttp.TCPConnector(ssl=ssl_verify)
|
||||
timeout_obj = aiohttp.ClientTimeout(total=timeout)
|
||||
|
||||
async with aiohttp.ClientSession(connector=connector, timeout=timeout_obj) as session:
|
||||
# Use encoded=True if URL contains percent-encoded chars to prevent double-encoding
|
||||
request_url = YarlURL(url, encoded=True) if '%' in url else url
|
||||
request_kwargs = dict(headers=headers, allow_redirects=follow_redirects)
|
||||
if method.upper() == "POST" and payload is not None:
|
||||
request_kwargs["json"] = payload
|
||||
|
||||
request_fn = session.post if method.upper() == "POST" else session.get
|
||||
async with request_fn(request_url, **request_kwargs) as resp:
|
||||
result["status"] = resp.status
|
||||
result["final_url"] = str(resp.url)
|
||||
|
||||
# Get redirect history
|
||||
if resp.history:
|
||||
result["redirects"] = [str(r.url) for r in resp.history]
|
||||
|
||||
# Read content
|
||||
try:
|
||||
text = await resp.text()
|
||||
result["content_length"] = len(text)
|
||||
result["content"] = text
|
||||
|
||||
# Extract title
|
||||
title_match = re.search(r'<title>([^<]*)</title>', text, re.IGNORECASE)
|
||||
if title_match:
|
||||
result["title"] = title_match.group(1).strip()[:100]
|
||||
|
||||
# Check common markers
|
||||
text_lower = text.lower()
|
||||
markers = {
|
||||
"404_text": any(m in text_lower for m in ["not found", "404", "doesn't exist", "does not exist"]),
|
||||
"profile_markers": any(m in text_lower for m in ["profile", "user", "member", "account"]),
|
||||
"error_markers": any(m in text_lower for m in ["error", "banned", "suspended", "blocked"]),
|
||||
"login_required": any(m in text_lower for m in ["log in", "login", "sign in", "signin"]),
|
||||
"captcha": any(m in text_lower for m in ["captcha", "recaptcha", "challenge", "verify you"]),
|
||||
"cloudflare": "cloudflare" in text_lower or "cf-ray" in text_lower,
|
||||
"rate_limit": any(m in text_lower for m in ["rate limit", "too many requests", "429"]),
|
||||
}
|
||||
result["markers"] = markers
|
||||
|
||||
# First 500 chars of body for inspection
|
||||
result["body_preview"] = text[:500].replace("\n", " ").strip()
|
||||
|
||||
except Exception as e:
|
||||
result["error"] = f"Content read error: {e}"
|
||||
result["error_type"] = "content_error"
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
result["error"] = "Timeout"
|
||||
result["error_type"] = "timeout"
|
||||
except aiohttp.ClientError as e:
|
||||
result["error"] = f"Client error: {e}"
|
||||
result["error_type"] = "client_error"
|
||||
except Exception as e:
|
||||
result["error"] = f"Error: {e}"
|
||||
result["error_type"] = "unknown"
|
||||
|
||||
return result
|
||||
|
||||
|
||||
async def check_url_maigret(site: 'MaigretSite', username: str, logger=None) -> dict:
|
||||
"""Check a URL using Maigret's checking mechanism."""
|
||||
if not MAIGRET_AVAILABLE:
|
||||
return {"error": "Maigret not available", "method": "maigret"}
|
||||
|
||||
if logger is None:
|
||||
logger = logging.getLogger("site_check")
|
||||
logger.setLevel(logging.WARNING)
|
||||
|
||||
result = {
|
||||
"method": "maigret",
|
||||
"url": None,
|
||||
"status": None,
|
||||
"status_str": None,
|
||||
"http_status": None,
|
||||
"final_url": None,
|
||||
"error": None,
|
||||
"error_type": None,
|
||||
"ids_data": None,
|
||||
}
|
||||
|
||||
try:
|
||||
# Create query options
|
||||
options = {
|
||||
"parsing": False,
|
||||
"cookie_jar": None,
|
||||
"timeout": 15,
|
||||
}
|
||||
|
||||
# Create a simple notifier
|
||||
class SilentNotify:
|
||||
def start(self, msg=None): pass
|
||||
def update(self, status, similar=False): pass
|
||||
def finish(self, msg=None, status=None): pass
|
||||
|
||||
notifier = SilentNotify()
|
||||
|
||||
# Run the check
|
||||
site_name, site_result = await check_site_for_username(
|
||||
site, username, options, logger, notifier
|
||||
)
|
||||
|
||||
result["url"] = site_result.get("url_user")
|
||||
result["status"] = site_result.get("status")
|
||||
result["status_str"] = str(site_result.get("status"))
|
||||
result["http_status"] = site_result.get("http_status")
|
||||
result["ids_data"] = site_result.get("ids_data")
|
||||
|
||||
# Check for errors
|
||||
status = site_result.get("status")
|
||||
if status and hasattr(status, 'error') and status.error:
|
||||
result["error"] = f"{status.error.type}: {status.error.desc}"
|
||||
result["error_type"] = str(status.error.type)
|
||||
|
||||
except Exception as e:
|
||||
result["error"] = str(e)
|
||||
result["error_type"] = "exception"
|
||||
|
||||
return result
|
||||
|
||||
|
||||
async def find_valid_username(url_template: str, usernames: list = None, headers: dict = None) -> Optional[str]:
|
||||
"""Try common usernames to find one that works."""
|
||||
usernames = usernames or COMMON_USERNAMES
|
||||
headers = headers or DEFAULT_HEADERS.copy()
|
||||
|
||||
print(f"Testing {len(usernames)} usernames on {url_template}...")
|
||||
|
||||
for username in usernames:
|
||||
url = url_template.replace("{username}", username)
|
||||
result = await check_url_aiohttp(url, headers)
|
||||
|
||||
status = result["status"]
|
||||
markers = result.get("markers", {})
|
||||
|
||||
# Good signs: 200 status, profile markers, no 404 text
|
||||
if status == 200 and not markers.get("404_text") and markers.get("profile_markers"):
|
||||
print(f" {color('[+]', Colors.GREEN)} {username}: status={status}, has profile markers")
|
||||
return username
|
||||
elif status == 200 and not markers.get("404_text"):
|
||||
print(f" {color('[?]', Colors.YELLOW)} {username}: status={status}, might work")
|
||||
else:
|
||||
print(f" {color('[-]', Colors.RED)} {username}: status={status}")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
async def compare_users_aiohttp(url_template: str, claimed: str, unclaimed: str = "noonewouldeverusethis7",
|
||||
headers: dict = None) -> Tuple[dict, dict]:
|
||||
"""Compare responses for claimed vs unclaimed usernames using aiohttp."""
|
||||
headers = headers or DEFAULT_HEADERS.copy()
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Comparing: {color(claimed, Colors.GREEN)} vs {color(unclaimed, Colors.RED)}")
|
||||
print(f"URL template: {url_template}")
|
||||
print(f"Method: aiohttp")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
url_claimed = url_template.replace("{username}", claimed)
|
||||
url_unclaimed = url_template.replace("{username}", unclaimed)
|
||||
|
||||
result_claimed, result_unclaimed = await asyncio.gather(
|
||||
check_url_aiohttp(url_claimed, headers),
|
||||
check_url_aiohttp(url_unclaimed, headers)
|
||||
)
|
||||
|
||||
def print_result(name, r, c):
|
||||
print(f"--- {color(name, c)} ---")
|
||||
print(f" URL: {r['url']}")
|
||||
print(f" Status: {color(str(r['status']), Colors.GREEN if r['status'] == 200 else Colors.RED)}")
|
||||
if r["redirects"]:
|
||||
print(f" Redirects: {' -> '.join(r['redirects'])} -> {r['final_url']}")
|
||||
print(f" Final URL: {r['final_url']}")
|
||||
print(f" Content length: {r['content_length']}")
|
||||
print(f" Title: {r['title']}")
|
||||
if r["error"]:
|
||||
print(f" Error: {color(r['error'], Colors.RED)}")
|
||||
print(f" Markers: {r['markers']}")
|
||||
print()
|
||||
|
||||
print_result(f"CLAIMED ({claimed})", result_claimed, Colors.GREEN)
|
||||
print_result(f"UNCLAIMED ({unclaimed})", result_unclaimed, Colors.RED)
|
||||
|
||||
# Analysis
|
||||
print(f"--- {color('ANALYSIS', Colors.CYAN)} ---")
|
||||
recommendations = []
|
||||
|
||||
if result_claimed["status"] != result_unclaimed["status"]:
|
||||
print(f" [!] Status codes differ: {result_claimed['status']} vs {result_unclaimed['status']}")
|
||||
recommendations.append(("status_code", f"Status codes: {result_claimed['status']} vs {result_unclaimed['status']}"))
|
||||
|
||||
if result_claimed["final_url"] != result_unclaimed["final_url"]:
|
||||
print(f" [!] Final URLs differ")
|
||||
recommendations.append(("response_url", "Final URLs differ"))
|
||||
|
||||
if result_claimed["content_length"] != result_unclaimed["content_length"]:
|
||||
diff = abs(result_claimed["content_length"] - result_unclaimed["content_length"])
|
||||
print(f" [!] Content length differs by {diff} bytes")
|
||||
recommendations.append(("message", f"Content differs by {diff} bytes"))
|
||||
|
||||
if result_claimed["title"] != result_unclaimed["title"]:
|
||||
print(f" [!] Titles differ:")
|
||||
print(f" Claimed: {result_claimed['title']}")
|
||||
print(f" Unclaimed: {result_unclaimed['title']}")
|
||||
recommendations.append(("message", f"Titles differ: '{result_claimed['title']}' vs '{result_unclaimed['title']}'"))
|
||||
|
||||
# Check for problems
|
||||
if result_claimed.get("markers", {}).get("captcha"):
|
||||
print(f" {color('[WARN]', Colors.YELLOW)} Captcha detected on claimed page")
|
||||
if result_claimed.get("markers", {}).get("cloudflare"):
|
||||
print(f" {color('[WARN]', Colors.YELLOW)} Cloudflare protection detected")
|
||||
if result_claimed.get("markers", {}).get("login_required"):
|
||||
print(f" {color('[WARN]', Colors.YELLOW)} Login may be required")
|
||||
|
||||
if recommendations:
|
||||
print(f"\n {color('Recommended checkType:', Colors.BOLD)} {recommendations[0][0]}")
|
||||
else:
|
||||
print(f" {color('[!]', Colors.RED)} No clear difference found - site may need special handling")
|
||||
|
||||
return result_claimed, result_unclaimed
|
||||
|
||||
|
||||
async def compare_methods(site: 'MaigretSite', claimed: str, unclaimed: str) -> dict:
|
||||
"""Compare aiohttp vs Maigret results for the same site."""
|
||||
if not MAIGRET_AVAILABLE:
|
||||
print(color("Maigret not available for comparison", Colors.RED))
|
||||
return {}
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"{color('METHOD COMPARISON', Colors.CYAN)}: aiohttp vs Maigret")
|
||||
print(f"Site: {site.name}")
|
||||
print(f"Claimed: {claimed}, Unclaimed: {unclaimed}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
# Build URL template
|
||||
url_template = site.url
|
||||
url_template = url_template.replace("{urlMain}", site.url_main or "")
|
||||
url_template = url_template.replace("{urlSubpath}", getattr(site, 'url_subpath', '') or "")
|
||||
|
||||
headers = DEFAULT_HEADERS.copy()
|
||||
if hasattr(site, 'headers') and site.headers:
|
||||
headers.update(site.headers)
|
||||
|
||||
# Run all checks in parallel
|
||||
url_claimed = url_template.replace("{username}", claimed)
|
||||
url_unclaimed = url_template.replace("{username}", unclaimed)
|
||||
|
||||
aiohttp_claimed, aiohttp_unclaimed, maigret_claimed, maigret_unclaimed = await asyncio.gather(
|
||||
check_url_aiohttp(url_claimed, headers),
|
||||
check_url_aiohttp(url_unclaimed, headers),
|
||||
check_url_maigret(site, claimed),
|
||||
check_url_maigret(site, unclaimed),
|
||||
)
|
||||
|
||||
def status_icon(status):
|
||||
if status == 200:
|
||||
return color("200", Colors.GREEN)
|
||||
elif status == 404:
|
||||
return color("404", Colors.YELLOW)
|
||||
elif status and status >= 400:
|
||||
return color(str(status), Colors.RED)
|
||||
return str(status)
|
||||
|
||||
def maigret_status_icon(status_str):
|
||||
if "Claimed" in str(status_str):
|
||||
return color("Claimed", Colors.GREEN)
|
||||
elif "Available" in str(status_str):
|
||||
return color("Available", Colors.YELLOW)
|
||||
else:
|
||||
return color(str(status_str), Colors.RED)
|
||||
|
||||
print(f"{'Method':<12} {'Username':<25} {'HTTP Status':<12} {'Result':<20}")
|
||||
print("-" * 70)
|
||||
print(f"{'aiohttp':<12} {claimed:<25} {status_icon(aiohttp_claimed['status']):<20} {'OK' if not aiohttp_claimed['error'] else aiohttp_claimed['error'][:20]}")
|
||||
print(f"{'aiohttp':<12} {unclaimed:<25} {status_icon(aiohttp_unclaimed['status']):<20} {'OK' if not aiohttp_unclaimed['error'] else aiohttp_unclaimed['error'][:20]}")
|
||||
print(f"{'Maigret':<12} {claimed:<25} {status_icon(maigret_claimed.get('http_status')):<20} {maigret_status_icon(maigret_claimed.get('status_str'))}")
|
||||
print(f"{'Maigret':<12} {unclaimed:<25} {status_icon(maigret_unclaimed.get('http_status')):<20} {maigret_status_icon(maigret_unclaimed.get('status_str'))}")
|
||||
|
||||
# Check for discrepancies
|
||||
print(f"\n--- {color('DISCREPANCY ANALYSIS', Colors.CYAN)} ---")
|
||||
issues = []
|
||||
|
||||
if aiohttp_claimed['status'] != maigret_claimed.get('http_status'):
|
||||
issues.append(f"HTTP status mismatch for claimed: aiohttp={aiohttp_claimed['status']}, Maigret={maigret_claimed.get('http_status')}")
|
||||
|
||||
if aiohttp_unclaimed['status'] != maigret_unclaimed.get('http_status'):
|
||||
issues.append(f"HTTP status mismatch for unclaimed: aiohttp={aiohttp_unclaimed['status']}, Maigret={maigret_unclaimed.get('http_status')}")
|
||||
|
||||
# Check Maigret detection correctness
|
||||
claimed_detected = "Claimed" in str(maigret_claimed.get('status_str', ''))
|
||||
unclaimed_detected = "Available" in str(maigret_unclaimed.get('status_str', ''))
|
||||
|
||||
if not claimed_detected:
|
||||
issues.append(f"Maigret did NOT detect claimed user '{claimed}' as Claimed")
|
||||
if not unclaimed_detected:
|
||||
issues.append(f"Maigret did NOT detect unclaimed user '{unclaimed}' as Available")
|
||||
|
||||
if issues:
|
||||
for issue in issues:
|
||||
print(f" {color('[!]', Colors.RED)} {issue}")
|
||||
else:
|
||||
print(f" {color('[OK]', Colors.GREEN)} Both methods agree on results")
|
||||
|
||||
return {
|
||||
"aiohttp_claimed": aiohttp_claimed,
|
||||
"aiohttp_unclaimed": aiohttp_unclaimed,
|
||||
"maigret_claimed": maigret_claimed,
|
||||
"maigret_unclaimed": maigret_unclaimed,
|
||||
"issues": issues,
|
||||
}
|
||||
|
||||
|
||||
async def diagnose_site(site_config: dict, site_name: str) -> dict:
|
||||
"""Full diagnosis of a site configuration."""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"{color('FULL SITE DIAGNOSIS', Colors.CYAN)}: {site_name}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
diagnosis = {
|
||||
"site_name": site_name,
|
||||
"issues": [],
|
||||
"warnings": [],
|
||||
"recommendations": [],
|
||||
"working": False,
|
||||
}
|
||||
|
||||
# 1. Config analysis
|
||||
print(f"--- {color('1. CONFIGURATION', Colors.BOLD)} ---")
|
||||
check_type = site_config.get("checkType", "status_code")
|
||||
url = site_config.get("url", "")
|
||||
url_main = site_config.get("urlMain", "")
|
||||
claimed = site_config.get("usernameClaimed")
|
||||
unclaimed = site_config.get("usernameUnclaimed", "noonewouldeverusethis7")
|
||||
disabled = site_config.get("disabled", False)
|
||||
|
||||
print(f" checkType: {check_type}")
|
||||
print(f" URL: {url}")
|
||||
print(f" urlMain: {url_main}")
|
||||
print(f" usernameClaimed: {claimed}")
|
||||
print(f" disabled: {disabled}")
|
||||
|
||||
if disabled:
|
||||
diagnosis["issues"].append("Site is disabled")
|
||||
print(f" {color('[!]', Colors.YELLOW)} Site is disabled")
|
||||
|
||||
if not claimed:
|
||||
diagnosis["issues"].append("No usernameClaimed defined")
|
||||
print(f" {color('[!]', Colors.RED)} No usernameClaimed defined")
|
||||
return diagnosis
|
||||
|
||||
# Build full URL (display URL)
|
||||
url_template = url.replace("{urlMain}", url_main).replace("{urlSubpath}", site_config.get("urlSubpath", ""))
|
||||
|
||||
# Build probe URL (what Maigret actually requests)
|
||||
url_probe = site_config.get("urlProbe", "")
|
||||
if url_probe:
|
||||
probe_template = url_probe.replace("{urlMain}", url_main).replace("{urlSubpath}", site_config.get("urlSubpath", ""))
|
||||
else:
|
||||
probe_template = url_template
|
||||
|
||||
# Detect request method and payload
|
||||
request_method = site_config.get("requestMethod", "GET").upper()
|
||||
request_payload_template = site_config.get("requestPayload")
|
||||
|
||||
headers = DEFAULT_HEADERS.copy()
|
||||
# For API probes (urlProbe, POST), use neutral Accept header instead of text/html
|
||||
# which can cause servers to return HTML instead of JSON
|
||||
if url_probe or request_method == "POST":
|
||||
headers["Accept"] = "*/*"
|
||||
if site_config.get("headers"):
|
||||
headers.update(site_config["headers"])
|
||||
|
||||
if url_probe:
|
||||
print(f" urlProbe: {url_probe}")
|
||||
if request_method != "GET":
|
||||
print(f" requestMethod: {request_method}")
|
||||
if request_payload_template:
|
||||
print(f" requestPayload: {request_payload_template}")
|
||||
|
||||
# 2. Connectivity test
|
||||
print(f"\n--- {color('2. CONNECTIVITY TEST', Colors.BOLD)} ---")
|
||||
probe_claimed = probe_template.replace("{username}", claimed)
|
||||
probe_unclaimed = probe_template.replace("{username}", unclaimed)
|
||||
|
||||
# Build payloads with username substituted
|
||||
payload_claimed = None
|
||||
payload_unclaimed = None
|
||||
if request_payload_template and request_method == "POST":
|
||||
payload_claimed = json.loads(
|
||||
json.dumps(request_payload_template).replace("{username}", claimed)
|
||||
)
|
||||
payload_unclaimed = json.loads(
|
||||
json.dumps(request_payload_template).replace("{username}", unclaimed)
|
||||
)
|
||||
|
||||
result_claimed, result_unclaimed = await asyncio.gather(
|
||||
check_url_aiohttp(probe_claimed, headers, method=request_method, payload=payload_claimed),
|
||||
check_url_aiohttp(probe_unclaimed, headers, method=request_method, payload=payload_unclaimed)
|
||||
)
|
||||
|
||||
print(f" Claimed ({claimed}): status={result_claimed['status']}, error={result_claimed['error']}")
|
||||
print(f" Unclaimed ({unclaimed}): status={result_unclaimed['status']}, error={result_unclaimed['error']}")
|
||||
|
||||
# Check for common problems
|
||||
if result_claimed["error_type"] == "timeout":
|
||||
diagnosis["issues"].append("Timeout on claimed username")
|
||||
if result_unclaimed["error_type"] == "timeout":
|
||||
diagnosis["issues"].append("Timeout on unclaimed username")
|
||||
|
||||
if result_claimed.get("markers", {}).get("cloudflare"):
|
||||
diagnosis["warnings"].append("Cloudflare protection detected")
|
||||
if result_claimed.get("markers", {}).get("captcha"):
|
||||
diagnosis["warnings"].append("Captcha detected")
|
||||
if result_claimed["status"] == 403:
|
||||
diagnosis["issues"].append("403 Forbidden - possible anti-bot protection")
|
||||
if result_claimed["status"] == 429:
|
||||
diagnosis["issues"].append("429 Rate Limited")
|
||||
|
||||
# 3. Check type validation
|
||||
print(f"\n--- {color('3. CHECK TYPE VALIDATION', Colors.BOLD)} ---")
|
||||
|
||||
if check_type == "status_code":
|
||||
if result_claimed["status"] == result_unclaimed["status"]:
|
||||
diagnosis["issues"].append(f"status_code check but same status ({result_claimed['status']}) for both")
|
||||
print(f" {color('[FAIL]', Colors.RED)} Same status code for claimed and unclaimed: {result_claimed['status']}")
|
||||
else:
|
||||
print(f" {color('[OK]', Colors.GREEN)} Status codes differ: {result_claimed['status']} vs {result_unclaimed['status']}")
|
||||
diagnosis["working"] = True
|
||||
|
||||
elif check_type == "response_url":
|
||||
if result_claimed["final_url"] == result_unclaimed["final_url"]:
|
||||
diagnosis["issues"].append("response_url check but same final URL for both")
|
||||
print(f" {color('[FAIL]', Colors.RED)} Same final URL for both")
|
||||
else:
|
||||
print(f" {color('[OK]', Colors.GREEN)} Final URLs differ")
|
||||
diagnosis["working"] = True
|
||||
|
||||
elif check_type == "message":
|
||||
presense_strs = site_config.get("presenseStrs", [])
|
||||
absence_strs = site_config.get("absenceStrs", [])
|
||||
|
||||
print(f" presenseStrs: {presense_strs}")
|
||||
print(f" absenceStrs: {absence_strs}")
|
||||
|
||||
claimed_content = result_claimed.get("content", "") or ""
|
||||
unclaimed_content = result_unclaimed.get("content", "") or ""
|
||||
|
||||
# Check presenseStrs
|
||||
presense_found_claimed = any(s in claimed_content for s in presense_strs) if presense_strs else True
|
||||
presense_found_unclaimed = any(s in unclaimed_content for s in presense_strs) if presense_strs else True
|
||||
|
||||
# Check absenceStrs
|
||||
absence_found_claimed = any(s in claimed_content for s in absence_strs) if absence_strs else False
|
||||
absence_found_unclaimed = any(s in unclaimed_content for s in absence_strs) if absence_strs else False
|
||||
|
||||
print(f" Claimed - presenseStrs found: {presense_found_claimed}, absenceStrs found: {absence_found_claimed}")
|
||||
print(f" Unclaimed - presenseStrs found: {presense_found_unclaimed}, absenceStrs found: {absence_found_unclaimed}")
|
||||
|
||||
if presense_strs and not presense_found_claimed:
|
||||
diagnosis["issues"].append(f"presenseStrs {presense_strs} not found in claimed page")
|
||||
print(f" {color('[FAIL]', Colors.RED)} presenseStrs not found in claimed page")
|
||||
if absence_strs and absence_found_claimed:
|
||||
diagnosis["issues"].append(f"absenceStrs {absence_strs} found in claimed page (should not be)")
|
||||
print(f" {color('[FAIL]', Colors.RED)} absenceStrs found in claimed page")
|
||||
if absence_strs and not absence_found_unclaimed:
|
||||
diagnosis["warnings"].append(f"absenceStrs not found in unclaimed page")
|
||||
print(f" {color('[WARN]', Colors.YELLOW)} absenceStrs not found in unclaimed page")
|
||||
|
||||
# Check works if: claimed is detected as present AND unclaimed is detected as absent.
|
||||
# Presence detection: presenseStrs found (or empty = always true).
|
||||
# Absence detection: absenceStrs found in unclaimed (or empty = never, rely on presenseStrs only).
|
||||
# With only presenseStrs: works if found in claimed but NOT in unclaimed.
|
||||
# With only absenceStrs: works if found in unclaimed but NOT in claimed.
|
||||
# With both: standard combination.
|
||||
claimed_is_present = presense_found_claimed and not absence_found_claimed
|
||||
unclaimed_is_absent = (
|
||||
(absence_strs and absence_found_unclaimed) or
|
||||
(presense_strs and not presense_found_unclaimed)
|
||||
)
|
||||
if claimed_is_present and unclaimed_is_absent:
|
||||
print(f" {color('[OK]', Colors.GREEN)} Message check should work correctly")
|
||||
diagnosis["working"] = True
|
||||
|
||||
# 4. Recommendations
|
||||
print(f"\n--- {color('4. RECOMMENDATIONS', Colors.BOLD)} ---")
|
||||
|
||||
if not diagnosis["working"]:
|
||||
# Suggest alternatives
|
||||
if result_claimed["status"] != result_unclaimed["status"]:
|
||||
diagnosis["recommendations"].append(f"Switch to checkType: status_code (status {result_claimed['status']} vs {result_unclaimed['status']})")
|
||||
if result_claimed["final_url"] != result_unclaimed["final_url"]:
|
||||
diagnosis["recommendations"].append("Switch to checkType: response_url")
|
||||
if result_claimed["title"] != result_unclaimed["title"]:
|
||||
diagnosis["recommendations"].append(f"Use title as marker: presenseStrs=['{result_claimed['title']}'] or absenceStrs=['{result_unclaimed['title']}']")
|
||||
|
||||
if diagnosis["recommendations"]:
|
||||
for rec in diagnosis["recommendations"]:
|
||||
print(f" -> {rec}")
|
||||
elif diagnosis["working"]:
|
||||
print(f" {color('Site appears to be working correctly', Colors.GREEN)}")
|
||||
else:
|
||||
print(f" {color('No clear fix found - site may need special handling or should be disabled', Colors.RED)}")
|
||||
|
||||
# Summary
|
||||
print(f"\n--- {color('SUMMARY', Colors.BOLD)} ---")
|
||||
if diagnosis["issues"]:
|
||||
print(f" Issues: {len(diagnosis['issues'])}")
|
||||
for issue in diagnosis["issues"]:
|
||||
print(f" - {issue}")
|
||||
if diagnosis["warnings"]:
|
||||
print(f" Warnings: {len(diagnosis['warnings'])}")
|
||||
for warn in diagnosis["warnings"]:
|
||||
print(f" - {warn}")
|
||||
print(f" Working: {color('YES', Colors.GREEN) if diagnosis['working'] else color('NO', Colors.RED)}")
|
||||
|
||||
return diagnosis
|
||||
|
||||
|
||||
def load_site_from_db(site_name: str) -> Tuple[Optional[dict], Optional['MaigretSite']]:
|
||||
"""Load site config from data.json. Returns (config_dict, MaigretSite or None)."""
|
||||
db_path = Path(__file__).parent.parent / "maigret" / "resources" / "data.json"
|
||||
|
||||
with open(db_path) as f:
|
||||
data = json.load(f)
|
||||
|
||||
config = None
|
||||
if site_name in data["sites"]:
|
||||
config = data["sites"][site_name]
|
||||
else:
|
||||
# Try case-insensitive search
|
||||
for name, cfg in data["sites"].items():
|
||||
if name.lower() == site_name.lower():
|
||||
config = cfg
|
||||
site_name = name
|
||||
break
|
||||
|
||||
if not config:
|
||||
return None, None
|
||||
|
||||
# Also load MaigretSite if available
|
||||
maigret_site = None
|
||||
if MAIGRET_AVAILABLE:
|
||||
try:
|
||||
db = MaigretDatabase().load_from_path(db_path)
|
||||
maigret_site = db.sites_dict.get(site_name)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return config, maigret_site
|
||||
|
||||
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Site check utility for Maigret development",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
%(prog)s --site "VK" --check-claimed # Test site with aiohttp
|
||||
%(prog)s --site "VK" --maigret # Test site with Maigret
|
||||
%(prog)s --site "VK" --compare-methods # Compare aiohttp vs Maigret
|
||||
%(prog)s --site "VK" --diagnose # Full diagnosis
|
||||
%(prog)s --url "https://vk.com/{username}" --compare blue nobody123
|
||||
%(prog)s --site "VK" --find-user # Find a valid username
|
||||
"""
|
||||
)
|
||||
parser.add_argument("--site", "-s", help="Site name from data.json")
|
||||
parser.add_argument("--url", "-u", help="URL template with {username}")
|
||||
parser.add_argument("--test", "-t", help="Username to test")
|
||||
parser.add_argument("--compare", "-c", nargs=2, metavar=("CLAIMED", "UNCLAIMED"),
|
||||
help="Compare two usernames")
|
||||
parser.add_argument("--find-user", "-f", action="store_true",
|
||||
help="Find a valid username")
|
||||
parser.add_argument("--check-claimed", action="store_true",
|
||||
help="Check if claimed username still works (aiohttp)")
|
||||
parser.add_argument("--maigret", "-m", action="store_true",
|
||||
help="Test using Maigret's checker instead of aiohttp")
|
||||
parser.add_argument("--compare-methods", action="store_true",
|
||||
help="Compare aiohttp vs Maigret results")
|
||||
parser.add_argument("--diagnose", "-d", action="store_true",
|
||||
help="Full diagnosis of site configuration")
|
||||
parser.add_argument("--headers", help="Custom headers as JSON")
|
||||
parser.add_argument("--timeout", type=int, default=15, help="Request timeout in seconds")
|
||||
parser.add_argument("--json", action="store_true", help="Output results as JSON")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
url_template = None
|
||||
claimed = None
|
||||
unclaimed = "noonewouldeverusethis7"
|
||||
headers = DEFAULT_HEADERS.copy()
|
||||
site_config = None
|
||||
maigret_site = None
|
||||
|
||||
# Load from site name
|
||||
if args.site:
|
||||
site_config, maigret_site = load_site_from_db(args.site)
|
||||
if not site_config:
|
||||
print(f"Site '{args.site}' not found in database")
|
||||
sys.exit(1)
|
||||
|
||||
url_template = site_config.get("url", "")
|
||||
url_main = site_config.get("urlMain", "")
|
||||
url_subpath = site_config.get("urlSubpath", "")
|
||||
url_template = url_template.replace("{urlMain}", url_main).replace("{urlSubpath}", url_subpath)
|
||||
|
||||
claimed = site_config.get("usernameClaimed")
|
||||
unclaimed = site_config.get("usernameUnclaimed", unclaimed)
|
||||
|
||||
if site_config.get("headers"):
|
||||
headers.update(site_config["headers"])
|
||||
|
||||
if not args.json:
|
||||
print(f"Loaded site: {args.site}")
|
||||
print(f" URL: {url_template}")
|
||||
print(f" Claimed: {claimed}")
|
||||
print(f" CheckType: {site_config.get('checkType', 'unknown')}")
|
||||
print(f" Disabled: {site_config.get('disabled', False)}")
|
||||
|
||||
# Override with explicit URL
|
||||
if args.url:
|
||||
url_template = args.url
|
||||
|
||||
# Custom headers
|
||||
if args.headers:
|
||||
headers.update(json.loads(args.headers))
|
||||
|
||||
# Actions
|
||||
if args.diagnose:
|
||||
if not site_config:
|
||||
print("--diagnose requires --site")
|
||||
sys.exit(1)
|
||||
result = await diagnose_site(site_config, args.site)
|
||||
if args.json:
|
||||
print(json.dumps(result, indent=2, default=str))
|
||||
|
||||
elif args.compare_methods:
|
||||
if not maigret_site:
|
||||
if not MAIGRET_AVAILABLE:
|
||||
print("Maigret imports not available")
|
||||
else:
|
||||
print("Could not load MaigretSite object")
|
||||
sys.exit(1)
|
||||
result = await compare_methods(maigret_site, claimed, unclaimed)
|
||||
if args.json:
|
||||
print(json.dumps(result, indent=2, default=str))
|
||||
|
||||
elif args.maigret:
|
||||
if not maigret_site:
|
||||
if not MAIGRET_AVAILABLE:
|
||||
print("Maigret imports not available")
|
||||
else:
|
||||
print("Could not load MaigretSite object")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"\n--- Testing with Maigret ---")
|
||||
for username in [claimed, unclaimed]:
|
||||
result = await check_url_maigret(maigret_site, username)
|
||||
print(f" {username}: status={result.get('status_str')}, http={result.get('http_status')}, error={result.get('error')}")
|
||||
|
||||
elif args.find_user:
|
||||
if not url_template:
|
||||
print("--find-user requires --site or --url")
|
||||
sys.exit(1)
|
||||
result = await find_valid_username(url_template, headers=headers)
|
||||
if result:
|
||||
print(f"\n{color('Found valid username:', Colors.GREEN)} {result}")
|
||||
else:
|
||||
print(f"\n{color('No valid username found', Colors.RED)}")
|
||||
|
||||
elif args.compare:
|
||||
if not url_template:
|
||||
print("--compare requires --site or --url")
|
||||
sys.exit(1)
|
||||
result = await compare_users_aiohttp(url_template, args.compare[0], args.compare[1], headers)
|
||||
if args.json:
|
||||
# Remove content field for JSON output (too large)
|
||||
for r in result:
|
||||
if isinstance(r, dict) and "content" in r:
|
||||
del r["content"]
|
||||
print(json.dumps(result, indent=2, default=str))
|
||||
|
||||
elif args.check_claimed and claimed:
|
||||
result = await compare_users_aiohttp(url_template, claimed, unclaimed, headers)
|
||||
|
||||
elif args.test:
|
||||
if not url_template:
|
||||
print("--test requires --site or --url")
|
||||
sys.exit(1)
|
||||
url = url_template.replace("{username}", args.test)
|
||||
result = await check_url_aiohttp(url, headers, timeout=args.timeout)
|
||||
if "content" in result:
|
||||
del result["content"] # Too large for display
|
||||
print(json.dumps(result, indent=2, default=str))
|
||||
|
||||
else:
|
||||
# Default: check claimed username if available
|
||||
if url_template and claimed:
|
||||
await compare_users_aiohttp(url_template, claimed, unclaimed, headers)
|
||||
else:
|
||||
parser.print_help()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
+134
-39
@@ -4,6 +4,7 @@ This module generates the listing of supported sites in file `SITES.md`
|
||||
and pretty prints file with sites data.
|
||||
"""
|
||||
import sys
|
||||
import socket
|
||||
import requests
|
||||
import logging
|
||||
import threading
|
||||
@@ -24,36 +25,87 @@ RANKS.update({
|
||||
'100000000': '100M',
|
||||
})
|
||||
|
||||
SEMAPHORE = threading.Semaphore(20)
|
||||
|
||||
|
||||
def get_rank(domain_to_query, site, print_errors=True):
|
||||
with SEMAPHORE:
|
||||
# Retrieve ranking data via alexa API
|
||||
url = f"http://data.alexa.com/data?cli=10&url={domain_to_query}"
|
||||
xml_data = requests.get(url).text
|
||||
root = ET.fromstring(xml_data)
|
||||
import csv
|
||||
import io
|
||||
from urllib.parse import urlparse
|
||||
|
||||
try:
|
||||
#Get ranking for this site.
|
||||
site.alexa_rank = int(root.find('.//REACH').attrib['RANK'])
|
||||
# country = root.find('.//COUNTRY')
|
||||
# if not country is None and country.attrib:
|
||||
# country_code = country.attrib['CODE']
|
||||
# tags = set(site.tags)
|
||||
# if country_code:
|
||||
# tags.add(country_code.lower())
|
||||
# site.tags = sorted(list(tags))
|
||||
# if site.type != 'username':
|
||||
# site.disabled = False
|
||||
except Exception as e:
|
||||
if print_errors:
|
||||
logging.error(e)
|
||||
# We did not find the rank for some reason.
|
||||
print(f"Error retrieving rank information for '{domain_to_query}'")
|
||||
print(f" Returned XML is |{xml_data}|")
|
||||
def fetch_majestic_million():
|
||||
print("Fetching Majestic Million CSV (this may take a few seconds)...")
|
||||
ranks = {}
|
||||
url = "https://downloads.majestic.com/majestic_million.csv"
|
||||
try:
|
||||
response = requests.get(url, stream=True)
|
||||
response.raise_for_status()
|
||||
|
||||
csv_file = io.StringIO(response.text)
|
||||
reader = csv.reader(csv_file)
|
||||
next(reader) # skip headers
|
||||
|
||||
for row in reader:
|
||||
if not row or len(row) < 3:
|
||||
continue
|
||||
rank = int(row[0])
|
||||
domain = row[2].lower()
|
||||
ranks[domain] = rank
|
||||
except Exception as e:
|
||||
logging.error(f"Error fetching Majestic Million: {e}")
|
||||
|
||||
print(f"Loaded {len(ranks)} domains from Majestic Million.")
|
||||
return ranks
|
||||
|
||||
return
|
||||
def get_base_domain(url):
|
||||
try:
|
||||
netloc = urlparse(url).netloc
|
||||
if netloc.startswith('www.'):
|
||||
netloc = netloc[4:]
|
||||
return netloc.lower()
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def check_dns(domain, timeout=5):
|
||||
"""Check if a domain resolves via DNS. Returns True if it resolves."""
|
||||
try:
|
||||
socket.setdefaulttimeout(timeout)
|
||||
socket.getaddrinfo(domain, None)
|
||||
return True
|
||||
except (socket.gaierror, socket.timeout, OSError):
|
||||
return False
|
||||
|
||||
|
||||
def check_sites_dns(sites):
|
||||
"""Check DNS resolution for all sites. Returns a set of site names that failed."""
|
||||
SKIP_TLDS = ('.onion', '.i2p')
|
||||
domains = {}
|
||||
for site in sites:
|
||||
domain = get_base_domain(site.url_main)
|
||||
if domain and not any(domain.endswith(tld) for tld in SKIP_TLDS):
|
||||
domains.setdefault(domain, []).append(site)
|
||||
|
||||
failed_sites = set()
|
||||
results = {}
|
||||
|
||||
def resolve(domain):
|
||||
results[domain] = check_dns(domain)
|
||||
|
||||
threads = []
|
||||
for domain in domains:
|
||||
t = threading.Thread(target=resolve, args=(domain,))
|
||||
threads.append(t)
|
||||
t.start()
|
||||
|
||||
for t in threads:
|
||||
t.join()
|
||||
|
||||
for domain, resolved in results.items():
|
||||
if not resolved:
|
||||
for site in domains[domain]:
|
||||
failed_sites.add(site.name)
|
||||
logging.warning(f"DNS resolution failed for {domain}")
|
||||
|
||||
return failed_sites
|
||||
|
||||
|
||||
def get_step_rank(rank):
|
||||
@@ -78,6 +130,8 @@ def main():
|
||||
parser.add_argument('--empty-only', help='update only sites without rating', action='store_true')
|
||||
parser.add_argument('--exclude-engine', help='do not update score with certain engine',
|
||||
action="append", dest="exclude_engine_list", default=[])
|
||||
parser.add_argument('--dns-check', help='disable sites whose domains do not resolve via DNS',
|
||||
action='store_true')
|
||||
|
||||
pool = list()
|
||||
|
||||
@@ -91,30 +145,51 @@ def main():
|
||||
with open("sites.md", "w") as site_file:
|
||||
site_file.write(f"""
|
||||
## List of supported sites (search methods): total {len(sites_subset)}\n
|
||||
Rank data fetched from Alexa by domains.
|
||||
Rank data fetched from Majestic Million by domains.
|
||||
|
||||
""")
|
||||
|
||||
if args.dns_check:
|
||||
print("Checking DNS resolution for all site domains...")
|
||||
failed = check_sites_dns(sites_subset)
|
||||
disabled_count = 0
|
||||
re_enabled_count = 0
|
||||
for site in sites_subset:
|
||||
if site.name in failed:
|
||||
if not site.disabled:
|
||||
site.disabled = True
|
||||
disabled_count += 1
|
||||
print(f" Disabled {site.name}: DNS does not resolve ({get_base_domain(site.url_main)})")
|
||||
else:
|
||||
if site.disabled:
|
||||
# Re-enable previously disabled site if DNS now resolves
|
||||
# (only if it was likely disabled due to DNS failure)
|
||||
pass
|
||||
print(f"DNS check complete: {disabled_count} site(s) disabled, {len(failed)} domain(s) unresolvable.")
|
||||
|
||||
majestic_ranks = {}
|
||||
if args.with_rank:
|
||||
majestic_ranks = fetch_majestic_million()
|
||||
|
||||
for site in sites_subset:
|
||||
if not args.with_rank:
|
||||
break
|
||||
url_main = site.url_main
|
||||
|
||||
if site.alexa_rank < sys.maxsize and args.empty_only:
|
||||
continue
|
||||
if args.exclude_engine_list and site.engine in args.exclude_engine_list:
|
||||
continue
|
||||
site.alexa_rank = 0
|
||||
th = threading.Thread(target=get_rank, args=(url_main, site,))
|
||||
pool.append((site.name, url_main, th))
|
||||
th.start()
|
||||
|
||||
|
||||
domain = get_base_domain(site.url_main)
|
||||
|
||||
if domain in majestic_ranks:
|
||||
site.alexa_rank = majestic_ranks[domain]
|
||||
else:
|
||||
site.alexa_rank = sys.maxsize
|
||||
|
||||
# In memory matching complete, no threads to join
|
||||
if args.with_rank:
|
||||
index = 1
|
||||
for site_name, url_main, th in pool:
|
||||
th.join()
|
||||
sys.stdout.write("\r{0}".format(f"Updated {index} out of {len(sites_subset)} entries"))
|
||||
sys.stdout.flush()
|
||||
index = index + 1
|
||||
print("Successfully updated ranks matching Majestic Million dataset.")
|
||||
|
||||
sites_full_list = [(s, int(s.alexa_rank)) for s in sites_subset]
|
||||
|
||||
@@ -142,6 +217,26 @@ Rank data fetched from Alexa by domains.
|
||||
site_file.write(f'\nThe list was updated at ({datetime.now(timezone.utc).date()})\n')
|
||||
db.save_to_file(args.base_file)
|
||||
|
||||
# Regenerate db_meta.json to stay in sync with data.json
|
||||
try:
|
||||
import hashlib, json, os
|
||||
db_data_raw = open(args.base_file, 'rb').read()
|
||||
db_data_parsed = json.loads(db_data_raw)
|
||||
meta = {
|
||||
"version": 1,
|
||||
"updated_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"sites_count": len(db_data_parsed.get("sites", {})),
|
||||
"min_maigret_version": "0.5.0",
|
||||
"data_sha256": hashlib.sha256(db_data_raw).hexdigest(),
|
||||
"data_url": "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/data.json",
|
||||
}
|
||||
meta_path = os.path.join(os.path.dirname(args.base_file), "db_meta.json")
|
||||
with open(meta_path, "w", encoding="utf-8") as mf:
|
||||
json.dump(meta, mf, indent=4, ensure_ascii=False)
|
||||
print(f"Updated {meta_path} ({meta['sites_count']} sites)")
|
||||
except Exception as e:
|
||||
print(f"Warning: could not regenerate db_meta.json: {e}")
|
||||
|
||||
statistics_text = db.get_db_stats(is_markdown=True)
|
||||
site_file.write('## Statistics\n\n')
|
||||
site_file.write(statistics_text)
|
||||
|
||||
Reference in New Issue
Block a user