mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-09 08:04:32 +00:00
Compare commits
222 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 7fd9bb3692 | |||
| 385f9f5bb3 | |||
| dc8751ac55 | |||
| 9303b1686d | |||
| aa80bd4232 | |||
| f5c4b1c35d | |||
| 5e24117e93 | |||
| 777e503e30 | |||
| c222c96aeb | |||
| b213f6e079 | |||
| 9354331874 | |||
| 8a82eb6ee6 | |||
| a61f3b32c4 | |||
| fbb8255518 | |||
| 9bad5d8269 | |||
| a8e7ab4540 | |||
| 6db1df2ddb | |||
| 23adc178ea | |||
| 6834483360 | |||
| 6ed8fdefcc | |||
| 3fd34afb77 | |||
| ad95302745 | |||
| 44a6c729e3 | |||
| 6d0a22b738 | |||
| abce3c9be4 | |||
| 269d50eedc | |||
| e8f4318e5d | |||
| 75289c78bf | |||
| eeb38ccdc0 | |||
| d136014576 | |||
| 5d502eaef6 | |||
| 9e8a701c54 | |||
| 7b67c61240 | |||
| 0e113c4592 | |||
| fb4e17be92 | |||
| adb19e5930 | |||
| 116fae3e0f | |||
| bf495cd57e | |||
| e49aa533df | |||
| 5aa7f6429b | |||
| a5d337b765 | |||
| 5aa0c908b0 | |||
| 51b452ad71 | |||
| fa1a4d1b4a | |||
| 184519b202 | |||
| a203eecbb2 | |||
| dde1cd5d78 | |||
| 547512519b | |||
| b333a2e2b2 | |||
| 2835ec71c7 | |||
| af67a6a3f3 | |||
| 4f737b5260 | |||
| 185e09e4ea | |||
| 5865e0f375 | |||
| 815c8cb2f3 | |||
| 656fe1df24 | |||
| 1c5dc5f152 | |||
| bc3d9faad9 | |||
| 5aae2ee005 | |||
| b145e7b26f | |||
| abd9aa57fe | |||
| 2e430e5039 | |||
| f5786f11ce | |||
| 3e56c95e16 | |||
| 28f35f9a4f | |||
| 79cea49526 | |||
| 2d94269656 | |||
| 829bda885a | |||
| eb541dcf51 | |||
| 4c97025a32 | |||
| 2775181a6a | |||
| b00ef1f5dd | |||
| d3f13ac295 | |||
| 479a614d1d | |||
| e0559e4320 | |||
| 00a9249229 | |||
| 005863c2e0 | |||
| e3aada6aef | |||
| 9b35fc1ab0 | |||
| 146bc0481b | |||
| 5930a3022e | |||
| b4482e0ba4 | |||
| 2c55501bc2 | |||
| 3ba07591a1 | |||
| a2d4373b68 | |||
| b960acec10 | |||
| b1a211c3cd | |||
| 56d0c9f2f1 | |||
| 01049b730d | |||
| 2c2d3409e2 | |||
| e81b50ef61 | |||
| 9ac0a65914 | |||
| 4f397fed1c | |||
| a17e0c7a13 | |||
| e84e394e6f | |||
| b8ada1c818 | |||
| 959b2be136 | |||
| 97cc4b46d9 | |||
| f3b741d283 | |||
| 33620853a1 | |||
| 19ed03a94d | |||
| 35372446e0 | |||
| 519bb46db6 | |||
| 227a25bfa1 | |||
| 5da4e78092 | |||
| e4d6b064df | |||
| f99091f5f7 | |||
| f26976f1dd | |||
| 83ae9c0133 | |||
| 93c4fdeba9 | |||
| 6ec3c47769 | |||
| 3dc3fe9371 | |||
| ebf8227bf1 | |||
| 5b7b28e683 | |||
| 0e95e2e3cc | |||
| 4cd1fccaa3 | |||
| 83a9dafe55 | |||
| b4147d2cd3 | |||
| aa591da913 | |||
| 2d4d3ba0cc | |||
| ec21bbe974 | |||
| 1a4190ee03 | |||
| fe60783a68 | |||
| 8aa0fab314 | |||
| 941a5171ae | |||
| 9a1bd8ffdb | |||
| 68f586fcca | |||
| e39476c4c7 | |||
| 6a7f778c80 | |||
| 7679f98e58 | |||
| c6dbc09ba5 | |||
| b8352c3406 | |||
| 8a02ad5ed7 | |||
| 8fda5776c6 | |||
| 2347bd2f7d | |||
| 229472f323 | |||
| 6acc22dd69 | |||
| 8af07b3889 | |||
| e9df40bdce | |||
| d5bef9e3ac | |||
| 25121754bd | |||
| 198c11b8d4 | |||
| bf9bc5a518 | |||
| 41e246f6a6 | |||
| 9f58fb27ad | |||
| b344a5d98a | |||
| d8b26181f1 | |||
| a60d96c7f2 | |||
| a3159b213b | |||
| 123ead4c03 | |||
| cd7571ef57 | |||
| d922f9be25 | |||
| 3b20b36609 | |||
| ba86981cf4 | |||
| 561ced647f | |||
| 7be3ee8240 | |||
| 48ca13dc4d | |||
| 7f94e86259 | |||
| c2ed1af4b4 | |||
| 648ba6e64c | |||
| 56815d8368 | |||
| b178e97d90 | |||
| a764198c2c | |||
| 2c4684e4a9 | |||
| 8713e1a63e | |||
| 55adc70d10 | |||
| 53fc83dbce | |||
| e8bd00f013 | |||
| a0ba853e64 | |||
| 54b4c7d2ab | |||
| 8791bca866 | |||
| fb26ccd1f6 | |||
| c22abdb834 | |||
| 0689470506 | |||
| 410d7568b7 | |||
| 7280033198 | |||
| 3c6af42916 | |||
| cdb896ba32 | |||
| 6bd047fda3 | |||
| e30cf353a6 | |||
| bd9e48de7c | |||
| aec4fef8db | |||
| 1da49bd208 | |||
| 6da39cf3d5 | |||
| f869eb49ca | |||
| bebadb0362 | |||
| 495eef6ad5 | |||
| e1c72bfb94 | |||
| deb13c9638 | |||
| 1e8e1acd58 | |||
| 5e88fd9ba8 | |||
| 6bc836d6c4 | |||
| 080611c8b9 | |||
| c3cf589aed | |||
| e01d5caae1 | |||
| d90d8a8ac9 | |||
| c3ce8a200b | |||
| 65ea5ceeb1 | |||
| bca1d4bfd8 | |||
| c9e38632ca | |||
| 5f8ce2da98 | |||
| bc6f7f831d | |||
| f95c71d009 | |||
| 974c93f327 | |||
| ed7b65e5ed | |||
| f76ea5d738 | |||
| 960b28d454 | |||
| 09eef6701a | |||
| 329ef27eff | |||
| ccb3b3bbd1 | |||
| b21ac36b27 | |||
| 0f7aa2c456 | |||
| c0e60e25b8 | |||
| 4195a3ca21 | |||
| 5b3b81b482 | |||
| 29d2c07a76 | |||
| 7ff2424de1 | |||
| fc1dd9380e | |||
| e423d72576 | |||
| 9bc6c3370c | |||
| b90cdb1981 | |||
| 21b35e3798 |
@@ -1,3 +1,10 @@
|
||||
#!/bin/sh
|
||||
echo 'Activating update_sitesmd hook script...'
|
||||
poetry run update_sitesmd
|
||||
poetry run update_sitesmd
|
||||
|
||||
echo 'Regenerating db_meta.json...'
|
||||
python3 utils/generate_db_meta.py
|
||||
|
||||
git add maigret/resources/db_meta.json
|
||||
git add maigret/resources/data.json
|
||||
git add sites.md
|
||||
|
||||
@@ -2,54 +2,69 @@ name: Package exe with PyInstaller - Windows
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main, dev ]
|
||||
branches: [main, dev]
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: PyInstaller Windows Build
|
||||
uses: JackMcKew/pyinstaller-action-windows@main
|
||||
with:
|
||||
path: pyinstaller
|
||||
# Wine Python (not Linux) runs PyInstaller; altgraph needs pkg_resources — reinstall setuptools after all deps.
|
||||
- name: Prepare requirements for Wine (setuptools last)
|
||||
run: |
|
||||
set -euo pipefail
|
||||
cp pyinstaller/requirements.txt pyinstaller/requirements-wine.txt
|
||||
{
|
||||
echo ""
|
||||
echo "# CI: setuptools last so pkg_resources exists for PyInstaller/altgraph in Wine"
|
||||
echo "setuptools==70.0.0"
|
||||
} >> pyinstaller/requirements-wine.txt
|
||||
|
||||
- name: Upload PyInstaller Binary to Workflow as Artifact
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: maigret_standalone_win32
|
||||
path: pyinstaller/dist/windows
|
||||
- name: PyInstaller Windows Build
|
||||
uses: JackMcKew/pyinstaller-action-windows@main
|
||||
with:
|
||||
path: pyinstaller
|
||||
requirements: requirements-wine.txt
|
||||
|
||||
- name: Download PyInstaller Binary
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: maigret_standalone_win32
|
||||
- name: Upload PyInstaller Binary to Workflow as Artifact
|
||||
if: success()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: maigret_standalone_win32
|
||||
path: pyinstaller/dist/windows
|
||||
|
||||
- name: Create New Release and Upload PyInstaller Binary to Release
|
||||
uses: ncipollo/release-action@v1.14.0
|
||||
id: create_release
|
||||
with:
|
||||
allowUpdates: true
|
||||
draft: false
|
||||
prerelease: false
|
||||
artifactErrorsFailBuild: true
|
||||
makeLatest: true
|
||||
replacesArtifacts: true
|
||||
artifacts: maigret_standalone.exe
|
||||
name: Development Windows Release [${{ github.ref_name }}]
|
||||
tag: ${{ github.ref_name }}
|
||||
body: |
|
||||
This is a development release built from the **${{ github.ref_name }}** branch.
|
||||
- name: Download PyInstaller Binary
|
||||
if: success()
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: maigret_standalone_win32
|
||||
|
||||
Take into account that `dev` releases may be unstable.
|
||||
Please, use [the development release](https://github.com/soxoj/maigret/releases/tag/main) build from the **main** branch.
|
||||
- name: Create New Release and Upload PyInstaller Binary to Release
|
||||
if: success()
|
||||
uses: ncipollo/release-action@v1.14.0
|
||||
id: create_release
|
||||
with:
|
||||
allowUpdates: true
|
||||
draft: false
|
||||
prerelease: false
|
||||
artifactErrorsFailBuild: true
|
||||
makeLatest: true
|
||||
replacesArtifacts: true
|
||||
artifacts: maigret_standalone.exe
|
||||
name: Development Windows Release [${{ github.ref_name }}]
|
||||
tag: ${{ github.ref_name }}
|
||||
body: |
|
||||
This is a development release built from the **${{ github.ref_name }}** branch.
|
||||
|
||||
Instructions:
|
||||
- Download the attached file `maigret_standalone.exe` to get the Windows executable.
|
||||
- Video guide on how to run it: https://youtu.be/qIgwTZOmMmM
|
||||
- For detailed documentation, visit: https://maigret.readthedocs.io/en/latest/
|
||||
Take into account that `dev` releases may be unstable.
|
||||
Please, use [the development release](https://github.com/soxoj/maigret/releases/tag/main) build from the **main** branch.
|
||||
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ github.token }}
|
||||
Instructions:
|
||||
- Download the attached file `maigret_standalone.exe` to get the Windows executable.
|
||||
- Video guide on how to run it: https://youtu.be/qIgwTZOmMmM
|
||||
- For detailed documentation, visit: https://maigret.readthedocs.io/en/latest/
|
||||
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ github.token }}
|
||||
|
||||
@@ -13,7 +13,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.10", "3.11", "3.12"]
|
||||
python-version: ["3.10", "3.11", "3.12", "3.13"]
|
||||
|
||||
steps:
|
||||
- name: Checkout
|
||||
@@ -22,6 +22,9 @@ jobs:
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- name: Install system dependencies
|
||||
run: |
|
||||
sudo apt-get update && sudo apt-get install -y libcairo2-dev
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
@@ -33,7 +36,7 @@ jobs:
|
||||
poetry run coverage report --fail-under=60
|
||||
poetry run coverage html
|
||||
- name: Upload coverage report
|
||||
uses: actions/upload-artifact@v3
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: htmlcov
|
||||
name: htmlcov-${{ strategy.job-index }}
|
||||
path: htmlcov
|
||||
@@ -1,8 +1,8 @@
|
||||
name: Upload Python Package to PyPI when a Release is Created
|
||||
name: Upload Python Package to PyPI when a Release is Published
|
||||
|
||||
on:
|
||||
release:
|
||||
types: [created]
|
||||
types: [published]
|
||||
|
||||
jobs:
|
||||
pypi-publish:
|
||||
@@ -22,9 +22,9 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install setuptools wheel
|
||||
pip install build
|
||||
- name: Build package
|
||||
run: |
|
||||
python setup.py sdist bdist_wheel # Could also be python -m build
|
||||
python -m build
|
||||
- name: Publish package distributions to PyPI
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
uses: pypa/gh-action-pypi-publish@release/v1
|
||||
|
||||
@@ -1,34 +1,60 @@
|
||||
name: Update sites rating and statistics
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches: [ dev ]
|
||||
types: [opened, synchronize]
|
||||
push:
|
||||
branches: [ main ]
|
||||
|
||||
concurrency:
|
||||
group: update-sites-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v2.3.2
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
ref: ${{ github.event.pull_request.head.sha }}
|
||||
ref: main
|
||||
fetch-depth: 0 # otherwise, there would be errors pushing refs to the destination repository.
|
||||
|
||||
- name: build application
|
||||
- name: Install system dependencies
|
||||
run: |
|
||||
sudo apt-get update && sudo apt-get install -y libcairo2-dev
|
||||
|
||||
- name: Build application
|
||||
run: |
|
||||
pip3 install .
|
||||
python3 ./utils/update_site_data.py --empty-only
|
||||
|
||||
- name: Commit and push changes
|
||||
- name: Regenerate db_meta.json
|
||||
run: python3 utils/generate_db_meta.py
|
||||
|
||||
- name: Remove ambiguous main tag
|
||||
run: git tag -d main || true
|
||||
|
||||
- name: Check for meaningful changes
|
||||
id: check
|
||||
run: |
|
||||
git config --global user.name "Maigret autoupdate"
|
||||
git config --global user.email "soxoj@protonmail.com"
|
||||
echo `git name-rev ${{ github.event.pull_request.head.sha }} --name-only`
|
||||
export BRANCH=`git name-rev ${{ github.event.pull_request.head.sha }} --name-only | sed 's/remotes\/origin\///'`
|
||||
echo $BRANCH
|
||||
git remote -v
|
||||
git checkout $BRANCH
|
||||
git add sites.md
|
||||
git commit -m "Updated site list and statistics"
|
||||
git push origin $BRANCH
|
||||
REAL_CHANGES=$(git diff --unified=0 sites.md | grep '^[+-][^+-]' | grep -v 'The list was updated at' | wc -l)
|
||||
if [ "$REAL_CHANGES" -gt 0 ]; then
|
||||
echo "has_changes=true" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "has_changes=false" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Delete existing PR branch
|
||||
if: steps.check.outputs.has_changes == 'true'
|
||||
run: git push origin --delete auto/update-sites-list || true
|
||||
|
||||
- name: Create Pull Request
|
||||
if: steps.check.outputs.has_changes == 'true'
|
||||
uses: peter-evans/create-pull-request@v7
|
||||
with:
|
||||
token: ${{ secrets.GITHUB_TOKEN }}
|
||||
commit-message: "Updated site list and statistics"
|
||||
title: "Automated Sites List Update"
|
||||
body: "Automated changes to sites.md based on new Alexa rankings/statistics."
|
||||
branch: "auto/update-sites-list"
|
||||
base: main
|
||||
delete-branch: true
|
||||
+2
-1
@@ -42,4 +42,5 @@ settings.json
|
||||
|
||||
# other
|
||||
*.egg-info
|
||||
build
|
||||
build
|
||||
LLM
|
||||
|
||||
+249
-1
@@ -1,6 +1,254 @@
|
||||
# Changelog
|
||||
|
||||
## [Unreleased]
|
||||
## [0.5.0] - 2025-08-10
|
||||
* Site Supression by @C3n7ral051nt4g3ncy in https://github.com/soxoj/maigret/pull/627
|
||||
* Bump yarl from 1.7.2 to 1.8.1 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/626
|
||||
* Streaming sites by @soxoj in https://github.com/soxoj/maigret/pull/628
|
||||
* Mirrors by @fen0s in https://github.com/soxoj/maigret/pull/630
|
||||
* Added Instagram scrapers by @soxoj in https://github.com/soxoj/maigret/pull/633
|
||||
* Bump psutil from 5.9.1 to 5.9.2 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/624
|
||||
* Bump pypdf2 from 2.10.4 to 2.10.5 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/625
|
||||
* Invalid results fixes by @soxoj in https://github.com/soxoj/maigret/pull/634
|
||||
* Bump pytest-httpserver from 1.0.5 to 1.0.6 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/638
|
||||
* Bump pypdf2 from 2.10.5 to 2.10.8 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/641
|
||||
* Bump certifi from 2022.6.15 to 2022.9.14 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/644
|
||||
* Bump idna from 3.3 to 3.4 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/640
|
||||
* fix false positives from bot by @fen0s in https://github.com/soxoj/maigret/pull/663
|
||||
* Add pre commit hook by @fen0s in https://github.com/soxoj/maigret/pull/664
|
||||
* site deletion by @C3n7ral051nt4g3ncy in https://github.com/soxoj/maigret/pull/648
|
||||
* Changed docker run to interactive and remove on exit by @dr-BEat in https://github.com/soxoj/maigret/pull/675
|
||||
* Corrected grammar in README.md by @Trkzi-Omar in https://github.com/soxoj/maigret/pull/674
|
||||
* fix sites from issues by @fen0s in https://github.com/soxoj/maigret/pull/680
|
||||
* correct username in usage examples by @LeonGr in https://github.com/soxoj/maigret/pull/673
|
||||
* Update README.md by @johanburati in https://github.com/soxoj/maigret/pull/669
|
||||
* Fix typos by @LorenzoSapora in https://github.com/soxoj/maigret/pull/681
|
||||
* Build docker images for arm64 and amd64 by @krydos in https://github.com/soxoj/maigret/pull/687
|
||||
* Bump certifi from 2022.9.14 to 2022.9.24 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/652
|
||||
* Bump aiohttp from 3.8.1 to 3.8.3 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/651
|
||||
* Bump arabic-reshaper from 2.1.3 to 2.1.4 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/650
|
||||
* Update README.md, Repl.it -> Replit with new badge by @PeterDaveHello in https://github.com/soxoj/maigret/pull/692
|
||||
* Refactor Dockerfile with best practices by @PeterDaveHello in https://github.com/soxoj/maigret/pull/691
|
||||
* Improve README.md Installation section by @PeterDaveHello in https://github.com/soxoj/maigret/pull/690
|
||||
* Bump pytest-cov from 3.0.0 to 4.0.0 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/688
|
||||
* Bump stem from 1.8.0 to 1.8.1 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/689
|
||||
* Bump typing-extensions from 4.3.0 to 4.4.0 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/698
|
||||
* Typo fixes in error.py by @Ben-Chapman in https://github.com/soxoj/maigret/pull/711
|
||||
* Fixed docs about tags by @soxoj in https://github.com/soxoj/maigret/pull/715
|
||||
* Fixed lightstalking.com by @soxoj in https://github.com/soxoj/maigret/pull/716
|
||||
* Fixed YouTube by @soxoj in https://github.com/soxoj/maigret/pull/717
|
||||
* Bump pytest-asyncio from 0.19.0 to 0.20.1 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/732
|
||||
* Updated snapcraft yaml by @kz6fittycent in https://github.com/soxoj/maigret/pull/720
|
||||
* Bump colorama from 0.4.5 to 0.4.6 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/733
|
||||
* Bump pytest from 7.1.3 to 7.2.0 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/734
|
||||
* disable not working sites by @fen0s in https://github.com/soxoj/maigret/pull/739
|
||||
* disable broken sites by @fen0s in https://github.com/soxoj/maigret/pull/756
|
||||
* Bump cloudscraper from 1.2.64 to 1.2.66 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/769
|
||||
* fix opensea and shutterstock, disable a few dead sites by @fen0s in https://github.com/soxoj/maigret/pull/798
|
||||
* Fixed documentation URL by @soxoj in https://github.com/soxoj/maigret/pull/799
|
||||
* Small readme fix by @soxoj in https://github.com/soxoj/maigret/pull/857
|
||||
* docs spelling error by @Nadeem-05 in https://github.com/soxoj/maigret/pull/866
|
||||
* Fix Pinterest false positive by @therealchiendat in https://github.com/soxoj/maigret/pull/862
|
||||
* Added new Websites by @codyMar30 in https://github.com/soxoj/maigret/pull/838
|
||||
* Update "future" package to v0.18.3 by @PeterDaveHello in https://github.com/soxoj/maigret/pull/834
|
||||
* Bump certifi from 2022.9.24 to 2022.12.7 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/793
|
||||
* Update dependency - networkx from v2.5.1 to v2.6 by @PeterDaveHello in https://github.com/soxoj/maigret/pull/738
|
||||
* Bump reportlab from 3.6.11 to 3.6.12 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/735
|
||||
* Bump typing-extensions from 4.4.0 to 4.5.0 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/888
|
||||
* Bump psutil from 5.9.2 to 5.9.4 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/741
|
||||
* Bump attrs from 22.1.0 to 22.2.0 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/892
|
||||
* Bump multidict from 6.0.2 to 6.0.4 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/891
|
||||
* Fixed false positives, updated networkx dep, some lint fixes by @soxoj in https://github.com/soxoj/maigret/pull/894
|
||||
* Bump lxml from 4.9.1 to 4.9.2 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/900
|
||||
* Bump yarl from 1.8.1 to 1.8.2 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/899
|
||||
* Fixed false positives on Mastodon sites by @soxoj in https://github.com/soxoj/maigret/pull/901
|
||||
* Added valid regex for Mastodon instances (#848) by @soxoj in https://github.com/soxoj/maigret/pull/906
|
||||
* Fix missing Mastodon Regex on #906 by @therealchiendat in https://github.com/soxoj/maigret/pull/908
|
||||
* Bump tqdm from 4.64.1 to 4.65.0 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/905
|
||||
* Bump requests from 2.28.1 to 2.28.2 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/904
|
||||
* Bump psutil from 5.9.4 to 5.9.5 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/910
|
||||
* fix deployment of tests by @noraj in https://github.com/soxoj/maigret/pull/933
|
||||
* Added 26 ENS and similar domains with tag `crypto` by @soxoj in https://github.com/soxoj/maigret/pull/942
|
||||
* Bump requests from 2.28.2 to 2.31.0 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/957
|
||||
* Update wizard.py by @engNoori in https://github.com/soxoj/maigret/pull/1016
|
||||
* Improved search through UnstoppableDomains by @soxoj in https://github.com/soxoj/maigret/pull/1040
|
||||
* Added memory.lol (Twitter usernames archive) by @soxoj in https://github.com/soxoj/maigret/pull/1067
|
||||
* Disabled and fixed several sites by @soxoj in https://github.com/soxoj/maigret/pull/1132
|
||||
* Fixed some sites (again) by @soxoj in https://github.com/soxoj/maigret/pull/1133
|
||||
* fix(sec): upgrade reportlab to 3.6.13 by @realize096 in https://github.com/soxoj/maigret/pull/1051
|
||||
* Add compatibility with pytest >= 7.3.0 by @tjni in https://github.com/soxoj/maigret/pull/1117
|
||||
* Additionally fixed sites, win32 build fix by @soxoj in https://github.com/soxoj/maigret/pull/1148
|
||||
* Sites fixes 250823 by @soxoj in https://github.com/soxoj/maigret/pull/1149
|
||||
* Bump reportlab from 3.6.12 to 4.0.4 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1160
|
||||
* Bump certifi from 2022.12.7 to 2023.7.22 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1070
|
||||
* fix(sec): upgrade certifi to 2022.12.07 by @realize096 in https://github.com/soxoj/maigret/pull/1173
|
||||
* Bump cloudscraper from 1.2.66 to 1.2.71 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/914
|
||||
* Some sites fixed & cloudflare detection by @soxoj in https://github.com/soxoj/maigret/pull/1178
|
||||
* EasyInstaller because everyone likes saving time :) by @CatchySmile in https://github.com/soxoj/maigret/pull/1212
|
||||
* Tests fixes + last updates by @soxoj in https://github.com/soxoj/maigret/pull/1228
|
||||
* Bump pypdf2 from 2.10.8 to 3.0.1 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/815
|
||||
* Bump pyvis from 0.2.1 to 0.3.2 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/861
|
||||
* Bump xhtml2pdf from 0.2.8 to 0.2.11 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/935
|
||||
* Bump flake8 from 5.0.4 to 6.1.0 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1091
|
||||
* Bump aiohttp from 3.8.3 to 3.8.6 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1222
|
||||
* Specified pyinstaller version by @soxoj in https://github.com/soxoj/maigret/pull/1230
|
||||
* Pyinstaller fix by @soxoj in https://github.com/soxoj/maigret/pull/1231
|
||||
* Test pyinstaller on dev branch by @soxoj in https://github.com/soxoj/maigret/pull/1233
|
||||
* Update main from dev again by @soxoj in https://github.com/soxoj/maigret/pull/1234
|
||||
* Bump typing-extensions from 4.5.0 to 4.8.0 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1239
|
||||
* Bump pytest-rerunfailures from 10.2 to 12.0 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1237
|
||||
* Bump async-timeout from 4.0.2 to 4.0.3 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1238
|
||||
* Changed pyinstaller dir by @soxoj in https://github.com/soxoj/maigret/pull/1245
|
||||
* Bump tqdm from 4.65.0 to 4.66.1 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1235
|
||||
* Updating site checkers, disabling suspended sites by @MeowyPouncer in https://github.com/soxoj/maigret/pull/1266
|
||||
* Updated site statistics by @soxoj in https://github.com/soxoj/maigret/pull/1273
|
||||
* Compat RegataOS (Opensuse) by @Jeiel0rbit in https://github.com/soxoj/maigret/pull/1308
|
||||
* fix reddit by @hhhtylerw in https://github.com/soxoj/maigret/pull/1296
|
||||
* Added Telegram bot link by @soxoj in https://github.com/soxoj/maigret/pull/1321
|
||||
* Added SOWEL classification by @soxoj in https://github.com/soxoj/maigret/pull/1453
|
||||
* Bump jinja2 from 3.1.2 to 3.1.3 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1358
|
||||
* Fixed/Disabled sites. Update requirements.txt by @rly0nheart in https://github.com/soxoj/maigret/pull/1517
|
||||
* Fixed 4 sites, added 6 sites, disabled 27 sites by @rly0nheart in https://github.com/soxoj/maigret/pull/1536
|
||||
* Fixed 3 sites, disabed 3, added by @rly0nheart in https://github.com/soxoj/maigret/pull/1539
|
||||
* Bump socid-extractor from 0.0.24 to 0.0.26 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1546
|
||||
* Added code conventions to CONTRIBUTING.md by @Lord-Topa in https://github.com/soxoj/maigret/pull/1589
|
||||
* Readme by @Lord-Topa in https://github.com/soxoj/maigret/pull/1588
|
||||
* Update data.json by @ranlo in https://github.com/soxoj/maigret/pull/1559
|
||||
* Adding permutator feature for usernames by @balestek in https://github.com/soxoj/maigret/pull/1575
|
||||
* Alik.cz indirectly requests removal by @ppfeister in https://github.com/soxoj/maigret/pull/1671
|
||||
* Fixed 1 site, PyInstaller workflow, Google Colab example by @Ixve in https://github.com/soxoj/maigret/pull/1558
|
||||
* Bump soupsieve from 2.5 to 2.6 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1708
|
||||
* Added dev documentation, fixed some sites, removed GitHub issue links… by @soxoj in https://github.com/soxoj/maigret/pull/1869
|
||||
* Bump cryptography from 42.0.7 to 43.0.1 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1870
|
||||
* Bump requests-futures from 1.0.1 to 1.0.2 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1868
|
||||
* Bump werkzeug from 3.0.3 to 3.0.6 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1846
|
||||
* Added .readthedocs.yaml, fixed Pyinstaller and Docker workflows by @soxoj in https://github.com/soxoj/maigret/pull/1874
|
||||
* Added GitHub and BuyMeACoffee sponsorships by @soxoj in https://github.com/soxoj/maigret/pull/1875
|
||||
* Bump psutil from 5.9.5 to 6.1.0 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1839
|
||||
* Bump flake8 from 6.1.0 to 7.1.1 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1692
|
||||
* Bump future from 0.18.3 to 1.0.0 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1545
|
||||
* Bump urllib3 from 2.2.1 to 2.2.2 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1600
|
||||
* Bump certifi from 2023.11.17 to 2024.8.30 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1840
|
||||
* Fixed test for aiohttp 3.10 by @soxoj in https://github.com/soxoj/maigret/pull/1876
|
||||
* Bump aiohttp from 3.9.5 to 3.10.5 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1721
|
||||
* Added new badges to README by @soxoj in https://github.com/soxoj/maigret/pull/1877
|
||||
* Show detailed error statistics for `-v` by @soxoj in https://github.com/soxoj/maigret/pull/1879
|
||||
* Disabled unavailable sites by @soxoj in https://github.com/soxoj/maigret/pull/1880
|
||||
* Added 7 sites, implemented integration with Marple, docs update by @soxoj in https://github.com/soxoj/maigret/pull/1881
|
||||
* Bump pefile from 2022.5.30 to 2024.8.26 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1883
|
||||
* Bump lxml from 4.9.4 to 5.3.0 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1884
|
||||
* New sites added by @soxoj in https://github.com/soxoj/maigret/pull/1888
|
||||
* Improved self-check mode, added 15 sites by @soxoj in https://github.com/soxoj/maigret/pull/1887
|
||||
* Bump pyinstaller from 6.1 to 6.11.1 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1882
|
||||
* Bump pytest-asyncio from 0.23.7 to 0.23.8 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1885
|
||||
* Pyinstaller bump & pefile fix by @soxoj in https://github.com/soxoj/maigret/pull/1890
|
||||
* Bump python-bidi from 0.4.2 to 0.6.3 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1886
|
||||
* Sites checks fixes by @soxoj in https://github.com/soxoj/maigret/pull/1896
|
||||
* Parallel execution optimization by @soxoj in https://github.com/soxoj/maigret/pull/1897
|
||||
* Maigret bot support (custom progress function fixed) by @soxoj in https://github.com/soxoj/maigret/pull/1898
|
||||
* Bump markupsafe from 2.1.5 to 3.0.2 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1895
|
||||
* Retries set to 0 by default, refactored code of executor with progress by @soxoj in https://github.com/soxoj/maigret/pull/1899
|
||||
* Bump aiohttp-socks from 0.7.1 to 0.9.1 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1900
|
||||
* Bump pycountry from 23.12.11 to 24.6.1 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1903
|
||||
* Bump pytest-cov from 4.1.0 to 6.0.0 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1902
|
||||
* Bump pyvis from 0.2.1 to 0.3.2 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1893
|
||||
* Close http connections (#1595) by @soxoj in https://github.com/soxoj/maigret/pull/1905
|
||||
* New logo by @soxoj in https://github.com/soxoj/maigret/pull/1906
|
||||
* Fixed dateutil parsing error for CDT timezone by @soxoj in https://github.com/soxoj/maigret/pull/1907
|
||||
* Bump alive-progress from 2.4.1 to 3.2.0 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1910
|
||||
* Permutator output and documentation updates by @soxoj in https://github.com/soxoj/maigret/pull/1914
|
||||
* Bump aiohttp from 3.11.7 to 3.11.8 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1912
|
||||
* Bump async-timeout from 4.0.3 to 5.0.1 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1909
|
||||
* An recursive search animation in README has been updated by @soxoj in https://github.com/soxoj/maigret/pull/1915
|
||||
* Bump pytest-rerunfailures from 12.0 to 15.0 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1911
|
||||
* Bump attrs from 22.2.0 to 24.2.0 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1913
|
||||
* Sites fixes by @soxoj in https://github.com/soxoj/maigret/pull/1917
|
||||
* Update README.md by @soxoj in https://github.com/soxoj/maigret/pull/1919
|
||||
* Refactored sites module, updated documentation by @soxoj in https://github.com/soxoj/maigret/pull/1918
|
||||
* Bump aiohttp from 3.11.8 to 3.11.9 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1920
|
||||
* Bump pytest from 7.4.4 to 8.3.4 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1923
|
||||
* Bump yarl from 1.18.0 to 1.18.3 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1922
|
||||
* Bump pytest-asyncio from 0.23.8 to 0.24.0 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1925
|
||||
* Documentation update by @soxoj in https://github.com/soxoj/maigret/pull/1926
|
||||
* Bump mock from 4.0.3 to 5.1.0 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1921
|
||||
* Bump pywin32-ctypes from 0.2.1 to 0.2.3 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1924
|
||||
* Installation docs update by @soxoj in https://github.com/soxoj/maigret/pull/1927
|
||||
* Disabled Figma check by @soxoj in https://github.com/soxoj/maigret/pull/1928
|
||||
* Put Windows executable in Releases for each dev and main commit by @soxoj in https://github.com/soxoj/maigret/pull/1929
|
||||
* Updated PyInstaller workflow by @soxoj in https://github.com/soxoj/maigret/pull/1930
|
||||
* Documentation update by @soxoj in https://github.com/soxoj/maigret/pull/1931
|
||||
* Fixed Figma check and some bugs by @soxoj in https://github.com/soxoj/maigret/pull/1932
|
||||
* Bump six from 1.16.0 to 1.17.0 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1933
|
||||
* Activation mechanism documentation added by @soxoj in https://github.com/soxoj/maigret/pull/1935
|
||||
* Readme/docs update based on GH discussions by @soxoj in https://github.com/soxoj/maigret/pull/1936
|
||||
* Bump aiohttp from 3.11.9 to 3.11.10 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1937
|
||||
* Weibo site check fix, activation mechanism added by @soxoj in https://github.com/soxoj/maigret/pull/1938
|
||||
* Fixed Ebay and BongaCams checks by @soxoj in https://github.com/soxoj/maigret/pull/1939
|
||||
* Sites fixes by @soxoj in https://github.com/soxoj/maigret/pull/1940
|
||||
* Fixed Linktr and discourse.mozilla.org by @soxoj in https://github.com/soxoj/maigret/pull/1941
|
||||
* Refactored self-check method, code formatting, small lint fixes by @soxoj in https://github.com/soxoj/maigret/pull/1942
|
||||
* Refactoring, test coverage increased to 60% by @soxoj in https://github.com/soxoj/maigret/pull/1943
|
||||
* Added a test for submitter by @soxoj in https://github.com/soxoj/maigret/pull/1944
|
||||
* Update README.md by @soxoj in https://github.com/soxoj/maigret/pull/1949
|
||||
* Updated OP.GG checks by @soxoj in https://github.com/soxoj/maigret/pull/1950
|
||||
* Fixed ProductHunt check by @soxoj in https://github.com/soxoj/maigret/pull/1951
|
||||
* Improved check feature extraction function, added tests by @soxoj in https://github.com/soxoj/maigret/pull/1952
|
||||
* Submit improvements and site check fixes by @soxoj in https://github.com/soxoj/maigret/pull/1956
|
||||
* chore: update submit.py by @eltociear in https://github.com/soxoj/maigret/pull/1957
|
||||
* Fixed Gravatar parsing (socid_extractor) by @soxoj in https://github.com/soxoj/maigret/pull/1958
|
||||
* Site check fixes by @soxoj in https://github.com/soxoj/maigret/pull/1962
|
||||
* fix bad linux filename generation by @overcuriousity in https://github.com/soxoj/maigret/pull/1961
|
||||
* Bump pytest-asyncio from 0.24.0 to 0.25.0 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1963
|
||||
* Fixed flaky tests to check cookies by @soxoj in https://github.com/soxoj/maigret/pull/1965
|
||||
* Preparation of 0.5.0 alpha version by @soxoj in https://github.com/soxoj/maigret/pull/1966
|
||||
* Created web frontend launched via --web flag by @overcuriousity in https://github.com/soxoj/maigret/pull/1967
|
||||
* Bump certifi from 2024.8.30 to 2024.12.14 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1969
|
||||
* Bump attrs from 24.2.0 to 24.3.0 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1970
|
||||
* Added web interface docs by @soxoj in https://github.com/soxoj/maigret/pull/1972
|
||||
* Small docs and parameters fixes for web interface mode by @soxoj in https://github.com/soxoj/maigret/pull/1973
|
||||
* [ImgBot] Optimize images by @imgbot[bot] in https://github.com/soxoj/maigret/pull/1974
|
||||
* Improving the web interface by @overcuriousity in https://github.com/soxoj/maigret/pull/1975
|
||||
* make graph more meaningful by @overcuriousity in https://github.com/soxoj/maigret/pull/1977
|
||||
* Async generator-executor for site checks by @soxoj in https://github.com/soxoj/maigret/pull/1978
|
||||
* Bump aiohttp from 3.11.10 to 3.11.11 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1979
|
||||
* Bump psutil from 6.1.0 to 6.1.1 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1980
|
||||
* Bump aiohttp-socks from 0.9.1 to 0.10.0 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1985
|
||||
* Bump mypy from 1.13.0 to 1.14.0 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1983
|
||||
* Bump aiohttp-socks from 0.10.0 to 0.10.1 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1987
|
||||
* Bump jinja2 from 3.1.4 to 3.1.5 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1982
|
||||
* Bump coverage from 7.6.9 to 7.6.10 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1986
|
||||
* Bump pytest-asyncio from 0.25.0 to 0.25.1 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1989
|
||||
* Bump mypy from 1.14.0 to 1.14.1 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1988
|
||||
* Bump pytest-asyncio from 0.25.1 to 0.25.2 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/1990
|
||||
* docs: update usage-examples.rst by @eltociear in https://github.com/soxoj/maigret/pull/1996
|
||||
* upload-artifact action in python test workflow updated to v4 by @soxoj in https://github.com/soxoj/maigret/pull/2024
|
||||
* Pass db_file configuration to web interface by @pykereaper in https://github.com/soxoj/maigret/pull/2019
|
||||
* Fix usage of data.json files from web by @pykereaper in https://github.com/soxoj/maigret/pull/2020
|
||||
* Bump black from 24.10.0 to 25.1.0 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/2001
|
||||
* Important Update Installer.bat by @CatchySmile in https://github.com/soxoj/maigret/pull/1994
|
||||
* Bump cryptography from 44.0.0 to 44.0.1 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/2005
|
||||
* Bump jinja2 from 3.1.5 to 3.1.6 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/2011
|
||||
* [#2010] Add 6 more websites to manage by @pylapp in https://github.com/soxoj/maigret/pull/2009
|
||||
* Bump flask from 3.1.0 to 3.1.1 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/2028
|
||||
* Bump requests from 2.32.3 to 2.32.4 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/2026
|
||||
* Bump pycares from 4.5.0 to 4.9.0 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/2025
|
||||
* Bump pytest-asyncio from 0.25.2 to 0.26.0 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/2016
|
||||
* Bump urllib3 from 2.2.3 to 2.5.0 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/2027
|
||||
* Disable ICQ site by @Echo-Darlyson in https://github.com/soxoj/maigret/pull/1993
|
||||
* Bump attrs from 24.3.0 to 25.3.0 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/2014
|
||||
* Bump certifi from 2024.12.14 to 2025.1.31 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/2004
|
||||
* Bump typing-extensions from 4.12.2 to 4.14.1 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/2038
|
||||
* Disable AskFM by @MR-VL in https://github.com/soxoj/maigret/pull/2037
|
||||
* Bump platformdirs from 4.3.6 to 4.3.8 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/2033
|
||||
* Bump coverage from 7.6.10 to 7.9.2 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/2039
|
||||
* Bump aiohttp from 3.11.11 to 3.12.14 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/2041
|
||||
* Bump yarl from 1.18.3 to 1.20.1 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/2032
|
||||
* Fixed test dialog_adds_site_negative by @soxoj in https://github.com/soxoj/maigret/pull/2107
|
||||
* Bump reportlab from 4.2.5 to 4.4.3 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/2063
|
||||
* Bump asgiref from 3.8.1 to 3.9.1 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/2040
|
||||
* Bump multidict from 6.1.0 to 6.6.3 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/2034
|
||||
* Bump pytest-rerunfailures from 15.0 to 15.1 by @dependabot[bot] in https://github.com/soxoj/maigret/pull/2030
|
||||
|
||||
**Full Changelog**: https://github.com/soxoj/maigret/compare/v0.4.4...v0.5.0
|
||||
|
||||
## [0.4.4] - 2022-09-03
|
||||
* Fixed some false positives by @soxoj in https://github.com/soxoj/maigret/pull/433
|
||||
|
||||
+9
-7
@@ -1,16 +1,18 @@
|
||||
FROM python:3.10-slim
|
||||
FROM python:3.11-slim
|
||||
LABEL maintainer="Soxoj <soxoj@protonmail.com>"
|
||||
WORKDIR /app
|
||||
RUN pip install --no-cache-dir --upgrade pip
|
||||
RUN apt-get update && \
|
||||
apt-get install --no-install-recommends -y \
|
||||
gcc \
|
||||
musl-dev \
|
||||
libxml2 \
|
||||
build-essential \
|
||||
python3-dev \
|
||||
pkg-config \
|
||||
libcairo2-dev \
|
||||
libxml2-dev \
|
||||
libxslt-dev \
|
||||
&& \
|
||||
rm -rf /var/lib/apt/lists/* /tmp/*
|
||||
libxslt1-dev \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/*
|
||||
COPY . .
|
||||
RUN YARL_NO_EXTENSIONS=1 python3 -m pip install --no-cache-dir .
|
||||
# For production use, set FLASK_HOST to a specific IP address for security
|
||||
ENV FLASK_HOST=0.0.0.0
|
||||
ENTRYPOINT ["maigret"]
|
||||
|
||||
+71
-81
@@ -1,85 +1,61 @@
|
||||
@echo off
|
||||
|
||||
REM check if running as admin
|
||||
|
||||
goto check_Permissions
|
||||
|
||||
:check_Permissions
|
||||
echo Administrative permissions required. Detecting permissions...
|
||||
|
||||
net session >nul 2>&1
|
||||
if %errorLevel% == 0 (
|
||||
goto 1
|
||||
echo Success: Elevated permissions granted.
|
||||
) else (
|
||||
cls
|
||||
echo Failure: You MUST run this as administator, otherwise commands will fail.
|
||||
echo Failure: Requires elevated permissions.
|
||||
pause >nul
|
||||
)
|
||||
|
||||
pause >nul
|
||||
|
||||
|
||||
|
||||
REM Step 2: Check if Python and pip3 are installed
|
||||
python --version >nul 2>&1
|
||||
if %errorlevel% neq 0 (
|
||||
echo Python is not installed. Please install Python 3.8 or higher.
|
||||
pause
|
||||
exit /b
|
||||
)
|
||||
|
||||
pip3 --version >nul 2>&1
|
||||
if %errorlevel% neq 0 (
|
||||
echo pip3 is not installed. Please install pip3.
|
||||
pause
|
||||
exit /b
|
||||
)
|
||||
|
||||
REM Step 3: Check Python version
|
||||
python -c "import sys; exit(0) if sys.version_info >= (3,8) else exit(1)"
|
||||
if %errorlevel% neq 0 (
|
||||
echo Python version 3.8 or higher is required.
|
||||
pause
|
||||
exit /b
|
||||
)
|
||||
|
||||
|
||||
:1
|
||||
cls
|
||||
:::===============================================================
|
||||
::: ______ __ __ _ _
|
||||
::: | ____| | \/ | (_) | |
|
||||
::: | |__ __ _ ___ _ _ | \ / | __ _ _ __ _ _ __ ___| |_
|
||||
::: | __| / _` / __| | | | | |\/| |/ _` | |/ _` | '__/ _ \ __|
|
||||
::: | |___| (_| \__ \ |_| | | | | | (_| | | (_| | | | __/ |_
|
||||
::: |______\__,_|___/\__, | |_| |_|\__,_|_|\__, |_| \___|\__|
|
||||
::: __/ | __/ |
|
||||
::: |___/ |___/
|
||||
:::
|
||||
:::===============================================================
|
||||
echo.
|
||||
for /f "delims=: tokens=*" %%A in ('findstr /b ::: "%~f0"') do @echo(%%A
|
||||
echo.
|
||||
echo ----------------------------------------------------------------
|
||||
echo Python 3.8 or higher and pip3 required.
|
||||
echo ----------------------------------------------------------------
|
||||
echo Press [I] to begin installation.
|
||||
echo Press [R] If already installed.
|
||||
echo ----------------------------------------------------------------
|
||||
echo --------------------------------------------------------
|
||||
echo Python 3.8 or higher and pip3 required.
|
||||
echo --------------------------------------------------------
|
||||
echo Press [I] to begin installation.
|
||||
echo Press [R] If already installed.
|
||||
echo --------------------------------------------------------
|
||||
choice /c IR
|
||||
if %errorlevel%==1 goto install1
|
||||
if %errorlevel%==1 goto check_python
|
||||
if %errorlevel%==2 goto after
|
||||
|
||||
:check_python
|
||||
cls
|
||||
for /f "tokens=2 delims= " %%i in ('python --version 2^>nul') do (
|
||||
for /f "tokens=1,2 delims=." %%j in ("%%i") do (
|
||||
if %%j GEQ 3 (
|
||||
if %%k GEQ 8 (
|
||||
goto check_pip
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
echo Python 3.8 or higher is required. Please install it first.
|
||||
pause
|
||||
exit /b
|
||||
|
||||
:check_pip
|
||||
pip --version 2>nul | findstr /r /c:"pip" >nul
|
||||
if %errorlevel% neq 0 (
|
||||
echo pip is required. Please install it first.
|
||||
pause
|
||||
exit /b
|
||||
)
|
||||
goto install1
|
||||
|
||||
:install1
|
||||
cls
|
||||
echo ========================================================
|
||||
echo Maigret Installation Script
|
||||
echo Maigret Installation
|
||||
echo ========================================================
|
||||
echo.
|
||||
echo --------------------------------------------------------
|
||||
echo If your pip installation is outdated, it could cause
|
||||
echo cryptography to fail on installation.
|
||||
echo --------------------------------------------------------
|
||||
echo check for and install pip updates now?
|
||||
echo Check for and install pip 23.3.2 now?
|
||||
echo --------------------------------------------------------
|
||||
choice /c YN
|
||||
if %errorlevel%==1 goto install2
|
||||
@@ -87,42 +63,56 @@ if %errorlevel%==2 goto install3
|
||||
|
||||
:install2
|
||||
cls
|
||||
python -m pip install --upgrade pip
|
||||
goto:install3
|
||||
python -m pip install --upgrade pip==23.3.2
|
||||
if %errorlevel% neq 0 (
|
||||
echo Failed to update pip to version 23.3.2. Please check your installation.
|
||||
pause
|
||||
exit /b
|
||||
)
|
||||
goto install3
|
||||
|
||||
:install3
|
||||
cls
|
||||
echo ========================================================
|
||||
echo Maigret Installation Script
|
||||
echo Maigret Installation
|
||||
echo ========================================================
|
||||
echo.
|
||||
echo --------------------------------------------------------
|
||||
echo Install requirements and maigret?
|
||||
echo --------------------------------------------------------
|
||||
choice /c YN
|
||||
if %errorlevel%==1 goto install4
|
||||
if %errorlevel%==2 goto 1
|
||||
|
||||
:install4
|
||||
cls
|
||||
pip install .
|
||||
pip install maigret
|
||||
goto:after
|
||||
echo Installing Maigret...
|
||||
python -m pip install maigret
|
||||
if %errorlevel% neq 0 (
|
||||
echo Failed to install Maigret. Please check your installation.
|
||||
pause
|
||||
exit /b
|
||||
)
|
||||
echo.
|
||||
echo +------------------------------------------------------+
|
||||
echo Maigret installed successfully.
|
||||
echo +------------------------------------------------------+
|
||||
pause
|
||||
goto after
|
||||
|
||||
:after
|
||||
cls
|
||||
echo ========================================================
|
||||
echo Maigret Background Search
|
||||
echo Maigret Usage
|
||||
echo ========================================================
|
||||
echo.
|
||||
echo --------------------------------------------------------
|
||||
echo Please Enter Username / Email
|
||||
echo --------------------------------------------------------
|
||||
set /p input=
|
||||
maigret %input%
|
||||
echo +--------------------------------------------------------+
|
||||
echo To use Maigret, you can run the following command:
|
||||
echo.
|
||||
echo maigret [options] [username]
|
||||
echo.
|
||||
echo For example, to search for a username:
|
||||
echo.
|
||||
echo maigret example_username
|
||||
echo.
|
||||
echo For more options and usage details, refer to the Maigret documentation.
|
||||
echo.
|
||||
echo https://github.com/soxoj/maigret/blob/5b3b81b4822f6deb2e9c31eb95039907f25beb5e/README.md
|
||||
echo +--------------------------------------------------------+
|
||||
echo.
|
||||
cmd
|
||||
pause
|
||||
goto:after
|
||||
exit /b
|
||||
exit /b
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2019 Sherlock Project
|
||||
Copyright (c) 2020-2021 Soxoj
|
||||
Copyright (c) 2020-2026 Soxoj
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
|
||||
@@ -1,4 +0,0 @@
|
||||
include LICENSE
|
||||
include README.md
|
||||
include requirements.txt
|
||||
include maigret/resources/*
|
||||
@@ -25,7 +25,7 @@
|
||||
|
||||
<i>The Commissioner Jules Maigret is a fictional French police detective, created by Georges Simenon. His investigation method is based on understanding the personality of different people and their interactions.</i>
|
||||
|
||||
<b>👉👉👉 [Online Telegram bot](https://t.me/osint_maigret_bot)</b>
|
||||
<b>👉👉👉 [Online Telegram bot](https://t.me/maigret_search_bot) | 🏢 [Commercial use & API](#commercial-use)</b>
|
||||
|
||||
## About
|
||||
|
||||
@@ -53,7 +53,7 @@ See the full description of Maigret features [in the documentation](https://maig
|
||||
|
||||
## Installation
|
||||
|
||||
‼️ Maigret is available online via [official Telegram bot](https://t.me/osint_maigret_bot). Consider using it if you don't want to install anything.
|
||||
‼️ Maigret is available online via [official Telegram bot](https://t.me/maigret_search_bot). Consider using it if you don't want to install anything.
|
||||
|
||||
### Windows
|
||||
|
||||
@@ -112,6 +112,10 @@ docker run -v /mydir:/app/reports soxoj/maigret:latest username --html
|
||||
docker build -t maigret .
|
||||
```
|
||||
|
||||
### Troubleshooting
|
||||
|
||||
If you encounter build errors during installation, check the [troubleshooting guide](https://maigret.readthedocs.io/en/latest/installation.html#troubleshooting).
|
||||
|
||||
## Usage examples
|
||||
|
||||
```bash
|
||||
@@ -192,6 +196,16 @@ The authors and developers of this tool bear no responsibility for any misuse or
|
||||
|
||||
If you have any questions, suggestions, or feedback, please feel free to [open an issue](https://github.com/soxoj/maigret/issues), create a [GitHub discussion](https://github.com/soxoj/maigret/discussions), or contact the author directly via [Telegram](https://t.me/soxoj).
|
||||
|
||||
## Commercial Use
|
||||
|
||||
If you need a **daily updated database** of supported sites or an **API for username checks**, feel free to reach out:
|
||||
|
||||
📧 [maigret@soxoj.com](mailto:maigret@soxoj.com)
|
||||
|
||||
Available options:
|
||||
- Up-to-date site database - regularly maintained and updated list of 5K+ sites, delivered daily
|
||||
- Username check API - programmatic access to Maigret's search capabilities for integration into your products
|
||||
|
||||
## SOWEL classification
|
||||
|
||||
This tool uses the following OSINT techniques:
|
||||
|
||||
@@ -31,14 +31,32 @@ two-letter country codes (**not a language!**). E.g. photo, dating, sport; jp, u
|
||||
Multiple tags can be associated with one site. **Warning**: tags markup is
|
||||
not stable now. Read more :doc:`in the separate section <tags>`.
|
||||
|
||||
``--exclude-tags`` - Exclude sites with specific tags from the search
|
||||
(blacklist). E.g. ``--exclude-tags porn,dating`` will skip all sites
|
||||
tagged with ``porn`` or ``dating``. Can be combined with ``--tags`` to
|
||||
include certain categories while excluding others. Read more
|
||||
:doc:`in the separate section <tags>`.
|
||||
|
||||
``-n``, ``--max-connections`` - Allowed number of concurrent connections
|
||||
**(default: 100)**.
|
||||
|
||||
``-a``, ``--all-sites`` - Use all sites for scan **(default: top 500)**.
|
||||
|
||||
``--top-sites`` - Count of sites for scan ranked by Alexa Top
|
||||
``--top-sites`` - Count of sites for scan ranked by Majestic Million
|
||||
**(default: top 500)**.
|
||||
|
||||
**Mirrors:** After the top *N* sites by Majestic Million rank are chosen (respecting
|
||||
``--tags``, ``--use-disabled-sites``, etc.), Maigret may add extra sites
|
||||
whose database field ``source`` names a **parent platform** that itself falls
|
||||
in the Majestic Million top *N* when ranking **including disabled** sites. For example,
|
||||
if ``Twitter`` ranks in the first 500 by Majestic Million, a mirror such as ``memory.lol``
|
||||
(with ``source: Twitter``) is included even though it has no rank and would
|
||||
otherwise be cut off. The same applies to Instagram-related mirrors (e.g.
|
||||
Picuki) when ``Instagram`` is in that parent top *N* by rank—even if the
|
||||
official ``Instagram`` entry is disabled and not scanned by default, its
|
||||
mirrors can still be pulled in. The final list is the ranked top *N* plus
|
||||
these mirrors (no fixed upper bound on mirror count).
|
||||
|
||||
``--timeout`` - Time (in seconds) to wait for responses from sites
|
||||
**(default: 30)**. A longer timeout will be more likely to get results
|
||||
from slow sites. On the other hand, this may cause a long delay to
|
||||
@@ -64,11 +82,63 @@ id types, sites will be filtered automatically.
|
||||
ids. Useful for repeated scanning with found known irrelevant usernames.
|
||||
|
||||
``--db`` - Load Maigret database from a JSON file or an online, valid,
|
||||
JSON file.
|
||||
JSON file. See :ref:`custom-database` below.
|
||||
|
||||
``--no-autoupdate`` - Disable the automatic database update check that
|
||||
runs at startup. The currently cached (or bundled) database is used
|
||||
as-is.
|
||||
|
||||
``--force-update`` - Force a database update check at startup, ignoring
|
||||
the usual check interval. Implies ``--no-autoupdate`` for the rest of
|
||||
the run after the explicit update finishes.
|
||||
|
||||
``--retries RETRIES`` - Count of attempts to restart temporarily failed
|
||||
requests.
|
||||
|
||||
.. _custom-database:
|
||||
|
||||
Using a custom sites database
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The ``--db`` flag accepts three forms:
|
||||
|
||||
1. **HTTP(S) URL** — fetched as-is, e.g.
|
||||
``--db https://example.com/my_db.json``.
|
||||
2. **Local file path** — absolute (``--db /tmp/private.json``) or
|
||||
relative to the current working directory
|
||||
(``--db LLM/maigret_private_db.json``).
|
||||
3. **Module-relative path** — kept for backwards compatibility, resolved
|
||||
against the installed ``maigret/`` package directory (e.g. the
|
||||
default ``resources/data.json``).
|
||||
|
||||
Resolution order for local paths: the path is first tried as given
|
||||
(absolute or cwd-relative); if that file does not exist, Maigret falls
|
||||
back to the legacy module-relative resolution. If neither location
|
||||
contains the file, Maigret exits with an error rather than silently
|
||||
loading the bundled database.
|
||||
|
||||
When ``--db`` points to a custom file, automatic database updates are
|
||||
skipped — the file is used exactly as provided.
|
||||
|
||||
On every run Maigret prints the database it actually loaded, for
|
||||
example::
|
||||
|
||||
[+] Using sites database: /path/to/maigret_private_db.json (6 sites)
|
||||
|
||||
If loading the requested database fails for any other reason (corrupt
|
||||
JSON, missing required keys, …), Maigret prints a warning, falls back
|
||||
to the bundled database, and reports the fallback explicitly::
|
||||
|
||||
[-] Falling back to bundled database: /…/maigret/resources/data.json
|
||||
[+] Using sites database: /…/maigret/resources/data.json (3154 sites)
|
||||
|
||||
A typical invocation against a private database, with auto-update
|
||||
disabled and all sites scanned, looks like::
|
||||
|
||||
python3 -m maigret username \
|
||||
--db LLM/maigret_private_db.json \
|
||||
--no-autoupdate -a
|
||||
|
||||
Reports
|
||||
-------
|
||||
|
||||
@@ -88,6 +158,9 @@ username).
|
||||
``-J``, ``--json`` - Generate a JSON report of specific type: simple,
|
||||
ndjson (one report per username). E.g. ``--json ndjson``
|
||||
|
||||
``-M``, ``--md`` - Generate a Markdown report (general report on all
|
||||
usernames). See :ref:`markdown-report` below.
|
||||
|
||||
``-fo``, ``--folderoutput`` - Results will be saved to this folder,
|
||||
``results`` by default. Will be created if doesn’t exist.
|
||||
|
||||
@@ -112,16 +185,60 @@ Other operations modes
|
||||
|
||||
``--version`` - Display version information and dependencies.
|
||||
|
||||
``--self-check`` - Do self-checking for sites and database and disable
|
||||
non-working ones **for current search session** by default. It’s useful
|
||||
for testing new internet connection (it depends on provider/hosting on
|
||||
which sites there will be censorship stub or captcha display). After
|
||||
checking Maigret asks if you want to save updates, answering y/Y will
|
||||
rewrite the local database.
|
||||
``--self-check`` - Do self-checking for sites and database. Each site is
|
||||
tested by looking up its known-claimed and known-unclaimed usernames and
|
||||
verifying that the results match expectations. Individual site failures
|
||||
(network errors, unexpected exceptions, etc.) are caught and logged
|
||||
without stopping the overall process, so the check always runs to
|
||||
completion. After checking, Maigret reports a summary of issues found.
|
||||
If any sites were disabled (see ``--auto-disable``), Maigret asks if you
|
||||
want to save updates; answering y/Y will rewrite the local database.
|
||||
|
||||
``--auto-disable`` - Used with ``--self-check``: automatically disable
|
||||
sites that fail checks (incorrect detection of claimed/unclaimed
|
||||
usernames, connection errors, or unexpected exceptions). Without this
|
||||
flag, ``--self-check`` only **reports** issues without modifying the
|
||||
database.
|
||||
|
||||
``--diagnose`` - Used with ``--self-check``: print detailed diagnosis
|
||||
information for each failing site, including the check type, the list
|
||||
of issues found, and recommendations (e.g. suggesting a different
|
||||
``checkType``).
|
||||
|
||||
``--submit URL`` - Do an automatic analysis of the given account URL or
|
||||
site main page URL to determine the site engine and methods to check
|
||||
account presence. After checking Maigret asks if you want to add the
|
||||
site, answering y/Y will rewrite the local database.
|
||||
|
||||
.. _markdown-report:
|
||||
|
||||
Markdown report (LLM-friendly)
|
||||
------------------------------
|
||||
|
||||
The ``--md`` / ``-M`` flag generates a Markdown report designed for both human reading and analysis by AI assistants (ChatGPT, Claude, etc.).
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
maigret username --md
|
||||
|
||||
The report includes:
|
||||
|
||||
- **Summary** with aggregated personal data (all fullnames, locations, bios found across accounts), country tags, website tags, first/last seen timestamps.
|
||||
- **Per-account sections** with profile URL, site tags, and all extracted fields (username, bio, follower count, linked accounts, etc.).
|
||||
- **Possible false positives** disclaimer explaining that accounts may belong to different people.
|
||||
- **Ethical use** notice about applicable data protection laws.
|
||||
|
||||
**Using with AI tools:**
|
||||
|
||||
The Markdown format is optimized for LLM context windows. You can feed the report directly to an AI assistant for follow-up analysis:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
# Generate the report
|
||||
maigret johndoe --md
|
||||
|
||||
# Feed it to an AI tool
|
||||
cat reports/report_johndoe.md | llm "Analyze this OSINT report and summarize key findings"
|
||||
|
||||
The structured Markdown with per-site sections makes it easy for AI tools to extract relationships, cross-reference identities, and identify patterns across accounts.
|
||||
|
||||
|
||||
+2
-2
@@ -3,10 +3,10 @@
|
||||
# -- Project information
|
||||
|
||||
project = 'Maigret'
|
||||
copyright = '2024, soxoj'
|
||||
copyright = '2025, soxoj'
|
||||
author = 'soxoj'
|
||||
|
||||
release = '0.5.0a1'
|
||||
release = '0.5.0'
|
||||
version = '0.5'
|
||||
|
||||
# -- General configuration
|
||||
|
||||
@@ -22,8 +22,16 @@ The supported methods (``checkType`` values in ``data.json``) are:
|
||||
- ``status_code`` - checks that status code of the response is 2XX
|
||||
- ``response_url`` - check if there is not redirect and the response is 2XX
|
||||
|
||||
.. note::
|
||||
Maigret natively treats specific anti-bot HTTP status codes (like LinkedIn's ``HTTP 999``) as a standard "Not Found/Available" signal instead of throwing an infrastructure Server Error, gracefully preventing false positives.
|
||||
|
||||
See the details of check mechanisms in the `checking.py <https://github.com/soxoj/maigret/blob/main/maigret/checking.py#L339>`_ file.
|
||||
|
||||
.. note::
|
||||
Maigret now uses the **Majestic Million** dataset for site popularity sorting instead of the discontinued Alexa Rank API. For backward compatibility with existing configurations and parsers, the ranking field in `data.json` and internal site models remains named ``alexaRank`` and ``alexa_rank``.
|
||||
|
||||
**Mirrors and ``--top-sites``:** When you limit scans with ``--top-sites N``, Maigret also includes *mirror* sites (entries whose ``source`` field points at a parent platform such as Twitter or Instagram) if that parent would appear in the Majestic Million top *N* when disabled sites are considered for ranking. See the **Mirrors** paragraph under ``--top-sites`` in :doc:`command-line-options`.
|
||||
|
||||
Testing
|
||||
-------
|
||||
|
||||
@@ -61,6 +69,21 @@ Use the following commands to check Maigret:
|
||||
make speed
|
||||
|
||||
|
||||
Site naming conventions
|
||||
-----------------------------------------------
|
||||
|
||||
Site names are the keys in ``data.json`` and appear in user-facing reports. Follow these rules:
|
||||
|
||||
- **Title Case** by default: ``Product Hunt``, ``Hacker News``.
|
||||
- **Lowercase** only if the brand itself is written that way: ``kofi``, ``note``, ``hi5``.
|
||||
- **No domain suffix** (``calendly.com`` → ``Calendly``), unless the domain is part of the recognized brand name: ``last.fm``, ``VC.ru``, ``Archive.org``.
|
||||
- **No full UPPERCASE** unless the brand is an acronym: ``VK``, ``CNET``, ``ICQ``, ``IFTTT``.
|
||||
- **No** ``www.`` **or** ``https://`` **prefix** in the name.
|
||||
- **Spaces** are allowed when the brand uses them: ``Star Citizen``, ``Google Maps``.
|
||||
- **{username} templates** in names are acceptable: ``{username}.tilda.ws``.
|
||||
|
||||
When in doubt, check how the service refers to itself on its homepage.
|
||||
|
||||
How to fix false-positives
|
||||
-----------------------------------------------
|
||||
|
||||
@@ -112,6 +135,57 @@ There are few options for sites data.json helpful in various cases:
|
||||
- ``headers`` - a dictionary of additional headers to be sent to the site
|
||||
- ``requestHeadOnly`` - set to ``true`` if it's enough to make a HEAD request to the site
|
||||
- ``regexCheck`` - a regex to check if the username is valid, in case of frequent false-positives
|
||||
- ``requestMethod`` - set the HTTP method to use (e.g., ``POST``). By default, Maigret natively defaults to GET or HEAD.
|
||||
- ``requestPayload`` - a dictionary with the JSON payload to send for POST requests (e.g., ``{"username": "{username}"}``), extremely useful for parsing GraphQL or modern JSON APIs.
|
||||
- ``protection`` - a list of protection types detected on the site (see below).
|
||||
|
||||
``protection`` (site protection tracking)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The ``protection`` field records what kind of anti-bot protection a site uses. Maigret reads this field and automatically applies the appropriate bypass mechanism.
|
||||
|
||||
Supported values:
|
||||
|
||||
- ``tls_fingerprint`` — the site fingerprints the TLS handshake (JA3/JA4) and blocks non-browser clients. Maigret automatically uses ``curl_cffi`` with Chrome browser emulation to bypass this. Requires the ``curl_cffi`` package (included as a dependency). Examples: Instagram, NPM, Codepen, Kickstarter, Letterboxd.
|
||||
- ``ip_reputation`` — the site blocks requests from datacenter/cloud IPs regardless of headers or TLS. Cannot be bypassed automatically; run Maigret from a regular internet connection (not a datacenter) or use a proxy (``--proxy``). Examples: Reddit, Patreon, Figma.
|
||||
- ``js_challenge`` — the site serves a JavaScript challenge page (e.g. "Just a moment...") that cannot be solved without a browser. Maigret detects challenge signatures and returns UNKNOWN instead of a false positive.
|
||||
|
||||
Example:
|
||||
|
||||
.. code-block:: json
|
||||
|
||||
"Instagram": {
|
||||
"url": "https://www.instagram.com/{username}/",
|
||||
"checkType": "message",
|
||||
"presenseStrs": ["\"routePath\":\"\\/"],
|
||||
"absenceStrs": ["\"routePath\":null"],
|
||||
"protection": ["tls_fingerprint"]
|
||||
}
|
||||
|
||||
``urlProbe`` (optional profile probe URL)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
By default Maigret performs the HTTP request to the same URL as ``url`` (the public profile link pattern).
|
||||
|
||||
If you set ``urlProbe`` in ``data.json``, Maigret **fetches** that URL for the presence check (API, GraphQL, JSON endpoint, etc.), while **reports and ``url_user``** still use ``url`` — the human-readable profile page users should open.
|
||||
|
||||
Placeholders: ``{username}``, ``{urlMain}``, ``{urlSubpath}`` (same as for ``url``). Example: GitHub uses ``url`` ``https://github.com/{username}`` and ``urlProbe`` ``https://api.github.com/users/{username}``; Picsart uses the web profile ``https://picsart.com/u/{username}`` and probes ``https://api.picsart.com/users/show/{username}.json``.
|
||||
|
||||
Implementation: ``make_site_result`` in `checking.py <https://github.com/soxoj/maigret/blob/main/maigret/checking.py>`_.
|
||||
|
||||
Site check fixes using LLM
|
||||
--------------------------
|
||||
|
||||
.. note::
|
||||
The ``LLM/`` directory at the root of the repository contains detailed instructions for editing site checks (in Markdown format): checklist, full guide to ``checkType`` / ``data.json`` / ``urlProbe``, handling false positives, searching for public JSON APIs, and the proposal log for ``socid_extractor``.
|
||||
|
||||
Main files:
|
||||
|
||||
- `site-checks-playbook.md <https://github.com/soxoj/maigret/blob/main/LLM/site-checks-playbook.md>`_ — short checklist
|
||||
- `site-checks-guide.md <https://github.com/soxoj/maigret/blob/main/LLM/site-checks-guide.md>`_ — detailed guide
|
||||
- `socid_extractor_improvements.log <https://github.com/soxoj/maigret/blob/main/LLM/socid_extractor_improvements.log>`_ — template and entries for identity extractor improvements
|
||||
|
||||
These files should be kept up-to-date whenever changes are made to the check logic in the code or in ``data.json``.
|
||||
|
||||
.. _activation-mechanism:
|
||||
|
||||
@@ -194,9 +268,10 @@ PyPi package.
|
||||
|
||||
2. Update Maigret version in three files manually:
|
||||
|
||||
- setup.py
|
||||
- pyproject.toml
|
||||
- maigret/__version__.py
|
||||
- docs/source/conf.py
|
||||
- docs/source/conf.py
|
||||
- snapcraft.yaml
|
||||
|
||||
3. Create a new empty text section in the beginning of the file `CHANGELOG.md` with a current date:
|
||||
|
||||
|
||||
@@ -170,6 +170,35 @@ Maigret will do retries of the requests with temporary errors got (connection fa
|
||||
|
||||
One attempt by default, can be changed with option ``--retries N``.
|
||||
|
||||
Database self-check
|
||||
-------------------
|
||||
|
||||
Maigret includes a self-check mode (``--self-check``) that validates every site
|
||||
in the database by looking up its known-claimed and known-unclaimed usernames
|
||||
and verifying that the detection results match expectations.
|
||||
|
||||
The self-check is **error-resilient**: if an individual site check raises an
|
||||
unexpected exception (e.g. a network error or a parsing failure), the error is
|
||||
caught, logged, and recorded as an issue — the remaining sites continue to be
|
||||
checked without interruption. This means the process always runs to completion,
|
||||
even when checking hundreds of sites with ``-a --self-check``.
|
||||
|
||||
Use ``--auto-disable`` together with ``--self-check`` to automatically disable
|
||||
sites that fail checks. Without it, issues are only reported. Use ``--diagnose``
|
||||
to print detailed per-site diagnosis including the check type, specific issues,
|
||||
and recommendations.
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
# Report-only mode (no changes to the database)
|
||||
maigret --self-check
|
||||
|
||||
# Automatically disable failing sites and save updates
|
||||
maigret -a --self-check --auto-disable
|
||||
|
||||
# Show detailed diagnosis for each failing site
|
||||
maigret -a --self-check --diagnose
|
||||
|
||||
Archives and mirrors checking
|
||||
-----------------------------
|
||||
|
||||
|
||||
@@ -90,3 +90,39 @@ Docker
|
||||
|
||||
# manual build
|
||||
docker build -t maigret .
|
||||
|
||||
Troubleshooting
|
||||
---------------
|
||||
|
||||
If you encounter build errors during installation such as ``cannot find ft2build.h``
|
||||
or errors related to ``reportlab`` / ``_renderPM``, you need to install system-level
|
||||
dependencies required to compile native extensions.
|
||||
|
||||
**Debian/Ubuntu/Kali:**
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
sudo apt install -y libfreetype6-dev libjpeg-dev libffi-dev
|
||||
|
||||
**Fedora/RHEL/CentOS:**
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
sudo dnf install -y freetype-devel libjpeg-devel libffi-devel
|
||||
|
||||
**Arch Linux:**
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
sudo pacman -S freetype2 libjpeg-turbo libffi
|
||||
|
||||
**macOS (Homebrew):**
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
brew install freetype
|
||||
|
||||
After installing the system dependencies, retry the maigret installation.
|
||||
|
||||
If you continue to have issues, consider using Docker instead, which includes all
|
||||
necessary dependencies.
|
||||
|
||||
@@ -27,3 +27,77 @@ Missing any of these files is not an error.
|
||||
If the next settings file contains already known option,
|
||||
this option will be rewrited. So it is possible to make
|
||||
custom configuration for different users and directories.
|
||||
|
||||
.. _database-auto-update:
|
||||
|
||||
Database auto-update
|
||||
--------------------
|
||||
|
||||
Maigret ships with a bundled site database, but it gets outdated between releases. To keep the database current, Maigret automatically checks for updates on startup.
|
||||
|
||||
**How it works:**
|
||||
|
||||
1. On startup, Maigret checks if more than 24 hours have passed since the last update check.
|
||||
2. If so, it fetches a lightweight metadata file (~200 bytes) from GitHub to see if a newer database is available.
|
||||
3. If a newer, compatible database exists, Maigret downloads it to ``~/.maigret/data.json`` and uses it instead of the bundled copy.
|
||||
4. If the download fails or the new database is incompatible with your Maigret version, the bundled database is used as a fallback.
|
||||
|
||||
The downloaded database has **higher priority** than the bundled one — it replaces, not overlays.
|
||||
|
||||
**Status messages** are printed only when an action occurs:
|
||||
|
||||
.. code-block:: text
|
||||
|
||||
[*] DB auto-update: checking for updates...
|
||||
[+] DB auto-update: database updated successfully (3180 sites)
|
||||
[*] DB auto-update: database is up to date (3157 sites)
|
||||
[!] DB auto-update: latest database requires maigret >= 0.6.0, you have 0.5.0
|
||||
|
||||
**Forcing an update:**
|
||||
|
||||
Use the ``--force-update`` flag to check for updates immediately, ignoring the check interval:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
maigret username --force-update
|
||||
|
||||
The update happens at startup, then the search continues normally with the freshly downloaded database.
|
||||
|
||||
**Disabling auto-update:**
|
||||
|
||||
Use the ``--no-autoupdate`` flag to skip the update check entirely:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
maigret username --no-autoupdate
|
||||
|
||||
Or set it permanently in ``~/.maigret/settings.json``:
|
||||
|
||||
.. code-block:: json
|
||||
|
||||
{
|
||||
"no_autoupdate": true
|
||||
}
|
||||
|
||||
This is recommended for **Docker containers**, **CI pipelines**, and **air-gapped environments**.
|
||||
|
||||
**Configuration options** (in ``settings.json``):
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
:widths: 35 15 50
|
||||
|
||||
* - Setting
|
||||
- Default
|
||||
- Description
|
||||
* - ``no_autoupdate``
|
||||
- ``false``
|
||||
- Disable auto-update entirely
|
||||
* - ``autoupdate_check_interval_hours``
|
||||
- ``24``
|
||||
- How often to check for updates (in hours)
|
||||
* - ``db_update_meta_url``
|
||||
- GitHub raw URL
|
||||
- URL of the metadata file (for custom mirrors)
|
||||
|
||||
**Using a custom database** with ``--db`` always skips auto-update — you are explicitly choosing your data source.
|
||||
|
||||
+22
-1
@@ -10,7 +10,12 @@ The use of tags allows you to select a subset of the sites from big Maigret DB f
|
||||
|
||||
There are several types of tags:
|
||||
|
||||
1. **Country codes**: ``us``, ``jp``, ``br``... (`ISO 3166-1 alpha-2 <https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2>`_). These tags reflect the site language and regional origin of its users and are then used to locate the owner of a username. If the regional origin is difficult to establish or a site is positioned as worldwide, `no country code is given`. There could be multiple country code tags for one site.
|
||||
1. **Country codes**: ``us``, ``jp``, ``br``... (`ISO 3166-1 alpha-2 <https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2>`_). A country tag means that having an account on the site implies a connection to that country — either origin or residence. The goal is attribution, not perfect accuracy.
|
||||
|
||||
- **Global sites** (GitHub, YouTube, Reddit, Medium, etc.) get **no country tag** — an account there says nothing about where a person is from.
|
||||
- **Regional/local sites** where an account implies a specific country **must** have a country tag: ``VK`` → ``ru``, ``Naver`` → ``kr``, ``Zhihu`` → ``cn``.
|
||||
- Multiple country tags are allowed when a service is used predominantly in a few countries (e.g. ``Xing`` → ``de``, ``eu``).
|
||||
- Do **not** assign country tags based on traffic statistics alone — a site popular in India by traffic is not "Indian" if it is used globally.
|
||||
|
||||
2. **Site engines**. Most of them are forum engines now: ``uCoz``, ``vBulletin``, ``XenForo`` et al. Full list of engines stored in the Maigret database.
|
||||
|
||||
@@ -23,3 +28,19 @@ Usage
|
||||
``--tags coding`` -- search on sites related to software development.
|
||||
|
||||
``--tags ucoz`` -- search on uCoz sites only (mostly CIS countries)
|
||||
|
||||
Blacklisting (excluding) tags
|
||||
------------------------------
|
||||
You can exclude sites with certain tags from the search using ``--exclude-tags``:
|
||||
|
||||
``--exclude-tags porn,dating`` -- skip all sites tagged with ``porn`` or ``dating``.
|
||||
|
||||
``--exclude-tags ru`` -- skip all Russian sites.
|
||||
|
||||
You can combine ``--tags`` and ``--exclude-tags`` to fine-tune your search:
|
||||
|
||||
``--tags forum --exclude-tags ru`` -- search on forum sites, but skip Russian ones.
|
||||
|
||||
In the web interface, the tag cloud supports three states per tag:
|
||||
click once to **include** (green), click again to **exclude** (dark/strikethrough),
|
||||
and click once more to return to **neutral** (red).
|
||||
|
||||
@@ -5,7 +5,7 @@ Usage examples
|
||||
|
||||
You can use Maigret as:
|
||||
|
||||
- a command line tool: inital and a default mode
|
||||
- a command line tool: initial and a default mode
|
||||
- a `web interface <#web-interface>`_: view the graph with results and download all report formats on a single page
|
||||
- a library: integrate Maigret into your own project
|
||||
|
||||
@@ -13,7 +13,7 @@ Use Cases
|
||||
---------
|
||||
|
||||
|
||||
1. Search for accounts with username ``machine42`` on top 500 sites (by default, according to Alexa rank) from the Maigret DB.
|
||||
1. Search for accounts with username ``machine42`` on top 500 sites (by default, according to Majestic Million rank) from the Maigret DB.
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
@@ -33,7 +33,7 @@ Use Cases
|
||||
If you experience many false positives, you can do the following:
|
||||
|
||||
- Install the last development version of Maigret from GitHub
|
||||
- Run Maigret with ``--self-check`` flag and agree on disabling of problematic sites
|
||||
- Run Maigret with ``--self-check --auto-disable`` flag and agree on disabling of problematic sites
|
||||
|
||||
3. Search for accounts with username ``machine42`` and generate HTML and PDF reports.
|
||||
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
"""Maigret version file"""
|
||||
|
||||
__version__ = '0.5.0a1'
|
||||
__version__ = '0.6.0'
|
||||
|
||||
+3
-14
@@ -30,17 +30,6 @@ class ParsingActivator:
|
||||
jwt_token = r.json()["jwt"]
|
||||
site.headers["Authorization"] = "jwt " + jwt_token
|
||||
|
||||
@staticmethod
|
||||
def spotify(site, logger, cookies={}):
|
||||
headers = dict(site.headers)
|
||||
if "Authorization" in headers:
|
||||
del headers["Authorization"]
|
||||
import requests
|
||||
|
||||
r = requests.get(site.activation["url"])
|
||||
bearer_token = r.json()["accessToken"]
|
||||
site.headers["authorization"] = f"Bearer {bearer_token}"
|
||||
|
||||
@staticmethod
|
||||
def weibo(site, logger):
|
||||
headers = dict(site.headers)
|
||||
@@ -54,7 +43,7 @@ class ParsingActivator:
|
||||
logger.debug(
|
||||
f"1 stage: {'success' if r.status_code == 302 else 'no 302 redirect, fail!'}"
|
||||
)
|
||||
location = r.headers.get("Location")
|
||||
location = r.headers.get("Location", "")
|
||||
|
||||
# 2 stage: go to passport visitor page
|
||||
headers["Referer"] = location
|
||||
@@ -84,9 +73,9 @@ def import_aiohttp_cookies(cookiestxt_filename):
|
||||
cookies = CookieJar()
|
||||
|
||||
cookies_list = []
|
||||
for domain in cookies_obj._cookies.values():
|
||||
for domain in cookies_obj._cookies.values(): # type: ignore[attr-defined]
|
||||
for key, cookie in list(domain.values())[0].items():
|
||||
c = Morsel()
|
||||
c: Morsel = Morsel()
|
||||
c.set(key, cookie.value, cookie.value)
|
||||
c["domain"] = cookie.domain
|
||||
c["path"] = cookie.path
|
||||
|
||||
+381
-141
@@ -6,7 +6,7 @@ import random
|
||||
import re
|
||||
import ssl
|
||||
import sys
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from urllib.parse import quote
|
||||
|
||||
# Third party imports
|
||||
@@ -15,7 +15,7 @@ from alive_progress import alive_bar
|
||||
from aiohttp import ClientSession, TCPConnector, http_exceptions
|
||||
from aiohttp.client_exceptions import ClientConnectorError, ServerDisconnectedError
|
||||
from python_socks import _errors as proxy_errors
|
||||
from socid_extractor import extract
|
||||
from socid_extractor import extract # type: ignore[import-not-found]
|
||||
|
||||
try:
|
||||
from mock import Mock
|
||||
@@ -48,15 +48,6 @@ SUPPORTED_IDS = (
|
||||
BAD_CHARS = "#"
|
||||
|
||||
|
||||
def is_cloudflare_bypass_active(value) -> bool:
|
||||
"""True if Cloudflare webgate URL rewrite should run (``--cloudflare-bypass`` or settings)."""
|
||||
if value is True:
|
||||
return True
|
||||
if isinstance(value, dict):
|
||||
return bool(value.get("enabled", False))
|
||||
return False
|
||||
|
||||
|
||||
class CheckerBase:
|
||||
pass
|
||||
|
||||
@@ -70,30 +61,49 @@ class SimpleAiohttpChecker(CheckerBase):
|
||||
self.headers = None
|
||||
self.allow_redirects = True
|
||||
self.timeout = 0
|
||||
self.allow_redirects = True
|
||||
self.timeout = 0
|
||||
self.method = 'get'
|
||||
self.payload = None
|
||||
|
||||
def prepare(self, url, headers=None, allow_redirects=True, timeout=0, method='get'):
|
||||
def prepare(self, url, headers=None, allow_redirects=True, timeout=0, method='get', payload=None):
|
||||
self.url = url
|
||||
self.headers = headers
|
||||
self.allow_redirects = allow_redirects
|
||||
self.timeout = timeout
|
||||
self.method = method
|
||||
self.payload = payload
|
||||
return None
|
||||
|
||||
async def close(self):
|
||||
pass
|
||||
|
||||
async def _make_request(
|
||||
self, session, url, headers, allow_redirects, timeout, method, logger
|
||||
) -> Tuple[str, int, Optional[CheckError]]:
|
||||
self, session, url, headers, allow_redirects, timeout, method, logger, payload=None
|
||||
) -> Tuple[Optional[str], int, Optional[CheckError]]:
|
||||
try:
|
||||
request_method = session.get if method == 'get' else session.head
|
||||
async with request_method(
|
||||
url=url,
|
||||
headers=headers,
|
||||
allow_redirects=allow_redirects,
|
||||
timeout=timeout,
|
||||
) as response:
|
||||
if method.lower() == 'get':
|
||||
request_method = session.get
|
||||
elif method.lower() == 'post':
|
||||
request_method = session.post
|
||||
elif method.lower() == 'head':
|
||||
request_method = session.head
|
||||
else:
|
||||
request_method = session.get
|
||||
|
||||
kwargs = {
|
||||
'url': url,
|
||||
'headers': headers,
|
||||
'allow_redirects': allow_redirects,
|
||||
'timeout': timeout,
|
||||
}
|
||||
if payload and method.lower() == 'post':
|
||||
if headers and headers.get('Content-Type') == 'application/x-www-form-urlencoded':
|
||||
kwargs['data'] = payload
|
||||
else:
|
||||
kwargs['json'] = payload
|
||||
|
||||
async with request_method(**kwargs) as response:
|
||||
status_code = response.status
|
||||
response_content = await response.content.read()
|
||||
charset = response.charset or "utf-8"
|
||||
@@ -126,15 +136,21 @@ class SimpleAiohttpChecker(CheckerBase):
|
||||
logger.debug(e, exc_info=True)
|
||||
return None, 0, CheckError("Unexpected", str(e))
|
||||
|
||||
async def check(self) -> Tuple[str, int, Optional[CheckError]]:
|
||||
async def check(self) -> Tuple[Optional[str], int, Optional[CheckError]]:
|
||||
from aiohttp_socks import ProxyConnector
|
||||
|
||||
# Use a real SSL context instead of ssl=False to avoid TLS fingerprinting
|
||||
# blocks by Cloudflare and similar WAFs. Certificate verification is
|
||||
# disabled to handle sites with invalid/expired certs.
|
||||
ssl_context = ssl.create_default_context()
|
||||
ssl_context.check_hostname = False
|
||||
ssl_context.verify_mode = ssl.CERT_NONE
|
||||
|
||||
connector = (
|
||||
ProxyConnector.from_url(self.proxy)
|
||||
if self.proxy
|
||||
else TCPConnector(ssl=False)
|
||||
else TCPConnector(ssl=ssl_context)
|
||||
)
|
||||
connector.verify_ssl = False
|
||||
|
||||
async with ClientSession(
|
||||
connector=connector,
|
||||
@@ -150,6 +166,7 @@ class SimpleAiohttpChecker(CheckerBase):
|
||||
self.timeout,
|
||||
self.method,
|
||||
self.logger,
|
||||
self.payload,
|
||||
)
|
||||
|
||||
if error and str(error) == "Invalid proxy response":
|
||||
@@ -174,11 +191,11 @@ class AiodnsDomainResolver(CheckerBase):
|
||||
self.logger = kwargs.get('logger', Mock())
|
||||
self.resolver = aiodns.DNSResolver(loop=loop)
|
||||
|
||||
def prepare(self, url, headers=None, allow_redirects=True, timeout=0, method='get'):
|
||||
def prepare(self, url, headers=None, allow_redirects=True, timeout=0, method='get', payload=None):
|
||||
self.url = url
|
||||
return None
|
||||
|
||||
async def check(self) -> Tuple[str, int, Optional[CheckError]]:
|
||||
async def check(self) -> Tuple[Optional[str], int, Optional[CheckError]]:
|
||||
status = 404
|
||||
error = None
|
||||
text = ''
|
||||
@@ -196,14 +213,84 @@ class AiodnsDomainResolver(CheckerBase):
|
||||
return text, status, error
|
||||
|
||||
|
||||
try:
|
||||
from curl_cffi.requests import AsyncSession as CurlCffiAsyncSession
|
||||
|
||||
CURL_CFFI_AVAILABLE = True
|
||||
except ImportError:
|
||||
CURL_CFFI_AVAILABLE = False
|
||||
|
||||
|
||||
class CurlCffiChecker(CheckerBase):
|
||||
"""Checker using curl_cffi to emulate browser TLS fingerprint and bypass WAF."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.logger = kwargs.get('logger', Mock())
|
||||
self.browser_emulate = kwargs.get('browser_emulate', 'chrome')
|
||||
self.url = None
|
||||
self.headers = None
|
||||
self.allow_redirects = True
|
||||
self.timeout = 0
|
||||
self.method = 'get'
|
||||
self.payload = None
|
||||
|
||||
def prepare(self, url, headers=None, allow_redirects=True, timeout=0, method='get', payload=None):
|
||||
self.url = url
|
||||
self.headers = headers
|
||||
self.allow_redirects = allow_redirects
|
||||
self.timeout = timeout
|
||||
self.method = method
|
||||
self.payload = payload
|
||||
return None
|
||||
|
||||
async def close(self):
|
||||
pass
|
||||
|
||||
async def check(self) -> Tuple[Optional[str], int, Optional[CheckError]]:
|
||||
try:
|
||||
async with CurlCffiAsyncSession() as session:
|
||||
kwargs = {
|
||||
'url': self.url,
|
||||
'headers': self.headers,
|
||||
'allow_redirects': self.allow_redirects,
|
||||
'timeout': self.timeout if self.timeout else 10,
|
||||
'impersonate': self.browser_emulate,
|
||||
}
|
||||
if self.payload and self.method.lower() == 'post':
|
||||
kwargs['json'] = self.payload
|
||||
|
||||
if self.method.lower() == 'post':
|
||||
response = await session.post(**kwargs)
|
||||
elif self.method.lower() == 'head':
|
||||
response = await session.head(**kwargs)
|
||||
else:
|
||||
response = await session.get(**kwargs)
|
||||
|
||||
status_code = response.status_code
|
||||
decoded_content = response.text
|
||||
|
||||
self.logger.debug(decoded_content)
|
||||
|
||||
error = CheckError("Connection lost") if status_code == 0 else None
|
||||
return decoded_content, status_code, error
|
||||
|
||||
except asyncio.TimeoutError as e:
|
||||
return None, 0, CheckError("Request timeout", str(e))
|
||||
except KeyboardInterrupt:
|
||||
return None, 0, CheckError("Interrupted")
|
||||
except Exception as e:
|
||||
self.logger.debug(e, exc_info=True)
|
||||
return None, 0, CheckError("Unexpected", str(e))
|
||||
|
||||
|
||||
class CheckerMock:
|
||||
def __init__(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
def prepare(self, url, headers=None, allow_redirects=True, timeout=0, method='get'):
|
||||
def prepare(self, url, headers=None, allow_redirects=True, timeout=0, method='get', payload=None):
|
||||
return None
|
||||
|
||||
async def check(self) -> Tuple[str, int, Optional[CheckError]]:
|
||||
async def check(self) -> Tuple[Optional[str], int, Optional[CheckError]]:
|
||||
await asyncio.sleep(0)
|
||||
return '', 0, None
|
||||
|
||||
@@ -229,6 +316,11 @@ def detect_error_page(
|
||||
if status_code == 403 and not ignore_403:
|
||||
return CheckError("Access denied", "403 status code, use proxy/vpn")
|
||||
|
||||
elif status_code == 999:
|
||||
# LinkedIn anti-bot / HTTP 999 workaround. It shouldn't trigger an infrastructure
|
||||
# Server Error because it represents a valid "Not Found / Blocked" state for the username.
|
||||
pass
|
||||
|
||||
elif status_code >= 500:
|
||||
return CheckError("Server", f"{status_code} status code")
|
||||
|
||||
@@ -316,6 +408,12 @@ def process_site_result(
|
||||
|
||||
if html_text:
|
||||
if not presense_flags:
|
||||
if check_type == "message" and logger.isEnabledFor(logging.DEBUG):
|
||||
logger.debug(
|
||||
"Site %s uses checkType message with empty presenseStrs; "
|
||||
"presence is treated as true for any page.",
|
||||
site.name,
|
||||
)
|
||||
is_presense_detected = True
|
||||
site.stats["presense_flag"] = None
|
||||
else:
|
||||
@@ -358,7 +456,7 @@ def process_site_result(
|
||||
result = build_result(MaigretCheckStatus.CLAIMED)
|
||||
else:
|
||||
result = build_result(MaigretCheckStatus.AVAILABLE)
|
||||
elif check_type in "status_code":
|
||||
elif check_type == "status_code":
|
||||
# Checks if the status code of the response is 2XX
|
||||
if 200 <= status_code < 300:
|
||||
result = build_result(MaigretCheckStatus.CLAIMED)
|
||||
@@ -440,14 +538,19 @@ def make_site_result(
|
||||
|
||||
# workaround to prevent slash errors
|
||||
url = re.sub("(?<!:)/+", "/", url)
|
||||
url_probe = site.url_probe
|
||||
|
||||
if 'cloudflare' in site.tags and options.get("cloudflare_bypass"):
|
||||
url_probe = 'http://localhost:8000/html?url=' + url
|
||||
logger.info(f"Using cloudflare proxy for {site.name}")
|
||||
|
||||
# always clearweb_checker for now
|
||||
checker = options["checkers"][site.protocol]
|
||||
# Select checker: use curl_cffi for sites requiring TLS impersonation
|
||||
needs_impersonation = 'tls_fingerprint' in site.protection
|
||||
if needs_impersonation and CURL_CFFI_AVAILABLE:
|
||||
checker = CurlCffiChecker(logger=logger, browser_emulate='chrome')
|
||||
elif needs_impersonation and not CURL_CFFI_AVAILABLE:
|
||||
logger.warning(
|
||||
f"Site {site.name} requires TLS impersonation (curl_cffi) but it's not installed. "
|
||||
"Install with: pip install curl_cffi"
|
||||
)
|
||||
checker = options["checkers"][site.protocol]
|
||||
else:
|
||||
checker = options["checkers"][site.protocol]
|
||||
|
||||
# site check is disabled
|
||||
if site.disabled and not options['forced']:
|
||||
@@ -486,6 +589,7 @@ def make_site_result(
|
||||
else:
|
||||
# URL of user on site (if it exists)
|
||||
results_site["url_user"] = url
|
||||
url_probe = site.url_probe
|
||||
if url_probe is None:
|
||||
# Probe URL is normal one seen by people out on the web.
|
||||
url_probe = url
|
||||
@@ -501,7 +605,9 @@ def make_site_result(
|
||||
for k, v in site.get_params.items():
|
||||
url_probe += f"&{k}={v}"
|
||||
|
||||
if site.check_type == "status_code" and site.request_head_only:
|
||||
if site.request_method:
|
||||
request_method = site.request_method.lower()
|
||||
elif site.check_type == "status_code" and site.request_head_only:
|
||||
# In most cases when we are detecting by status code,
|
||||
# it is not necessary to get the entire body: we can
|
||||
# detect fine with just the HEAD response.
|
||||
@@ -512,6 +618,15 @@ def make_site_result(
|
||||
# not respond properly unless we request the whole page.
|
||||
request_method = 'get'
|
||||
|
||||
payload = None
|
||||
if site.request_payload:
|
||||
payload = {}
|
||||
for k, v in site.request_payload.items():
|
||||
if isinstance(v, str):
|
||||
payload[k] = v.format(username=username)
|
||||
else:
|
||||
payload[k] = v
|
||||
|
||||
if site.check_type == "response_url":
|
||||
# Site forwards request to a different URL if username not
|
||||
# found. Disallow the redirect so we can capture the
|
||||
@@ -528,6 +643,7 @@ def make_site_result(
|
||||
headers=headers,
|
||||
allow_redirects=allow_redirects,
|
||||
timeout=options['timeout'],
|
||||
payload=payload,
|
||||
)
|
||||
|
||||
# Store future request object in the results object
|
||||
@@ -554,6 +670,39 @@ async def check_site_for_username(
|
||||
return site.name, default_result
|
||||
|
||||
response = await checker.check()
|
||||
html_text = response[0] if response and response[0] else ""
|
||||
|
||||
# Retry once after token-style activation (e.g. Twitter guest token refresh).
|
||||
act = site.activation
|
||||
if act and html_text:
|
||||
marks = act.get("marks") or []
|
||||
if marks and any(m in html_text for m in marks):
|
||||
method = act["method"]
|
||||
try:
|
||||
activate_fun = getattr(ParsingActivator(), method)
|
||||
activate_fun(site, logger)
|
||||
except AttributeError as e:
|
||||
logger.warning(
|
||||
f"Activation method {method} for site {site.name} not found!",
|
||||
exc_info=True,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Failed activation {method} for site {site.name}: {str(e)}",
|
||||
exc_info=True,
|
||||
)
|
||||
else:
|
||||
merged = dict(checker.headers or {})
|
||||
merged.update(site.headers)
|
||||
checker.prepare(
|
||||
url=checker.url,
|
||||
headers=merged,
|
||||
allow_redirects=checker.allow_redirects,
|
||||
timeout=checker.timeout,
|
||||
method=checker.method,
|
||||
payload=getattr(checker, 'payload', None),
|
||||
)
|
||||
response = await checker.check()
|
||||
|
||||
response_result = process_site_result(
|
||||
response, query_notify, logger, default_result, site
|
||||
@@ -602,7 +751,6 @@ async def maigret(
|
||||
cookies=None,
|
||||
retries=0,
|
||||
check_domains=False,
|
||||
cloudflare_bypass=False,
|
||||
*args,
|
||||
**kwargs,
|
||||
) -> QueryResultWrapper:
|
||||
@@ -701,14 +849,12 @@ async def maigret(
|
||||
options["timeout"] = timeout
|
||||
options["id_type"] = id_type
|
||||
options["forced"] = forced
|
||||
options["cloudflare_bypass"] = is_cloudflare_bypass_active(cloudflare_bypass)
|
||||
|
||||
# results from analysis of all sites
|
||||
all_results: Dict[str, QueryResultWrapper] = {}
|
||||
|
||||
sites = list(site_dict.keys())
|
||||
|
||||
executor_limit = timeout + 0.5
|
||||
attempts = retries + 1
|
||||
while attempts:
|
||||
tasks_dict = {}
|
||||
@@ -723,11 +869,7 @@ async def maigret(
|
||||
sitename,
|
||||
'',
|
||||
MaigretCheckStatus.UNKNOWN,
|
||||
error=CheckError(
|
||||
'Request timeout',
|
||||
f'No response within {executor_limit:.1f}s per site '
|
||||
f'(increase --timeout or use --no-progressbar)',
|
||||
),
|
||||
error=CheckError('Request failed'),
|
||||
),
|
||||
}
|
||||
tasks_dict[sitename] = (
|
||||
@@ -743,7 +885,7 @@ async def maigret(
|
||||
with alive_bar(
|
||||
len(tasks_dict), title="Searching", force_tty=True, disable=no_progressbar
|
||||
) as progress:
|
||||
async for result in executor.run(tasks_dict.values()):
|
||||
async for result in executor.run(list(tasks_dict.values())): # type: ignore[arg-type]
|
||||
cur_results.append(result)
|
||||
progress()
|
||||
|
||||
@@ -808,93 +950,160 @@ async def site_self_check(
|
||||
i2p_proxy=None,
|
||||
skip_errors=False,
|
||||
cookies=None,
|
||||
cloudflare_bypass=False,
|
||||
auto_disable=False,
|
||||
diagnose=False,
|
||||
):
|
||||
changes = {
|
||||
"""
|
||||
Self-check a site configuration.
|
||||
|
||||
Args:
|
||||
auto_disable: If True, automatically disable sites that fail checks.
|
||||
If False (default), only report issues without disabling.
|
||||
diagnose: If True, print detailed diagnosis information.
|
||||
"""
|
||||
changes: Dict[str, Any] = {
|
||||
"disabled": False,
|
||||
"issues": [],
|
||||
"recommendations": [],
|
||||
}
|
||||
|
||||
check_data = [
|
||||
(site.username_claimed, MaigretCheckStatus.CLAIMED),
|
||||
(site.username_unclaimed, MaigretCheckStatus.AVAILABLE),
|
||||
]
|
||||
try:
|
||||
check_data = [
|
||||
(site.username_claimed, MaigretCheckStatus.CLAIMED),
|
||||
(site.username_unclaimed, MaigretCheckStatus.AVAILABLE),
|
||||
]
|
||||
|
||||
logger.info(f"Checking {site.name}...")
|
||||
logger.info(f"Checking {site.name}...")
|
||||
|
||||
for username, status in check_data:
|
||||
async with semaphore:
|
||||
results_dict = await maigret(
|
||||
username=username,
|
||||
site_dict={site.name: site},
|
||||
logger=logger,
|
||||
timeout=30,
|
||||
id_type=site.type,
|
||||
forced=True,
|
||||
no_progressbar=True,
|
||||
retries=1,
|
||||
proxy=proxy,
|
||||
tor_proxy=tor_proxy,
|
||||
i2p_proxy=i2p_proxy,
|
||||
cookies=cookies,
|
||||
cloudflare_bypass=cloudflare_bypass,
|
||||
)
|
||||
results_cache = {}
|
||||
|
||||
# don't disable entries with other ids types
|
||||
# TODO: make normal checking
|
||||
if site.name not in results_dict:
|
||||
logger.info(results_dict)
|
||||
changes["disabled"] = True
|
||||
continue
|
||||
|
||||
logger.debug(results_dict)
|
||||
|
||||
result = results_dict[site.name]["status"]
|
||||
|
||||
if result.error and 'Cannot connect to host' in result.error.desc:
|
||||
changes["disabled"] = True
|
||||
|
||||
site_status = result.status
|
||||
|
||||
if site_status != status:
|
||||
if site_status == MaigretCheckStatus.UNKNOWN:
|
||||
msgs = site.absence_strs
|
||||
etype = site.check_type
|
||||
logger.warning(
|
||||
f"Error while searching {username} in {site.name}: {result.context}, {msgs}, type {etype}"
|
||||
for username, status in check_data:
|
||||
async with semaphore:
|
||||
results_dict = await maigret(
|
||||
username=username,
|
||||
site_dict={site.name: site},
|
||||
logger=logger,
|
||||
timeout=30,
|
||||
id_type=site.type,
|
||||
forced=True,
|
||||
no_progressbar=True,
|
||||
retries=1,
|
||||
proxy=proxy,
|
||||
tor_proxy=tor_proxy,
|
||||
i2p_proxy=i2p_proxy,
|
||||
cookies=cookies,
|
||||
)
|
||||
# don't disable sites after the error
|
||||
# meaning that the site could be available, but returned error for the check
|
||||
# e.g. many sites protected by cloudflare and available in general
|
||||
if skip_errors:
|
||||
pass
|
||||
# don't disable in case of available username
|
||||
elif status == MaigretCheckStatus.CLAIMED:
|
||||
|
||||
# don't disable entries with other ids types
|
||||
# TODO: make normal checking
|
||||
if site.name not in results_dict:
|
||||
logger.info(results_dict)
|
||||
changes["issues"].append(f"Site {site.name} not in results (wrong id_type?)")
|
||||
if auto_disable:
|
||||
changes["disabled"] = True
|
||||
continue
|
||||
|
||||
logger.debug(results_dict)
|
||||
|
||||
result = results_dict[site.name]["status"]
|
||||
results_cache[username] = results_dict[site.name]
|
||||
|
||||
if result.error and 'Cannot connect to host' in result.error.desc:
|
||||
changes["issues"].append("Cannot connect to host")
|
||||
if auto_disable:
|
||||
changes["disabled"] = True
|
||||
elif status == MaigretCheckStatus.CLAIMED:
|
||||
logger.warning(
|
||||
f"Not found `{username}` in {site.name}, must be claimed"
|
||||
)
|
||||
logger.info(results_dict[site.name])
|
||||
changes["disabled"] = True
|
||||
else:
|
||||
logger.warning(f"Found `{username}` in {site.name}, must be available")
|
||||
logger.info(results_dict[site.name])
|
||||
changes["disabled"] = True
|
||||
|
||||
logger.info(f"Site {site.name} checking is finished")
|
||||
site_status = result.status
|
||||
|
||||
if changes["disabled"] != site.disabled:
|
||||
site.disabled = changes["disabled"]
|
||||
logger.info(f"Switching property 'disabled' for {site.name} to {site.disabled}")
|
||||
db.update_site(site)
|
||||
if not silent:
|
||||
action = "Disabled" if site.disabled else "Enabled"
|
||||
print(f"{action} site {site.name}...")
|
||||
if site_status != status:
|
||||
if site_status == MaigretCheckStatus.UNKNOWN:
|
||||
msgs = site.absence_strs
|
||||
etype = site.check_type
|
||||
error_msg = f"Error checking {username}: {result.context}"
|
||||
changes["issues"].append(error_msg)
|
||||
logger.warning(
|
||||
f"Error while searching {username} in {site.name}: {result.context}, {msgs}, type {etype}"
|
||||
)
|
||||
# don't disable sites after the error
|
||||
# meaning that the site could be available, but returned error for the check
|
||||
# e.g. many sites protected by cloudflare and available in general
|
||||
if skip_errors:
|
||||
pass
|
||||
# don't disable in case of available username
|
||||
elif status == MaigretCheckStatus.CLAIMED and auto_disable:
|
||||
changes["disabled"] = True
|
||||
elif status == MaigretCheckStatus.CLAIMED:
|
||||
changes["issues"].append(f"Claimed user '{username}' not detected as claimed")
|
||||
logger.warning(
|
||||
f"Not found `{username}` in {site.name}, must be claimed"
|
||||
)
|
||||
logger.info(results_dict[site.name])
|
||||
if auto_disable:
|
||||
changes["disabled"] = True
|
||||
else:
|
||||
changes["issues"].append(f"Unclaimed user '{username}' detected as claimed")
|
||||
logger.warning(f"Found `{username}` in {site.name}, must be available")
|
||||
logger.info(results_dict[site.name])
|
||||
if auto_disable:
|
||||
changes["disabled"] = True
|
||||
|
||||
# remove service tag "unchecked"
|
||||
if "unchecked" in site.tags:
|
||||
site.tags.remove("unchecked")
|
||||
db.update_site(site)
|
||||
logger.info(f"Site {site.name} checking is finished")
|
||||
|
||||
# Generate recommendations based on issues
|
||||
if changes["issues"] and len(results_cache) == 2:
|
||||
claimed_result = results_cache.get(site.username_claimed, {})
|
||||
unclaimed_result = results_cache.get(site.username_unclaimed, {})
|
||||
|
||||
claimed_http = claimed_result.get("http_status")
|
||||
unclaimed_http = unclaimed_result.get("http_status")
|
||||
|
||||
if claimed_http and unclaimed_http:
|
||||
if claimed_http != unclaimed_http and site.check_type != "status_code":
|
||||
changes["recommendations"].append(
|
||||
f"Consider checkType: status_code (HTTP {claimed_http} vs {unclaimed_http})"
|
||||
)
|
||||
|
||||
# Print diagnosis if requested
|
||||
if diagnose and changes["issues"]:
|
||||
print(f"\n--- {site.name} DIAGNOSIS ---")
|
||||
print(f" Check type: {site.check_type}")
|
||||
print(" Issues:")
|
||||
for issue in changes["issues"]:
|
||||
print(f" - {issue}")
|
||||
if changes["recommendations"]:
|
||||
print(" Recommendations:")
|
||||
for rec in changes["recommendations"]:
|
||||
print(f" -> {rec}")
|
||||
|
||||
# Only modify site if auto_disable is enabled
|
||||
if auto_disable and changes["disabled"] != site.disabled:
|
||||
site.disabled = changes["disabled"]
|
||||
logger.info(f"Switching property 'disabled' for {site.name} to {site.disabled}")
|
||||
db.update_site(site)
|
||||
if not silent:
|
||||
action = "Disabled" if site.disabled else "Enabled"
|
||||
print(f"{action} site {site.name}...")
|
||||
elif changes["issues"] and not silent and not diagnose:
|
||||
# Report issues without disabling
|
||||
print(f"Issues found in {site.name}: {len(changes['issues'])} (not auto-disabled)")
|
||||
|
||||
# remove service tag "unchecked"
|
||||
if "unchecked" in site.tags:
|
||||
site.tags.remove("unchecked")
|
||||
db.update_site(site)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Self-check of {site.name} failed with unexpected error: {e}",
|
||||
exc_info=True,
|
||||
)
|
||||
changes["issues"].append(f"Unexpected error: {e}")
|
||||
if auto_disable and not site.disabled:
|
||||
changes["disabled"] = True
|
||||
site.disabled = True
|
||||
db.update_site(site)
|
||||
if not silent:
|
||||
print(f"Disabled site {site.name} (unexpected error)...")
|
||||
|
||||
return changes
|
||||
|
||||
@@ -908,11 +1117,25 @@ async def self_check(
|
||||
proxy=None,
|
||||
tor_proxy=None,
|
||||
i2p_proxy=None,
|
||||
cloudflare_bypass=False,
|
||||
) -> bool:
|
||||
auto_disable=False,
|
||||
diagnose=False,
|
||||
no_progressbar=False,
|
||||
) -> dict:
|
||||
"""
|
||||
Run self-check on sites.
|
||||
|
||||
Args:
|
||||
auto_disable: If True, automatically disable sites that fail checks.
|
||||
If False (default), only report issues without disabling.
|
||||
diagnose: If True, print detailed diagnosis for each failing site.
|
||||
|
||||
Returns:
|
||||
dict with 'needs_update' bool and 'results' list of check results
|
||||
"""
|
||||
sem = asyncio.Semaphore(max_connections)
|
||||
tasks = []
|
||||
all_sites = site_data
|
||||
all_results = []
|
||||
|
||||
def disabled_count(lst):
|
||||
return len(list(filter(lambda x: x.disabled, lst)))
|
||||
@@ -924,25 +1147,29 @@ async def self_check(
|
||||
|
||||
for _, site in all_sites.items():
|
||||
check_coro = site_self_check(
|
||||
site,
|
||||
logger,
|
||||
sem,
|
||||
db,
|
||||
silent,
|
||||
proxy,
|
||||
tor_proxy,
|
||||
i2p_proxy,
|
||||
skip_errors=True,
|
||||
cookies=None,
|
||||
cloudflare_bypass=cloudflare_bypass,
|
||||
site, logger, sem, db, silent, proxy, tor_proxy, i2p_proxy,
|
||||
skip_errors=True, auto_disable=auto_disable, diagnose=diagnose
|
||||
)
|
||||
future = asyncio.ensure_future(check_coro)
|
||||
tasks.append(future)
|
||||
tasks.append((site.name, future))
|
||||
|
||||
if tasks:
|
||||
with alive_bar(len(tasks), title='Self-checking', force_tty=True) as progress:
|
||||
for f in asyncio.as_completed(tasks):
|
||||
await f
|
||||
with alive_bar(len(tasks), title='Self-checking', force_tty=True, disable=no_progressbar) as progress:
|
||||
for site_name, f in tasks:
|
||||
try:
|
||||
result = await f
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Self-check task for {site_name} raised unexpected error: {e}",
|
||||
exc_info=True,
|
||||
)
|
||||
result = {
|
||||
"disabled": False,
|
||||
"issues": [f"Unexpected error: {e}"],
|
||||
"recommendations": [],
|
||||
}
|
||||
result['site_name'] = site_name
|
||||
all_results.append(result)
|
||||
progress() # Update the progress bar
|
||||
|
||||
unchecked_new_count = len(
|
||||
@@ -951,7 +1178,10 @@ async def self_check(
|
||||
disabled_new_count = disabled_count(all_sites.values())
|
||||
total_disabled = disabled_new_count - disabled_old_count
|
||||
|
||||
if total_disabled:
|
||||
# Count issues
|
||||
total_issues = sum(1 for r in all_results if r.get('issues'))
|
||||
|
||||
if auto_disable and total_disabled:
|
||||
if total_disabled >= 0:
|
||||
message = "Disabled"
|
||||
else:
|
||||
@@ -963,11 +1193,21 @@ async def self_check(
|
||||
f"{message} {total_disabled} ({disabled_old_count} => {disabled_new_count}) checked sites. "
|
||||
"Run with `--info` flag to get more information"
|
||||
)
|
||||
elif total_issues and not silent:
|
||||
print(f"\nFound issues in {total_issues} sites (auto-disable is OFF)")
|
||||
print("Use --auto-disable to automatically disable failing sites")
|
||||
print("Use --diagnose to see detailed diagnosis for each site")
|
||||
|
||||
if unchecked_new_count != unchecked_old_count:
|
||||
print(f"Unchecked sites verified: {unchecked_old_count - unchecked_new_count}")
|
||||
|
||||
return total_disabled != 0 or unchecked_new_count != unchecked_old_count
|
||||
needs_update = total_disabled != 0 or unchecked_new_count != unchecked_old_count
|
||||
|
||||
return {
|
||||
'needs_update': needs_update,
|
||||
'results': all_results,
|
||||
'total_issues': total_issues,
|
||||
}
|
||||
|
||||
|
||||
def extract_ids_data(html_text, logger, site) -> Dict:
|
||||
@@ -986,7 +1226,7 @@ def parse_usernames(extracted_ids_data, logger) -> Dict:
|
||||
elif "usernames" in k:
|
||||
try:
|
||||
tree = ast.literal_eval(v)
|
||||
if type(tree) == list:
|
||||
if isinstance(tree, list):
|
||||
for n in tree:
|
||||
new_usernames[n] = "username"
|
||||
except Exception as e:
|
||||
|
||||
@@ -0,0 +1,342 @@
|
||||
"""
|
||||
Database auto-update logic for maigret.
|
||||
|
||||
Checks a lightweight meta file to determine if a newer site database is available,
|
||||
downloads it if compatible, and caches it locally in ~/.maigret/.
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import os.path as path
|
||||
import tempfile
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
from colorama import Fore, Style
|
||||
|
||||
from .__version__ import __version__
|
||||
|
||||
logger = logging.getLogger("maigret")
|
||||
|
||||
_use_color = True
|
||||
|
||||
|
||||
def _print_info(msg: str) -> None:
|
||||
text = f"[*] {msg}"
|
||||
if _use_color:
|
||||
print(Style.BRIGHT + Fore.GREEN + text + Style.RESET_ALL)
|
||||
else:
|
||||
print(text)
|
||||
|
||||
|
||||
def _print_success(msg: str) -> None:
|
||||
text = f"[+] {msg}"
|
||||
if _use_color:
|
||||
print(Style.BRIGHT + Fore.GREEN + text + Style.RESET_ALL)
|
||||
else:
|
||||
print(text)
|
||||
|
||||
|
||||
def _print_warning(msg: str) -> None:
|
||||
text = f"[!] {msg}"
|
||||
if _use_color:
|
||||
print(Style.BRIGHT + Fore.YELLOW + text + Style.RESET_ALL)
|
||||
else:
|
||||
print(text)
|
||||
|
||||
|
||||
DEFAULT_META_URL = (
|
||||
"https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/db_meta.json"
|
||||
)
|
||||
DEFAULT_CHECK_INTERVAL_HOURS = 24
|
||||
MAIGRET_HOME = path.expanduser("~/.maigret")
|
||||
CACHED_DB_PATH = path.join(MAIGRET_HOME, "data.json")
|
||||
STATE_PATH = path.join(MAIGRET_HOME, "autoupdate_state.json")
|
||||
BUNDLED_DB_PATH = path.join(path.dirname(path.realpath(__file__)), "resources", "data.json")
|
||||
|
||||
|
||||
def _parse_version(version_str: str) -> tuple:
|
||||
"""Parse a version string like '0.5.0' into a comparable tuple (0, 5, 0)."""
|
||||
try:
|
||||
return tuple(int(x) for x in version_str.strip().split("."))
|
||||
except (ValueError, AttributeError):
|
||||
return (0, 0, 0)
|
||||
|
||||
|
||||
def _ensure_maigret_home() -> None:
|
||||
os.makedirs(MAIGRET_HOME, exist_ok=True)
|
||||
|
||||
|
||||
def _load_state() -> dict:
|
||||
try:
|
||||
with open(STATE_PATH, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
except (FileNotFoundError, json.JSONDecodeError, OSError):
|
||||
return {}
|
||||
|
||||
|
||||
def _save_state(state: dict) -> None:
|
||||
_ensure_maigret_home()
|
||||
tmp_path = STATE_PATH + ".tmp"
|
||||
try:
|
||||
with open(tmp_path, "w", encoding="utf-8") as f:
|
||||
json.dump(state, f, indent=2, ensure_ascii=False)
|
||||
os.replace(tmp_path, STATE_PATH)
|
||||
except OSError:
|
||||
try:
|
||||
os.unlink(tmp_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def _needs_check(state: dict, interval_hours: int) -> bool:
|
||||
last_check = state.get("last_check_at")
|
||||
if not last_check:
|
||||
return True
|
||||
try:
|
||||
last_dt = datetime.fromisoformat(last_check.replace("Z", "+00:00"))
|
||||
elapsed = (datetime.now(timezone.utc) - last_dt).total_seconds() / 3600
|
||||
return elapsed >= interval_hours
|
||||
except (ValueError, TypeError):
|
||||
return True
|
||||
|
||||
|
||||
def _fetch_meta(meta_url: str, timeout: int = 10) -> Optional[dict]:
|
||||
try:
|
||||
response = requests.get(meta_url, timeout=timeout)
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def _is_version_compatible(meta: dict) -> bool:
|
||||
min_ver = meta.get("min_maigret_version", "0.0.0")
|
||||
return _parse_version(__version__) >= _parse_version(min_ver)
|
||||
|
||||
|
||||
def _is_update_available(meta: dict, state: dict) -> bool:
|
||||
if not path.isfile(CACHED_DB_PATH):
|
||||
return True
|
||||
remote_date = meta.get("updated_at", "")
|
||||
cached_date = state.get("last_meta", {}).get("updated_at", "")
|
||||
return remote_date > cached_date
|
||||
|
||||
|
||||
def _download_and_verify(data_url: str, expected_sha256: str, timeout: int = 60) -> Optional[str]:
|
||||
_ensure_maigret_home()
|
||||
tmp_fd, tmp_path = tempfile.mkstemp(dir=MAIGRET_HOME, suffix=".json")
|
||||
try:
|
||||
response = requests.get(data_url, timeout=timeout)
|
||||
if response.status_code != 200:
|
||||
return None
|
||||
|
||||
content = response.content
|
||||
actual_sha256 = hashlib.sha256(content).hexdigest()
|
||||
if actual_sha256 != expected_sha256:
|
||||
_print_warning("DB auto-update: SHA-256 mismatch, download rejected")
|
||||
return None
|
||||
|
||||
# Validate JSON structure
|
||||
data = json.loads(content)
|
||||
if not all(k in data for k in ("sites", "engines", "tags")):
|
||||
_print_warning("DB auto-update: invalid database structure")
|
||||
return None
|
||||
|
||||
os.write(tmp_fd, content)
|
||||
os.close(tmp_fd)
|
||||
tmp_fd = None
|
||||
os.replace(tmp_path, CACHED_DB_PATH)
|
||||
return CACHED_DB_PATH
|
||||
except Exception:
|
||||
return None
|
||||
finally:
|
||||
if tmp_fd is not None:
|
||||
os.close(tmp_fd)
|
||||
try:
|
||||
os.unlink(tmp_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def _best_local() -> str:
|
||||
"""Return cached DB if it exists and is valid, otherwise bundled."""
|
||||
if path.isfile(CACHED_DB_PATH):
|
||||
try:
|
||||
with open(CACHED_DB_PATH, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
if "sites" in data:
|
||||
return CACHED_DB_PATH
|
||||
except (json.JSONDecodeError, OSError):
|
||||
pass
|
||||
return BUNDLED_DB_PATH
|
||||
|
||||
|
||||
def _now_iso() -> str:
|
||||
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
|
||||
|
||||
def resolve_db_path(
|
||||
db_file_arg: str,
|
||||
no_autoupdate: bool = False,
|
||||
meta_url: str = DEFAULT_META_URL,
|
||||
check_interval_hours: int = DEFAULT_CHECK_INTERVAL_HOURS,
|
||||
color: bool = True,
|
||||
) -> str:
|
||||
"""
|
||||
Determine which database file to use, potentially downloading an update.
|
||||
|
||||
Returns the path to the database file that should be loaded.
|
||||
"""
|
||||
global _use_color
|
||||
_use_color = color
|
||||
|
||||
default_db_name = "resources/data.json"
|
||||
|
||||
# User specified a custom DB — skip auto-update
|
||||
is_url = db_file_arg.startswith("http://") or db_file_arg.startswith("https://")
|
||||
is_default = db_file_arg == default_db_name
|
||||
if is_url:
|
||||
return db_file_arg
|
||||
if not is_default:
|
||||
# Try the path as-is (absolute or relative to cwd) first.
|
||||
if path.isfile(db_file_arg):
|
||||
return path.abspath(db_file_arg)
|
||||
# Fall back to legacy behavior: resolve relative to the maigret module dir.
|
||||
module_relative = path.join(path.dirname(path.realpath(__file__)), db_file_arg)
|
||||
if module_relative != db_file_arg and path.isfile(module_relative):
|
||||
return module_relative
|
||||
if module_relative != db_file_arg:
|
||||
raise FileNotFoundError(
|
||||
f"Custom database file not found: {db_file_arg!r} "
|
||||
f"(also tried {module_relative!r})"
|
||||
)
|
||||
raise FileNotFoundError(f"Custom database file not found: {db_file_arg!r}")
|
||||
|
||||
# Auto-update disabled
|
||||
if no_autoupdate:
|
||||
return _best_local()
|
||||
|
||||
# Check interval
|
||||
_ensure_maigret_home()
|
||||
state = _load_state()
|
||||
if not _needs_check(state, check_interval_hours):
|
||||
return _best_local()
|
||||
|
||||
# Time to check
|
||||
_print_info("DB auto-update: checking for updates...")
|
||||
meta = _fetch_meta(meta_url)
|
||||
if meta is None:
|
||||
_print_warning("DB auto-update: could not reach update server, using local database")
|
||||
state["last_check_at"] = _now_iso()
|
||||
_save_state(state)
|
||||
return _best_local()
|
||||
|
||||
# Version compatibility
|
||||
if not _is_version_compatible(meta):
|
||||
min_ver = meta.get("min_maigret_version", "?")
|
||||
_print_warning(
|
||||
f"DB auto-update: latest database requires maigret >= {min_ver}, "
|
||||
f"you have {__version__}. Please upgrade with: pip install -U maigret"
|
||||
)
|
||||
state["last_check_at"] = _now_iso()
|
||||
_save_state(state)
|
||||
return _best_local()
|
||||
|
||||
# Check if update available
|
||||
if not _is_update_available(meta, state):
|
||||
sites_count = meta.get("sites_count", "?")
|
||||
_print_info(f"DB auto-update: database is up to date ({sites_count} sites)")
|
||||
state["last_check_at"] = _now_iso()
|
||||
state["last_meta"] = meta
|
||||
_save_state(state)
|
||||
return _best_local()
|
||||
|
||||
# Download update
|
||||
new_count = meta.get("sites_count", "?")
|
||||
old_count = state.get("last_meta", {}).get("sites_count")
|
||||
if old_count:
|
||||
_print_info(f"DB auto-update: downloading updated database ({new_count} sites, was {old_count})...")
|
||||
else:
|
||||
_print_info(f"DB auto-update: downloading database ({new_count} sites)...")
|
||||
|
||||
data_url = meta.get("data_url", "")
|
||||
expected_sha = meta.get("data_sha256", "")
|
||||
result = _download_and_verify(data_url, expected_sha)
|
||||
|
||||
if result is None:
|
||||
_print_warning("DB auto-update: download failed, using local database")
|
||||
state["last_check_at"] = _now_iso()
|
||||
_save_state(state)
|
||||
return _best_local()
|
||||
|
||||
_print_success(f"DB auto-update: database updated successfully ({new_count} sites)")
|
||||
state["last_check_at"] = _now_iso()
|
||||
state["last_meta"] = meta
|
||||
state["cached_db_sha256"] = expected_sha
|
||||
_save_state(state)
|
||||
return CACHED_DB_PATH
|
||||
|
||||
|
||||
def force_update(
|
||||
meta_url: str = DEFAULT_META_URL,
|
||||
color: bool = True,
|
||||
) -> bool:
|
||||
"""
|
||||
Force check for database updates and download if available.
|
||||
|
||||
Returns True if database was updated, False otherwise.
|
||||
"""
|
||||
global _use_color
|
||||
_use_color = color
|
||||
|
||||
_ensure_maigret_home()
|
||||
|
||||
_print_info("DB update: checking for updates...")
|
||||
meta = _fetch_meta(meta_url)
|
||||
if meta is None:
|
||||
_print_warning("DB update: could not reach update server")
|
||||
return False
|
||||
|
||||
if not _is_version_compatible(meta):
|
||||
min_ver = meta.get("min_maigret_version", "?")
|
||||
_print_warning(
|
||||
f"DB update: latest database requires maigret >= {min_ver}, "
|
||||
f"you have {__version__}. Please upgrade with: pip install -U maigret"
|
||||
)
|
||||
return False
|
||||
|
||||
state = _load_state()
|
||||
new_count = meta.get("sites_count", "?")
|
||||
old_count = state.get("last_meta", {}).get("sites_count")
|
||||
|
||||
if not _is_update_available(meta, state):
|
||||
_print_info(f"DB update: database is already up to date ({new_count} sites)")
|
||||
state["last_check_at"] = _now_iso()
|
||||
state["last_meta"] = meta
|
||||
_save_state(state)
|
||||
return False
|
||||
|
||||
if old_count:
|
||||
_print_info(f"DB update: downloading updated database ({new_count} sites, was {old_count})...")
|
||||
else:
|
||||
_print_info(f"DB update: downloading database ({new_count} sites)...")
|
||||
|
||||
data_url = meta.get("data_url", "")
|
||||
expected_sha = meta.get("data_sha256", "")
|
||||
result = _download_and_verify(data_url, expected_sha)
|
||||
|
||||
if result is None:
|
||||
_print_warning("DB update: download failed")
|
||||
return False
|
||||
|
||||
_print_success(f"DB update: database updated successfully ({new_count} sites)")
|
||||
state["last_check_at"] = _now_iso()
|
||||
state["last_meta"] = meta
|
||||
state["cached_db_sha256"] = expected_sha
|
||||
_save_state(state)
|
||||
return True
|
||||
+7
-5
@@ -32,6 +32,9 @@ COMMON_ERRORS = {
|
||||
'<title>Attention Required! | Cloudflare</title>': CheckError(
|
||||
'Captcha', 'Cloudflare'
|
||||
),
|
||||
'<title>Just a moment</title>': CheckError(
|
||||
'Bot protection', 'Cloudflare challenge page'
|
||||
),
|
||||
'Please stand by, while we are checking your browser': CheckError(
|
||||
'Bot protection', 'Cloudflare'
|
||||
),
|
||||
@@ -55,6 +58,8 @@ COMMON_ERRORS = {
|
||||
'Censorship', 'MGTS'
|
||||
),
|
||||
'Incapsula incident ID': CheckError('Bot protection', 'Incapsula'),
|
||||
'<title>Client Challenge</title>': CheckError('Bot protection', 'Anti-bot challenge'),
|
||||
'<title>DDoS-Guard</title>': CheckError('Bot protection', 'DDoS-Guard'),
|
||||
'Сайт заблокирован хостинг-провайдером': CheckError(
|
||||
'Site-specific', 'Site is disabled (Beget)'
|
||||
),
|
||||
@@ -136,10 +141,7 @@ def extract_and_group(search_res: QueryResultWrapper) -> List[Dict[str, Any]]:
|
||||
|
||||
|
||||
def notify_about_errors(
|
||||
search_results: QueryResultWrapper,
|
||||
query_notify,
|
||||
show_statistics=False,
|
||||
print_check_errors=False,
|
||||
search_results: QueryResultWrapper, query_notify, show_statistics=False
|
||||
) -> List[Tuple]:
|
||||
"""
|
||||
Prepare error notifications in search results, text + symbol,
|
||||
@@ -172,7 +174,7 @@ def notify_about_errors(
|
||||
text = f'{e["err"]}: {round(e["perc"],2)}%'
|
||||
results.append((text, '!'))
|
||||
|
||||
if was_errs_displayed and not print_check_errors:
|
||||
if was_errs_displayed:
|
||||
results.append(
|
||||
('You can see detailed site check errors with a flag `--print-errors`', '-')
|
||||
)
|
||||
|
||||
+4
-28
@@ -103,35 +103,13 @@ class AsyncioProgressbarQueueExecutor(AsyncExecutor):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.workers_count = kwargs.get('in_parallel', 10)
|
||||
self.queue = asyncio.Queue(self.workers_count)
|
||||
self.queue: asyncio.Queue = asyncio.Queue(self.workers_count)
|
||||
self.timeout = kwargs.get('timeout')
|
||||
# Pass a progress function; alive_bar by default
|
||||
self.progress_func = kwargs.get('progress_func', alive_bar)
|
||||
self.progress = None
|
||||
|
||||
# TODO: tests
|
||||
@staticmethod
|
||||
def _emit_timeout_notify(args, default_fallback):
|
||||
"""Print per-site line when wait_for kills check_site_for_username (no query_notify.update ran)."""
|
||||
if (
|
||||
not default_fallback
|
||||
or not isinstance(default_fallback, (tuple, list))
|
||||
or len(default_fallback) < 2
|
||||
):
|
||||
return
|
||||
_, results_dict = default_fallback[0], default_fallback[1]
|
||||
if not isinstance(results_dict, dict):
|
||||
return
|
||||
status = results_dict.get('status')
|
||||
if status is None:
|
||||
return
|
||||
if len(args) < 5:
|
||||
return
|
||||
query_notify = args[4]
|
||||
site = results_dict.get('site')
|
||||
similar = bool(getattr(site, 'similar_search', False)) if site is not None else False
|
||||
query_notify.update(status, similar)
|
||||
|
||||
async def increment_progress(self, count):
|
||||
"""Update progress by calling the provided progress function."""
|
||||
if self.progress:
|
||||
@@ -166,7 +144,6 @@ class AsyncioProgressbarQueueExecutor(AsyncExecutor):
|
||||
result = await asyncio.wait_for(query_task, timeout=self.timeout)
|
||||
except asyncio.TimeoutError:
|
||||
result = kwargs.get('default')
|
||||
self._emit_timeout_notify(args, result)
|
||||
|
||||
self.results.append(result)
|
||||
|
||||
@@ -207,10 +184,10 @@ class AsyncioQueueGeneratorExecutor:
|
||||
# Deprecated: will be removed soon, don't use it
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.workers_count = kwargs.get('in_parallel', 10)
|
||||
self.queue = asyncio.Queue()
|
||||
self.queue: asyncio.Queue = asyncio.Queue()
|
||||
self.timeout = kwargs.get('timeout')
|
||||
self.logger = kwargs['logger']
|
||||
self._results = asyncio.Queue()
|
||||
self._results: asyncio.Queue = asyncio.Queue()
|
||||
self._stop_signal = object()
|
||||
|
||||
async def worker(self):
|
||||
@@ -230,10 +207,9 @@ class AsyncioQueueGeneratorExecutor:
|
||||
result = await asyncio.wait_for(query_task, timeout=self.timeout)
|
||||
except asyncio.TimeoutError:
|
||||
result = kwargs.get('default')
|
||||
AsyncioProgressbarQueueExecutor._emit_timeout_notify(args, result)
|
||||
await self._results.put(result)
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error in worker: {e}")
|
||||
self.logger.error(f"Error in worker: {e}", exc_info=True)
|
||||
finally:
|
||||
self.queue.task_done()
|
||||
|
||||
|
||||
+125
-28
@@ -13,7 +13,7 @@ from argparse import ArgumentParser, RawDescriptionHelpFormatter
|
||||
from typing import List, Tuple
|
||||
import os.path as path
|
||||
|
||||
from socid_extractor import extract, parse
|
||||
from socid_extractor import extract, parse # type: ignore[import-not-found]
|
||||
|
||||
from .__version__ import __version__
|
||||
from .checking import (
|
||||
@@ -37,6 +37,7 @@ from .report import (
|
||||
get_plaintext_report,
|
||||
sort_report_by_data_points,
|
||||
save_graph_report,
|
||||
save_markdown_report,
|
||||
)
|
||||
from .sites import MaigretDatabase
|
||||
from .submit import Submitter
|
||||
@@ -75,7 +76,7 @@ def extract_ids_from_page(url, logger, timeout=5) -> dict:
|
||||
elif 'usernames' in k:
|
||||
try:
|
||||
tree = ast.literal_eval(v)
|
||||
if type(tree) == list:
|
||||
if isinstance(tree, list):
|
||||
for n in tree:
|
||||
results[n] = 'username'
|
||||
except Exception as e:
|
||||
@@ -201,6 +202,20 @@ def setup_arguments_parser(settings: Settings):
|
||||
default=settings.sites_db_path,
|
||||
help="Load Maigret database from a JSON file or HTTP web resource.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-autoupdate",
|
||||
action="store_true",
|
||||
dest="no_autoupdate",
|
||||
default=settings.no_autoupdate,
|
||||
help="Disable automatic database updates on startup.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--force-update",
|
||||
action="store_true",
|
||||
dest="force_update",
|
||||
default=False,
|
||||
help="Force check for database updates and download if available.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cookies-jar-file",
|
||||
metavar="COOKIE_FILE",
|
||||
@@ -254,12 +269,6 @@ def setup_arguments_parser(settings: Settings):
|
||||
default=settings.domain_search,
|
||||
help="Enable (experimental) feature of checking domains on usernames.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cloudflare-bypass",
|
||||
action="store_true",
|
||||
default=settings.cloudflare_bypass,
|
||||
help="Enable Cloudflare bypass (edit settings.json to configure)",
|
||||
)
|
||||
|
||||
filter_group = parser.add_argument_group(
|
||||
'Site filtering', 'Options to set site search scope'
|
||||
@@ -283,6 +292,12 @@ def setup_arguments_parser(settings: Settings):
|
||||
filter_group.add_argument(
|
||||
"--tags", dest="tags", default='', help="Specify tags of sites (see `--stats`)."
|
||||
)
|
||||
filter_group.add_argument(
|
||||
"--exclude-tags",
|
||||
dest="exclude_tags",
|
||||
default='',
|
||||
help="Specify tags to exclude from search (blacklist).",
|
||||
)
|
||||
filter_group.add_argument(
|
||||
"--site",
|
||||
action="append",
|
||||
@@ -322,7 +337,19 @@ def setup_arguments_parser(settings: Settings):
|
||||
"--self-check",
|
||||
action="store_true",
|
||||
default=settings.self_check_enabled,
|
||||
help="Do self check for sites and database and disable non-working ones.",
|
||||
help="Do self check for sites and database. Use --auto-disable to disable failing sites.",
|
||||
)
|
||||
modes_group.add_argument(
|
||||
"--auto-disable",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="With --self-check: automatically disable sites that fail checks.",
|
||||
)
|
||||
modes_group.add_argument(
|
||||
"--diagnose",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="With --self-check: print detailed diagnosis for each failing site.",
|
||||
)
|
||||
modes_group.add_argument(
|
||||
"--stats",
|
||||
@@ -439,6 +466,14 @@ def setup_arguments_parser(settings: Settings):
|
||||
default=settings.pdf_report,
|
||||
help="Generate a PDF report (general report on all usernames).",
|
||||
)
|
||||
report_group.add_argument(
|
||||
"-M",
|
||||
"--md",
|
||||
action="store_true",
|
||||
dest="md",
|
||||
default=settings.md_report,
|
||||
help="Generate a Markdown report (general report on all usernames).",
|
||||
)
|
||||
report_group.add_argument(
|
||||
"-G",
|
||||
"--graph",
|
||||
@@ -499,15 +534,6 @@ async def main():
|
||||
log_level = logging.WARNING
|
||||
logger.setLevel(log_level)
|
||||
|
||||
if args.web is not None:
|
||||
from maigret.web.app import app
|
||||
|
||||
port = (
|
||||
args.web if args.web else 5000
|
||||
) # args.web is either the specified port or 5000 by default
|
||||
app.run(port=port)
|
||||
return
|
||||
|
||||
# Usernames initial list
|
||||
usernames = {
|
||||
u: args.id_type
|
||||
@@ -535,7 +561,30 @@ async def main():
|
||||
if args.tags:
|
||||
args.tags = list(set(str(args.tags).split(',')))
|
||||
|
||||
db_file = path.join(path.dirname(path.realpath(__file__)), args.db_file)
|
||||
if args.exclude_tags:
|
||||
args.exclude_tags = list(set(str(args.exclude_tags).split(',')))
|
||||
else:
|
||||
args.exclude_tags = []
|
||||
|
||||
from .db_updater import resolve_db_path, force_update, BUNDLED_DB_PATH
|
||||
|
||||
if args.force_update:
|
||||
force_update(
|
||||
meta_url=settings.db_update_meta_url,
|
||||
color=not args.no_color,
|
||||
)
|
||||
|
||||
try:
|
||||
db_file = resolve_db_path(
|
||||
db_file_arg=args.db_file,
|
||||
no_autoupdate=args.no_autoupdate or args.force_update,
|
||||
meta_url=settings.db_update_meta_url,
|
||||
check_interval_hours=settings.autoupdate_check_interval_hours,
|
||||
color=not args.no_color,
|
||||
)
|
||||
except FileNotFoundError as e:
|
||||
logger.error(str(e))
|
||||
sys.exit(2)
|
||||
|
||||
if args.top_sites == 0 or args.all_sites:
|
||||
args.top_sites = sys.maxsize
|
||||
@@ -550,10 +599,25 @@ async def main():
|
||||
)
|
||||
|
||||
# Create object with all information about sites we are aware of.
|
||||
db = MaigretDatabase().load_from_path(db_file)
|
||||
try:
|
||||
db = MaigretDatabase().load_from_path(db_file)
|
||||
query_notify.success(f'Using sites database: {db_file} ({len(db.sites)} sites)')
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to load database from {db_file}: {e}")
|
||||
if db_file != BUNDLED_DB_PATH:
|
||||
query_notify.warning(
|
||||
f'Falling back to bundled database: {BUNDLED_DB_PATH}'
|
||||
)
|
||||
db = MaigretDatabase().load_from_path(BUNDLED_DB_PATH)
|
||||
query_notify.success(
|
||||
f'Using sites database: {BUNDLED_DB_PATH} ({len(db.sites)} sites)'
|
||||
)
|
||||
else:
|
||||
raise
|
||||
get_top_sites_for_id = lambda x: db.ranked_sites_dict(
|
||||
top=args.top_sites,
|
||||
tags=args.tags,
|
||||
excluded_tags=args.exclude_tags,
|
||||
names=args.site_list,
|
||||
disabled=args.use_disabled_sites,
|
||||
id_type=x,
|
||||
@@ -579,7 +643,7 @@ async def main():
|
||||
query_notify.success(
|
||||
f'Maigret sites database self-check started for {len(site_data)} sites...'
|
||||
)
|
||||
is_need_update = await self_check(
|
||||
check_result = await self_check(
|
||||
db,
|
||||
site_data,
|
||||
logger,
|
||||
@@ -587,8 +651,13 @@ async def main():
|
||||
max_connections=args.connections,
|
||||
tor_proxy=args.tor_proxy,
|
||||
i2p_proxy=args.i2p_proxy,
|
||||
cloudflare_bypass=args.cloudflare_bypass,
|
||||
auto_disable=args.auto_disable,
|
||||
diagnose=args.diagnose,
|
||||
no_progressbar=args.no_progressbar,
|
||||
)
|
||||
|
||||
is_need_update = check_result.get('needs_update', False)
|
||||
|
||||
if is_need_update:
|
||||
if input('Do you want to save changes permanently? [Yn]\n').lower() in (
|
||||
'y',
|
||||
@@ -616,6 +685,21 @@ async def main():
|
||||
# Define one report filename template
|
||||
report_filepath_tpl = path.join(report_dir, 'report_{username}{postfix}')
|
||||
|
||||
# Web interface
|
||||
if args.web is not None:
|
||||
from maigret.web.app import app
|
||||
|
||||
app.config["MAIGRET_DB_FILE"] = db_file
|
||||
|
||||
port = (
|
||||
args.web if args.web else 5000
|
||||
) # args.web is either the specified port or 5000 by default
|
||||
|
||||
# Host configuration: secure by default, but allow override via environment
|
||||
host = os.getenv('FLASK_HOST', '127.0.0.1')
|
||||
app.run(host=host, port=port)
|
||||
return
|
||||
|
||||
if usernames == {}:
|
||||
# magic params to exit after init
|
||||
query_notify.warning('No usernames to check, exiting.')
|
||||
@@ -688,14 +772,10 @@ async def main():
|
||||
no_progressbar=args.no_progressbar,
|
||||
retries=args.retries,
|
||||
check_domains=args.with_domains,
|
||||
cloudflare_bypass=args.cloudflare_bypass,
|
||||
)
|
||||
|
||||
errs = errors.notify_about_errors(
|
||||
results,
|
||||
query_notify,
|
||||
show_statistics=args.verbose,
|
||||
print_check_errors=args.print_check_errors,
|
||||
results, query_notify, show_statistics=args.verbose
|
||||
)
|
||||
for e in errs:
|
||||
query_notify.warning(*e)
|
||||
@@ -742,7 +822,7 @@ async def main():
|
||||
|
||||
# reporting for all the result
|
||||
if general_results:
|
||||
if args.html or args.pdf:
|
||||
if args.html or args.pdf or args.md:
|
||||
query_notify.warning('Generating report info...')
|
||||
report_context = generate_report_context(general_results)
|
||||
# determine main username
|
||||
@@ -762,6 +842,23 @@ async def main():
|
||||
save_pdf_report(filename, report_context)
|
||||
query_notify.warning(f'PDF report on all usernames saved in {filename}')
|
||||
|
||||
if args.md:
|
||||
username = username.replace('/', '_')
|
||||
filename = report_filepath_tpl.format(username=username, postfix='.md')
|
||||
run_flags = []
|
||||
if args.tags:
|
||||
run_flags.append(f"--tags {args.tags}")
|
||||
if args.site_list:
|
||||
run_flags.append(f"--site {','.join(args.site_list)}")
|
||||
if args.all_sites:
|
||||
run_flags.append("--all-sites")
|
||||
run_info = {
|
||||
"sites_count": sum(len(d) for _, _, d in general_results),
|
||||
"flags": " ".join(run_flags) if run_flags else None,
|
||||
}
|
||||
save_markdown_report(filename, report_context, run_info=run_info)
|
||||
query_notify.warning(f'Markdown report on all usernames saved in {filename}')
|
||||
|
||||
if args.graph:
|
||||
username = username.replace('/', '_')
|
||||
filename = report_filepath_tpl.format(
|
||||
|
||||
+1
-1
@@ -174,7 +174,7 @@ class QueryNotifyPrint(QueryNotify):
|
||||
else:
|
||||
return self.make_simple_terminal_notify(*args)
|
||||
|
||||
def start(self, message, id_type):
|
||||
def start(self, message=None, id_type="username"):
|
||||
"""Notify Start.
|
||||
|
||||
Will print the title to the standard output.
|
||||
|
||||
+182
-23
@@ -7,7 +7,7 @@ import os
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any
|
||||
|
||||
import xmind
|
||||
import xmind # type: ignore[import-untyped]
|
||||
from dateutil.tz import gettz
|
||||
from dateutil.parser import parse as parse_datetime_str
|
||||
from jinja2 import Template
|
||||
@@ -79,7 +79,7 @@ def save_pdf_report(filename: str, context: dict):
|
||||
filled_template = template.render(**context)
|
||||
|
||||
# moved here to speed up the launch of Maigret
|
||||
from xhtml2pdf import pisa
|
||||
from xhtml2pdf import pisa # type: ignore[import-untyped]
|
||||
|
||||
with open(filename, "w+b") as f:
|
||||
pisa.pisaDocument(io.StringIO(filled_template), dest=f, default_css=css)
|
||||
@@ -91,9 +91,9 @@ def save_json_report(filename: str, username: str, results: dict, report_type: s
|
||||
|
||||
|
||||
class MaigretGraph:
|
||||
other_params = {'size': 10, 'group': 3}
|
||||
site_params = {'size': 15, 'group': 2}
|
||||
username_params = {'size': 20, 'group': 1}
|
||||
other_params: dict = {'size': 10, 'group': 3}
|
||||
site_params: dict = {'size': 15, 'group': 2}
|
||||
username_params: dict = {'size': 20, 'group': 1}
|
||||
|
||||
def __init__(self, graph):
|
||||
self.G = graph
|
||||
@@ -106,7 +106,7 @@ class MaigretGraph:
|
||||
params = dict(self.username_params)
|
||||
elif value.startswith('http'):
|
||||
params = dict(self.site_params)
|
||||
|
||||
|
||||
params['title'] = node_name
|
||||
if color:
|
||||
params['color'] = color
|
||||
@@ -121,12 +121,12 @@ class MaigretGraph:
|
||||
def save_graph_report(filename: str, username_results: list, db: MaigretDatabase):
|
||||
import networkx as nx
|
||||
|
||||
G = nx.Graph()
|
||||
G: Any = nx.Graph()
|
||||
graph = MaigretGraph(G)
|
||||
|
||||
base_site_nodes = {}
|
||||
site_account_nodes = {}
|
||||
processed_values = {} # Track processed values to avoid duplicates
|
||||
processed_values: Dict[str, Any] = {} # Track processed values to avoid duplicates
|
||||
|
||||
for username, id_type, results in username_results:
|
||||
# Add username node, using normalized version directly if different
|
||||
@@ -141,10 +141,12 @@ def save_graph_report(filename: str, username_results: list, db: MaigretDatabase
|
||||
if not status or status.status != MaigretCheckStatus.CLAIMED:
|
||||
continue
|
||||
|
||||
# base site node
|
||||
# base site node
|
||||
site_base_url = website_name
|
||||
if site_base_url not in base_site_nodes:
|
||||
base_site_nodes[site_base_url] = graph.add_node('site', site_base_url, color='#28a745') # Green color
|
||||
base_site_nodes[site_base_url] = graph.add_node(
|
||||
'site', site_base_url, color='#28a745'
|
||||
) # Green color
|
||||
|
||||
site_base_node_name = base_site_nodes[site_base_url]
|
||||
|
||||
@@ -152,7 +154,9 @@ def save_graph_report(filename: str, username_results: list, db: MaigretDatabase
|
||||
account_url = dictionary.get('url_user', f'{site_base_url}/{norm_username}')
|
||||
account_node_id = f"{site_base_url}: {account_url}"
|
||||
if account_node_id not in site_account_nodes:
|
||||
site_account_nodes[account_node_id] = graph.add_node('account', account_url)
|
||||
site_account_nodes[account_node_id] = graph.add_node(
|
||||
'account', account_url
|
||||
)
|
||||
|
||||
account_node_name = site_account_nodes[account_node_id]
|
||||
|
||||
@@ -162,13 +166,18 @@ def save_graph_report(filename: str, username_results: list, db: MaigretDatabase
|
||||
|
||||
def process_ids(parent_node, ids):
|
||||
for k, v in ids.items():
|
||||
if k.endswith('_count') or k.startswith('is_') or k.endswith('_at') or k in 'image':
|
||||
if (
|
||||
k.endswith('_count')
|
||||
or k.startswith('is_')
|
||||
or k.endswith('_at')
|
||||
or k in 'image'
|
||||
):
|
||||
continue
|
||||
|
||||
# Normalize value if string
|
||||
norm_v = v.lower() if isinstance(v, str) else v
|
||||
value_key = f"{k}:{norm_v}"
|
||||
|
||||
|
||||
if value_key in processed_values:
|
||||
ids_data_name = processed_values[value_key]
|
||||
else:
|
||||
@@ -187,7 +196,9 @@ def save_graph_report(filename: str, username_results: list, db: MaigretDatabase
|
||||
data_node_name = graph.add_node(vv, site_base_url)
|
||||
graph.link(list_node_name, data_node_name)
|
||||
|
||||
add_ids = {a: b for b, a in db.extract_ids_from_url(vv).items()}
|
||||
add_ids = {
|
||||
a: b for b, a in db.extract_ids_from_url(vv).items()
|
||||
}
|
||||
if add_ids:
|
||||
process_ids(data_node_name, add_ids)
|
||||
ids_data_name = list_node_name
|
||||
@@ -198,11 +209,17 @@ def save_graph_report(filename: str, username_results: list, db: MaigretDatabase
|
||||
if 'username' in k or k in SUPPORTED_IDS:
|
||||
new_username_key = f"username:{norm_v}"
|
||||
if new_username_key not in processed_values:
|
||||
new_username_node_name = graph.add_node('username', norm_v)
|
||||
processed_values[new_username_key] = new_username_node_name
|
||||
new_username_node_name = graph.add_node(
|
||||
'username', norm_v
|
||||
)
|
||||
processed_values[new_username_key] = (
|
||||
new_username_node_name
|
||||
)
|
||||
graph.link(ids_data_name, new_username_node_name)
|
||||
|
||||
add_ids = {k: v for v, k in db.extract_ids_from_url(v).items()}
|
||||
add_ids = {
|
||||
k: v for v, k in db.extract_ids_from_url(v).items()
|
||||
}
|
||||
if add_ids:
|
||||
process_ids(ids_data_name, add_ids)
|
||||
|
||||
@@ -216,11 +233,14 @@ def save_graph_report(filename: str, username_results: list, db: MaigretDatabase
|
||||
G.remove_nodes_from(nodes_to_remove)
|
||||
|
||||
# Remove site nodes with only one connection
|
||||
single_degree_sites = [n for n, deg in G.degree() if n.startswith("site:") and deg <= 1]
|
||||
single_degree_sites = [
|
||||
n for n, deg in G.degree() if n.startswith("site:") and deg <= 1
|
||||
]
|
||||
G.remove_nodes_from(single_degree_sites)
|
||||
|
||||
# Generate interactive visualization
|
||||
from pyvis.network import Network
|
||||
from pyvis.network import Network # type: ignore[import-untyped]
|
||||
|
||||
nt = Network(notebook=True, height="750px", width="100%")
|
||||
nt.from_nx(G)
|
||||
nt.show(filename)
|
||||
@@ -237,6 +257,144 @@ def get_plaintext_report(context: dict) -> str:
|
||||
return output.strip()
|
||||
|
||||
|
||||
def _md_format_value(value) -> str:
|
||||
"""Format a value for Markdown output, detecting links."""
|
||||
if isinstance(value, list):
|
||||
return ", ".join(str(v) for v in value)
|
||||
s = str(value)
|
||||
if s.startswith("http://") or s.startswith("https://"):
|
||||
return f"[{s}]({s})"
|
||||
return s
|
||||
|
||||
|
||||
def save_markdown_report(filename: str, context: dict, run_info: dict = None):
|
||||
username = context.get("username", "unknown")
|
||||
generated_at = context.get("generated_at", "")
|
||||
brief = context.get("brief", "")
|
||||
countries = context.get("countries_tuple_list", [])
|
||||
interests = context.get("interests_tuple_list", [])
|
||||
first_seen = context.get("first_seen")
|
||||
results = context.get("results", [])
|
||||
|
||||
# Collect ALL values for key fields across all accounts
|
||||
all_fields: Dict[str, list] = {}
|
||||
last_seen = None
|
||||
for _, _, data in results:
|
||||
for _, v in data.items():
|
||||
if not v.get("found") or v.get("is_similar"):
|
||||
continue
|
||||
ids_data = v.get("ids_data", {})
|
||||
# Map multiple source fields to unified output fields
|
||||
field_sources = {
|
||||
"fullname": ("fullname", "name"),
|
||||
"location": ("location", "country", "city", "country_code", "locale", "region"),
|
||||
"gender": ("gender",),
|
||||
"bio": ("bio", "about", "description"),
|
||||
}
|
||||
for out_field, source_keys in field_sources.items():
|
||||
for src in source_keys:
|
||||
val = ids_data.get(src)
|
||||
if val:
|
||||
all_fields.setdefault(out_field, [])
|
||||
val_str = str(val)
|
||||
if val_str not in all_fields[out_field]:
|
||||
all_fields[out_field].append(val_str)
|
||||
# Track last_seen
|
||||
for ts_field in ("last_online", "latest_activity_at", "updated_at"):
|
||||
ts = ids_data.get(ts_field)
|
||||
if ts and (last_seen is None or str(ts) > str(last_seen)):
|
||||
last_seen = ts
|
||||
|
||||
lines = []
|
||||
lines.append(f"# Report by searching on username \"{username}\"\n")
|
||||
|
||||
# Generated line with run info
|
||||
gen_line = f"Generated at {generated_at} by [Maigret](https://github.com/soxoj/maigret)"
|
||||
if run_info:
|
||||
parts = []
|
||||
if run_info.get("sites_count"):
|
||||
parts.append(f"{run_info['sites_count']} sites checked")
|
||||
if run_info.get("flags"):
|
||||
parts.append(f"flags: `{run_info['flags']}`")
|
||||
if parts:
|
||||
gen_line += f" ({', '.join(parts)})"
|
||||
lines.append(f"{gen_line}\n")
|
||||
|
||||
# Summary
|
||||
lines.append("## Summary\n")
|
||||
lines.append(f"{brief}\n")
|
||||
|
||||
if all_fields:
|
||||
lines.append("**Information extracted from accounts:**\n")
|
||||
for field, values in all_fields.items():
|
||||
title = CaseConverter.snake_to_title(field)
|
||||
lines.append(f"- {title}: {'; '.join(values)}")
|
||||
lines.append("")
|
||||
|
||||
if countries:
|
||||
geo = ", ".join(f"{code} (x{count})" for code, count in countries)
|
||||
lines.append(f"**Country tags:** {geo}\n")
|
||||
|
||||
if interests:
|
||||
tags = ", ".join(f"{tag} (x{count})" for tag, count in interests)
|
||||
lines.append(f"**Website tags:** {tags}\n")
|
||||
|
||||
if first_seen:
|
||||
lines.append(f"**First seen:** {first_seen}")
|
||||
if last_seen:
|
||||
lines.append(f"**Last seen:** {last_seen}")
|
||||
if first_seen or last_seen:
|
||||
lines.append("")
|
||||
|
||||
# Accounts found
|
||||
lines.append("## Accounts found\n")
|
||||
|
||||
for u, id_type, data in results:
|
||||
for site_name, v in data.items():
|
||||
if not v.get("found") or v.get("is_similar"):
|
||||
continue
|
||||
|
||||
lines.append(f"### {site_name}\n")
|
||||
lines.append(f"- **URL:** [{v.get('url_user', '')}]({v.get('url_user', '')})")
|
||||
|
||||
tags = v.get("status") and v["status"].tags or []
|
||||
if tags:
|
||||
lines.append(f"- **Tags:** {', '.join(tags)}")
|
||||
lines.append("")
|
||||
|
||||
ids_data = v.get("ids_data", {})
|
||||
if ids_data:
|
||||
for field, value in ids_data.items():
|
||||
if field == "image":
|
||||
continue
|
||||
title = CaseConverter.snake_to_title(field)
|
||||
lines.append(f"- {title}: {_md_format_value(value)}")
|
||||
|
||||
lines.append("")
|
||||
|
||||
# Possible false positives
|
||||
lines.append("## Possible false positives\n")
|
||||
lines.append(
|
||||
f"This report was generated by searching for accounts matching the username `{username}`. "
|
||||
f"Accounts listed above may belong to different people who happen to use the same "
|
||||
f"or similar username. Results without extracted personal information could contain "
|
||||
f"some false positive findings. Always verify findings before drawing conclusions.\n"
|
||||
)
|
||||
|
||||
# Ethical use
|
||||
lines.append("## Ethical use\n")
|
||||
lines.append(
|
||||
"This report is a result of a technical collection of publicly available information "
|
||||
"from online accounts and does not constitute personal data processing. If you intend "
|
||||
"to use this data for personal data processing or collection purposes, ensure your use "
|
||||
"complies with applicable laws and regulations in your jurisdiction (such as GDPR, "
|
||||
"CCPA, and similar).\n"
|
||||
)
|
||||
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
f.write("\n".join(lines))
|
||||
|
||||
|
||||
"""
|
||||
REPORTS GENERATING
|
||||
"""
|
||||
@@ -333,11 +491,12 @@ def generate_report_context(username_results: list):
|
||||
if k in ["country", "locale"]:
|
||||
try:
|
||||
if is_country_tag(k):
|
||||
tag = pycountry.countries.get(alpha_2=v).alpha_2.lower()
|
||||
country = pycountry.countries.get(alpha_2=v)
|
||||
tag = country.alpha_2.lower() # type: ignore[union-attr]
|
||||
else:
|
||||
tag = pycountry.countries.search_fuzzy(v)[
|
||||
0
|
||||
].alpha_2.lower()
|
||||
].alpha_2.lower() # type: ignore[attr-defined]
|
||||
# TODO: move countries to another struct
|
||||
tags[tag] = tags.get(tag, 0) + 1
|
||||
except Exception as e:
|
||||
@@ -493,8 +652,8 @@ def add_xmind_subtopic(userlink, k, v, supposed_data):
|
||||
|
||||
|
||||
def design_xmind_sheet(sheet, username, results):
|
||||
alltags = {}
|
||||
supposed_data = {}
|
||||
alltags: Dict[str, Any] = {}
|
||||
supposed_data: Dict[str, Any] = {}
|
||||
|
||||
sheet.setTitle("%s Analysis" % (username))
|
||||
root_topic1 = sheet.getRootTopic()
|
||||
|
||||
+24289
-24720
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,8 @@
|
||||
{
|
||||
"version": 1,
|
||||
"updated_at": "2026-04-10T10:28:14Z",
|
||||
"sites_count": 3150,
|
||||
"min_maigret_version": "0.6.0",
|
||||
"data_sha256": "72a493fef4eb8958fe8ed0c9b895841ec10c335f1b8e5e9b24b50784be6ad017",
|
||||
"data_url": "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/data.json"
|
||||
}
|
||||
@@ -54,15 +54,9 @@
|
||||
"graph_report": false,
|
||||
"pdf_report": false,
|
||||
"html_report": false,
|
||||
"md_report": false,
|
||||
"web_interface_port": 5000,
|
||||
"cloudflare_bypass": {
|
||||
"enabled": true,
|
||||
"modules": [
|
||||
{
|
||||
"name": "chrome_webgate",
|
||||
"method": "url_rewrite",
|
||||
"url": "http://localhost:8000/html?url={url}&retries=1"
|
||||
}
|
||||
]
|
||||
}
|
||||
"no_autoupdate": false,
|
||||
"db_update_meta_url": "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/db_meta.json",
|
||||
"autoupdate_check_interval_hours": 24
|
||||
}
|
||||
+5
-1
@@ -5,7 +5,7 @@ from typing import List
|
||||
|
||||
SETTINGS_FILES_PATHS = [
|
||||
path.join(path.dirname(path.realpath(__file__)), "resources/settings.json"),
|
||||
'~/.maigret/settings.json',
|
||||
path.expanduser('~/.maigret/settings.json'),
|
||||
path.join(os.getcwd(), 'settings.json'),
|
||||
]
|
||||
|
||||
@@ -42,7 +42,11 @@ class Settings:
|
||||
pdf_report: bool
|
||||
html_report: bool
|
||||
graph_report: bool
|
||||
md_report: bool
|
||||
web_interface_port: int
|
||||
no_autoupdate: bool
|
||||
db_update_meta_url: str
|
||||
autoupdate_check_interval_hours: int
|
||||
|
||||
# submit mode settings
|
||||
presence_strings: list
|
||||
|
||||
+98
-9
@@ -65,6 +65,10 @@ class MaigretSite:
|
||||
url_probe = None
|
||||
# Type of check to perform
|
||||
check_type = ""
|
||||
# HTTP request method (GET, POST, HEAD, etc.)
|
||||
request_method = ""
|
||||
# HTTP request payload (for POST, PUT, etc.)
|
||||
request_payload: Dict[str, Any] = {}
|
||||
# Whether to only send HEAD requests (GET by default)
|
||||
request_head_only = ""
|
||||
# GET parameters to include in requests
|
||||
@@ -88,10 +92,12 @@ class MaigretSite:
|
||||
# Alexa traffic rank
|
||||
alexa_rank = None
|
||||
# Source (in case a site is a mirror of another site)
|
||||
source = None
|
||||
source: Optional[str] = None
|
||||
|
||||
# URL protocol (http/https)
|
||||
protocol = ''
|
||||
# Protection types detected on this site (e.g. ["tls_fingerprint", "ddos_guard"])
|
||||
protection: List[str] = []
|
||||
|
||||
def __init__(self, name, information):
|
||||
self.name = name
|
||||
@@ -137,6 +143,8 @@ class MaigretSite:
|
||||
'regex_check',
|
||||
'url_probe',
|
||||
'check_type',
|
||||
'request_method',
|
||||
'request_payload',
|
||||
'request_head_only',
|
||||
'get_params',
|
||||
'presense_strs',
|
||||
@@ -167,7 +175,7 @@ class MaigretSite:
|
||||
self.__dict__[CaseConverter.camel_to_snake(group)],
|
||||
)
|
||||
|
||||
self.url_regexp = URLMatcher.make_profile_url_regexp(url, self.regex_check)
|
||||
self.url_regexp = URLMatcher.make_profile_url_regexp(url, self.regex_check or "")
|
||||
|
||||
def detect_username(self, url: str) -> Optional[str]:
|
||||
if self.url_regexp:
|
||||
@@ -318,6 +326,7 @@ class MaigretDatabase:
|
||||
reverse=False,
|
||||
top=sys.maxsize,
|
||||
tags=[],
|
||||
excluded_tags=[],
|
||||
names=[],
|
||||
disabled=True,
|
||||
id_type="username",
|
||||
@@ -325,19 +334,30 @@ class MaigretDatabase:
|
||||
"""
|
||||
Ranking and filtering of the sites list
|
||||
|
||||
When ``top`` is limited (not "all sites"), **mirrors** may be appended after
|
||||
the Alexa-ranked slice. A mirror is any filtered site with a non-empty
|
||||
``source`` field equal to the name of a site that appears in the first
|
||||
``top`` positions of a **parent ranking** that includes disabled sites.
|
||||
Thus mirrors such as third-party viewers (e.g. for Twitter or Instagram)
|
||||
are still scanned when their parent platform ranks highly, even if the
|
||||
official site is disabled and omitted from the main list.
|
||||
|
||||
Args:
|
||||
reverse (bool, optional): Reverse the sorting order. Defaults to False.
|
||||
top (int, optional): Maximum number of sites to return. Defaults to sys.maxsize.
|
||||
tags (list, optional): List of tags to filter sites by. Defaults to empty list.
|
||||
tags (list, optional): List of tags to filter sites by (whitelist). Defaults to empty list.
|
||||
excluded_tags (list, optional): List of tags to exclude sites by (blacklist). Defaults to empty list.
|
||||
names (list, optional): List of site names (or urls, see MaigretSite.__eq__) to filter by. Defaults to empty list.
|
||||
disabled (bool, optional): Whether to include disabled sites. Defaults to True.
|
||||
id_type (str, optional): Type of identifier to filter by. Defaults to "username".
|
||||
|
||||
Returns:
|
||||
dict: Dictionary of filtered and ranked sites, with site names as keys and MaigretSite objects as values
|
||||
dict: Dictionary of filtered and ranked sites (base top slice plus mirrors),
|
||||
with site names as keys and MaigretSite objects as values
|
||||
"""
|
||||
normalized_names = list(map(str.lower, names))
|
||||
normalized_tags = list(map(str.lower, tags))
|
||||
normalized_excluded_tags = list(map(str.lower, excluded_tags))
|
||||
|
||||
is_name_ok = lambda x: x.name.lower() in normalized_names
|
||||
is_source_ok = lambda x: x.source and x.source.lower() in normalized_names
|
||||
@@ -351,6 +371,22 @@ class MaigretDatabase:
|
||||
)
|
||||
is_id_type_ok = lambda x: x.type == id_type
|
||||
|
||||
is_excluded_by_tag = lambda x: set(
|
||||
map(str.lower, x.tags)
|
||||
).intersection(set(normalized_excluded_tags))
|
||||
is_excluded_by_engine = lambda x: (
|
||||
isinstance(x.engine, str)
|
||||
and x.engine.lower() in normalized_excluded_tags
|
||||
)
|
||||
is_excluded_by_protocol = lambda x: (
|
||||
x.protocol and x.protocol in normalized_excluded_tags
|
||||
)
|
||||
is_not_excluded = lambda x: not excluded_tags or not (
|
||||
is_excluded_by_tag(x)
|
||||
or is_excluded_by_engine(x)
|
||||
or is_excluded_by_protocol(x)
|
||||
)
|
||||
|
||||
filter_tags_engines_fun = (
|
||||
lambda x: not tags
|
||||
or is_engine_ok(x)
|
||||
@@ -361,6 +397,7 @@ class MaigretDatabase:
|
||||
|
||||
filter_fun = (
|
||||
lambda x: filter_tags_engines_fun(x)
|
||||
and is_not_excluded(x)
|
||||
and filter_names_fun(x)
|
||||
and is_disabled_needed(x)
|
||||
and is_id_type_ok(x)
|
||||
@@ -371,6 +408,33 @@ class MaigretDatabase:
|
||||
sorted_list = sorted(
|
||||
filtered_list, key=lambda x: x.alexa_rank, reverse=reverse
|
||||
)[:top]
|
||||
|
||||
# Mirrors: sites whose `source` matches a parent platform that ranks in the
|
||||
# top `top` by Alexa when disabled entries are included in the ranking pool
|
||||
# (so e.g. Instagram can be a parent for Picuki even if Instagram is disabled).
|
||||
if top < sys.maxsize and sorted_list:
|
||||
filter_fun_ranking_parents = (
|
||||
lambda x: filter_tags_engines_fun(x)
|
||||
and is_not_excluded(x)
|
||||
and filter_names_fun(x)
|
||||
and is_id_type_ok(x)
|
||||
)
|
||||
ranking_pool = [s for s in self.sites if filter_fun_ranking_parents(s)]
|
||||
sorted_parents = sorted(
|
||||
ranking_pool, key=lambda x: x.alexa_rank, reverse=reverse
|
||||
)[:top]
|
||||
parent_names_lower = {s.name.lower() for s in sorted_parents}
|
||||
base_names = {s.name for s in sorted_list}
|
||||
|
||||
def is_mirror(s) -> bool:
|
||||
if not s.source or s.name in base_names:
|
||||
return False
|
||||
return s.source.lower() in parent_names_lower
|
||||
|
||||
mirrors = [s for s in filtered_list if is_mirror(s)]
|
||||
mirrors.sort(key=lambda x: (x.alexa_rank, x.name))
|
||||
sorted_list = list(sorted_list) + mirrors
|
||||
|
||||
return {site.name: site for site in sorted_list}
|
||||
|
||||
@property
|
||||
@@ -400,9 +464,9 @@ class MaigretDatabase:
|
||||
"tags": self._tags,
|
||||
}
|
||||
|
||||
json_data = json.dumps(db_data, indent=4)
|
||||
json_data = json.dumps(db_data, indent=4, ensure_ascii=False)
|
||||
|
||||
with open(filename, "w") as f:
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
f.write(json_data)
|
||||
|
||||
return self
|
||||
@@ -502,7 +566,7 @@ class MaigretDatabase:
|
||||
|
||||
def get_scan_stats(self, sites_dict):
|
||||
sites = sites_dict or self.sites_dict
|
||||
found_flags = {}
|
||||
found_flags: Dict[str, int] = {}
|
||||
for _, s in sites.items():
|
||||
if "presense_flag" in s.stats:
|
||||
flag = s.stats["presense_flag"]
|
||||
@@ -523,8 +587,10 @@ class MaigretDatabase:
|
||||
def get_db_stats(self, is_markdown=False):
|
||||
# Initialize counters
|
||||
sites_dict = self.sites_dict
|
||||
urls = {}
|
||||
tags = {}
|
||||
urls: Dict[str, int] = {}
|
||||
tags: Dict[str, int] = {}
|
||||
engine_total: Dict[str, int] = {}
|
||||
engine_enabled: Dict[str, int] = {}
|
||||
disabled_count = 0
|
||||
message_checks_one_factor = 0
|
||||
status_checks = 0
|
||||
@@ -547,6 +613,14 @@ class MaigretDatabase:
|
||||
elif site.check_type == 'status_code':
|
||||
status_checks += 1
|
||||
|
||||
# Count engines
|
||||
if site.engine:
|
||||
engine_total[site.engine] = engine_total.get(site.engine, 0) + 1
|
||||
if not site.disabled:
|
||||
engine_enabled[site.engine] = (
|
||||
engine_enabled.get(site.engine, 0) + 1
|
||||
)
|
||||
|
||||
# Count tags
|
||||
if not site.tags:
|
||||
tags["NO_TAGS"] = tags.get("NO_TAGS", 0) + 1
|
||||
@@ -583,11 +657,26 @@ class MaigretDatabase:
|
||||
f"Sites with probing: {', '.join(sorted(site_with_probing))}",
|
||||
f"Sites with activation: {', '.join(sorted(site_with_activation))}",
|
||||
self._format_top_items("profile URLs", urls, 20, is_markdown),
|
||||
self._format_engine_stats(engine_total, engine_enabled, is_markdown),
|
||||
self._format_top_items("tags", tags, 20, is_markdown, self._tags),
|
||||
]
|
||||
|
||||
return separator.join(output)
|
||||
|
||||
def _format_engine_stats(self, engine_total, engine_enabled, is_markdown):
|
||||
"""Format per-engine enabled/total counts, sorted by total descending."""
|
||||
output = "Sites by engine:\n"
|
||||
for engine, total in sorted(
|
||||
engine_total.items(), key=lambda x: x[1], reverse=True
|
||||
):
|
||||
enabled = engine_enabled.get(engine, 0)
|
||||
perc = round(100 * enabled / total, 1) if total else 0.0
|
||||
if is_markdown:
|
||||
output += f"- `{engine}`: {enabled}/{total} ({perc}%)\n"
|
||||
else:
|
||||
output += f"{enabled}/{total} ({perc}%)\t{engine}\n"
|
||||
return output
|
||||
|
||||
def _format_top_items(
|
||||
self, title, items_dict, limit, is_markdown, valid_items=None
|
||||
):
|
||||
|
||||
+41
-29
@@ -6,8 +6,7 @@ import logging
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from aiohttp import ClientSession, TCPConnector
|
||||
from aiohttp_socks import ProxyConnector
|
||||
import cloudscraper
|
||||
import cloudscraper # type: ignore[import-untyped]
|
||||
from colorama import Fore, Style
|
||||
|
||||
from .activation import import_aiohttp_cookies
|
||||
@@ -68,8 +67,10 @@ class Submitter:
|
||||
else:
|
||||
cookie_jar = import_aiohttp_cookies(args.cookie_file)
|
||||
|
||||
connector = ProxyConnector.from_url(proxy) if proxy else TCPConnector(ssl=False)
|
||||
connector.verify_ssl = False
|
||||
ssl_context = __import__('ssl').create_default_context()
|
||||
ssl_context.check_hostname = False
|
||||
ssl_context.verify_mode = __import__('ssl').CERT_NONE
|
||||
connector = ProxyConnector.from_url(proxy) if proxy else TCPConnector(ssl=ssl_context)
|
||||
self.session = ClientSession(
|
||||
connector=connector, trust_env=True, cookie_jar=cookie_jar
|
||||
)
|
||||
@@ -88,7 +89,9 @@ class Submitter:
|
||||
alexa_rank = 0
|
||||
|
||||
try:
|
||||
alexa_rank = int(root.find('.//REACH').attrib['RANK'])
|
||||
reach_elem = root.find('.//REACH')
|
||||
if reach_elem is not None:
|
||||
alexa_rank = int(reach_elem.attrib['RANK'])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@@ -110,7 +113,6 @@ class Submitter:
|
||||
cookies=self.args.cookie_file,
|
||||
# Don't skip errors in submit mode - we need check both false positives/true negatives
|
||||
skip_errors=False,
|
||||
cloudflare_bypass=getattr(self.args, 'cloudflare_bypass', False),
|
||||
)
|
||||
return changes
|
||||
|
||||
@@ -128,7 +130,7 @@ class Submitter:
|
||||
|
||||
async def detect_known_engine(
|
||||
self, url_exists, url_mainpage, session, follow_redirects, headers
|
||||
) -> [List[MaigretSite], str]:
|
||||
) -> Tuple[List[MaigretSite], str]:
|
||||
|
||||
session = session or self.session
|
||||
resp_text, _ = await self.get_html_response_to_compare(
|
||||
@@ -192,8 +194,9 @@ class Submitter:
|
||||
# TODO: replace with checking.py/SimpleAiohttpChecker call
|
||||
@staticmethod
|
||||
async def get_html_response_to_compare(
|
||||
url: str, session: ClientSession = None, redirects=False, headers: Dict = None
|
||||
url: str, session: Optional[ClientSession] = None, redirects=False, headers: Optional[Dict] = None
|
||||
):
|
||||
assert session is not None, "session must not be None"
|
||||
async with session.get(
|
||||
url, allow_redirects=redirects, headers=headers
|
||||
) as response:
|
||||
@@ -212,10 +215,10 @@ class Submitter:
|
||||
username: str,
|
||||
url_exists: str,
|
||||
cookie_filename="", # TODO: use cookies
|
||||
session: ClientSession = None,
|
||||
session: Optional[ClientSession] = None,
|
||||
follow_redirects=False,
|
||||
headers: dict = None,
|
||||
) -> Tuple[List[str], List[str], str, str]:
|
||||
headers: Optional[dict] = None,
|
||||
) -> Tuple[Optional[List[str]], Optional[List[str]], str, str]:
|
||||
|
||||
random_username = generate_random_username()
|
||||
url_of_non_existing_account = url_exists.lower().replace(
|
||||
@@ -270,11 +273,8 @@ class Submitter:
|
||||
tokens_a = set(re.split(f'[{self.SEPARATORS}]', first_html_response))
|
||||
tokens_b = set(re.split(f'[{self.SEPARATORS}]', second_html_response))
|
||||
|
||||
a_minus_b = tokens_a.difference(tokens_b)
|
||||
b_minus_a = tokens_b.difference(tokens_a)
|
||||
|
||||
a_minus_b = list(map(lambda x: x.strip('\\'), a_minus_b))
|
||||
b_minus_a = list(map(lambda x: x.strip('\\'), b_minus_a))
|
||||
a_minus_b: List[str] = [x.strip('\\') for x in tokens_a.difference(tokens_b)]
|
||||
b_minus_a: List[str] = [x.strip('\\') for x in tokens_b.difference(tokens_a)]
|
||||
|
||||
# Filter out strings containing usernames
|
||||
a_minus_b = [s for s in a_minus_b if username.lower() not in s.lower()]
|
||||
@@ -379,7 +379,7 @@ class Submitter:
|
||||
).strip()
|
||||
|
||||
if field in ['tags', 'presense_strs', 'absence_strs']:
|
||||
new_value = list(map(str.strip, new_value.split(',')))
|
||||
new_value = list(map(str.strip, new_value.split(','))) # type: ignore[assignment]
|
||||
|
||||
if new_value:
|
||||
setattr(site, field, new_value)
|
||||
@@ -410,8 +410,13 @@ class Submitter:
|
||||
self.logger.info('Domain is %s', domain_raw)
|
||||
|
||||
# check for existence
|
||||
domain_re = re.compile(
|
||||
r'://(www\.)?' + re.escape(domain_raw) + r'(/|$)'
|
||||
)
|
||||
matched_sites = list(
|
||||
filter(lambda x: domain_raw in x.url_main + x.url, self.db.sites)
|
||||
filter(
|
||||
lambda x: domain_re.search(x.url_main + x.url), self.db.sites
|
||||
)
|
||||
)
|
||||
|
||||
if matched_sites:
|
||||
@@ -420,12 +425,12 @@ class Submitter:
|
||||
f"{Fore.YELLOW}[!] Sites with domain \"{domain_raw}\" already exists in the Maigret database!{Style.RESET_ALL}"
|
||||
)
|
||||
|
||||
status = lambda s: "(disabled)" if s.disabled else ""
|
||||
site_status = lambda s: "(disabled)" if s.disabled else ""
|
||||
url_block = lambda s: f"\n\t{s.url_main}\n\t{s.url}"
|
||||
print(
|
||||
"\n".join(
|
||||
[
|
||||
f"{site.name} {status(site)}{url_block(site)}"
|
||||
f"{site.name} {site_status(site)}{url_block(site)}"
|
||||
for site in matched_sites
|
||||
]
|
||||
)
|
||||
@@ -449,9 +454,14 @@ class Submitter:
|
||||
old_site = next(
|
||||
(site for site in matched_sites if site.name == site_name), None
|
||||
)
|
||||
print(
|
||||
f'{Fore.GREEN}[+] We will update site "{old_site.name}" in case of success.{Style.RESET_ALL}'
|
||||
)
|
||||
if old_site is None:
|
||||
print(
|
||||
f'{Fore.RED}[!] Site "{site_name}" not found in the matched list. Proceeding without updating an existing site.{Style.RESET_ALL}'
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f'{Fore.GREEN}[+] We will update site "{old_site.name}" in case of success.{Style.RESET_ALL}'
|
||||
)
|
||||
|
||||
# Check if the site check is ordinary or not
|
||||
if old_site and (old_site.url_probe or old_site.activation):
|
||||
@@ -488,7 +498,7 @@ class Submitter:
|
||||
)
|
||||
|
||||
print('Detecting site engine, please wait...')
|
||||
sites = []
|
||||
sites: List[MaigretSite] = []
|
||||
text = None
|
||||
try:
|
||||
sites, text = await self.detect_known_engine(
|
||||
@@ -501,7 +511,7 @@ class Submitter:
|
||||
except KeyboardInterrupt:
|
||||
print('Engine detect process is interrupted.')
|
||||
|
||||
if 'cloudflare' in text.lower():
|
||||
if text and 'cloudflare' in text.lower():
|
||||
print(
|
||||
'Cloudflare protection detected. I will use cloudscraper for further work'
|
||||
)
|
||||
@@ -564,6 +574,8 @@ class Submitter:
|
||||
found = True
|
||||
break
|
||||
|
||||
assert chosen_site is not None, "No sites to check"
|
||||
|
||||
if not found:
|
||||
print(
|
||||
f"{Fore.RED}[!] The check for site '{chosen_site.name}' failed!{Style.RESET_ALL}"
|
||||
@@ -622,8 +634,8 @@ class Submitter:
|
||||
# chosen_site.alexa_rank = rank
|
||||
|
||||
self.logger.info(chosen_site.json)
|
||||
site_data = chosen_site.strip_engine_data()
|
||||
self.logger.info(site_data.json)
|
||||
stripped_site = chosen_site.strip_engine_data()
|
||||
self.logger.info(stripped_site.json)
|
||||
|
||||
if old_site:
|
||||
# Update old site with new values and log changes
|
||||
@@ -642,7 +654,7 @@ class Submitter:
|
||||
|
||||
for field, display_name in fields_to_check.items():
|
||||
old_value = getattr(old_site, field)
|
||||
new_value = getattr(site_data, field)
|
||||
new_value = getattr(stripped_site, field)
|
||||
if field == 'tags' and not new_tags:
|
||||
continue
|
||||
if str(old_value) != str(new_value):
|
||||
@@ -652,7 +664,7 @@ class Submitter:
|
||||
old_site.__dict__[field] = new_value
|
||||
|
||||
# update the site
|
||||
final_site = old_site if old_site else site_data
|
||||
final_site = old_site if old_site else stripped_site
|
||||
self.db.update_site(final_site)
|
||||
|
||||
# save the db in file
|
||||
|
||||
+6
-3
@@ -8,7 +8,7 @@ from typing import Any
|
||||
|
||||
|
||||
DEFAULT_USER_AGENTS = [
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
|
||||
]
|
||||
|
||||
|
||||
@@ -71,7 +71,10 @@ class URLMatcher:
|
||||
|
||||
|
||||
def ascii_data_display(data: str) -> Any:
|
||||
return ast.literal_eval(data)
|
||||
try:
|
||||
return ast.literal_eval(data)
|
||||
except (ValueError, SyntaxError):
|
||||
return data
|
||||
|
||||
|
||||
def get_dict_ascii_tree(items, prepend="", new_line=True):
|
||||
@@ -86,7 +89,7 @@ def get_dict_ascii_tree(items, prepend="", new_line=True):
|
||||
new_result + new_line if num != len(items) - 1 else last_result + new_line
|
||||
)
|
||||
|
||||
if type(item) == tuple:
|
||||
if isinstance(item, tuple):
|
||||
field_name, field_value = item
|
||||
if field_value.startswith("['"):
|
||||
is_last_item = num == len(items) - 1
|
||||
|
||||
+61
-39
@@ -13,26 +13,25 @@ import os
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
from threading import Thread
|
||||
from typing import Any, Dict
|
||||
import maigret
|
||||
import maigret.settings
|
||||
from maigret.sites import MaigretDatabase
|
||||
from maigret.report import generate_report_context
|
||||
|
||||
app = Flask(__name__)
|
||||
app.secret_key = 'your-secret-key-here'
|
||||
# Use environment variable for secret key, generate random one if not set
|
||||
app.secret_key = os.getenv('FLASK_SECRET_KEY', os.urandom(24).hex())
|
||||
|
||||
#add background job tracking
|
||||
background_jobs = {}
|
||||
# add background job tracking
|
||||
background_jobs: Dict[str, Any] = {}
|
||||
job_results = {}
|
||||
|
||||
# Configuration
|
||||
MAIGRET_DB_FILE = os.path.join('maigret', 'resources', 'data.json')
|
||||
COOKIES_FILE = "cookies.txt"
|
||||
UPLOAD_FOLDER = 'uploads'
|
||||
REPORTS_FOLDER = os.path.abspath('/tmp/maigret_reports')
|
||||
|
||||
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
||||
os.makedirs(REPORTS_FOLDER, exist_ok=True)
|
||||
app.config["MAIGRET_DB_FILE"] = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'resources', 'data.json')
|
||||
app.config["COOKIES_FILE"] = "cookies.txt"
|
||||
app.config["UPLOAD_FOLDER"] = 'uploads'
|
||||
app.config["REPORTS_FOLDER"] = os.path.abspath('/tmp/maigret_reports')
|
||||
|
||||
|
||||
def setup_logger(log_level, name):
|
||||
@@ -44,24 +43,26 @@ def setup_logger(log_level, name):
|
||||
async def maigret_search(username, options):
|
||||
logger = setup_logger(logging.WARNING, 'maigret')
|
||||
try:
|
||||
db = MaigretDatabase().load_from_path(MAIGRET_DB_FILE)
|
||||
|
||||
top_sites = int(options.get('top_sites') or 500)
|
||||
db = MaigretDatabase().load_from_path(app.config["MAIGRET_DB_FILE"])
|
||||
|
||||
top_sites = int(options.get('top_sites') or 500)
|
||||
if options.get('all_sites'):
|
||||
top_sites = 999999999 # effectively all
|
||||
|
||||
|
||||
tags = options.get('tags', [])
|
||||
site_list= options.get('site_list', [])
|
||||
logger.info(f"Filtering sites by tags: {tags}")
|
||||
|
||||
excluded_tags = options.get('excluded_tags', [])
|
||||
site_list = options.get('site_list', [])
|
||||
logger.info(f"Filtering sites by tags: {tags}, excluded: {excluded_tags}")
|
||||
|
||||
sites = db.ranked_sites_dict(
|
||||
top=top_sites,
|
||||
tags=tags,
|
||||
excluded_tags=excluded_tags,
|
||||
names=site_list,
|
||||
disabled=False,
|
||||
id_type='username'
|
||||
id_type='username',
|
||||
)
|
||||
|
||||
|
||||
logger.info(f"Found {len(sites)} sites matching the tag criteria")
|
||||
|
||||
results = await maigret.search(
|
||||
@@ -70,9 +71,11 @@ async def maigret_search(username, options):
|
||||
timeout=int(options.get('timeout', 30)),
|
||||
logger=logger,
|
||||
id_type='username',
|
||||
cookies=COOKIES_FILE if options.get('use_cookies') else None,
|
||||
is_parsing_enabled=(not options.get('disable_extracting', False)),
|
||||
recursive_search_enabled=(not options.get('disable_recursive_search', False)),
|
||||
cookies=app.config["COOKIES_FILE"] if options.get('use_cookies') else None,
|
||||
is_parsing_enabled=(not options.get('disable_extracting', False)),
|
||||
recursive_search_enabled=(
|
||||
not options.get('disable_recursive_search', False)
|
||||
),
|
||||
check_domains=options.get('with_domains', False),
|
||||
proxy=options.get('proxy', None),
|
||||
tor_proxy=options.get('tor_proxy', None),
|
||||
@@ -104,14 +107,17 @@ def process_search_task(usernames, options, timestamp):
|
||||
search_multiple_usernames(usernames, options)
|
||||
)
|
||||
|
||||
session_folder = os.path.join(REPORTS_FOLDER, f"search_{timestamp}")
|
||||
os.makedirs(app.config["REPORTS_FOLDER"], exist_ok=True)
|
||||
session_folder = os.path.join(
|
||||
app.config["REPORTS_FOLDER"], f"search_{timestamp}"
|
||||
)
|
||||
os.makedirs(session_folder, exist_ok=True)
|
||||
|
||||
graph_path = os.path.join(session_folder, "combined_graph.html")
|
||||
maigret.report.save_graph_report(
|
||||
graph_path,
|
||||
general_results,
|
||||
MaigretDatabase().load_from_path(MAIGRET_DB_FILE),
|
||||
MaigretDatabase().load_from_path(app.config["MAIGRET_DB_FILE"]),
|
||||
)
|
||||
|
||||
individual_reports = []
|
||||
@@ -188,20 +194,20 @@ def process_search_task(usernames, options, timestamp):
|
||||
|
||||
@app.route('/')
|
||||
def index():
|
||||
#load site data for autocomplete
|
||||
db = MaigretDatabase().load_from_path(MAIGRET_DB_FILE)
|
||||
# load site data for autocomplete
|
||||
db = MaigretDatabase().load_from_path(app.config["MAIGRET_DB_FILE"])
|
||||
site_options = []
|
||||
|
||||
|
||||
for site in db.sites:
|
||||
#add main site name
|
||||
# add main site name
|
||||
site_options.append(site.name)
|
||||
#add URL if different from name
|
||||
# add URL if different from name
|
||||
if site.url_main and site.url_main not in site_options:
|
||||
site_options.append(site.url_main)
|
||||
|
||||
#sort and deduplicate
|
||||
|
||||
# sort and deduplicate
|
||||
site_options = sorted(set(site_options))
|
||||
|
||||
|
||||
return render_template('index.html', site_options=site_options)
|
||||
|
||||
|
||||
@@ -222,7 +228,8 @@ def search():
|
||||
|
||||
# Get selected tags - ensure it's a list
|
||||
selected_tags = request.form.getlist('tags')
|
||||
logging.info(f"Selected tags: {selected_tags}")
|
||||
excluded_tags = request.form.getlist('excluded_tags')
|
||||
logging.info(f"Selected tags: {selected_tags}, Excluded tags: {excluded_tags}")
|
||||
|
||||
options = {
|
||||
'top_sites': request.form.get('top_sites') or '500',
|
||||
@@ -237,10 +244,15 @@ def search():
|
||||
'i2p_proxy': request.form.get('i2p_proxy', None) or None,
|
||||
'permute': 'permute' in request.form,
|
||||
'tags': selected_tags, # Pass selected tags as a list
|
||||
'site_list': [s.strip() for s in request.form.get('site', '').split(',') if s.strip()],
|
||||
'excluded_tags': excluded_tags, # Pass excluded tags as a list
|
||||
'site_list': [
|
||||
s.strip() for s in request.form.get('site', '').split(',') if s.strip()
|
||||
],
|
||||
}
|
||||
|
||||
logging.info(f"Starting search for usernames: {usernames} with tags: {selected_tags}")
|
||||
logging.info(
|
||||
f"Starting search for usernames: {usernames} with tags: {selected_tags}, excluded: {excluded_tags}"
|
||||
)
|
||||
|
||||
# Start background job
|
||||
background_jobs[timestamp] = {
|
||||
@@ -249,10 +261,11 @@ def search():
|
||||
target=process_search_task, args=(usernames, options, timestamp)
|
||||
),
|
||||
}
|
||||
background_jobs[timestamp]['thread'].start()
|
||||
background_jobs[timestamp]['thread'].start() # type: ignore[union-attr]
|
||||
|
||||
return redirect(url_for('status', timestamp=timestamp))
|
||||
|
||||
|
||||
@app.route('/status/<timestamp>')
|
||||
def status(timestamp):
|
||||
logging.info(f"Status check for timestamp: {timestamp}")
|
||||
@@ -313,8 +326,11 @@ def results(session_id):
|
||||
@app.route('/reports/<path:filename>')
|
||||
def download_report(filename):
|
||||
try:
|
||||
file_path = os.path.normpath(os.path.join(REPORTS_FOLDER, filename))
|
||||
if not file_path.startswith(REPORTS_FOLDER):
|
||||
os.makedirs(app.config["REPORTS_FOLDER"], exist_ok=True)
|
||||
file_path = os.path.normpath(
|
||||
os.path.join(app.config["REPORTS_FOLDER"], filename)
|
||||
)
|
||||
if not file_path.startswith(app.config["REPORTS_FOLDER"]):
|
||||
raise Exception("Invalid file path")
|
||||
return send_file(file_path)
|
||||
except Exception as e:
|
||||
@@ -328,4 +344,10 @@ if __name__ == '__main__':
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
|
||||
)
|
||||
debug_mode = os.getenv('FLASK_DEBUG', 'False').lower() in ['true', '1', 't']
|
||||
app.run(debug=debug_mode)
|
||||
|
||||
# Host configuration: secure by default
|
||||
# Use 127.0.0.1 for local development, 0.0.0.0 only if explicitly set
|
||||
host = os.getenv('FLASK_HOST', '127.0.0.1')
|
||||
port = int(os.getenv('FLASK_PORT', '5000'))
|
||||
|
||||
app.run(host=host, port=port, debug=debug_mode)
|
||||
|
||||
@@ -28,6 +28,11 @@
|
||||
background-color: #28a745;
|
||||
}
|
||||
|
||||
.tag.excluded {
|
||||
background-color: #343a40;
|
||||
text-decoration: line-through;
|
||||
}
|
||||
|
||||
.tag:hover {
|
||||
transform: translateY(-2px);
|
||||
box-shadow: 0 2px 5px rgba(0, 0, 0, 0.2);
|
||||
@@ -168,7 +173,16 @@
|
||||
</div>
|
||||
|
||||
<div class="mb-3">
|
||||
<label class="form-label">Tags (click to select)</label>
|
||||
<label class="form-label">Tags (click to cycle: include → exclude → neutral)</label>
|
||||
<div class="mb-2">
|
||||
<small class="text-muted">
|
||||
<span style="display:inline-block;width:12px;height:12px;background:#28a745;border-radius:50%;"></span> Included (whitelist)
|
||||
|
||||
<span style="display:inline-block;width:12px;height:12px;background:#343a40;border-radius:50%;"></span> Excluded (blacklist)
|
||||
|
||||
<span style="display:inline-block;width:12px;height:12px;background:#dc3545;border-radius:50%;"></span> Neutral
|
||||
</small>
|
||||
</div>
|
||||
<div class="tag-cloud" id="tagCloud"></div>
|
||||
<select multiple class="hidden-select" id="tags" name="tags">
|
||||
<option value="gaming">Gaming</option>
|
||||
@@ -230,6 +244,89 @@
|
||||
<option value="q&a">Q&A</option>
|
||||
<option value="crypto">Crypto</option>
|
||||
<option value="ai">AI</option>
|
||||
<!-- Country tags -->
|
||||
<option value="ae" data-group="country">AE - United Arab Emirates</option>
|
||||
<option value="ao" data-group="country">AO - Angola</option>
|
||||
<option value="ar" data-group="country">AR - Argentina</option>
|
||||
<option value="at" data-group="country">AT - Austria</option>
|
||||
<option value="au" data-group="country">AU - Australia</option>
|
||||
<option value="az" data-group="country">AZ - Azerbaijan</option>
|
||||
<option value="bd" data-group="country">BD - Bangladesh</option>
|
||||
<option value="be" data-group="country">BE - Belgium</option>
|
||||
<option value="bg" data-group="country">BG - Bulgaria</option>
|
||||
<option value="br" data-group="country">BR - Brazil</option>
|
||||
<option value="by" data-group="country">BY - Belarus</option>
|
||||
<option value="ca" data-group="country">CA - Canada</option>
|
||||
<option value="ch" data-group="country">CH - Switzerland</option>
|
||||
<option value="cl" data-group="country">CL - Chile</option>
|
||||
<option value="cn" data-group="country">CN - China</option>
|
||||
<option value="co" data-group="country">CO - Colombia</option>
|
||||
<option value="cr" data-group="country">CR - Costa Rica</option>
|
||||
<option value="cz" data-group="country">CZ - Czechia</option>
|
||||
<option value="de" data-group="country">DE - Germany</option>
|
||||
<option value="dk" data-group="country">DK - Denmark</option>
|
||||
<option value="dz" data-group="country">DZ - Algeria</option>
|
||||
<option value="ee" data-group="country">EE - Estonia</option>
|
||||
<option value="eg" data-group="country">EG - Egypt</option>
|
||||
<option value="es" data-group="country">ES - Spain</option>
|
||||
<option value="eu" data-group="country">EU - European Union</option>
|
||||
<option value="fi" data-group="country">FI - Finland</option>
|
||||
<option value="fr" data-group="country">FR - France</option>
|
||||
<option value="gb" data-group="country">GB - United Kingdom</option>
|
||||
<option value="global" data-group="country">🌍 Global</option>
|
||||
<option value="gr" data-group="country">GR - Greece</option>
|
||||
<option value="hk" data-group="country">HK - Hong Kong</option>
|
||||
<option value="hr" data-group="country">HR - Croatia</option>
|
||||
<option value="hu" data-group="country">HU - Hungary</option>
|
||||
<option value="id" data-group="country">ID - Indonesia</option>
|
||||
<option value="ie" data-group="country">IE - Ireland</option>
|
||||
<option value="il" data-group="country">IL - Israel</option>
|
||||
<option value="in" data-group="country">IN - India</option>
|
||||
<option value="ir" data-group="country">IR - Iran</option>
|
||||
<option value="it" data-group="country">IT - Italy</option>
|
||||
<option value="jp" data-group="country">JP - Japan</option>
|
||||
<option value="kg" data-group="country">KG - Kyrgyzstan</option>
|
||||
<option value="kr" data-group="country">KR - Korea</option>
|
||||
<option value="kz" data-group="country">KZ - Kazakhstan</option>
|
||||
<option value="la" data-group="country">LA - Laos</option>
|
||||
<option value="lk" data-group="country">LK - Sri Lanka</option>
|
||||
<option value="lt" data-group="country">LT - Lithuania</option>
|
||||
<option value="ma" data-group="country">MA - Morocco</option>
|
||||
<option value="md" data-group="country">MD - Moldova</option>
|
||||
<option value="mg" data-group="country">MG - Madagascar</option>
|
||||
<option value="mk" data-group="country">MK - North Macedonia</option>
|
||||
<option value="mx" data-group="country">MX - Mexico</option>
|
||||
<option value="ng" data-group="country">NG - Nigeria</option>
|
||||
<option value="nl" data-group="country">NL - Netherlands</option>
|
||||
<option value="no" data-group="country">NO - Norway</option>
|
||||
<option value="ph" data-group="country">PH - Philippines</option>
|
||||
<option value="pk" data-group="country">PK - Pakistan</option>
|
||||
<option value="pl" data-group="country">PL - Poland</option>
|
||||
<option value="pt" data-group="country">PT - Portugal</option>
|
||||
<option value="re" data-group="country">RE - Réunion</option>
|
||||
<option value="ro" data-group="country">RO - Romania</option>
|
||||
<option value="rs" data-group="country">RS - Serbia</option>
|
||||
<option value="ru" data-group="country">RU - Russia</option>
|
||||
<option value="sa" data-group="country">SA - Saudi Arabia</option>
|
||||
<option value="sd" data-group="country">SD - Sudan</option>
|
||||
<option value="se" data-group="country">SE - Sweden</option>
|
||||
<option value="sg" data-group="country">SG - Singapore</option>
|
||||
<option value="sk" data-group="country">SK - Slovakia</option>
|
||||
<option value="sv" data-group="country">SV - El Salvador</option>
|
||||
<option value="th" data-group="country">TH - Thailand</option>
|
||||
<option value="tn" data-group="country">TN - Tunisia</option>
|
||||
<option value="tr" data-group="country">TR - Türkiye</option>
|
||||
<option value="tw" data-group="country">TW - Taiwan</option>
|
||||
<option value="ua" data-group="country">UA - Ukraine</option>
|
||||
<option value="uk" data-group="country">UK - United Kingdom</option>
|
||||
<option value="us" data-group="country">US - United States</option>
|
||||
<option value="uz" data-group="country">UZ - Uzbekistan</option>
|
||||
<option value="ve" data-group="country">VE - Venezuela</option>
|
||||
<option value="vi" data-group="country">VI - Virgin Islands</option>
|
||||
<option value="vn" data-group="country">VN - Viet Nam</option>
|
||||
<option value="za" data-group="country">ZA - South Africa</option>
|
||||
</select>
|
||||
<select multiple class="hidden-select" id="excludedTags" name="excluded_tags">
|
||||
</select>
|
||||
</div>
|
||||
</div>
|
||||
@@ -292,26 +389,66 @@
|
||||
}
|
||||
|
||||
document.addEventListener('DOMContentLoaded', function () {
|
||||
// Tag cloud functionality
|
||||
// Tag cloud functionality with include/exclude (whitelist/blacklist) support
|
||||
const tagCloud = document.getElementById('tagCloud');
|
||||
const hiddenSelect = document.getElementById('tags');
|
||||
const excludedSelect = document.getElementById('excludedTags');
|
||||
const allTags = Array.from(hiddenSelect.options).map(opt => ({
|
||||
value: opt.value,
|
||||
label: opt.text
|
||||
label: opt.text,
|
||||
group: opt.dataset.group || 'category'
|
||||
}));
|
||||
|
||||
function updateTagSelects() {
|
||||
// Clear and repopulate hidden selects based on tag states
|
||||
Array.from(hiddenSelect.options).forEach(opt => opt.selected = false);
|
||||
// Clear excluded select
|
||||
excludedSelect.innerHTML = '';
|
||||
|
||||
document.querySelectorAll('#tagCloud .tag').forEach(tagEl => {
|
||||
const val = tagEl.dataset.value;
|
||||
if (tagEl.classList.contains('selected')) {
|
||||
const option = Array.from(hiddenSelect.options).find(opt => opt.value === val);
|
||||
if (option) option.selected = true;
|
||||
} else if (tagEl.classList.contains('excluded')) {
|
||||
const opt = document.createElement('option');
|
||||
opt.value = val;
|
||||
opt.selected = true;
|
||||
excludedSelect.appendChild(opt);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
let lastGroup = '';
|
||||
allTags.forEach(tag => {
|
||||
if (tag.group !== lastGroup && tag.group === 'country') {
|
||||
const separator = document.createElement('div');
|
||||
separator.style.cssText = 'width:100%;margin:8px 0 4px;padding:4px 0;border-top:1px solid rgba(0,0,0,0.15);font-size:13px;color:#666;';
|
||||
separator.textContent = 'Countries';
|
||||
tagCloud.appendChild(separator);
|
||||
}
|
||||
lastGroup = tag.group;
|
||||
|
||||
const tagElement = document.createElement('span');
|
||||
tagElement.className = 'tag';
|
||||
tagElement.textContent = tag.label;
|
||||
tagElement.dataset.value = tag.value;
|
||||
|
||||
tagElement.addEventListener('click', function () {
|
||||
const isSelected = this.classList.toggle('selected');
|
||||
const option = Array.from(hiddenSelect.options).find(opt => opt.value === tag.value);
|
||||
if (option) {
|
||||
option.selected = isSelected;
|
||||
// Single click cycles: neutral -> included -> excluded -> neutral
|
||||
tagElement.addEventListener('click', function (e) {
|
||||
e.preventDefault();
|
||||
if (this.classList.contains('selected')) {
|
||||
// included -> excluded
|
||||
this.classList.remove('selected');
|
||||
this.classList.add('excluded');
|
||||
} else if (this.classList.contains('excluded')) {
|
||||
// excluded -> neutral
|
||||
this.classList.remove('excluded');
|
||||
} else {
|
||||
// neutral -> included
|
||||
this.classList.add('selected');
|
||||
}
|
||||
updateTagSelects();
|
||||
});
|
||||
|
||||
tagCloud.appendChild(tagElement);
|
||||
|
||||
Generated
+1946
-1171
File diff suppressed because it is too large
Load Diff
@@ -1,5 +1,5 @@
|
||||
maigret @ https://github.com/soxoj/maigret/archive/refs/heads/main.zip
|
||||
pefile==2023.2.7 # do not bump while pyinstaller is 6.11.1, there is a conflict
|
||||
psutil==6.1.1
|
||||
pyinstaller==6.11.1
|
||||
psutil==7.2.2
|
||||
pyinstaller==6.19.0
|
||||
pywin32-ctypes==0.2.3
|
||||
|
||||
+30
-28
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.poetry]
|
||||
name = "maigret"
|
||||
version = "0.5.0a1"
|
||||
version = "0.6.0"
|
||||
description = "🕵️♂️ Collect a dossier on a person by username from thousands of sites."
|
||||
authors = ["Soxoj <soxoj@protonmail.com>"]
|
||||
readme = "README.md"
|
||||
@@ -31,66 +31,68 @@ classifiers = [
|
||||
# Install with dev dependencies:
|
||||
# poetry install --with dev
|
||||
python = "^3.10"
|
||||
aiodns = "^3.0.0"
|
||||
aiohttp = "^3.11.11"
|
||||
aiohttp-socks = "^0.9.1"
|
||||
aiodns = ">=3,<5"
|
||||
aiohttp = "^3.12.14"
|
||||
aiohttp-socks = ">=0.10.1,<0.12.0"
|
||||
arabic-reshaper = "^3.0.0"
|
||||
async-timeout = "^5.0.1"
|
||||
attrs = "^24.3.0"
|
||||
certifi = "^2024.12.14"
|
||||
chardet = "^5.0.0"
|
||||
attrs = ">=25.3,<27.0"
|
||||
certifi = ">=2025.6.15,<2027.0.0"
|
||||
chardet = ">=5,<8"
|
||||
colorama = "^0.4.6"
|
||||
future = "^1.0.0"
|
||||
future-annotations= "^1.0.0"
|
||||
html5lib = "^1.1"
|
||||
idna = "^3.4"
|
||||
Jinja2 = "^3.1.3"
|
||||
lxml = "^5.3.0"
|
||||
Jinja2 = "^3.1.6"
|
||||
lxml = ">=6.0.2,<7.0"
|
||||
MarkupSafe = "^3.0.2"
|
||||
mock = "^5.1.0"
|
||||
multidict = "^6.0.4"
|
||||
pycountry = "^24.6.1"
|
||||
multidict = "^6.6.3"
|
||||
pycountry = ">=24.6.1,<27.0.0"
|
||||
PyPDF2 = "^3.0.1"
|
||||
PySocks = "^1.7.1"
|
||||
python-bidi = "^0.6.3"
|
||||
requests = "^2.31.0"
|
||||
requests = "^2.32.4"
|
||||
requests-futures = "^1.0.2"
|
||||
requests-toolbelt = "^1.0.0"
|
||||
six = "^1.17.0"
|
||||
socid-extractor = "^0.0.27"
|
||||
socid-extractor = ">=0.0.27,<0.0.29"
|
||||
soupsieve = "^2.6"
|
||||
stem = "^1.8.1"
|
||||
torrequest = "^0.1.0"
|
||||
alive_progress = "^3.2.0"
|
||||
typing-extensions = "^4.8.0"
|
||||
typing-extensions = "^4.14.1"
|
||||
webencodings = "^0.5.1"
|
||||
xhtml2pdf = "^0.2.11"
|
||||
XMind = "^1.2.0"
|
||||
yarl = "^1.18.3"
|
||||
yarl = "^1.20.1"
|
||||
networkx = "^2.6.3"
|
||||
pyvis = "^0.3.2"
|
||||
reportlab = "^4.2.0"
|
||||
reportlab = "^4.4.3"
|
||||
cloudscraper = "^1.2.71"
|
||||
flask = {extras = ["async"], version = "^3.1.0"}
|
||||
asgiref = "^3.8.1"
|
||||
platformdirs = "^4.3.6"
|
||||
flask = {extras = ["async"], version = "^3.1.1"}
|
||||
asgiref = "^3.9.1"
|
||||
platformdirs = "^4.3.8"
|
||||
curl-cffi = ">=0.14,<1.0"
|
||||
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
# How to add a new dev dependency: poetry add black --group dev
|
||||
# Install dev dependencies with: poetry install --with dev
|
||||
flake8 = "^7.1.1"
|
||||
pytest = "^8.3.4"
|
||||
pytest-asyncio = "^0.25.0"
|
||||
pytest-cov = "^6.0.0"
|
||||
pytest = ">=8.3.4,<10.0.0"
|
||||
pytest-asyncio = "^1.0.0"
|
||||
pytest-cov = ">=6,<8"
|
||||
pytest-httpserver = "^1.0.0"
|
||||
pytest-rerunfailures = "^15.0"
|
||||
reportlab = "^4.2.0"
|
||||
mypy = "^1.13.0"
|
||||
pytest-rerunfailures = ">=15.1,<17.0"
|
||||
reportlab = "^4.4.3"
|
||||
mypy = "^1.14.1"
|
||||
tuna = "^0.5.11"
|
||||
coverage = "^7.6.9"
|
||||
black = "^24.10.0"
|
||||
coverage = "^7.9.2"
|
||||
black = ">=25.1,<27.0"
|
||||
|
||||
[tool.poetry.scripts]
|
||||
# Run with: poetry run maigret <username>
|
||||
maigret = "maigret.maigret:run"
|
||||
update_sitesmd = "utils.update_site_data:main"
|
||||
update_sitesmd = "utils.update_site_data:main"
|
||||
|
||||
+1
-1
@@ -7,7 +7,7 @@ description: |
|
||||
|
||||
Currently supported more than 3000 sites, search is launched against 500 popular sites in descending order of popularity by default. Also supported checking of Tor sites, I2P sites, and domains (via DNS resolving).
|
||||
|
||||
version: 0.5.0a1
|
||||
version: 0.5.0
|
||||
license: MIT
|
||||
base: core22
|
||||
confinement: strict
|
||||
|
||||
+37
-20
@@ -5,11 +5,13 @@ from typing import Dict, Any
|
||||
|
||||
DEFAULT_ARGS: Dict[str, Any] = {
|
||||
'all_sites': False,
|
||||
'auto_disable': False,
|
||||
'connections': 100,
|
||||
'cookie_file': None,
|
||||
'csv': False,
|
||||
'db_file': 'resources/data.json',
|
||||
'debug': False,
|
||||
'diagnose': False,
|
||||
'disable_extracting': False,
|
||||
'disable_recursive_search': False,
|
||||
'folderoutput': 'reports',
|
||||
@@ -34,6 +36,7 @@ DEFAULT_ARGS: Dict[str, Any] = {
|
||||
'site_list': [],
|
||||
'stats': False,
|
||||
'tags': '',
|
||||
'exclude_tags': '',
|
||||
'timeout': 30,
|
||||
'tor_proxy': 'socks5://127.0.0.1:9050',
|
||||
'i2p_proxy': 'http://127.0.0.1:4444',
|
||||
@@ -45,17 +48,9 @@ DEFAULT_ARGS: Dict[str, Any] = {
|
||||
'web': None,
|
||||
'with_domains': False,
|
||||
'xmind': False,
|
||||
# Mirrors maigret/resources/settings.json (flag --cloudflare-bypass overrides with True)
|
||||
'cloudflare_bypass': {
|
||||
"enabled": True,
|
||||
"modules": [
|
||||
{
|
||||
"name": "chrome_webgate",
|
||||
"method": "url_rewrite",
|
||||
"url": "http://localhost:8000/html?url={url}&retries=1"
|
||||
}
|
||||
]
|
||||
}
|
||||
'md': False,
|
||||
'no_autoupdate': False,
|
||||
'force_update': False,
|
||||
}
|
||||
|
||||
|
||||
@@ -71,15 +66,6 @@ def test_args_search_mode(argparser):
|
||||
assert getattr(args, arg) == want_args[arg]
|
||||
|
||||
|
||||
def test_args_cloudflare_bypass_flag(argparser):
|
||||
args = argparser.parse_args('--cloudflare-bypass username'.split())
|
||||
|
||||
want_args = dict(DEFAULT_ARGS)
|
||||
want_args.update({'username': ['username'], 'cloudflare_bypass': True})
|
||||
|
||||
assert args == Namespace(**want_args)
|
||||
|
||||
|
||||
def test_args_search_mode_several_usernames(argparser):
|
||||
args = argparser.parse_args('username1 username2'.split())
|
||||
|
||||
@@ -123,3 +109,34 @@ def test_args_multiple_sites(argparser):
|
||||
|
||||
for arg in vars(args):
|
||||
assert getattr(args, arg) == want_args[arg]
|
||||
|
||||
|
||||
def test_args_exclude_tags(argparser):
|
||||
args = argparser.parse_args('--exclude-tags porn,dating username'.split())
|
||||
|
||||
want_args = dict(DEFAULT_ARGS)
|
||||
want_args.update(
|
||||
{
|
||||
'exclude_tags': 'porn,dating',
|
||||
'username': ['username'],
|
||||
}
|
||||
)
|
||||
|
||||
for arg in vars(args):
|
||||
assert getattr(args, arg) == want_args[arg]
|
||||
|
||||
|
||||
def test_args_tags_with_exclude_tags(argparser):
|
||||
args = argparser.parse_args('--tags coding --exclude-tags porn username'.split())
|
||||
|
||||
want_args = dict(DEFAULT_ARGS)
|
||||
want_args.update(
|
||||
{
|
||||
'tags': 'coding',
|
||||
'exclude_tags': 'porn',
|
||||
'username': ['username'],
|
||||
}
|
||||
)
|
||||
|
||||
for arg in vars(args):
|
||||
assert getattr(args, arg) == want_args[arg]
|
||||
|
||||
@@ -4,6 +4,30 @@ import pytest
|
||||
from maigret.utils import is_country_tag
|
||||
|
||||
|
||||
TOP_SITES_ALEXA_RANK_LIMIT = 50
|
||||
|
||||
KNOWN_SOCIAL_DOMAINS = [
|
||||
"facebook.com",
|
||||
"instagram.com",
|
||||
"twitter.com",
|
||||
"tiktok.com",
|
||||
"vk.com",
|
||||
"reddit.com",
|
||||
"pinterest.com",
|
||||
"snapchat.com",
|
||||
"linkedin.com",
|
||||
"tumblr.com",
|
||||
"threads.net",
|
||||
"bsky.app",
|
||||
"myspace.com",
|
||||
"weibo.com",
|
||||
"mastodon.social",
|
||||
"gab.com",
|
||||
"minds.com",
|
||||
"clubhouse.com",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_tags_validity(default_db):
|
||||
unknown_tags = set()
|
||||
@@ -19,3 +43,62 @@ def test_tags_validity(default_db):
|
||||
# if you see "unchecked" tag error, please, do
|
||||
# maigret --db `pwd`/maigret/resources/data.json --self-check --tag unchecked --use-disabled-sites
|
||||
assert unknown_tags == set()
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_top_sites_have_category_tag(default_db):
|
||||
"""Top sites by alexaRank must have at least one category tag (not just country codes)."""
|
||||
sites_ranked = sorted(
|
||||
[s for s in default_db.sites if s.alexa_rank],
|
||||
key=lambda s: s.alexa_rank,
|
||||
)[:TOP_SITES_ALEXA_RANK_LIMIT]
|
||||
|
||||
missing_category = []
|
||||
for site in sites_ranked:
|
||||
category_tags = [t for t in site.tags if not is_country_tag(t)]
|
||||
if not category_tags:
|
||||
missing_category.append(f"{site.name} (rank {site.alexa_rank})")
|
||||
|
||||
assert missing_category == [], (
|
||||
f"{len(missing_category)} top-{TOP_SITES_ALEXA_RANK_LIMIT} sites have no category tag: "
|
||||
+ ", ".join(missing_category[:20])
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_no_unused_tags_in_registry(default_db):
|
||||
"""Every tag in the registry should be used by at least one site."""
|
||||
all_used_tags = set()
|
||||
for site in default_db.sites:
|
||||
for tag in site.tags:
|
||||
if not is_country_tag(tag):
|
||||
all_used_tags.add(tag)
|
||||
|
||||
registered_tags = set(default_db._tags)
|
||||
unused = registered_tags - all_used_tags
|
||||
|
||||
assert unused == set(), f"Tags registered but not used by any site: {unused}"
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_social_networks_have_social_tag(default_db):
|
||||
"""Known social network domains must have the 'social' tag."""
|
||||
from urllib.parse import urlparse
|
||||
|
||||
missing_social = []
|
||||
for site in default_db.sites:
|
||||
url = site.url_main or ""
|
||||
try:
|
||||
hostname = urlparse(url).hostname or ""
|
||||
except Exception:
|
||||
continue
|
||||
for domain in KNOWN_SOCIAL_DOMAINS:
|
||||
if hostname == domain or hostname.endswith("." + domain):
|
||||
if "social" not in site.tags:
|
||||
missing_social.append(f"{site.name} ({domain})")
|
||||
break
|
||||
|
||||
assert missing_social == [], (
|
||||
f"{len(missing_social)} known social networks missing 'social' tag: "
|
||||
+ ", ".join(missing_social)
|
||||
)
|
||||
|
||||
@@ -0,0 +1,236 @@
|
||||
"""Tests for the database auto-update system."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import hashlib
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from unittest.mock import patch, MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from maigret.db_updater import (
|
||||
_parse_version,
|
||||
_needs_check,
|
||||
_is_version_compatible,
|
||||
_is_update_available,
|
||||
_load_state,
|
||||
_save_state,
|
||||
_best_local,
|
||||
_now_iso,
|
||||
resolve_db_path,
|
||||
force_update,
|
||||
CACHED_DB_PATH,
|
||||
BUNDLED_DB_PATH,
|
||||
STATE_PATH,
|
||||
MAIGRET_HOME,
|
||||
)
|
||||
|
||||
|
||||
def test_parse_version():
|
||||
assert _parse_version("0.5.0") == (0, 5, 0)
|
||||
assert _parse_version("1.2.3") == (1, 2, 3)
|
||||
assert _parse_version("bad") == (0, 0, 0)
|
||||
assert _parse_version("") == (0, 0, 0)
|
||||
|
||||
|
||||
def test_needs_check_no_state():
|
||||
assert _needs_check({}, 24) is True
|
||||
|
||||
|
||||
def test_needs_check_recent():
|
||||
state = {"last_check_at": _now_iso()}
|
||||
assert _needs_check(state, 24) is False
|
||||
|
||||
|
||||
def test_needs_check_expired():
|
||||
old_time = (datetime.now(timezone.utc) - timedelta(hours=25)).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
state = {"last_check_at": old_time}
|
||||
assert _needs_check(state, 24) is True
|
||||
|
||||
|
||||
def test_needs_check_corrupt():
|
||||
state = {"last_check_at": "not-a-date"}
|
||||
assert _needs_check(state, 24) is True
|
||||
|
||||
|
||||
def test_version_compatible():
|
||||
with patch("maigret.db_updater.__version__", "0.5.0"):
|
||||
assert _is_version_compatible({"min_maigret_version": "0.5.0"}) is True
|
||||
assert _is_version_compatible({"min_maigret_version": "0.4.0"}) is True
|
||||
assert _is_version_compatible({"min_maigret_version": "0.6.0"}) is False
|
||||
assert _is_version_compatible({}) is True # missing field = compatible
|
||||
|
||||
|
||||
def test_update_available_no_cache(tmp_path):
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(tmp_path / "nonexistent.json")):
|
||||
assert _is_update_available({"updated_at": "2026-01-01T00:00:00Z"}, {}) is True
|
||||
|
||||
|
||||
def test_update_available_newer(tmp_path):
|
||||
cache = tmp_path / "data.json"
|
||||
cache.write_text("{}")
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(cache)):
|
||||
state = {"last_meta": {"updated_at": "2026-01-01T00:00:00Z"}}
|
||||
meta = {"updated_at": "2026-02-01T00:00:00Z"}
|
||||
assert _is_update_available(meta, state) is True
|
||||
|
||||
|
||||
def test_update_available_same(tmp_path):
|
||||
cache = tmp_path / "data.json"
|
||||
cache.write_text("{}")
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(cache)):
|
||||
state = {"last_meta": {"updated_at": "2026-01-01T00:00:00Z"}}
|
||||
meta = {"updated_at": "2026-01-01T00:00:00Z"}
|
||||
assert _is_update_available(meta, state) is False
|
||||
|
||||
|
||||
def test_load_state_missing(tmp_path):
|
||||
with patch("maigret.db_updater.STATE_PATH", str(tmp_path / "missing.json")):
|
||||
assert _load_state() == {}
|
||||
|
||||
|
||||
def test_load_state_corrupt(tmp_path):
|
||||
corrupt = tmp_path / "state.json"
|
||||
corrupt.write_text("not json{{{")
|
||||
with patch("maigret.db_updater.STATE_PATH", str(corrupt)):
|
||||
assert _load_state() == {}
|
||||
|
||||
|
||||
def test_save_and_load_state(tmp_path):
|
||||
state_file = tmp_path / "state.json"
|
||||
with patch("maigret.db_updater.STATE_PATH", str(state_file)):
|
||||
with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)):
|
||||
_save_state({"last_check_at": "2026-01-01T00:00:00Z"})
|
||||
loaded = _load_state()
|
||||
assert loaded["last_check_at"] == "2026-01-01T00:00:00Z"
|
||||
|
||||
|
||||
def test_best_local_with_valid_cache(tmp_path):
|
||||
cache = tmp_path / "data.json"
|
||||
cache.write_text('{"sites": {}, "engines": {}, "tags": []}')
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(cache)):
|
||||
assert _best_local() == str(cache)
|
||||
|
||||
|
||||
def test_best_local_with_corrupt_cache(tmp_path):
|
||||
cache = tmp_path / "data.json"
|
||||
cache.write_text("not json")
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(cache)):
|
||||
assert _best_local() == BUNDLED_DB_PATH
|
||||
|
||||
|
||||
def test_best_local_no_cache(tmp_path):
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(tmp_path / "missing.json")):
|
||||
assert _best_local() == BUNDLED_DB_PATH
|
||||
|
||||
|
||||
def test_resolve_db_path_custom_url():
|
||||
result = resolve_db_path("https://example.com/db.json")
|
||||
assert result == "https://example.com/db.json"
|
||||
|
||||
|
||||
def test_resolve_db_path_custom_file(tmp_path):
|
||||
custom_db = tmp_path / "custom" / "path.json"
|
||||
custom_db.parent.mkdir(parents=True)
|
||||
custom_db.write_text("{}")
|
||||
result = resolve_db_path(str(custom_db))
|
||||
assert result.endswith("custom/path.json")
|
||||
|
||||
|
||||
def test_resolve_db_path_no_autoupdate(tmp_path):
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(tmp_path / "missing.json")):
|
||||
result = resolve_db_path("resources/data.json", no_autoupdate=True)
|
||||
assert result == BUNDLED_DB_PATH
|
||||
|
||||
|
||||
def test_resolve_db_path_no_autoupdate_with_cache(tmp_path):
|
||||
cache = tmp_path / "data.json"
|
||||
cache.write_text('{"sites": {}, "engines": {}, "tags": []}')
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(cache)):
|
||||
result = resolve_db_path("resources/data.json", no_autoupdate=True)
|
||||
assert result == str(cache)
|
||||
|
||||
|
||||
@patch("maigret.db_updater._fetch_meta")
|
||||
def test_resolve_db_path_network_failure(mock_fetch, tmp_path):
|
||||
mock_fetch.return_value = None
|
||||
with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)):
|
||||
with patch("maigret.db_updater.STATE_PATH", str(tmp_path / "state.json")):
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(tmp_path / "missing.json")):
|
||||
result = resolve_db_path("resources/data.json")
|
||||
assert result == BUNDLED_DB_PATH
|
||||
|
||||
|
||||
# --- force_update tests ---
|
||||
|
||||
|
||||
@patch("maigret.db_updater._fetch_meta")
|
||||
def test_force_update_network_failure(mock_fetch, tmp_path):
|
||||
mock_fetch.return_value = None
|
||||
with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)):
|
||||
with patch("maigret.db_updater.STATE_PATH", str(tmp_path / "state.json")):
|
||||
assert force_update() is False
|
||||
|
||||
|
||||
@patch("maigret.db_updater._fetch_meta")
|
||||
def test_force_update_incompatible_version(mock_fetch, tmp_path):
|
||||
mock_fetch.return_value = {"min_maigret_version": "99.0.0", "sites_count": 100}
|
||||
with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)):
|
||||
with patch("maigret.db_updater.STATE_PATH", str(tmp_path / "state.json")):
|
||||
assert force_update() is False
|
||||
|
||||
|
||||
@patch("maigret.db_updater._download_and_verify")
|
||||
@patch("maigret.db_updater._fetch_meta")
|
||||
def test_force_update_success(mock_fetch, mock_download, tmp_path):
|
||||
mock_fetch.return_value = {
|
||||
"min_maigret_version": "0.1.0",
|
||||
"sites_count": 3200,
|
||||
"updated_at": "2099-01-01T00:00:00Z",
|
||||
"data_url": "https://example.com/data.json",
|
||||
"data_sha256": "abc123",
|
||||
}
|
||||
mock_download.return_value = str(tmp_path / "data.json")
|
||||
with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)):
|
||||
with patch("maigret.db_updater.STATE_PATH", str(tmp_path / "state.json")):
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(tmp_path / "missing.json")):
|
||||
assert force_update() is True
|
||||
state = _load_state()
|
||||
assert state["last_meta"]["sites_count"] == 3200
|
||||
|
||||
|
||||
@patch("maigret.db_updater._fetch_meta")
|
||||
def test_force_update_already_up_to_date(mock_fetch, tmp_path):
|
||||
cache = tmp_path / "data.json"
|
||||
cache.write_text('{"sites": {}, "engines": {}, "tags": []}')
|
||||
state_file = tmp_path / "state.json"
|
||||
state_file.write_text(json.dumps({
|
||||
"last_check_at": _now_iso(),
|
||||
"last_meta": {"updated_at": "2026-01-01T00:00:00Z", "sites_count": 3000},
|
||||
}))
|
||||
mock_fetch.return_value = {
|
||||
"min_maigret_version": "0.1.0",
|
||||
"sites_count": 3000,
|
||||
"updated_at": "2026-01-01T00:00:00Z",
|
||||
}
|
||||
with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)):
|
||||
with patch("maigret.db_updater.STATE_PATH", str(state_file)):
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(cache)):
|
||||
assert force_update() is False
|
||||
|
||||
|
||||
@patch("maigret.db_updater._download_and_verify")
|
||||
@patch("maigret.db_updater._fetch_meta")
|
||||
def test_force_update_download_fails(mock_fetch, mock_download, tmp_path):
|
||||
mock_fetch.return_value = {
|
||||
"min_maigret_version": "0.1.0",
|
||||
"sites_count": 3200,
|
||||
"updated_at": "2099-01-01T00:00:00Z",
|
||||
"data_url": "https://example.com/data.json",
|
||||
"data_sha256": "abc123",
|
||||
}
|
||||
mock_download.return_value = None
|
||||
with patch("maigret.db_updater.MAIGRET_HOME", str(tmp_path)):
|
||||
with patch("maigret.db_updater.STATE_PATH", str(tmp_path / "state.json")):
|
||||
with patch("maigret.db_updater.CACHED_DB_PATH", str(tmp_path / "missing.json")):
|
||||
assert force_update() is False
|
||||
@@ -36,7 +36,7 @@ def test_notify_about_errors():
|
||||
},
|
||||
}
|
||||
|
||||
results = notify_about_errors(results, query_notify=None, show_statistics=True)
|
||||
notifications = notify_about_errors(results, query_notify=None, show_statistics=True)
|
||||
|
||||
# Check the output
|
||||
expected_output = [
|
||||
@@ -55,4 +55,4 @@ def test_notify_about_errors():
|
||||
('Access denied: 25.0%', '!'),
|
||||
('You can see detailed site check errors with a flag `--print-errors`', '-'),
|
||||
]
|
||||
assert results == expected_output
|
||||
assert notifications == expected_output
|
||||
|
||||
+11
-10
@@ -3,6 +3,7 @@
|
||||
import pytest
|
||||
import asyncio
|
||||
import logging
|
||||
from typing import Any, List, Tuple, Callable, Dict
|
||||
from maigret.executors import (
|
||||
AsyncioSimpleExecutor,
|
||||
AsyncioProgressbarExecutor,
|
||||
@@ -21,7 +22,7 @@ async def func(n):
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_simple_asyncio_executor():
|
||||
tasks = [(func, [n], {}) for n in range(10)]
|
||||
tasks: List[Tuple[Callable, list, dict]] = [(func, [n], {}) for n in range(10)]
|
||||
executor = AsyncioSimpleExecutor(logger=logger)
|
||||
assert await executor.run(tasks) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
|
||||
assert executor.execution_time > 0.2
|
||||
@@ -30,18 +31,18 @@ async def test_simple_asyncio_executor():
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_asyncio_progressbar_executor():
|
||||
tasks = [(func, [n], {}) for n in range(10)]
|
||||
tasks: List[Tuple[Callable, list, dict]] = [(func, [n], {}) for n in range(10)]
|
||||
|
||||
executor = AsyncioProgressbarExecutor(logger=logger)
|
||||
# no guarantees for the results order
|
||||
assert sorted(await executor.run(tasks)) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
|
||||
assert executor.execution_time > 0.2
|
||||
assert executor.execution_time < 0.6
|
||||
assert executor.execution_time < 0.3
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_asyncio_progressbar_semaphore_executor():
|
||||
tasks = [(func, [n], {}) for n in range(10)]
|
||||
tasks: List[Tuple[Callable, list, dict]] = [(func, [n], {}) for n in range(10)]
|
||||
|
||||
executor = AsyncioProgressbarSemaphoreExecutor(logger=logger, in_parallel=5)
|
||||
# no guarantees for the results order
|
||||
@@ -53,7 +54,7 @@ async def test_asyncio_progressbar_semaphore_executor():
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.asyncio
|
||||
async def test_asyncio_progressbar_queue_executor():
|
||||
tasks = [(func, [n], {}) for n in range(10)]
|
||||
tasks: List[Tuple[Callable, list, dict]] = [(func, [n], {}) for n in range(10)]
|
||||
|
||||
executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=2)
|
||||
assert await executor.run(tasks) == [0, 1, 3, 2, 4, 6, 7, 5, 9, 8]
|
||||
@@ -81,22 +82,22 @@ async def test_asyncio_progressbar_queue_executor():
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_asyncio_queue_generator_executor():
|
||||
tasks = [(func, [n], {}) for n in range(10)]
|
||||
tasks: List[Tuple[Callable, list, dict]] = [(func, [n], {}) for n in range(10)]
|
||||
|
||||
executor = AsyncioQueueGeneratorExecutor(logger=logger, in_parallel=2)
|
||||
results = [result async for result in executor.run(tasks)]
|
||||
results = [result async for result in executor.run(tasks)] # type: ignore[arg-type]
|
||||
assert results == [0, 1, 3, 2, 4, 6, 7, 5, 9, 8]
|
||||
assert executor.execution_time > 0.5
|
||||
assert executor.execution_time < 0.6
|
||||
|
||||
executor = AsyncioQueueGeneratorExecutor(logger=logger, in_parallel=3)
|
||||
results = [result async for result in executor.run(tasks)]
|
||||
results = [result async for result in executor.run(tasks)] # type: ignore[arg-type]
|
||||
assert results == [0, 3, 1, 4, 6, 2, 7, 9, 5, 8]
|
||||
assert executor.execution_time > 0.4
|
||||
assert executor.execution_time < 0.5
|
||||
|
||||
executor = AsyncioQueueGeneratorExecutor(logger=logger, in_parallel=5)
|
||||
results = [result async for result in executor.run(tasks)]
|
||||
results = [result async for result in executor.run(tasks)] # type: ignore[arg-type]
|
||||
assert results in (
|
||||
[0, 3, 6, 1, 4, 7, 9, 2, 5, 8],
|
||||
[0, 3, 6, 1, 4, 9, 7, 2, 5, 8],
|
||||
@@ -105,7 +106,7 @@ async def test_asyncio_queue_generator_executor():
|
||||
assert executor.execution_time < 0.4
|
||||
|
||||
executor = AsyncioQueueGeneratorExecutor(logger=logger, in_parallel=10)
|
||||
results = [result async for result in executor.run(tasks)]
|
||||
results = [result async for result in executor.run(tasks)] # type: ignore[arg-type]
|
||||
assert results == [0, 3, 6, 9, 1, 4, 7, 2, 5, 8]
|
||||
assert executor.execution_time > 0.2
|
||||
assert executor.execution_time < 0.3
|
||||
|
||||
+87
-3
@@ -2,6 +2,7 @@
|
||||
|
||||
import asyncio
|
||||
import copy
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from mock import Mock
|
||||
@@ -11,7 +12,8 @@ from maigret.maigret import (
|
||||
extract_ids_from_page,
|
||||
extract_ids_from_results,
|
||||
)
|
||||
from maigret.sites import MaigretSite
|
||||
from maigret.checking import site_self_check
|
||||
from maigret.sites import MaigretSite, MaigretDatabase
|
||||
from maigret.result import MaigretCheckResult, MaigretCheckStatus
|
||||
from tests.conftest import RESULTS_EXAMPLE
|
||||
|
||||
@@ -27,7 +29,9 @@ async def test_self_check_db(test_db):
|
||||
assert test_db.sites_dict['ValidActive'].disabled is False
|
||||
assert test_db.sites_dict['InvalidInactive'].disabled is True
|
||||
|
||||
await self_check(test_db, test_db.sites_dict, logger, silent=False)
|
||||
await self_check(
|
||||
test_db, test_db.sites_dict, logger, silent=False, auto_disable=True
|
||||
)
|
||||
|
||||
assert test_db.sites_dict['InvalidActive'].disabled is True
|
||||
assert test_db.sites_dict['ValidInactive'].disabled is False
|
||||
@@ -35,6 +39,86 @@ async def test_self_check_db(test_db):
|
||||
assert test_db.sites_dict['InvalidInactive'].disabled is True
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.asyncio
|
||||
async def test_self_check_no_progressbar(test_db):
|
||||
"""Verify that no_progressbar=True disables the alive_bar in self_check."""
|
||||
logger = Mock()
|
||||
|
||||
with patch('maigret.checking.alive_bar') as mock_alive_bar:
|
||||
mock_bar = Mock()
|
||||
mock_alive_bar.return_value.__enter__ = Mock(return_value=mock_bar)
|
||||
mock_alive_bar.return_value.__exit__ = Mock(return_value=False)
|
||||
|
||||
await self_check(
|
||||
test_db, test_db.sites_dict, logger, silent=True,
|
||||
no_progressbar=True,
|
||||
)
|
||||
|
||||
# First call is the self-check progress bar; subsequent calls are
|
||||
# from inner search() invocations.
|
||||
self_check_call = mock_alive_bar.call_args_list[0]
|
||||
_, kwargs = self_check_call
|
||||
assert kwargs.get('title') == 'Self-checking'
|
||||
assert kwargs.get('disable') is True
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.asyncio
|
||||
async def test_self_check_progressbar_enabled_by_default(test_db):
|
||||
"""Verify that alive_bar is enabled by default (no_progressbar=False)."""
|
||||
logger = Mock()
|
||||
|
||||
with patch('maigret.checking.alive_bar') as mock_alive_bar:
|
||||
mock_bar = Mock()
|
||||
mock_alive_bar.return_value.__enter__ = Mock(return_value=mock_bar)
|
||||
mock_alive_bar.return_value.__exit__ = Mock(return_value=False)
|
||||
|
||||
await self_check(
|
||||
test_db, test_db.sites_dict, logger, silent=True,
|
||||
)
|
||||
|
||||
self_check_call = mock_alive_bar.call_args_list[0]
|
||||
_, kwargs = self_check_call
|
||||
assert kwargs.get('title') == 'Self-checking'
|
||||
assert kwargs.get('disable') is False
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_site_self_check_handles_exception(test_db):
|
||||
"""Verify that site_self_check catches unexpected exceptions and returns a valid result."""
|
||||
logger = Mock()
|
||||
sem = asyncio.Semaphore(1)
|
||||
site = test_db.sites_dict['ValidActive']
|
||||
|
||||
with patch('maigret.checking.maigret', side_effect=RuntimeError("test crash")):
|
||||
result = await site_self_check(site, logger, sem, test_db)
|
||||
|
||||
assert isinstance(result, dict)
|
||||
assert "issues" in result
|
||||
assert len(result["issues"]) > 0
|
||||
assert any("Unexpected error" in issue for issue in result["issues"])
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_self_check_handles_task_exception(test_db):
|
||||
"""Verify that self_check continues when individual site checks raise exceptions."""
|
||||
logger = Mock()
|
||||
|
||||
with patch('maigret.checking.maigret', side_effect=RuntimeError("test crash")):
|
||||
result = await self_check(
|
||||
test_db, test_db.sites_dict, logger, silent=True,
|
||||
no_progressbar=True,
|
||||
)
|
||||
|
||||
assert isinstance(result, dict)
|
||||
assert 'results' in result
|
||||
assert len(result['results']) == len(test_db.sites_dict)
|
||||
for r in result['results']:
|
||||
assert 'site_name' in r
|
||||
assert 'issues' in r
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.skip(reason="broken, fixme")
|
||||
def test_maigret_results(test_db):
|
||||
@@ -110,7 +194,7 @@ def test_extract_ids_from_page(test_db):
|
||||
|
||||
|
||||
def test_extract_ids_from_results(test_db):
|
||||
TEST_EXAMPLE = copy.deepcopy(RESULTS_EXAMPLE)
|
||||
TEST_EXAMPLE: dict = copy.deepcopy(RESULTS_EXAMPLE)
|
||||
TEST_EXAMPLE['Reddit']['ids_usernames'] = {'test1': 'yandex_public_id'}
|
||||
TEST_EXAMPLE['Reddit']['ids_links'] = ['https://www.reddit.com/user/test2']
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ import os
|
||||
import pytest
|
||||
from io import StringIO
|
||||
|
||||
import xmind
|
||||
import xmind # type: ignore[import-untyped]
|
||||
from jinja2 import Template
|
||||
|
||||
from maigret.report import (
|
||||
|
||||
@@ -0,0 +1,53 @@
|
||||
import unittest
|
||||
from unittest.mock import patch, mock_open
|
||||
|
||||
from maigret.settings import Settings
|
||||
|
||||
|
||||
class TestSettings(unittest.TestCase):
|
||||
@patch('json.load')
|
||||
@patch('builtins.open', new_callable=mock_open)
|
||||
def test_settings_cascade_and_override(self, mock_file, mock_json_load):
|
||||
file1_data = {"timeout": 10, "retries_count": 3, "proxy_url": "http://proxy1"}
|
||||
file2_data = {"timeout": 20, "recursive_search": True}
|
||||
file3_data = {"proxy_url": "http://proxy3", "print_not_found": False}
|
||||
|
||||
mock_json_load.side_effect = [file1_data, file2_data, file3_data]
|
||||
|
||||
settings = Settings()
|
||||
paths = ['file1.json', 'file2.json', 'file3.json']
|
||||
|
||||
was_inited, msg = settings.load(paths)
|
||||
|
||||
self.assertTrue(was_inited)
|
||||
self.assertEqual(settings.retries_count, 3)
|
||||
self.assertEqual(settings.timeout, 20)
|
||||
self.assertTrue(settings.recursive_search)
|
||||
self.assertEqual(settings.proxy_url, "http://proxy3")
|
||||
self.assertFalse(settings.print_not_found)
|
||||
|
||||
@patch('builtins.open')
|
||||
def test_settings_file_not_found(self, mock_open_func):
|
||||
mock_open_func.side_effect = FileNotFoundError()
|
||||
|
||||
settings = Settings()
|
||||
paths = ['nonexistent.json']
|
||||
|
||||
was_inited, msg = settings.load(paths)
|
||||
|
||||
self.assertFalse(was_inited)
|
||||
self.assertIn('None of the default settings files found', msg)
|
||||
|
||||
@patch('json.load')
|
||||
@patch('builtins.open', new_callable=mock_open)
|
||||
def test_settings_invalid_json(self, mock_file, mock_json_load):
|
||||
mock_json_load.side_effect = ValueError("Expecting value")
|
||||
|
||||
settings = Settings()
|
||||
paths = ['invalid.json']
|
||||
|
||||
was_inited, msg = settings.load(paths)
|
||||
|
||||
self.assertFalse(was_inited)
|
||||
self.assertIsInstance(msg, ValueError)
|
||||
self.assertIn('Problem with parsing json contents', str(msg))
|
||||
+94
-1
@@ -1,8 +1,10 @@
|
||||
"""Maigret Database test functions"""
|
||||
|
||||
from typing import Any, Dict
|
||||
|
||||
from maigret.sites import MaigretDatabase, MaigretSite
|
||||
|
||||
EXAMPLE_DB = {
|
||||
EXAMPLE_DB: Dict[str, Any] = {
|
||||
'engines': {
|
||||
"XenForo": {
|
||||
"presenseStrs": ["XenForo"],
|
||||
@@ -182,6 +184,97 @@ def test_ranked_sites_dict_id_type():
|
||||
assert len(db.ranked_sites_dict(id_type='gaia_id')) == 1
|
||||
|
||||
|
||||
def test_ranked_sites_dict_excluded_tags():
|
||||
db = MaigretDatabase()
|
||||
db.update_site(MaigretSite('3', {'alexaRank': 1000, 'engine': 'ucoz'}))
|
||||
db.update_site(MaigretSite('1', {'alexaRank': 2, 'tags': ['forum']}))
|
||||
db.update_site(MaigretSite('2', {'alexaRank': 10, 'tags': ['ru', 'forum']}))
|
||||
|
||||
# excluding by tag
|
||||
assert list(db.ranked_sites_dict(excluded_tags=['ru']).keys()) == ['1', '3']
|
||||
assert list(db.ranked_sites_dict(excluded_tags=['forum']).keys()) == ['3']
|
||||
|
||||
# excluding by engine
|
||||
assert list(db.ranked_sites_dict(excluded_tags=['ucoz']).keys()) == ['1', '2']
|
||||
|
||||
# combining include and exclude tags
|
||||
assert list(db.ranked_sites_dict(tags=['forum'], excluded_tags=['ru']).keys()) == ['1']
|
||||
|
||||
# excluding non-existent tag has no effect
|
||||
assert list(db.ranked_sites_dict(excluded_tags=['nonexistent']).keys()) == ['1', '2', '3']
|
||||
|
||||
# exclude all
|
||||
assert list(db.ranked_sites_dict(excluded_tags=['forum', 'ucoz']).keys()) == []
|
||||
|
||||
|
||||
def test_ranked_sites_dict_excluded_tags_with_top():
|
||||
"""Excluded tags should also prevent mirrors from being included."""
|
||||
db = MaigretDatabase()
|
||||
db.update_site(
|
||||
MaigretSite('Parent', {'alexaRank': 1, 'tags': ['forum'], 'type': 'username'})
|
||||
)
|
||||
db.update_site(
|
||||
MaigretSite('Mirror', {'alexaRank': 999999, 'source': 'Parent', 'tags': ['forum'], 'type': 'username'})
|
||||
)
|
||||
db.update_site(
|
||||
MaigretSite('Other', {'alexaRank': 2, 'tags': ['coding'], 'type': 'username'})
|
||||
)
|
||||
|
||||
# Without exclusion, mirror should be included
|
||||
result = db.ranked_sites_dict(top=1, id_type='username')
|
||||
assert 'Parent' in result
|
||||
assert 'Mirror' in result
|
||||
|
||||
# With exclusion of 'forum', both Parent and Mirror should be excluded
|
||||
result = db.ranked_sites_dict(top=2, excluded_tags=['forum'], id_type='username')
|
||||
assert 'Parent' not in result
|
||||
assert 'Mirror' not in result
|
||||
assert 'Other' in result
|
||||
|
||||
|
||||
def test_ranked_sites_dict_mirrors_disabled_parent():
|
||||
"""Mirror is included when parent ranks in top N but parent is disabled."""
|
||||
db = MaigretDatabase()
|
||||
db.update_site(
|
||||
MaigretSite(
|
||||
'ParentPlatform',
|
||||
{'alexaRank': 5, 'disabled': True, 'type': 'username'},
|
||||
)
|
||||
)
|
||||
db.update_site(
|
||||
MaigretSite(
|
||||
'OtherSite',
|
||||
{'alexaRank': 100, 'type': 'username'},
|
||||
)
|
||||
)
|
||||
db.update_site(
|
||||
MaigretSite(
|
||||
'MirrorSite',
|
||||
{
|
||||
'alexaRank': 99999999,
|
||||
'source': 'ParentPlatform',
|
||||
'type': 'username',
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
result = db.ranked_sites_dict(top=1, disabled=False, id_type='username')
|
||||
assert list(result.keys()) == ['OtherSite', 'MirrorSite']
|
||||
|
||||
|
||||
def test_ranked_sites_dict_mirrors_no_extra_without_parent_in_top():
|
||||
db = MaigretDatabase()
|
||||
db.update_site(MaigretSite('A', {'alexaRank': 1, 'type': 'username'}))
|
||||
db.update_site(
|
||||
MaigretSite(
|
||||
'B',
|
||||
{'alexaRank': 2, 'source': 'NotInDb', 'type': 'username'},
|
||||
)
|
||||
)
|
||||
|
||||
assert list(db.ranked_sites_dict(top=1, id_type='username').keys()) == ['A']
|
||||
|
||||
|
||||
def test_get_url_template():
|
||||
site = MaigretSite(
|
||||
"test",
|
||||
|
||||
+87
-4
@@ -1,8 +1,10 @@
|
||||
import re
|
||||
|
||||
import pytest
|
||||
from unittest.mock import MagicMock, patch
|
||||
from maigret.submit import Submitter
|
||||
from aiohttp import ClientSession
|
||||
from maigret.sites import MaigretDatabase
|
||||
from maigret.sites import MaigretDatabase, MaigretSite
|
||||
import logging
|
||||
|
||||
|
||||
@@ -26,7 +28,7 @@ async def test_detect_known_engine(test_db, local_test_db):
|
||||
url_exists = "https://devforum.zoom.us/u/adam"
|
||||
url_mainpage = "https://devforum.zoom.us/"
|
||||
# Mock extract_username_dialog to return "adam"
|
||||
submitter.extract_username_dialog = MagicMock(return_value="adam")
|
||||
submitter.extract_username_dialog = MagicMock(return_value="adam") # type: ignore[method-assign]
|
||||
|
||||
sites, resp_text = await submitter.detect_known_engine(
|
||||
url_exists, url_mainpage, session=None, follow_redirects=False, headers=None
|
||||
@@ -109,7 +111,7 @@ async def test_check_features_manually_success(settings):
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_features_manually_success(settings):
|
||||
async def test_check_features_manually_cloudflare(settings):
|
||||
# Setup
|
||||
db = MaigretDatabase()
|
||||
logger = logging.getLogger("test_logger")
|
||||
@@ -271,7 +273,88 @@ async def test_dialog_adds_site_negative(settings):
|
||||
]
|
||||
|
||||
with patch('builtins.input', side_effect=user_inputs):
|
||||
result = await submitter.dialog("https://icq.im/sokrat", None)
|
||||
result = await submitter.dialog("https://icq.com/sokrat", None)
|
||||
await submitter.close()
|
||||
|
||||
assert result is False
|
||||
|
||||
|
||||
def test_domain_matching_exact():
|
||||
"""Test that domain matching uses proper boundary checks, not substring matching.
|
||||
|
||||
x.com should NOT match sites like 500px.com, mix.com, etc.
|
||||
"""
|
||||
domain_raw = "x.com"
|
||||
domain_re = re.compile(
|
||||
r'://(www\.)?' + re.escape(domain_raw) + r'(/|$)'
|
||||
)
|
||||
|
||||
# These should NOT match x.com
|
||||
non_matching = [
|
||||
MaigretSite("500px", {"url": "https://500px.com/p/{username}", "urlMain": "https://500px.com/"}),
|
||||
MaigretSite("Mix", {"url": "https://mix.com/{username}", "urlMain": "https://mix.com"}),
|
||||
MaigretSite("Screwfix", {"url": "{urlMain}{urlSubpath}/members/?username={username}", "urlMain": "https://community.screwfix.com"}),
|
||||
MaigretSite("Wix", {"url": "https://{username}.wix.com", "urlMain": "https://wix.com/"}),
|
||||
MaigretSite("1x", {"url": "https://1x.com/{username}", "urlMain": "https://1x.com"}),
|
||||
MaigretSite("Roblox", {"url": "https://www.roblox.com/user.aspx?username={username}", "urlMain": "https://www.roblox.com/"}),
|
||||
]
|
||||
|
||||
for site in non_matching:
|
||||
assert not domain_re.search(site.url_main + site.url), \
|
||||
f"x.com should NOT match site {site.name} ({site.url_main})"
|
||||
|
||||
|
||||
def test_domain_matching_positive():
|
||||
"""Test that domain matching correctly matches the exact domain."""
|
||||
domain_raw = "x.com"
|
||||
domain_re = re.compile(
|
||||
r'://(www\.)?' + re.escape(domain_raw) + r'(/|$)'
|
||||
)
|
||||
|
||||
# These SHOULD match x.com
|
||||
matching = [
|
||||
MaigretSite("X", {"url": "https://x.com/{username}", "urlMain": "https://x.com"}),
|
||||
MaigretSite("X-www", {"url": "https://www.x.com/{username}", "urlMain": "https://www.x.com"}),
|
||||
]
|
||||
|
||||
for site in matching:
|
||||
assert domain_re.search(site.url_main + site.url), \
|
||||
f"x.com SHOULD match site {site.name} ({site.url_main})"
|
||||
|
||||
|
||||
def test_dialog_nonexistent_site_name_no_crash():
|
||||
"""Test that entering a site name not in the matched list doesn't crash.
|
||||
|
||||
This tests the fix for: AttributeError: 'NoneType' object has no attribute 'name'
|
||||
The old_site should be None when user enters a name not in matched_sites,
|
||||
and the code should handle it gracefully.
|
||||
"""
|
||||
# Simulate the logic that was crashing
|
||||
matched_sites = [
|
||||
MaigretSite("ValidActive", {"url": "https://example.com/{username}", "urlMain": "https://example.com"}),
|
||||
MaigretSite("InvalidActive", {"url": "https://example.com/alt/{username}", "urlMain": "https://example.com"}),
|
||||
]
|
||||
site_name = "NonExistentSite"
|
||||
|
||||
old_site = next(
|
||||
(site for site in matched_sites if site.name == site_name), None
|
||||
)
|
||||
|
||||
# This is what the old code did - it would crash here
|
||||
assert old_site is None
|
||||
|
||||
# The fix: check before accessing .name
|
||||
if old_site is None:
|
||||
result = "not found"
|
||||
else:
|
||||
result = old_site.name
|
||||
|
||||
assert result == "not found"
|
||||
|
||||
# And when site_name IS in matched_sites, it should work
|
||||
site_name = "ValidActive"
|
||||
old_site = next(
|
||||
(site for site in matched_sites if site.name == site_name), None
|
||||
)
|
||||
assert old_site is not None
|
||||
assert old_site.name == "ValidActive"
|
||||
|
||||
@@ -0,0 +1,63 @@
|
||||
"""Tests for the Twitter / X site entry and GraphQL probe."""
|
||||
|
||||
import re
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from maigret.sites import MaigretSite
|
||||
|
||||
|
||||
def _twitter_site(site: MaigretSite) -> None:
|
||||
assert site.name == "Twitter"
|
||||
assert site.disabled is False
|
||||
assert site.check_type == "message"
|
||||
assert site.url_probe and "{username}" in site.url_probe
|
||||
assert "UserByScreenName" in site.url_probe or "graphql" in site.url_probe
|
||||
assert site.regex_check
|
||||
assert re.fullmatch(site.regex_check, site.username_claimed)
|
||||
assert re.fullmatch(site.regex_check, site.username_unclaimed)
|
||||
assert site.absence_strs
|
||||
assert site.activation.get("method") == "twitter"
|
||||
assert site.activation.get("url")
|
||||
assert "authorization" in {k.lower() for k in site.headers.keys()}
|
||||
|
||||
|
||||
def test_twitter_site_entry_config(default_db):
|
||||
"""Twitter entry in data.json must define probe URL, regex, and activation."""
|
||||
site = default_db.sites_dict["Twitter"]
|
||||
assert isinstance(site, MaigretSite)
|
||||
_twitter_site(site)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_twitter_graphql_probe_claimed_vs_unclaimed(default_db):
|
||||
"""
|
||||
Live check: guest activation + UserByScreenName GraphQL returns a user for
|
||||
usernameClaimed and no user for usernameUnclaimed (same flow as urlProbe).
|
||||
"""
|
||||
site = default_db.sites_dict["Twitter"]
|
||||
_twitter_site(site)
|
||||
|
||||
headers = dict(site.headers)
|
||||
headers.pop("x-guest-token", None)
|
||||
|
||||
act = requests.post(site.activation["url"], headers=headers, timeout=45)
|
||||
assert act.status_code == 200, act.text[:500]
|
||||
body = act.json()
|
||||
assert "guest_token" in body
|
||||
headers["x-guest-token"] = body["guest_token"]
|
||||
|
||||
def fetch(username: str) -> dict:
|
||||
url = site.url_probe.format(username=username)
|
||||
resp = requests.get(url, headers=headers, timeout=45)
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
|
||||
claimed_json = fetch(site.username_claimed)
|
||||
assert "data" in claimed_json
|
||||
assert claimed_json["data"].get("user") is not None
|
||||
|
||||
unclaimed_json = fetch(site.username_unclaimed)
|
||||
data = unclaimed_json.get("data") or {}
|
||||
assert data == {} or data.get("user") is None
|
||||
@@ -0,0 +1,480 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Mass site checking utility for Maigret development.
|
||||
Check top-N sites from data.json and generate a report.
|
||||
|
||||
Usage:
|
||||
python utils/check_top_n.py --top 100 # Check top 100 sites
|
||||
python utils/check_top_n.py --top 50 --parallel 10 # Check with 10 parallel requests
|
||||
python utils/check_top_n.py --top 100 --output report.json
|
||||
python utils/check_top_n.py --top 100 --fix # Auto-fix simple issues
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
import time
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
# Add parent dir for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
try:
|
||||
import aiohttp
|
||||
except ImportError:
|
||||
print("aiohttp not installed. Run: pip install aiohttp")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
class Colors:
|
||||
RED = "\033[91m"
|
||||
GREEN = "\033[92m"
|
||||
YELLOW = "\033[93m"
|
||||
BLUE = "\033[94m"
|
||||
CYAN = "\033[96m"
|
||||
RESET = "\033[0m"
|
||||
BOLD = "\033[1m"
|
||||
|
||||
|
||||
def color(text: str, c: str) -> str:
|
||||
return f"{c}{text}{Colors.RESET}"
|
||||
|
||||
|
||||
@dataclass
|
||||
class SiteCheckResult:
|
||||
"""Result of checking a single site."""
|
||||
site_name: str
|
||||
alexa_rank: int
|
||||
disabled: bool
|
||||
check_type: str
|
||||
|
||||
# Status
|
||||
status: str = "unknown" # working, broken, timeout, error, anti_bot, disabled
|
||||
|
||||
# HTTP results
|
||||
claimed_http_status: Optional[int] = None
|
||||
unclaimed_http_status: Optional[int] = None
|
||||
claimed_error: Optional[str] = None
|
||||
unclaimed_error: Optional[str] = None
|
||||
|
||||
# Issues detected
|
||||
issues: List[str] = field(default_factory=list)
|
||||
warnings: List[str] = field(default_factory=list)
|
||||
|
||||
# Recommendations
|
||||
recommendations: List[str] = field(default_factory=list)
|
||||
|
||||
# Timing
|
||||
check_time_ms: int = 0
|
||||
|
||||
|
||||
DEFAULT_HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.5",
|
||||
}
|
||||
|
||||
|
||||
async def check_url(url: str, headers: dict, timeout: int = 15) -> dict:
|
||||
"""Quick URL check returning status and basic info."""
|
||||
result = {
|
||||
"status": None,
|
||||
"final_url": None,
|
||||
"content_length": 0,
|
||||
"error": None,
|
||||
"error_type": None,
|
||||
"content": None,
|
||||
"markers": {},
|
||||
}
|
||||
|
||||
try:
|
||||
connector = aiohttp.TCPConnector(ssl=False)
|
||||
timeout_obj = aiohttp.ClientTimeout(total=timeout)
|
||||
|
||||
async with aiohttp.ClientSession(connector=connector, timeout=timeout_obj) as session:
|
||||
async with session.get(url, headers=headers, allow_redirects=True) as resp:
|
||||
result["status"] = resp.status
|
||||
result["final_url"] = str(resp.url)
|
||||
|
||||
try:
|
||||
text = await resp.text()
|
||||
result["content_length"] = len(text)
|
||||
result["content"] = text
|
||||
|
||||
text_lower = text.lower()
|
||||
result["markers"] = {
|
||||
"404_text": any(m in text_lower for m in ["not found", "404", "doesn't exist"]),
|
||||
"captcha": any(m in text_lower for m in ["captcha", "recaptcha", "challenge"]),
|
||||
"cloudflare": "cloudflare" in text_lower,
|
||||
"login": any(m in text_lower for m in ["log in", "login", "sign in"]),
|
||||
}
|
||||
except Exception as e:
|
||||
result["error"] = f"Content error: {e}"
|
||||
result["error_type"] = "content"
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
result["error"] = "Timeout"
|
||||
result["error_type"] = "timeout"
|
||||
except aiohttp.ClientError as e:
|
||||
result["error"] = str(e)
|
||||
result["error_type"] = "client"
|
||||
except Exception as e:
|
||||
result["error"] = str(e)
|
||||
result["error_type"] = "unknown"
|
||||
|
||||
return result
|
||||
|
||||
|
||||
async def check_site(site_name: str, config: dict, timeout: int = 15) -> SiteCheckResult:
|
||||
"""Check a single site and return detailed result."""
|
||||
start_time = time.time()
|
||||
|
||||
result = SiteCheckResult(
|
||||
site_name=site_name,
|
||||
alexa_rank=config.get("alexaRank", 999999),
|
||||
disabled=config.get("disabled", False),
|
||||
check_type=config.get("checkType", "status_code"),
|
||||
)
|
||||
|
||||
# Skip disabled sites
|
||||
if result.disabled:
|
||||
result.status = "disabled"
|
||||
return result
|
||||
|
||||
# Build URL
|
||||
url_template = config.get("url", "")
|
||||
url_main = config.get("urlMain", "")
|
||||
url_subpath = config.get("urlSubpath", "")
|
||||
url_template = url_template.replace("{urlMain}", url_main).replace("{urlSubpath}", url_subpath)
|
||||
|
||||
claimed = config.get("usernameClaimed")
|
||||
unclaimed = config.get("usernameUnclaimed", "noonewouldeverusethis7")
|
||||
|
||||
if not claimed:
|
||||
result.status = "error"
|
||||
result.issues.append("No usernameClaimed defined")
|
||||
return result
|
||||
|
||||
# Prepare headers
|
||||
headers = DEFAULT_HEADERS.copy()
|
||||
if config.get("headers"):
|
||||
headers.update(config["headers"])
|
||||
|
||||
# Check both URLs
|
||||
url_claimed = url_template.replace("{username}", claimed)
|
||||
url_unclaimed = url_template.replace("{username}", unclaimed)
|
||||
|
||||
try:
|
||||
claimed_result, unclaimed_result = await asyncio.gather(
|
||||
check_url(url_claimed, headers, timeout),
|
||||
check_url(url_unclaimed, headers, timeout),
|
||||
)
|
||||
except Exception as e:
|
||||
result.status = "error"
|
||||
result.issues.append(f"Check failed: {e}")
|
||||
return result
|
||||
|
||||
result.claimed_http_status = claimed_result["status"]
|
||||
result.unclaimed_http_status = unclaimed_result["status"]
|
||||
result.claimed_error = claimed_result.get("error")
|
||||
result.unclaimed_error = unclaimed_result.get("error")
|
||||
|
||||
# Categorize result
|
||||
if claimed_result["error_type"] == "timeout" or unclaimed_result["error_type"] == "timeout":
|
||||
result.status = "timeout"
|
||||
result.issues.append("Request timeout")
|
||||
|
||||
elif claimed_result["status"] == 403 or claimed_result["status"] == 429:
|
||||
result.status = "anti_bot"
|
||||
result.issues.append(f"Anti-bot protection (HTTP {claimed_result['status']})")
|
||||
|
||||
elif claimed_result.get("markers", {}).get("captcha"):
|
||||
result.status = "anti_bot"
|
||||
result.issues.append("Captcha detected")
|
||||
|
||||
elif claimed_result.get("markers", {}).get("cloudflare"):
|
||||
result.status = "anti_bot"
|
||||
result.warnings.append("Cloudflare protection detected")
|
||||
|
||||
elif claimed_result["error"] or unclaimed_result["error"]:
|
||||
result.status = "error"
|
||||
if claimed_result["error"]:
|
||||
result.issues.append(f"Claimed error: {claimed_result['error']}")
|
||||
if unclaimed_result["error"]:
|
||||
result.issues.append(f"Unclaimed error: {unclaimed_result['error']}")
|
||||
|
||||
else:
|
||||
# Validate check type
|
||||
check_type = config.get("checkType", "status_code")
|
||||
|
||||
if check_type == "status_code":
|
||||
if claimed_result["status"] == unclaimed_result["status"]:
|
||||
result.status = "broken"
|
||||
result.issues.append(f"Same status code ({claimed_result['status']}) for both")
|
||||
# Suggest fix
|
||||
if claimed_result["final_url"] != unclaimed_result["final_url"]:
|
||||
result.recommendations.append("Switch to checkType: response_url")
|
||||
else:
|
||||
result.status = "working"
|
||||
|
||||
elif check_type == "response_url":
|
||||
if claimed_result["final_url"] == unclaimed_result["final_url"]:
|
||||
result.status = "broken"
|
||||
result.issues.append("Same final URL for both")
|
||||
if claimed_result["status"] != unclaimed_result["status"]:
|
||||
result.recommendations.append("Switch to checkType: status_code")
|
||||
else:
|
||||
result.status = "working"
|
||||
|
||||
elif check_type == "message":
|
||||
presense_strs = config.get("presenseStrs", [])
|
||||
absence_strs = config.get("absenceStrs", [])
|
||||
|
||||
claimed_content = claimed_result.get("content", "") or ""
|
||||
unclaimed_content = unclaimed_result.get("content", "") or ""
|
||||
|
||||
presense_ok = not presense_strs or any(s in claimed_content for s in presense_strs)
|
||||
absence_claimed = absence_strs and any(s in claimed_content for s in absence_strs)
|
||||
absence_unclaimed = absence_strs and any(s in unclaimed_content for s in absence_strs)
|
||||
|
||||
if presense_strs and not presense_ok:
|
||||
result.status = "broken"
|
||||
result.issues.append(f"presenseStrs not found: {presense_strs}")
|
||||
# Check if status_code would work
|
||||
if claimed_result["status"] != unclaimed_result["status"]:
|
||||
result.recommendations.append(f"Switch to checkType: status_code ({claimed_result['status']} vs {unclaimed_result['status']})")
|
||||
elif absence_claimed:
|
||||
result.status = "broken"
|
||||
result.issues.append(f"absenceStrs found in claimed page")
|
||||
elif absence_strs and not absence_unclaimed:
|
||||
result.status = "broken"
|
||||
result.warnings.append("absenceStrs not found in unclaimed page")
|
||||
else:
|
||||
result.status = "working"
|
||||
|
||||
else:
|
||||
result.status = "unknown"
|
||||
result.warnings.append(f"Unknown checkType: {check_type}")
|
||||
|
||||
result.check_time_ms = int((time.time() - start_time) * 1000)
|
||||
return result
|
||||
|
||||
|
||||
def load_sites(db_path: Path) -> Dict[str, dict]:
|
||||
"""Load all sites from data.json."""
|
||||
with open(db_path) as f:
|
||||
data = json.load(f)
|
||||
return data.get("sites", {})
|
||||
|
||||
|
||||
def get_top_sites(sites: Dict[str, dict], n: int) -> List[Tuple[str, dict]]:
|
||||
"""Get top N sites by Alexa rank."""
|
||||
ranked = []
|
||||
for name, config in sites.items():
|
||||
rank = config.get("alexaRank", 999999)
|
||||
ranked.append((name, config, rank))
|
||||
|
||||
ranked.sort(key=lambda x: x[2])
|
||||
return [(name, config) for name, config, _ in ranked[:n]]
|
||||
|
||||
|
||||
async def check_sites_batch(sites: List[Tuple[str, dict]], parallel: int = 5,
|
||||
timeout: int = 15, progress_callback=None) -> List[SiteCheckResult]:
|
||||
"""Check multiple sites with parallelism control."""
|
||||
results = []
|
||||
semaphore = asyncio.Semaphore(parallel)
|
||||
|
||||
async def check_with_semaphore(name, config, index):
|
||||
async with semaphore:
|
||||
if progress_callback:
|
||||
progress_callback(index, len(sites), name)
|
||||
return await check_site(name, config, timeout)
|
||||
|
||||
tasks = [
|
||||
check_with_semaphore(name, config, i)
|
||||
for i, (name, config) in enumerate(sites)
|
||||
]
|
||||
|
||||
results = await asyncio.gather(*tasks)
|
||||
return results
|
||||
|
||||
|
||||
def print_progress(current: int, total: int, site_name: str):
|
||||
"""Print progress indicator."""
|
||||
pct = int(current / total * 100)
|
||||
bar_width = 30
|
||||
filled = int(bar_width * current / total)
|
||||
bar = "█" * filled + "░" * (bar_width - filled)
|
||||
print(f"\r[{bar}] {pct:3d}% ({current}/{total}) {site_name:<30}", end="", flush=True)
|
||||
|
||||
|
||||
def generate_report(results: List[SiteCheckResult]) -> dict:
|
||||
"""Generate a summary report from check results."""
|
||||
report = {
|
||||
"summary": {
|
||||
"total": len(results),
|
||||
"working": 0,
|
||||
"broken": 0,
|
||||
"disabled": 0,
|
||||
"timeout": 0,
|
||||
"anti_bot": 0,
|
||||
"error": 0,
|
||||
"unknown": 0,
|
||||
},
|
||||
"by_status": defaultdict(list),
|
||||
"issues": [],
|
||||
"recommendations": [],
|
||||
}
|
||||
|
||||
for r in results:
|
||||
report["summary"][r.status] = report["summary"].get(r.status, 0) + 1
|
||||
report["by_status"][r.status].append(r.site_name)
|
||||
|
||||
if r.issues:
|
||||
report["issues"].append({
|
||||
"site": r.site_name,
|
||||
"rank": r.alexa_rank,
|
||||
"issues": r.issues,
|
||||
})
|
||||
|
||||
if r.recommendations:
|
||||
report["recommendations"].append({
|
||||
"site": r.site_name,
|
||||
"rank": r.alexa_rank,
|
||||
"recommendations": r.recommendations,
|
||||
})
|
||||
|
||||
return report
|
||||
|
||||
|
||||
def print_report(report: dict, results: List[SiteCheckResult]):
|
||||
"""Print a formatted report to console."""
|
||||
summary = report["summary"]
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"{color('SITE CHECK REPORT', Colors.CYAN)}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
print(f"{color('SUMMARY:', Colors.BOLD)}")
|
||||
print(f" Total sites checked: {summary['total']}")
|
||||
print(f" {color('Working:', Colors.GREEN)} {summary['working']}")
|
||||
print(f" {color('Broken:', Colors.RED)} {summary['broken']}")
|
||||
print(f" {color('Disabled:', Colors.YELLOW)} {summary['disabled']}")
|
||||
print(f" {color('Timeout:', Colors.YELLOW)} {summary['timeout']}")
|
||||
print(f" {color('Anti-bot:', Colors.YELLOW)} {summary['anti_bot']}")
|
||||
print(f" {color('Error:', Colors.RED)} {summary['error']}")
|
||||
|
||||
# Broken sites
|
||||
if report["by_status"]["broken"]:
|
||||
print(f"\n{color('BROKEN SITES:', Colors.RED)}")
|
||||
for site in report["by_status"]["broken"][:20]:
|
||||
r = next(x for x in results if x.site_name == site)
|
||||
print(f" - {site} (rank {r.alexa_rank}): {', '.join(r.issues)}")
|
||||
if len(report["by_status"]["broken"]) > 20:
|
||||
print(f" ... and {len(report['by_status']['broken']) - 20} more")
|
||||
|
||||
# Timeout sites
|
||||
if report["by_status"]["timeout"]:
|
||||
print(f"\n{color('TIMEOUT SITES:', Colors.YELLOW)}")
|
||||
for site in report["by_status"]["timeout"][:10]:
|
||||
print(f" - {site}")
|
||||
if len(report["by_status"]["timeout"]) > 10:
|
||||
print(f" ... and {len(report['by_status']['timeout']) - 10} more")
|
||||
|
||||
# Anti-bot sites
|
||||
if report["by_status"]["anti_bot"]:
|
||||
print(f"\n{color('ANTI-BOT PROTECTED:', Colors.YELLOW)}")
|
||||
for site in report["by_status"]["anti_bot"][:10]:
|
||||
r = next(x for x in results if x.site_name == site)
|
||||
print(f" - {site}: {', '.join(r.issues)}")
|
||||
if len(report["by_status"]["anti_bot"]) > 10:
|
||||
print(f" ... and {len(report['by_status']['anti_bot']) - 10} more")
|
||||
|
||||
# Recommendations
|
||||
if report["recommendations"]:
|
||||
print(f"\n{color('RECOMMENDATIONS:', Colors.CYAN)}")
|
||||
for rec in report["recommendations"][:15]:
|
||||
print(f" {rec['site']} (rank {rec['rank']}):")
|
||||
for r in rec["recommendations"]:
|
||||
print(f" -> {r}")
|
||||
if len(report["recommendations"]) > 15:
|
||||
print(f" ... and {len(report['recommendations']) - 15} more")
|
||||
|
||||
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Mass site checking for Maigret",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
parser.add_argument("--top", "-n", type=int, default=100,
|
||||
help="Check top N sites by Alexa rank (default: 100)")
|
||||
parser.add_argument("--parallel", "-p", type=int, default=5,
|
||||
help="Number of parallel requests (default: 5)")
|
||||
parser.add_argument("--timeout", "-t", type=int, default=15,
|
||||
help="Request timeout in seconds (default: 15)")
|
||||
parser.add_argument("--output", "-o", help="Output JSON report to file")
|
||||
parser.add_argument("--include-disabled", action="store_true",
|
||||
help="Include disabled sites in results")
|
||||
parser.add_argument("--only-broken", action="store_true",
|
||||
help="Only show broken sites")
|
||||
parser.add_argument("--json", action="store_true",
|
||||
help="Output as JSON only")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load sites
|
||||
db_path = Path(__file__).parent.parent / "maigret" / "resources" / "data.json"
|
||||
if not db_path.exists():
|
||||
print(f"Database not found: {db_path}")
|
||||
sys.exit(1)
|
||||
|
||||
sites = load_sites(db_path)
|
||||
top_sites = get_top_sites(sites, args.top)
|
||||
|
||||
if not args.json:
|
||||
print(f"Checking top {len(top_sites)} sites (parallel={args.parallel}, timeout={args.timeout}s)...")
|
||||
print()
|
||||
|
||||
# Run checks
|
||||
progress = print_progress if not args.json else None
|
||||
results = await check_sites_batch(top_sites, args.parallel, args.timeout, progress)
|
||||
|
||||
if not args.json:
|
||||
print() # Clear progress line
|
||||
|
||||
# Filter results
|
||||
if not args.include_disabled:
|
||||
results = [r for r in results if r.status != "disabled"]
|
||||
if args.only_broken:
|
||||
results = [r for r in results if r.status in ("broken", "error", "timeout")]
|
||||
|
||||
# Generate report
|
||||
report = generate_report(results)
|
||||
|
||||
# Output
|
||||
if args.json:
|
||||
output = {
|
||||
"report": report,
|
||||
"results": [asdict(r) for r in results],
|
||||
}
|
||||
print(json.dumps(output, indent=2))
|
||||
else:
|
||||
print_report(report, results)
|
||||
|
||||
# Save to file
|
||||
if args.output:
|
||||
output = {
|
||||
"report": report,
|
||||
"results": [asdict(r) for r in results],
|
||||
}
|
||||
with open(args.output, "w") as f:
|
||||
json.dump(output, f, indent=2)
|
||||
print(f"\nReport saved to: {args.output}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -0,0 +1,223 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Probe likely false-positive sites among the top-N Alexa-ranked entries.
|
||||
|
||||
For each of K random *distinct* usernames taken from ``usernameClaimed`` fields in
|
||||
the Maigret database, runs a clean ``maigret`` scan (``--top-sites N --json simple|ndjson``).
|
||||
Sites that return CLAIMED in *every* run are reported: unrelated random claimed
|
||||
handles are unlikely to all exist on the same third-party site, so such sites are
|
||||
candidates for broken checks.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import random
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def repo_root() -> Path:
|
||||
return Path(__file__).resolve().parent.parent
|
||||
|
||||
|
||||
def load_username_claimed_pool(db_path: Path) -> list[str]:
|
||||
with db_path.open(encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
sites = data.get("sites") or {}
|
||||
seen: set[str] = set()
|
||||
pool: list[str] = []
|
||||
for _name, site in sites.items():
|
||||
u = (site or {}).get("usernameClaimed")
|
||||
if not u or not isinstance(u, str):
|
||||
continue
|
||||
u = u.strip()
|
||||
if not u or u in seen:
|
||||
continue
|
||||
seen.add(u)
|
||||
pool.append(u)
|
||||
return pool
|
||||
|
||||
|
||||
def run_maigret(
|
||||
*,
|
||||
username: str,
|
||||
db_path: Path,
|
||||
out_dir: Path,
|
||||
top_sites: int,
|
||||
json_format: str,
|
||||
quiet: bool,
|
||||
) -> Path:
|
||||
"""Run maigret subprocess; return path to the written JSON report."""
|
||||
safe = username.replace("/", "_")
|
||||
report_name = f"report_{safe}_{json_format}.json"
|
||||
report_path = out_dir / report_name
|
||||
|
||||
cmd = [
|
||||
sys.executable,
|
||||
"-m",
|
||||
"maigret",
|
||||
username,
|
||||
"--db",
|
||||
str(db_path),
|
||||
"--top-sites",
|
||||
str(top_sites),
|
||||
"--json",
|
||||
json_format,
|
||||
"--folderoutput",
|
||||
str(out_dir),
|
||||
"--no-progressbar",
|
||||
"--no-color",
|
||||
"--no-recursion",
|
||||
"--no-extracting",
|
||||
]
|
||||
sink = subprocess.DEVNULL if quiet else None
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
cwd=str(repo_root()),
|
||||
text=True,
|
||||
stdout=sink,
|
||||
stderr=sink,
|
||||
)
|
||||
if proc.returncode != 0:
|
||||
raise RuntimeError(
|
||||
f"maigret exited with {proc.returncode} for username {username!r}"
|
||||
)
|
||||
if not report_path.is_file():
|
||||
raise FileNotFoundError(f"Expected report missing: {report_path}")
|
||||
return report_path
|
||||
|
||||
|
||||
def claimed_sites_from_report(path: Path, json_format: str) -> set[str]:
|
||||
if json_format == "simple":
|
||||
with path.open(encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
if not isinstance(data, dict):
|
||||
return set()
|
||||
return set(data.keys())
|
||||
# ndjson: one object per line, each has "sitename"
|
||||
sites: set[str] = set()
|
||||
with path.open(encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
obj = json.loads(line)
|
||||
name = obj.get("sitename")
|
||||
if isinstance(name, str) and name:
|
||||
sites.add(name)
|
||||
return sites
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
description=(
|
||||
"Pick random distinct usernameClaimed values, run maigret --top-sites N "
|
||||
"with JSON reports, and list sites that claimed all of them (suspicious FP)."
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
"--db",
|
||||
"-b",
|
||||
type=Path,
|
||||
default=repo_root() / "maigret" / "resources" / "data.json",
|
||||
help="Path to Maigret data.json (a temp copy is used for runs).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--top-sites",
|
||||
"-n",
|
||||
type=int,
|
||||
default=500,
|
||||
metavar="N",
|
||||
help="Value for maigret --top-sites (default: 500).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--samples",
|
||||
"-k",
|
||||
type=int,
|
||||
default=5,
|
||||
metavar="K",
|
||||
help="How many distinct random usernames to draw (default: 5).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--seed",
|
||||
type=int,
|
||||
default=None,
|
||||
help="RNG seed for reproducible username selection.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--json",
|
||||
dest="json_format",
|
||||
default="simple",
|
||||
choices=["simple", "ndjson"],
|
||||
help="JSON report type passed to maigret -J (default: simple).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
"-v",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Print maigret stdout/stderr (default: suppress child output).",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
quiet = not args.verbose
|
||||
|
||||
db_src = args.db.resolve()
|
||||
if not db_src.is_file():
|
||||
print(f"Database not found: {db_src}", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
pool = load_username_claimed_pool(db_src)
|
||||
if len(pool) < args.samples:
|
||||
print(
|
||||
f"Need at least {args.samples} distinct usernameClaimed entries, "
|
||||
f"found {len(pool)}.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 2
|
||||
|
||||
rng = random.Random(args.seed)
|
||||
picked = rng.sample(pool, args.samples)
|
||||
|
||||
print(f"Database: {db_src}")
|
||||
print(f"--top-sites {args.top_sites}, {args.samples} random usernameClaimed:")
|
||||
for i, u in enumerate(picked, 1):
|
||||
print(f" {i}. {u}")
|
||||
|
||||
site_sets: list[set[str]] = []
|
||||
with tempfile.TemporaryDirectory(prefix="maigret_fp_probe_") as tmp:
|
||||
tmp_path = Path(tmp)
|
||||
db_work = tmp_path / "data.json"
|
||||
shutil.copyfile(db_src, db_work)
|
||||
|
||||
for u in picked:
|
||||
print(f"\nRunning maigret for {u!r} ...", flush=True)
|
||||
report = run_maigret(
|
||||
username=u,
|
||||
db_path=db_work,
|
||||
out_dir=tmp_path,
|
||||
top_sites=args.top_sites,
|
||||
json_format=args.json_format,
|
||||
quiet=quiet,
|
||||
)
|
||||
sites = claimed_sites_from_report(report, args.json_format)
|
||||
site_sets.append(sites)
|
||||
print(f" -> {len(sites)} positive site(s) in JSON", flush=True)
|
||||
|
||||
always = set.intersection(*site_sets) if site_sets else set()
|
||||
print("\n--- Sites with CLAIMED in all runs (candidates for false positives) ---")
|
||||
if not always:
|
||||
print("(none)")
|
||||
else:
|
||||
for name in sorted(always):
|
||||
print(name)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
@@ -0,0 +1,59 @@
|
||||
"""Generate db_meta.json from data.json for the auto-update system."""
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import os.path as path
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
|
||||
RESOURCES_DIR = path.join(path.dirname(path.dirname(path.abspath(__file__))), "maigret", "resources")
|
||||
DATA_JSON_PATH = path.join(RESOURCES_DIR, "data.json")
|
||||
META_JSON_PATH = path.join(RESOURCES_DIR, "db_meta.json")
|
||||
DEFAULT_DATA_URL = "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/data.json"
|
||||
|
||||
|
||||
def get_current_version():
|
||||
version_file = path.join(path.dirname(path.dirname(path.abspath(__file__))), "maigret", "__version__.py")
|
||||
with open(version_file) as f:
|
||||
for line in f:
|
||||
if line.startswith("__version__"):
|
||||
return line.split("=")[1].strip().strip("'\"")
|
||||
return "0.0.0"
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Generate db_meta.json from data.json")
|
||||
parser.add_argument("--min-version", default=None, help="Minimum compatible maigret version (default: current version)")
|
||||
parser.add_argument("--data-url", default=DEFAULT_DATA_URL, help="URL where data.json can be downloaded")
|
||||
args = parser.parse_args()
|
||||
|
||||
min_version = args.min_version or get_current_version()
|
||||
|
||||
with open(DATA_JSON_PATH, "rb") as f:
|
||||
raw = f.read()
|
||||
sha256 = hashlib.sha256(raw).hexdigest()
|
||||
|
||||
data = json.loads(raw)
|
||||
sites_count = len(data.get("sites", {}))
|
||||
|
||||
meta = {
|
||||
"version": 1,
|
||||
"updated_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"sites_count": sites_count,
|
||||
"min_maigret_version": min_version,
|
||||
"data_sha256": sha256,
|
||||
"data_url": args.data_url,
|
||||
}
|
||||
|
||||
with open(META_JSON_PATH, "w", encoding="utf-8") as f:
|
||||
json.dump(meta, f, indent=4, ensure_ascii=False)
|
||||
|
||||
print(f"Generated {META_JSON_PATH}")
|
||||
print(f" sites: {sites_count}")
|
||||
print(f" sha256: {sha256[:16]}...")
|
||||
print(f" min_version: {min_version}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,808 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Site check utility for Maigret development.
|
||||
Quickly test site availability, find valid usernames, and diagnose check issues.
|
||||
|
||||
Usage:
|
||||
python utils/site_check.py --site "SiteName" --check-claimed
|
||||
python utils/site_check.py --site "SiteName" --maigret # Test via Maigret
|
||||
python utils/site_check.py --site "SiteName" --compare-methods # aiohttp vs Maigret
|
||||
python utils/site_check.py --url "https://example.com/user/{username}" --test "john"
|
||||
python utils/site_check.py --site "SiteName" --find-user
|
||||
python utils/site_check.py --site "SiteName" --diagnose # Full diagnosis
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
# Add parent dir for imports
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
try:
|
||||
import aiohttp
|
||||
from yarl import URL as YarlURL
|
||||
except ImportError:
|
||||
print("aiohttp not installed. Run: pip install aiohttp")
|
||||
sys.exit(1)
|
||||
|
||||
# Maigret imports (optional, for --maigret mode)
|
||||
MAIGRET_AVAILABLE = False
|
||||
try:
|
||||
from maigret.sites import MaigretDatabase, MaigretSite
|
||||
from maigret.checking import (
|
||||
SimpleAiohttpChecker,
|
||||
check_site_for_username,
|
||||
process_site_result,
|
||||
make_site_result,
|
||||
)
|
||||
from maigret.notify import QueryNotifyPrint
|
||||
from maigret.result import QueryStatus
|
||||
MAIGRET_AVAILABLE = True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
DEFAULT_HEADERS = {
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.5",
|
||||
}
|
||||
|
||||
COMMON_USERNAMES = ["blue", "test", "admin", "user", "john", "alex", "david", "mike", "chris", "dan"]
|
||||
|
||||
|
||||
class Colors:
|
||||
"""ANSI color codes for terminal output."""
|
||||
RED = "\033[91m"
|
||||
GREEN = "\033[92m"
|
||||
YELLOW = "\033[93m"
|
||||
BLUE = "\033[94m"
|
||||
MAGENTA = "\033[95m"
|
||||
CYAN = "\033[96m"
|
||||
RESET = "\033[0m"
|
||||
BOLD = "\033[1m"
|
||||
|
||||
|
||||
def color(text: str, c: str) -> str:
|
||||
"""Wrap text with color codes."""
|
||||
return f"{c}{text}{Colors.RESET}"
|
||||
|
||||
|
||||
async def check_url_aiohttp(url: str, headers: dict = None, follow_redirects: bool = True,
|
||||
timeout: int = 15, ssl_verify: bool = False,
|
||||
method: str = "GET", payload: dict = None) -> dict:
|
||||
"""Check a URL using aiohttp and return detailed response info.
|
||||
|
||||
Args:
|
||||
method: HTTP method ("GET" or "POST").
|
||||
payload: JSON payload for POST requests (dict, will be serialized).
|
||||
"""
|
||||
headers = headers or DEFAULT_HEADERS.copy()
|
||||
result = {
|
||||
"method": "aiohttp",
|
||||
"url": url,
|
||||
"status": None,
|
||||
"final_url": None,
|
||||
"redirects": [],
|
||||
"content_length": 0,
|
||||
"content": None,
|
||||
"title": None,
|
||||
"error": None,
|
||||
"error_type": None,
|
||||
"markers": {},
|
||||
}
|
||||
|
||||
try:
|
||||
connector = aiohttp.TCPConnector(ssl=ssl_verify)
|
||||
timeout_obj = aiohttp.ClientTimeout(total=timeout)
|
||||
|
||||
async with aiohttp.ClientSession(connector=connector, timeout=timeout_obj) as session:
|
||||
# Use encoded=True if URL contains percent-encoded chars to prevent double-encoding
|
||||
request_url = YarlURL(url, encoded=True) if '%' in url else url
|
||||
request_kwargs = dict(headers=headers, allow_redirects=follow_redirects)
|
||||
if method.upper() == "POST" and payload is not None:
|
||||
request_kwargs["json"] = payload
|
||||
|
||||
request_fn = session.post if method.upper() == "POST" else session.get
|
||||
async with request_fn(request_url, **request_kwargs) as resp:
|
||||
result["status"] = resp.status
|
||||
result["final_url"] = str(resp.url)
|
||||
|
||||
# Get redirect history
|
||||
if resp.history:
|
||||
result["redirects"] = [str(r.url) for r in resp.history]
|
||||
|
||||
# Read content
|
||||
try:
|
||||
text = await resp.text()
|
||||
result["content_length"] = len(text)
|
||||
result["content"] = text
|
||||
|
||||
# Extract title
|
||||
title_match = re.search(r'<title>([^<]*)</title>', text, re.IGNORECASE)
|
||||
if title_match:
|
||||
result["title"] = title_match.group(1).strip()[:100]
|
||||
|
||||
# Check common markers
|
||||
text_lower = text.lower()
|
||||
markers = {
|
||||
"404_text": any(m in text_lower for m in ["not found", "404", "doesn't exist", "does not exist"]),
|
||||
"profile_markers": any(m in text_lower for m in ["profile", "user", "member", "account"]),
|
||||
"error_markers": any(m in text_lower for m in ["error", "banned", "suspended", "blocked"]),
|
||||
"login_required": any(m in text_lower for m in ["log in", "login", "sign in", "signin"]),
|
||||
"captcha": any(m in text_lower for m in ["captcha", "recaptcha", "challenge", "verify you"]),
|
||||
"cloudflare": "cloudflare" in text_lower or "cf-ray" in text_lower,
|
||||
"rate_limit": any(m in text_lower for m in ["rate limit", "too many requests", "429"]),
|
||||
}
|
||||
result["markers"] = markers
|
||||
|
||||
# First 500 chars of body for inspection
|
||||
result["body_preview"] = text[:500].replace("\n", " ").strip()
|
||||
|
||||
except Exception as e:
|
||||
result["error"] = f"Content read error: {e}"
|
||||
result["error_type"] = "content_error"
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
result["error"] = "Timeout"
|
||||
result["error_type"] = "timeout"
|
||||
except aiohttp.ClientError as e:
|
||||
result["error"] = f"Client error: {e}"
|
||||
result["error_type"] = "client_error"
|
||||
except Exception as e:
|
||||
result["error"] = f"Error: {e}"
|
||||
result["error_type"] = "unknown"
|
||||
|
||||
return result
|
||||
|
||||
|
||||
async def check_url_maigret(site: 'MaigretSite', username: str, logger=None) -> dict:
|
||||
"""Check a URL using Maigret's checking mechanism."""
|
||||
if not MAIGRET_AVAILABLE:
|
||||
return {"error": "Maigret not available", "method": "maigret"}
|
||||
|
||||
if logger is None:
|
||||
logger = logging.getLogger("site_check")
|
||||
logger.setLevel(logging.WARNING)
|
||||
|
||||
result = {
|
||||
"method": "maigret",
|
||||
"url": None,
|
||||
"status": None,
|
||||
"status_str": None,
|
||||
"http_status": None,
|
||||
"final_url": None,
|
||||
"error": None,
|
||||
"error_type": None,
|
||||
"ids_data": None,
|
||||
}
|
||||
|
||||
try:
|
||||
# Create query options
|
||||
options = {
|
||||
"parsing": False,
|
||||
"cookie_jar": None,
|
||||
"timeout": 15,
|
||||
}
|
||||
|
||||
# Create a simple notifier
|
||||
class SilentNotify:
|
||||
def start(self, msg=None): pass
|
||||
def update(self, status, similar=False): pass
|
||||
def finish(self, msg=None, status=None): pass
|
||||
|
||||
notifier = SilentNotify()
|
||||
|
||||
# Run the check
|
||||
site_name, site_result = await check_site_for_username(
|
||||
site, username, options, logger, notifier
|
||||
)
|
||||
|
||||
result["url"] = site_result.get("url_user")
|
||||
result["status"] = site_result.get("status")
|
||||
result["status_str"] = str(site_result.get("status"))
|
||||
result["http_status"] = site_result.get("http_status")
|
||||
result["ids_data"] = site_result.get("ids_data")
|
||||
|
||||
# Check for errors
|
||||
status = site_result.get("status")
|
||||
if status and hasattr(status, 'error') and status.error:
|
||||
result["error"] = f"{status.error.type}: {status.error.desc}"
|
||||
result["error_type"] = str(status.error.type)
|
||||
|
||||
except Exception as e:
|
||||
result["error"] = str(e)
|
||||
result["error_type"] = "exception"
|
||||
|
||||
return result
|
||||
|
||||
|
||||
async def find_valid_username(url_template: str, usernames: list = None, headers: dict = None) -> Optional[str]:
|
||||
"""Try common usernames to find one that works."""
|
||||
usernames = usernames or COMMON_USERNAMES
|
||||
headers = headers or DEFAULT_HEADERS.copy()
|
||||
|
||||
print(f"Testing {len(usernames)} usernames on {url_template}...")
|
||||
|
||||
for username in usernames:
|
||||
url = url_template.replace("{username}", username)
|
||||
result = await check_url_aiohttp(url, headers)
|
||||
|
||||
status = result["status"]
|
||||
markers = result.get("markers", {})
|
||||
|
||||
# Good signs: 200 status, profile markers, no 404 text
|
||||
if status == 200 and not markers.get("404_text") and markers.get("profile_markers"):
|
||||
print(f" {color('[+]', Colors.GREEN)} {username}: status={status}, has profile markers")
|
||||
return username
|
||||
elif status == 200 and not markers.get("404_text"):
|
||||
print(f" {color('[?]', Colors.YELLOW)} {username}: status={status}, might work")
|
||||
else:
|
||||
print(f" {color('[-]', Colors.RED)} {username}: status={status}")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
async def compare_users_aiohttp(url_template: str, claimed: str, unclaimed: str = "noonewouldeverusethis7",
|
||||
headers: dict = None) -> Tuple[dict, dict]:
|
||||
"""Compare responses for claimed vs unclaimed usernames using aiohttp."""
|
||||
headers = headers or DEFAULT_HEADERS.copy()
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Comparing: {color(claimed, Colors.GREEN)} vs {color(unclaimed, Colors.RED)}")
|
||||
print(f"URL template: {url_template}")
|
||||
print(f"Method: aiohttp")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
url_claimed = url_template.replace("{username}", claimed)
|
||||
url_unclaimed = url_template.replace("{username}", unclaimed)
|
||||
|
||||
result_claimed, result_unclaimed = await asyncio.gather(
|
||||
check_url_aiohttp(url_claimed, headers),
|
||||
check_url_aiohttp(url_unclaimed, headers)
|
||||
)
|
||||
|
||||
def print_result(name, r, c):
|
||||
print(f"--- {color(name, c)} ---")
|
||||
print(f" URL: {r['url']}")
|
||||
print(f" Status: {color(str(r['status']), Colors.GREEN if r['status'] == 200 else Colors.RED)}")
|
||||
if r["redirects"]:
|
||||
print(f" Redirects: {' -> '.join(r['redirects'])} -> {r['final_url']}")
|
||||
print(f" Final URL: {r['final_url']}")
|
||||
print(f" Content length: {r['content_length']}")
|
||||
print(f" Title: {r['title']}")
|
||||
if r["error"]:
|
||||
print(f" Error: {color(r['error'], Colors.RED)}")
|
||||
print(f" Markers: {r['markers']}")
|
||||
print()
|
||||
|
||||
print_result(f"CLAIMED ({claimed})", result_claimed, Colors.GREEN)
|
||||
print_result(f"UNCLAIMED ({unclaimed})", result_unclaimed, Colors.RED)
|
||||
|
||||
# Analysis
|
||||
print(f"--- {color('ANALYSIS', Colors.CYAN)} ---")
|
||||
recommendations = []
|
||||
|
||||
if result_claimed["status"] != result_unclaimed["status"]:
|
||||
print(f" [!] Status codes differ: {result_claimed['status']} vs {result_unclaimed['status']}")
|
||||
recommendations.append(("status_code", f"Status codes: {result_claimed['status']} vs {result_unclaimed['status']}"))
|
||||
|
||||
if result_claimed["final_url"] != result_unclaimed["final_url"]:
|
||||
print(f" [!] Final URLs differ")
|
||||
recommendations.append(("response_url", "Final URLs differ"))
|
||||
|
||||
if result_claimed["content_length"] != result_unclaimed["content_length"]:
|
||||
diff = abs(result_claimed["content_length"] - result_unclaimed["content_length"])
|
||||
print(f" [!] Content length differs by {diff} bytes")
|
||||
recommendations.append(("message", f"Content differs by {diff} bytes"))
|
||||
|
||||
if result_claimed["title"] != result_unclaimed["title"]:
|
||||
print(f" [!] Titles differ:")
|
||||
print(f" Claimed: {result_claimed['title']}")
|
||||
print(f" Unclaimed: {result_unclaimed['title']}")
|
||||
recommendations.append(("message", f"Titles differ: '{result_claimed['title']}' vs '{result_unclaimed['title']}'"))
|
||||
|
||||
# Check for problems
|
||||
if result_claimed.get("markers", {}).get("captcha"):
|
||||
print(f" {color('[WARN]', Colors.YELLOW)} Captcha detected on claimed page")
|
||||
if result_claimed.get("markers", {}).get("cloudflare"):
|
||||
print(f" {color('[WARN]', Colors.YELLOW)} Cloudflare protection detected")
|
||||
if result_claimed.get("markers", {}).get("login_required"):
|
||||
print(f" {color('[WARN]', Colors.YELLOW)} Login may be required")
|
||||
|
||||
if recommendations:
|
||||
print(f"\n {color('Recommended checkType:', Colors.BOLD)} {recommendations[0][0]}")
|
||||
else:
|
||||
print(f" {color('[!]', Colors.RED)} No clear difference found - site may need special handling")
|
||||
|
||||
return result_claimed, result_unclaimed
|
||||
|
||||
|
||||
async def compare_methods(site: 'MaigretSite', claimed: str, unclaimed: str) -> dict:
|
||||
"""Compare aiohttp vs Maigret results for the same site."""
|
||||
if not MAIGRET_AVAILABLE:
|
||||
print(color("Maigret not available for comparison", Colors.RED))
|
||||
return {}
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"{color('METHOD COMPARISON', Colors.CYAN)}: aiohttp vs Maigret")
|
||||
print(f"Site: {site.name}")
|
||||
print(f"Claimed: {claimed}, Unclaimed: {unclaimed}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
# Build URL template
|
||||
url_template = site.url
|
||||
url_template = url_template.replace("{urlMain}", site.url_main or "")
|
||||
url_template = url_template.replace("{urlSubpath}", getattr(site, 'url_subpath', '') or "")
|
||||
|
||||
headers = DEFAULT_HEADERS.copy()
|
||||
if hasattr(site, 'headers') and site.headers:
|
||||
headers.update(site.headers)
|
||||
|
||||
# Run all checks in parallel
|
||||
url_claimed = url_template.replace("{username}", claimed)
|
||||
url_unclaimed = url_template.replace("{username}", unclaimed)
|
||||
|
||||
aiohttp_claimed, aiohttp_unclaimed, maigret_claimed, maigret_unclaimed = await asyncio.gather(
|
||||
check_url_aiohttp(url_claimed, headers),
|
||||
check_url_aiohttp(url_unclaimed, headers),
|
||||
check_url_maigret(site, claimed),
|
||||
check_url_maigret(site, unclaimed),
|
||||
)
|
||||
|
||||
def status_icon(status):
|
||||
if status == 200:
|
||||
return color("200", Colors.GREEN)
|
||||
elif status == 404:
|
||||
return color("404", Colors.YELLOW)
|
||||
elif status and status >= 400:
|
||||
return color(str(status), Colors.RED)
|
||||
return str(status)
|
||||
|
||||
def maigret_status_icon(status_str):
|
||||
if "Claimed" in str(status_str):
|
||||
return color("Claimed", Colors.GREEN)
|
||||
elif "Available" in str(status_str):
|
||||
return color("Available", Colors.YELLOW)
|
||||
else:
|
||||
return color(str(status_str), Colors.RED)
|
||||
|
||||
print(f"{'Method':<12} {'Username':<25} {'HTTP Status':<12} {'Result':<20}")
|
||||
print("-" * 70)
|
||||
print(f"{'aiohttp':<12} {claimed:<25} {status_icon(aiohttp_claimed['status']):<20} {'OK' if not aiohttp_claimed['error'] else aiohttp_claimed['error'][:20]}")
|
||||
print(f"{'aiohttp':<12} {unclaimed:<25} {status_icon(aiohttp_unclaimed['status']):<20} {'OK' if not aiohttp_unclaimed['error'] else aiohttp_unclaimed['error'][:20]}")
|
||||
print(f"{'Maigret':<12} {claimed:<25} {status_icon(maigret_claimed.get('http_status')):<20} {maigret_status_icon(maigret_claimed.get('status_str'))}")
|
||||
print(f"{'Maigret':<12} {unclaimed:<25} {status_icon(maigret_unclaimed.get('http_status')):<20} {maigret_status_icon(maigret_unclaimed.get('status_str'))}")
|
||||
|
||||
# Check for discrepancies
|
||||
print(f"\n--- {color('DISCREPANCY ANALYSIS', Colors.CYAN)} ---")
|
||||
issues = []
|
||||
|
||||
if aiohttp_claimed['status'] != maigret_claimed.get('http_status'):
|
||||
issues.append(f"HTTP status mismatch for claimed: aiohttp={aiohttp_claimed['status']}, Maigret={maigret_claimed.get('http_status')}")
|
||||
|
||||
if aiohttp_unclaimed['status'] != maigret_unclaimed.get('http_status'):
|
||||
issues.append(f"HTTP status mismatch for unclaimed: aiohttp={aiohttp_unclaimed['status']}, Maigret={maigret_unclaimed.get('http_status')}")
|
||||
|
||||
# Check Maigret detection correctness
|
||||
claimed_detected = "Claimed" in str(maigret_claimed.get('status_str', ''))
|
||||
unclaimed_detected = "Available" in str(maigret_unclaimed.get('status_str', ''))
|
||||
|
||||
if not claimed_detected:
|
||||
issues.append(f"Maigret did NOT detect claimed user '{claimed}' as Claimed")
|
||||
if not unclaimed_detected:
|
||||
issues.append(f"Maigret did NOT detect unclaimed user '{unclaimed}' as Available")
|
||||
|
||||
if issues:
|
||||
for issue in issues:
|
||||
print(f" {color('[!]', Colors.RED)} {issue}")
|
||||
else:
|
||||
print(f" {color('[OK]', Colors.GREEN)} Both methods agree on results")
|
||||
|
||||
return {
|
||||
"aiohttp_claimed": aiohttp_claimed,
|
||||
"aiohttp_unclaimed": aiohttp_unclaimed,
|
||||
"maigret_claimed": maigret_claimed,
|
||||
"maigret_unclaimed": maigret_unclaimed,
|
||||
"issues": issues,
|
||||
}
|
||||
|
||||
|
||||
async def diagnose_site(site_config: dict, site_name: str) -> dict:
|
||||
"""Full diagnosis of a site configuration."""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"{color('FULL SITE DIAGNOSIS', Colors.CYAN)}: {site_name}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
diagnosis = {
|
||||
"site_name": site_name,
|
||||
"issues": [],
|
||||
"warnings": [],
|
||||
"recommendations": [],
|
||||
"working": False,
|
||||
}
|
||||
|
||||
# 1. Config analysis
|
||||
print(f"--- {color('1. CONFIGURATION', Colors.BOLD)} ---")
|
||||
check_type = site_config.get("checkType", "status_code")
|
||||
url = site_config.get("url", "")
|
||||
url_main = site_config.get("urlMain", "")
|
||||
claimed = site_config.get("usernameClaimed")
|
||||
unclaimed = site_config.get("usernameUnclaimed", "noonewouldeverusethis7")
|
||||
disabled = site_config.get("disabled", False)
|
||||
|
||||
print(f" checkType: {check_type}")
|
||||
print(f" URL: {url}")
|
||||
print(f" urlMain: {url_main}")
|
||||
print(f" usernameClaimed: {claimed}")
|
||||
print(f" disabled: {disabled}")
|
||||
|
||||
if disabled:
|
||||
diagnosis["issues"].append("Site is disabled")
|
||||
print(f" {color('[!]', Colors.YELLOW)} Site is disabled")
|
||||
|
||||
if not claimed:
|
||||
diagnosis["issues"].append("No usernameClaimed defined")
|
||||
print(f" {color('[!]', Colors.RED)} No usernameClaimed defined")
|
||||
return diagnosis
|
||||
|
||||
# Build full URL (display URL)
|
||||
url_template = url.replace("{urlMain}", url_main).replace("{urlSubpath}", site_config.get("urlSubpath", ""))
|
||||
|
||||
# Build probe URL (what Maigret actually requests)
|
||||
url_probe = site_config.get("urlProbe", "")
|
||||
if url_probe:
|
||||
probe_template = url_probe.replace("{urlMain}", url_main).replace("{urlSubpath}", site_config.get("urlSubpath", ""))
|
||||
else:
|
||||
probe_template = url_template
|
||||
|
||||
# Detect request method and payload
|
||||
request_method = site_config.get("requestMethod", "GET").upper()
|
||||
request_payload_template = site_config.get("requestPayload")
|
||||
|
||||
headers = DEFAULT_HEADERS.copy()
|
||||
# For API probes (urlProbe, POST), use neutral Accept header instead of text/html
|
||||
# which can cause servers to return HTML instead of JSON
|
||||
if url_probe or request_method == "POST":
|
||||
headers["Accept"] = "*/*"
|
||||
if site_config.get("headers"):
|
||||
headers.update(site_config["headers"])
|
||||
|
||||
if url_probe:
|
||||
print(f" urlProbe: {url_probe}")
|
||||
if request_method != "GET":
|
||||
print(f" requestMethod: {request_method}")
|
||||
if request_payload_template:
|
||||
print(f" requestPayload: {request_payload_template}")
|
||||
|
||||
# 2. Connectivity test
|
||||
print(f"\n--- {color('2. CONNECTIVITY TEST', Colors.BOLD)} ---")
|
||||
probe_claimed = probe_template.replace("{username}", claimed)
|
||||
probe_unclaimed = probe_template.replace("{username}", unclaimed)
|
||||
|
||||
# Build payloads with username substituted
|
||||
payload_claimed = None
|
||||
payload_unclaimed = None
|
||||
if request_payload_template and request_method == "POST":
|
||||
payload_claimed = json.loads(
|
||||
json.dumps(request_payload_template).replace("{username}", claimed)
|
||||
)
|
||||
payload_unclaimed = json.loads(
|
||||
json.dumps(request_payload_template).replace("{username}", unclaimed)
|
||||
)
|
||||
|
||||
result_claimed, result_unclaimed = await asyncio.gather(
|
||||
check_url_aiohttp(probe_claimed, headers, method=request_method, payload=payload_claimed),
|
||||
check_url_aiohttp(probe_unclaimed, headers, method=request_method, payload=payload_unclaimed)
|
||||
)
|
||||
|
||||
print(f" Claimed ({claimed}): status={result_claimed['status']}, error={result_claimed['error']}")
|
||||
print(f" Unclaimed ({unclaimed}): status={result_unclaimed['status']}, error={result_unclaimed['error']}")
|
||||
|
||||
# Check for common problems
|
||||
if result_claimed["error_type"] == "timeout":
|
||||
diagnosis["issues"].append("Timeout on claimed username")
|
||||
if result_unclaimed["error_type"] == "timeout":
|
||||
diagnosis["issues"].append("Timeout on unclaimed username")
|
||||
|
||||
if result_claimed.get("markers", {}).get("cloudflare"):
|
||||
diagnosis["warnings"].append("Cloudflare protection detected")
|
||||
if result_claimed.get("markers", {}).get("captcha"):
|
||||
diagnosis["warnings"].append("Captcha detected")
|
||||
if result_claimed["status"] == 403:
|
||||
diagnosis["issues"].append("403 Forbidden - possible anti-bot protection")
|
||||
if result_claimed["status"] == 429:
|
||||
diagnosis["issues"].append("429 Rate Limited")
|
||||
|
||||
# 3. Check type validation
|
||||
print(f"\n--- {color('3. CHECK TYPE VALIDATION', Colors.BOLD)} ---")
|
||||
|
||||
if check_type == "status_code":
|
||||
if result_claimed["status"] == result_unclaimed["status"]:
|
||||
diagnosis["issues"].append(f"status_code check but same status ({result_claimed['status']}) for both")
|
||||
print(f" {color('[FAIL]', Colors.RED)} Same status code for claimed and unclaimed: {result_claimed['status']}")
|
||||
else:
|
||||
print(f" {color('[OK]', Colors.GREEN)} Status codes differ: {result_claimed['status']} vs {result_unclaimed['status']}")
|
||||
diagnosis["working"] = True
|
||||
|
||||
elif check_type == "response_url":
|
||||
if result_claimed["final_url"] == result_unclaimed["final_url"]:
|
||||
diagnosis["issues"].append("response_url check but same final URL for both")
|
||||
print(f" {color('[FAIL]', Colors.RED)} Same final URL for both")
|
||||
else:
|
||||
print(f" {color('[OK]', Colors.GREEN)} Final URLs differ")
|
||||
diagnosis["working"] = True
|
||||
|
||||
elif check_type == "message":
|
||||
presense_strs = site_config.get("presenseStrs", [])
|
||||
absence_strs = site_config.get("absenceStrs", [])
|
||||
|
||||
print(f" presenseStrs: {presense_strs}")
|
||||
print(f" absenceStrs: {absence_strs}")
|
||||
|
||||
claimed_content = result_claimed.get("content", "") or ""
|
||||
unclaimed_content = result_unclaimed.get("content", "") or ""
|
||||
|
||||
# Check presenseStrs
|
||||
presense_found_claimed = any(s in claimed_content for s in presense_strs) if presense_strs else True
|
||||
presense_found_unclaimed = any(s in unclaimed_content for s in presense_strs) if presense_strs else True
|
||||
|
||||
# Check absenceStrs
|
||||
absence_found_claimed = any(s in claimed_content for s in absence_strs) if absence_strs else False
|
||||
absence_found_unclaimed = any(s in unclaimed_content for s in absence_strs) if absence_strs else False
|
||||
|
||||
print(f" Claimed - presenseStrs found: {presense_found_claimed}, absenceStrs found: {absence_found_claimed}")
|
||||
print(f" Unclaimed - presenseStrs found: {presense_found_unclaimed}, absenceStrs found: {absence_found_unclaimed}")
|
||||
|
||||
if presense_strs and not presense_found_claimed:
|
||||
diagnosis["issues"].append(f"presenseStrs {presense_strs} not found in claimed page")
|
||||
print(f" {color('[FAIL]', Colors.RED)} presenseStrs not found in claimed page")
|
||||
if absence_strs and absence_found_claimed:
|
||||
diagnosis["issues"].append(f"absenceStrs {absence_strs} found in claimed page (should not be)")
|
||||
print(f" {color('[FAIL]', Colors.RED)} absenceStrs found in claimed page")
|
||||
if absence_strs and not absence_found_unclaimed:
|
||||
diagnosis["warnings"].append(f"absenceStrs not found in unclaimed page")
|
||||
print(f" {color('[WARN]', Colors.YELLOW)} absenceStrs not found in unclaimed page")
|
||||
|
||||
# Check works if: claimed is detected as present AND unclaimed is detected as absent.
|
||||
# Presence detection: presenseStrs found (or empty = always true).
|
||||
# Absence detection: absenceStrs found in unclaimed (or empty = never, rely on presenseStrs only).
|
||||
# With only presenseStrs: works if found in claimed but NOT in unclaimed.
|
||||
# With only absenceStrs: works if found in unclaimed but NOT in claimed.
|
||||
# With both: standard combination.
|
||||
claimed_is_present = presense_found_claimed and not absence_found_claimed
|
||||
unclaimed_is_absent = (
|
||||
(absence_strs and absence_found_unclaimed) or
|
||||
(presense_strs and not presense_found_unclaimed)
|
||||
)
|
||||
if claimed_is_present and unclaimed_is_absent:
|
||||
print(f" {color('[OK]', Colors.GREEN)} Message check should work correctly")
|
||||
diagnosis["working"] = True
|
||||
|
||||
# 4. Recommendations
|
||||
print(f"\n--- {color('4. RECOMMENDATIONS', Colors.BOLD)} ---")
|
||||
|
||||
if not diagnosis["working"]:
|
||||
# Suggest alternatives
|
||||
if result_claimed["status"] != result_unclaimed["status"]:
|
||||
diagnosis["recommendations"].append(f"Switch to checkType: status_code (status {result_claimed['status']} vs {result_unclaimed['status']})")
|
||||
if result_claimed["final_url"] != result_unclaimed["final_url"]:
|
||||
diagnosis["recommendations"].append("Switch to checkType: response_url")
|
||||
if result_claimed["title"] != result_unclaimed["title"]:
|
||||
diagnosis["recommendations"].append(f"Use title as marker: presenseStrs=['{result_claimed['title']}'] or absenceStrs=['{result_unclaimed['title']}']")
|
||||
|
||||
if diagnosis["recommendations"]:
|
||||
for rec in diagnosis["recommendations"]:
|
||||
print(f" -> {rec}")
|
||||
elif diagnosis["working"]:
|
||||
print(f" {color('Site appears to be working correctly', Colors.GREEN)}")
|
||||
else:
|
||||
print(f" {color('No clear fix found - site may need special handling or should be disabled', Colors.RED)}")
|
||||
|
||||
# Summary
|
||||
print(f"\n--- {color('SUMMARY', Colors.BOLD)} ---")
|
||||
if diagnosis["issues"]:
|
||||
print(f" Issues: {len(diagnosis['issues'])}")
|
||||
for issue in diagnosis["issues"]:
|
||||
print(f" - {issue}")
|
||||
if diagnosis["warnings"]:
|
||||
print(f" Warnings: {len(diagnosis['warnings'])}")
|
||||
for warn in diagnosis["warnings"]:
|
||||
print(f" - {warn}")
|
||||
print(f" Working: {color('YES', Colors.GREEN) if diagnosis['working'] else color('NO', Colors.RED)}")
|
||||
|
||||
return diagnosis
|
||||
|
||||
|
||||
def load_site_from_db(site_name: str) -> Tuple[Optional[dict], Optional['MaigretSite']]:
|
||||
"""Load site config from data.json. Returns (config_dict, MaigretSite or None)."""
|
||||
db_path = Path(__file__).parent.parent / "maigret" / "resources" / "data.json"
|
||||
|
||||
with open(db_path) as f:
|
||||
data = json.load(f)
|
||||
|
||||
config = None
|
||||
if site_name in data["sites"]:
|
||||
config = data["sites"][site_name]
|
||||
else:
|
||||
# Try case-insensitive search
|
||||
for name, cfg in data["sites"].items():
|
||||
if name.lower() == site_name.lower():
|
||||
config = cfg
|
||||
site_name = name
|
||||
break
|
||||
|
||||
if not config:
|
||||
return None, None
|
||||
|
||||
# Also load MaigretSite if available
|
||||
maigret_site = None
|
||||
if MAIGRET_AVAILABLE:
|
||||
try:
|
||||
db = MaigretDatabase().load_from_path(db_path)
|
||||
maigret_site = db.sites_dict.get(site_name)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return config, maigret_site
|
||||
|
||||
|
||||
async def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Site check utility for Maigret development",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
%(prog)s --site "VK" --check-claimed # Test site with aiohttp
|
||||
%(prog)s --site "VK" --maigret # Test site with Maigret
|
||||
%(prog)s --site "VK" --compare-methods # Compare aiohttp vs Maigret
|
||||
%(prog)s --site "VK" --diagnose # Full diagnosis
|
||||
%(prog)s --url "https://vk.com/{username}" --compare blue nobody123
|
||||
%(prog)s --site "VK" --find-user # Find a valid username
|
||||
"""
|
||||
)
|
||||
parser.add_argument("--site", "-s", help="Site name from data.json")
|
||||
parser.add_argument("--url", "-u", help="URL template with {username}")
|
||||
parser.add_argument("--test", "-t", help="Username to test")
|
||||
parser.add_argument("--compare", "-c", nargs=2, metavar=("CLAIMED", "UNCLAIMED"),
|
||||
help="Compare two usernames")
|
||||
parser.add_argument("--find-user", "-f", action="store_true",
|
||||
help="Find a valid username")
|
||||
parser.add_argument("--check-claimed", action="store_true",
|
||||
help="Check if claimed username still works (aiohttp)")
|
||||
parser.add_argument("--maigret", "-m", action="store_true",
|
||||
help="Test using Maigret's checker instead of aiohttp")
|
||||
parser.add_argument("--compare-methods", action="store_true",
|
||||
help="Compare aiohttp vs Maigret results")
|
||||
parser.add_argument("--diagnose", "-d", action="store_true",
|
||||
help="Full diagnosis of site configuration")
|
||||
parser.add_argument("--headers", help="Custom headers as JSON")
|
||||
parser.add_argument("--timeout", type=int, default=15, help="Request timeout in seconds")
|
||||
parser.add_argument("--json", action="store_true", help="Output results as JSON")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
url_template = None
|
||||
claimed = None
|
||||
unclaimed = "noonewouldeverusethis7"
|
||||
headers = DEFAULT_HEADERS.copy()
|
||||
site_config = None
|
||||
maigret_site = None
|
||||
|
||||
# Load from site name
|
||||
if args.site:
|
||||
site_config, maigret_site = load_site_from_db(args.site)
|
||||
if not site_config:
|
||||
print(f"Site '{args.site}' not found in database")
|
||||
sys.exit(1)
|
||||
|
||||
url_template = site_config.get("url", "")
|
||||
url_main = site_config.get("urlMain", "")
|
||||
url_subpath = site_config.get("urlSubpath", "")
|
||||
url_template = url_template.replace("{urlMain}", url_main).replace("{urlSubpath}", url_subpath)
|
||||
|
||||
claimed = site_config.get("usernameClaimed")
|
||||
unclaimed = site_config.get("usernameUnclaimed", unclaimed)
|
||||
|
||||
if site_config.get("headers"):
|
||||
headers.update(site_config["headers"])
|
||||
|
||||
if not args.json:
|
||||
print(f"Loaded site: {args.site}")
|
||||
print(f" URL: {url_template}")
|
||||
print(f" Claimed: {claimed}")
|
||||
print(f" CheckType: {site_config.get('checkType', 'unknown')}")
|
||||
print(f" Disabled: {site_config.get('disabled', False)}")
|
||||
|
||||
# Override with explicit URL
|
||||
if args.url:
|
||||
url_template = args.url
|
||||
|
||||
# Custom headers
|
||||
if args.headers:
|
||||
headers.update(json.loads(args.headers))
|
||||
|
||||
# Actions
|
||||
if args.diagnose:
|
||||
if not site_config:
|
||||
print("--diagnose requires --site")
|
||||
sys.exit(1)
|
||||
result = await diagnose_site(site_config, args.site)
|
||||
if args.json:
|
||||
print(json.dumps(result, indent=2, default=str))
|
||||
|
||||
elif args.compare_methods:
|
||||
if not maigret_site:
|
||||
if not MAIGRET_AVAILABLE:
|
||||
print("Maigret imports not available")
|
||||
else:
|
||||
print("Could not load MaigretSite object")
|
||||
sys.exit(1)
|
||||
result = await compare_methods(maigret_site, claimed, unclaimed)
|
||||
if args.json:
|
||||
print(json.dumps(result, indent=2, default=str))
|
||||
|
||||
elif args.maigret:
|
||||
if not maigret_site:
|
||||
if not MAIGRET_AVAILABLE:
|
||||
print("Maigret imports not available")
|
||||
else:
|
||||
print("Could not load MaigretSite object")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"\n--- Testing with Maigret ---")
|
||||
for username in [claimed, unclaimed]:
|
||||
result = await check_url_maigret(maigret_site, username)
|
||||
print(f" {username}: status={result.get('status_str')}, http={result.get('http_status')}, error={result.get('error')}")
|
||||
|
||||
elif args.find_user:
|
||||
if not url_template:
|
||||
print("--find-user requires --site or --url")
|
||||
sys.exit(1)
|
||||
result = await find_valid_username(url_template, headers=headers)
|
||||
if result:
|
||||
print(f"\n{color('Found valid username:', Colors.GREEN)} {result}")
|
||||
else:
|
||||
print(f"\n{color('No valid username found', Colors.RED)}")
|
||||
|
||||
elif args.compare:
|
||||
if not url_template:
|
||||
print("--compare requires --site or --url")
|
||||
sys.exit(1)
|
||||
result = await compare_users_aiohttp(url_template, args.compare[0], args.compare[1], headers)
|
||||
if args.json:
|
||||
# Remove content field for JSON output (too large)
|
||||
for r in result:
|
||||
if isinstance(r, dict) and "content" in r:
|
||||
del r["content"]
|
||||
print(json.dumps(result, indent=2, default=str))
|
||||
|
||||
elif args.check_claimed and claimed:
|
||||
result = await compare_users_aiohttp(url_template, claimed, unclaimed, headers)
|
||||
|
||||
elif args.test:
|
||||
if not url_template:
|
||||
print("--test requires --site or --url")
|
||||
sys.exit(1)
|
||||
url = url_template.replace("{username}", args.test)
|
||||
result = await check_url_aiohttp(url, headers, timeout=args.timeout)
|
||||
if "content" in result:
|
||||
del result["content"] # Too large for display
|
||||
print(json.dumps(result, indent=2, default=str))
|
||||
|
||||
else:
|
||||
# Default: check claimed username if available
|
||||
if url_template and claimed:
|
||||
await compare_users_aiohttp(url_template, claimed, unclaimed, headers)
|
||||
else:
|
||||
parser.print_help()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
+134
-39
@@ -4,6 +4,7 @@ This module generates the listing of supported sites in file `SITES.md`
|
||||
and pretty prints file with sites data.
|
||||
"""
|
||||
import sys
|
||||
import socket
|
||||
import requests
|
||||
import logging
|
||||
import threading
|
||||
@@ -24,36 +25,87 @@ RANKS.update({
|
||||
'100000000': '100M',
|
||||
})
|
||||
|
||||
SEMAPHORE = threading.Semaphore(20)
|
||||
|
||||
|
||||
def get_rank(domain_to_query, site, print_errors=True):
|
||||
with SEMAPHORE:
|
||||
# Retrieve ranking data via alexa API
|
||||
url = f"http://data.alexa.com/data?cli=10&url={domain_to_query}"
|
||||
xml_data = requests.get(url).text
|
||||
root = ET.fromstring(xml_data)
|
||||
import csv
|
||||
import io
|
||||
from urllib.parse import urlparse
|
||||
|
||||
try:
|
||||
#Get ranking for this site.
|
||||
site.alexa_rank = int(root.find('.//REACH').attrib['RANK'])
|
||||
# country = root.find('.//COUNTRY')
|
||||
# if not country is None and country.attrib:
|
||||
# country_code = country.attrib['CODE']
|
||||
# tags = set(site.tags)
|
||||
# if country_code:
|
||||
# tags.add(country_code.lower())
|
||||
# site.tags = sorted(list(tags))
|
||||
# if site.type != 'username':
|
||||
# site.disabled = False
|
||||
except Exception as e:
|
||||
if print_errors:
|
||||
logging.error(e)
|
||||
# We did not find the rank for some reason.
|
||||
print(f"Error retrieving rank information for '{domain_to_query}'")
|
||||
print(f" Returned XML is |{xml_data}|")
|
||||
def fetch_majestic_million():
|
||||
print("Fetching Majestic Million CSV (this may take a few seconds)...")
|
||||
ranks = {}
|
||||
url = "https://downloads.majestic.com/majestic_million.csv"
|
||||
try:
|
||||
response = requests.get(url, stream=True)
|
||||
response.raise_for_status()
|
||||
|
||||
csv_file = io.StringIO(response.text)
|
||||
reader = csv.reader(csv_file)
|
||||
next(reader) # skip headers
|
||||
|
||||
for row in reader:
|
||||
if not row or len(row) < 3:
|
||||
continue
|
||||
rank = int(row[0])
|
||||
domain = row[2].lower()
|
||||
ranks[domain] = rank
|
||||
except Exception as e:
|
||||
logging.error(f"Error fetching Majestic Million: {e}")
|
||||
|
||||
print(f"Loaded {len(ranks)} domains from Majestic Million.")
|
||||
return ranks
|
||||
|
||||
return
|
||||
def get_base_domain(url):
|
||||
try:
|
||||
netloc = urlparse(url).netloc
|
||||
if netloc.startswith('www.'):
|
||||
netloc = netloc[4:]
|
||||
return netloc.lower()
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
def check_dns(domain, timeout=5):
|
||||
"""Check if a domain resolves via DNS. Returns True if it resolves."""
|
||||
try:
|
||||
socket.setdefaulttimeout(timeout)
|
||||
socket.getaddrinfo(domain, None)
|
||||
return True
|
||||
except (socket.gaierror, socket.timeout, OSError):
|
||||
return False
|
||||
|
||||
|
||||
def check_sites_dns(sites):
|
||||
"""Check DNS resolution for all sites. Returns a set of site names that failed."""
|
||||
SKIP_TLDS = ('.onion', '.i2p')
|
||||
domains = {}
|
||||
for site in sites:
|
||||
domain = get_base_domain(site.url_main)
|
||||
if domain and not any(domain.endswith(tld) for tld in SKIP_TLDS):
|
||||
domains.setdefault(domain, []).append(site)
|
||||
|
||||
failed_sites = set()
|
||||
results = {}
|
||||
|
||||
def resolve(domain):
|
||||
results[domain] = check_dns(domain)
|
||||
|
||||
threads = []
|
||||
for domain in domains:
|
||||
t = threading.Thread(target=resolve, args=(domain,))
|
||||
threads.append(t)
|
||||
t.start()
|
||||
|
||||
for t in threads:
|
||||
t.join()
|
||||
|
||||
for domain, resolved in results.items():
|
||||
if not resolved:
|
||||
for site in domains[domain]:
|
||||
failed_sites.add(site.name)
|
||||
logging.warning(f"DNS resolution failed for {domain}")
|
||||
|
||||
return failed_sites
|
||||
|
||||
|
||||
def get_step_rank(rank):
|
||||
@@ -78,6 +130,8 @@ def main():
|
||||
parser.add_argument('--empty-only', help='update only sites without rating', action='store_true')
|
||||
parser.add_argument('--exclude-engine', help='do not update score with certain engine',
|
||||
action="append", dest="exclude_engine_list", default=[])
|
||||
parser.add_argument('--dns-check', help='disable sites whose domains do not resolve via DNS',
|
||||
action='store_true')
|
||||
|
||||
pool = list()
|
||||
|
||||
@@ -91,30 +145,51 @@ def main():
|
||||
with open("sites.md", "w") as site_file:
|
||||
site_file.write(f"""
|
||||
## List of supported sites (search methods): total {len(sites_subset)}\n
|
||||
Rank data fetched from Alexa by domains.
|
||||
Rank data fetched from Majestic Million by domains.
|
||||
|
||||
""")
|
||||
|
||||
if args.dns_check:
|
||||
print("Checking DNS resolution for all site domains...")
|
||||
failed = check_sites_dns(sites_subset)
|
||||
disabled_count = 0
|
||||
re_enabled_count = 0
|
||||
for site in sites_subset:
|
||||
if site.name in failed:
|
||||
if not site.disabled:
|
||||
site.disabled = True
|
||||
disabled_count += 1
|
||||
print(f" Disabled {site.name}: DNS does not resolve ({get_base_domain(site.url_main)})")
|
||||
else:
|
||||
if site.disabled:
|
||||
# Re-enable previously disabled site if DNS now resolves
|
||||
# (only if it was likely disabled due to DNS failure)
|
||||
pass
|
||||
print(f"DNS check complete: {disabled_count} site(s) disabled, {len(failed)} domain(s) unresolvable.")
|
||||
|
||||
majestic_ranks = {}
|
||||
if args.with_rank:
|
||||
majestic_ranks = fetch_majestic_million()
|
||||
|
||||
for site in sites_subset:
|
||||
if not args.with_rank:
|
||||
break
|
||||
url_main = site.url_main
|
||||
|
||||
if site.alexa_rank < sys.maxsize and args.empty_only:
|
||||
continue
|
||||
if args.exclude_engine_list and site.engine in args.exclude_engine_list:
|
||||
continue
|
||||
site.alexa_rank = 0
|
||||
th = threading.Thread(target=get_rank, args=(url_main, site,))
|
||||
pool.append((site.name, url_main, th))
|
||||
th.start()
|
||||
|
||||
|
||||
domain = get_base_domain(site.url_main)
|
||||
|
||||
if domain in majestic_ranks:
|
||||
site.alexa_rank = majestic_ranks[domain]
|
||||
else:
|
||||
site.alexa_rank = sys.maxsize
|
||||
|
||||
# In memory matching complete, no threads to join
|
||||
if args.with_rank:
|
||||
index = 1
|
||||
for site_name, url_main, th in pool:
|
||||
th.join()
|
||||
sys.stdout.write("\r{0}".format(f"Updated {index} out of {len(sites_subset)} entries"))
|
||||
sys.stdout.flush()
|
||||
index = index + 1
|
||||
print("Successfully updated ranks matching Majestic Million dataset.")
|
||||
|
||||
sites_full_list = [(s, int(s.alexa_rank)) for s in sites_subset]
|
||||
|
||||
@@ -142,6 +217,26 @@ Rank data fetched from Alexa by domains.
|
||||
site_file.write(f'\nThe list was updated at ({datetime.now(timezone.utc).date()})\n')
|
||||
db.save_to_file(args.base_file)
|
||||
|
||||
# Regenerate db_meta.json to stay in sync with data.json
|
||||
try:
|
||||
import hashlib, json, os
|
||||
db_data_raw = open(args.base_file, 'rb').read()
|
||||
db_data_parsed = json.loads(db_data_raw)
|
||||
meta = {
|
||||
"version": 1,
|
||||
"updated_at": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
||||
"sites_count": len(db_data_parsed.get("sites", {})),
|
||||
"min_maigret_version": "0.5.0",
|
||||
"data_sha256": hashlib.sha256(db_data_raw).hexdigest(),
|
||||
"data_url": "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/data.json",
|
||||
}
|
||||
meta_path = os.path.join(os.path.dirname(args.base_file), "db_meta.json")
|
||||
with open(meta_path, "w", encoding="utf-8") as mf:
|
||||
json.dump(meta, mf, indent=4, ensure_ascii=False)
|
||||
print(f"Updated {meta_path} ({meta['sites_count']} sites)")
|
||||
except Exception as e:
|
||||
print(f"Warning: could not regenerate db_meta.json: {e}")
|
||||
|
||||
statistics_text = db.get_db_stats(is_markdown=True)
|
||||
site_file.write('## Statistics\n\n')
|
||||
site_file.write(statistics_text)
|
||||
|
||||
Reference in New Issue
Block a user