Compare commits
308 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 470ef5721f | |||
| fd2c8afd33 | |||
| 8c007219f5 | |||
| a425e5ceff | |||
| d0fd3533b5 | |||
| 5bf361a1ac | |||
| e07d3b60ba | |||
| 1e2d5cf742 | |||
| 694e024ba1 | |||
| 6862425215 | |||
| 54c8074e51 | |||
| 71e1fb6dcf | |||
| 364187861d | |||
| 8a53a38543 | |||
| bc787cdf51 | |||
| dcf5181e28 | |||
| 61452d56d3 | |||
| be204ff119 | |||
| 8a865a1ce6 | |||
| a29c3c6abe | |||
| ea6fd30a30 | |||
| 8dbe9a415c | |||
| 222398154e | |||
| 3030025ea3 | |||
| 40233e66cb | |||
| 2ea75f7f76 | |||
| dbd393da58 | |||
| b9f72151ea | |||
| dc2989a47d | |||
| c86e558a57 | |||
| 3c8c1d1f5a | |||
| 1683e5b744 | |||
| 31fc656721 | |||
| 79f872c77c | |||
| 22f158e749 | |||
| ff1eac0b20 | |||
| f2d3fed9c7 | |||
| cbbdc5a820 | |||
| 8a614001fd | |||
| 7a50f2922a | |||
| da0f4ae7cf | |||
| d12310bb53 | |||
| 211b8ccfd0 | |||
| f352f9f58b | |||
| 0d70ee1abc | |||
| 032ca8141a | |||
| 3acf6e5180 | |||
| 14f2b0c756 | |||
| e0a4775205 | |||
| d056eb545f | |||
| 10f8e1f597 | |||
| 6cc789d800 | |||
| c214f38841 | |||
| 392b83c230 | |||
| 96bebd49d3 | |||
| 92950f1b88 | |||
| 07b5874802 | |||
| 6a62586a59 | |||
| 883abe7877 | |||
| fc58046a34 | |||
| b6a1eb26e7 | |||
| 42169397fe | |||
| 870d68ec1c | |||
| 12ef7f62c2 | |||
| 8b7ea67edc | |||
| 182a493b6a | |||
| 4f7781b7a2 | |||
| 3579f2fd09 | |||
| 34b8d938f7 | |||
| ea963af29b | |||
| 5ea5f6337d | |||
| 292d0a2665 | |||
| 057bdce751 | |||
| f051cc768e | |||
| 985f4075f4 | |||
| d88abc6271 | |||
| 63b99338d7 | |||
| bd3503f3c8 | |||
| d7f94076bf | |||
| 10879c8bf3 | |||
| b48d126118 | |||
| c2c2707fb6 | |||
| 5e16edc003 | |||
| e84b5e3d5d | |||
| 4d65d03074 | |||
| 222e8d3d09 | |||
| 92c7e41439 | |||
| 55f941cf18 | |||
| fa6bb1ee17 | |||
| 58ae979904 | |||
| e8d63ef273 | |||
| 41f2ae6faa | |||
| 6cf9b296e5 | |||
| 1301e66e90 | |||
| 549a8b43fe | |||
| 2c33d797ce | |||
| 5c05cfa5bc | |||
| 3e884d4b76 | |||
| 66c80aa878 | |||
| e51aba743a | |||
| 55dea38b6b | |||
| d516c93bfc | |||
| e520418f6a | |||
| ecabf88c3a | |||
| 8801f7e6de | |||
| d52ff10186 | |||
| 4ee65e0445 | |||
| 1dfc45722b | |||
| bc8e29e92a | |||
| c5df7ca990 | |||
| bda85b290e | |||
| b781602474 | |||
| 56ad1d1c60 | |||
| 744ad1deda | |||
| aee9125c96 | |||
| 262f97ce33 | |||
| 4880b71246 | |||
| 5f220b652d | |||
| f533c30564 | |||
| 2b905ae996 | |||
| c154cf9f23 | |||
| 90ec62d657 | |||
| 09ae96e4da | |||
| b664efc3f1 | |||
| 39a523c188 | |||
| d1c708e8c3 | |||
| 954465f2d6 | |||
| fb75e9e5a2 | |||
| ef8f9ce15b | |||
| 0aec913eee | |||
| fa064b6c1b | |||
| 7f151a0d6a | |||
| 8b20799a34 | |||
| 6477a36ae1 | |||
| a7a56839a9 | |||
| b33656c02c | |||
| c5ac36affe | |||
| e4f87e1a9b | |||
| b7b902f108 | |||
| 447cd8511c | |||
| 220c749af3 | |||
| 9e6d38dfea | |||
| 1283ac01bf | |||
| dbcd52da81 | |||
| 3e370ce967 | |||
| 0bc11da598 | |||
| 0a6d2bed2e | |||
| 2059e69e99 | |||
| 10523e98c5 | |||
| cae9bf99ff | |||
| 7decbce08d | |||
| c0f2a550f5 | |||
| 6688479c1c | |||
| 3dc8ae1f41 | |||
| 1290a9863f | |||
| 282a3bef73 | |||
| 1b9ce3bac7 | |||
| 1931877756 | |||
| 646265791a | |||
| e38e302b6d | |||
| 4ff19970dd | |||
| 267d9e505b | |||
| 979e0c4dd4 | |||
| 24a446bd3a | |||
| 7a362406d5 | |||
| cc0ecb49d4 | |||
| 216e02111e | |||
| 59f573e754 | |||
| d993c4883e | |||
| f81a500d72 | |||
| 89711ff036 | |||
| dc8fdc25f5 | |||
| 4f5222df1c | |||
| 1a0db9032d | |||
| b4a13562a2 | |||
| fa3225a7cf | |||
| 6aef69cc81 | |||
| 74665283ed | |||
| 4ce241893b | |||
| 784eec7748 | |||
| eeab6ba82c | |||
| 516861e0ae | |||
| 87a7a2cc59 | |||
| 8f86d76db6 | |||
| 290c162094 | |||
| 63a7e8feac | |||
| e3b4512c47 | |||
| 37854a867b | |||
| 6480eebbdf | |||
| c57204ff2f | |||
| c147f19c3a | |||
| 998ff2e4e6 | |||
| 0dd3f2e137 | |||
| aad862b2ed | |||
| c6d0f332bd | |||
| f1c006159e | |||
| 69a09fcd94 | |||
| 9f948928e6 | |||
| a3034c11ff | |||
| d47c72b972 | |||
| 8062ec30e9 | |||
| 32000a1cfd | |||
| 8af6ce3af5 | |||
| 0dd1dd5d76 | |||
| 4aab21046b | |||
| 92ac9ec8b7 | |||
| ca2c8b3502 | |||
| 4362a41fca | |||
| c7977f1cdf | |||
| 49708da980 | |||
| bc1398061f | |||
| e8634c8c56 | |||
| dc59b93f38 | |||
| c727cbae27 | |||
| e6c6cc8f6d | |||
| c80e8b1207 | |||
| 6e78fdeb81 | |||
| 9c22e09808 | |||
| f057fd3a68 | |||
| 9b0acc092a | |||
| e6b4cdfa77 | |||
| eb721dc7e3 | |||
| eba0c4531c | |||
| b4a26c03fe | |||
| 9b7f36dc24 | |||
| 05167ad30c | |||
| cee6f0aa43 | |||
| 02cf330e37 | |||
| 5c8f7a3af0 | |||
| 13e1b6f4d1 | |||
| 5179cb56eb | |||
| 1a2c7e944a | |||
| f7eae046a1 | |||
| bdff08cb70 | |||
| a468cb1cd3 | |||
| 0fe933e8a1 | |||
| 5c3de91181 | |||
| 3356463102 | |||
| 7ac03cf5ca | |||
| 4aeacef07d | |||
| 8de1830cf3 | |||
| ba6169659e | |||
| 4a5c5c3f07 | |||
| 4ba7fcb1ff | |||
| a76f95858f | |||
| bea900dda0 | |||
| bb1bde833d | |||
| 5b405c6abb | |||
| 99fa58ceed | |||
| c71e404f63 | |||
| 2c04ccce57 | |||
| 435db7cdc9 | |||
| 413a0502a4 | |||
| 2aedcc3166 | |||
| 28835204f5 | |||
| b11a247dfd | |||
| c9219d91ec | |||
| aa6cd0eca9 | |||
| 38e5d5c664 | |||
| 8a562d06ae | |||
| aa50ee9672 | |||
| 51327f9647 | |||
| 4a368c9bb6 | |||
| 6fd5f6e33a | |||
| fa3db9c39c | |||
| 5912ad4fbc | |||
| ee36dc0187 | |||
| 9eb62e4e22 | |||
| ead048af93 | |||
| acc751ff98 | |||
| b7bdd71cf0 | |||
| 43f189f774 | |||
| 5bda7fb339 | |||
| 414523a8ac | |||
| 6d4e268706 | |||
| b696b982f4 | |||
| d4234036c0 | |||
| b57c70091c | |||
| e90df3560b | |||
| bc6ee48b8c | |||
| e70bdf3789 | |||
| 84f9d417cf | |||
| 4333c40be7 | |||
| 9e504c0094 | |||
| 2f752a0368 | |||
| 53e9dab677 | |||
| 11b70a2a48 | |||
| 960708ef2e | |||
| e6f6d8735d | |||
| f77d7d307a | |||
| 158f739a59 | |||
| b6a207d0e3 | |||
| d59867b0d9 | |||
| 2145027196 | |||
| 386e9eba4f | |||
| 0e9655c46a | |||
| 009d51c380 | |||
| 78e9688ece | |||
| 3cbb9df7b3 | |||
| 2fb1f19948 | |||
| 3b91a9cd31 | |||
| 9858e71349 | |||
| c88e194d07 | |||
| ad5c7fbc7d | |||
| 66d6c7a93c | |||
| bdfb4911ce | |||
| 951be44452 | |||
| 188edc1b7f |
@@ -0,0 +1,3 @@
|
||||
# These are supported funding model platforms
|
||||
|
||||
patreon: soxoj
|
||||
@@ -0,0 +1,13 @@
|
||||
---
|
||||
name: Add a site
|
||||
about: I want to add a new site for Maigret checks
|
||||
title: New site
|
||||
labels: new-site
|
||||
assignees: soxoj
|
||||
|
||||
---
|
||||
|
||||
Link to the site main page: https://example.com
|
||||
Link to an existing account: https://example.com/users/john
|
||||
Link to a nonexistent account: https://example.com/users/noonewouldeverusethis7
|
||||
Tags: photo, us, ...
|
||||
@@ -0,0 +1,24 @@
|
||||
---
|
||||
name: Maigret bug report
|
||||
about: I want to report a bug in Maigret functionality
|
||||
title: ''
|
||||
labels: bug
|
||||
assignees: soxoj
|
||||
|
||||
---
|
||||
|
||||
## Checklist
|
||||
|
||||
- [ ] I'm reporting a bug in Maigret functionality
|
||||
- [ ] I've checked for similar bug reports including closed ones
|
||||
- [ ] I've checked for pull requests that attempt to fix this bug
|
||||
|
||||
## Description
|
||||
|
||||
Info about Maigret version you are running and environment (`--version`, operation system, ISP provuder):
|
||||
<INSERT VERSION INFO HERE>
|
||||
|
||||
How to reproduce this bug (commandline options / conditions):
|
||||
<INSERT EXAMPLE OF CLI COMMAND HERE>
|
||||
|
||||
<DESCRIPTION>
|
||||
@@ -0,0 +1,20 @@
|
||||
---
|
||||
name: Report invalid result
|
||||
about: I want to report invalid result of Maigret search
|
||||
title: Invalid result
|
||||
labels: false-result
|
||||
assignees: soxoj
|
||||
|
||||
---
|
||||
|
||||
Invalid link: <INSERT LINK HERE>
|
||||
|
||||
<!--
|
||||
|
||||
Put x into the box
|
||||
|
||||
[ ] ==> [x]
|
||||
|
||||
-->
|
||||
|
||||
- [ ] I'm sure that the link leads to "not found" page
|
||||
@@ -0,0 +1,6 @@
|
||||
version: 2
|
||||
updates:
|
||||
- package-ecosystem: "pip"
|
||||
directory: "/"
|
||||
schedule:
|
||||
interval: "daily"
|
||||
@@ -0,0 +1,32 @@
|
||||
name: Build docker image and push to DockerHub
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
|
||||
jobs:
|
||||
docker:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
-
|
||||
name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v1
|
||||
-
|
||||
name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v1
|
||||
-
|
||||
name: Login to DockerHub
|
||||
uses: docker/login-action@v1
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_HUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }}
|
||||
-
|
||||
name: Build and push
|
||||
id: docker_build
|
||||
uses: docker/build-push-action@v2
|
||||
with:
|
||||
push: true
|
||||
tags: ${{ secrets.DOCKER_HUB_USERNAME }}/maigret:latest
|
||||
-
|
||||
name: Image digest
|
||||
run: echo ${{ steps.docker_build.outputs.digest }}
|
||||
@@ -0,0 +1,67 @@
|
||||
# For most projects, this workflow file will not need changing; you simply need
|
||||
# to commit it to your repository.
|
||||
#
|
||||
# You may wish to alter this file to override the set of languages analyzed,
|
||||
# or to provide custom queries or build logic.
|
||||
#
|
||||
# ******** NOTE ********
|
||||
# We have attempted to detect the languages in your repository. Please check
|
||||
# the `language` matrix defined below to confirm you have the correct set of
|
||||
# supported CodeQL languages.
|
||||
#
|
||||
name: "CodeQL"
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
schedule:
|
||||
- cron: '23 6 * * 6'
|
||||
|
||||
jobs:
|
||||
analyze:
|
||||
name: Analyze
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
actions: read
|
||||
contents: read
|
||||
security-events: write
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
language: [ 'python' ]
|
||||
# CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
|
||||
# Learn more about CodeQL language support at https://git.io/codeql-language-support
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v2
|
||||
|
||||
# Initializes the CodeQL tools for scanning.
|
||||
- name: Initialize CodeQL
|
||||
uses: github/codeql-action/init@v1
|
||||
with:
|
||||
languages: ${{ matrix.language }}
|
||||
# If you wish to specify custom queries, you can do so here or in a config file.
|
||||
# By default, queries listed here will override any specified in a config file.
|
||||
# Prefix the list here with "+" to use these queries and those in the config file.
|
||||
# queries: ./path/to/local/query, your-org/your-repo/queries@main
|
||||
|
||||
# Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
|
||||
# If this step fails, then you should remove it and run the build manually (see below)
|
||||
- name: Autobuild
|
||||
uses: github/codeql-action/autobuild@v1
|
||||
|
||||
# ℹ️ Command-line programs to run using the OS shell.
|
||||
# 📚 https://git.io/JvXDl
|
||||
|
||||
# ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
|
||||
# and modify them (or add more) to build your code if your project
|
||||
# uses a compiled language
|
||||
|
||||
#- run: |
|
||||
# make bootstrap
|
||||
# make release
|
||||
|
||||
- name: Perform CodeQL Analysis
|
||||
uses: github/codeql-action/analyze@v1
|
||||
@@ -0,0 +1,24 @@
|
||||
name: Package exe with PyInstaller - Windows
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
|
||||
jobs:
|
||||
build:
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: PyInstaller Windows
|
||||
uses: JackMcKew/pyinstaller-action-windows@main
|
||||
with:
|
||||
path: pyinstaller
|
||||
|
||||
- uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: maigret_standalone_win32
|
||||
path: pyinstaller/dist/windows # or path/to/artifact
|
||||
@@ -1,13 +1,10 @@
|
||||
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
|
||||
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
|
||||
|
||||
name: Python package
|
||||
name: Linting and testing
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
types: [opened, synchronize, reopened]
|
||||
|
||||
jobs:
|
||||
build:
|
||||
@@ -26,7 +23,7 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip install flake8 pytest pytest-rerunfailures
|
||||
python -m pip install -r test-requirements.txt
|
||||
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
|
||||
- name: Test with pytest
|
||||
run: |
|
||||
|
||||
@@ -1,6 +1,3 @@
|
||||
# This workflow will upload a Python Package using Twine when a release is created
|
||||
# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
|
||||
|
||||
name: Upload Python Package
|
||||
|
||||
on:
|
||||
|
||||
@@ -0,0 +1,34 @@
|
||||
name: Update sites rating and statistics
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches: [ main ]
|
||||
types: [opened, synchronize]
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v2.3.2
|
||||
with:
|
||||
ref: ${{ github.event.pull_request.head.sha }}
|
||||
fetch-depth: 0 # otherwise, there would be errors pushing refs to the destination repository.
|
||||
|
||||
- name: build application
|
||||
run: |
|
||||
pip3 install .
|
||||
python3 ./utils/update_site_data.py --empty-only
|
||||
|
||||
- name: Commit and push changes
|
||||
run: |
|
||||
git config --global user.name "Maigret autoupdate"
|
||||
git config --global user.email "soxoj@protonmail.com"
|
||||
echo `git name-rev ${{ github.event.pull_request.head.sha }} --name-only`
|
||||
export BRANCH=`git name-rev ${{ github.event.pull_request.head.sha }} --name-only | sed 's/remotes\/origin\///'`
|
||||
echo $BRANCH
|
||||
git remote -v
|
||||
git checkout $BRANCH
|
||||
git add sites.md
|
||||
git commit -m "Updated site list and statistics"
|
||||
git push origin $BRANCH
|
||||
@@ -22,9 +22,15 @@ src/
|
||||
# Comma-Separated Values (CSV) Reports
|
||||
*.csv
|
||||
|
||||
# Excluded sites list
|
||||
tests/.excluded_sites
|
||||
|
||||
# MacOS Folder Metadata File
|
||||
.DS_Store
|
||||
/reports/
|
||||
|
||||
# Testing
|
||||
.coverage
|
||||
dist/
|
||||
htmlcov/
|
||||
/test_*
|
||||
|
||||
# Maigret files
|
||||
settings.json
|
||||
|
||||
@@ -2,6 +2,153 @@
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
## [0.4.2] - 2022-03-07
|
||||
* [ImgBot] Optimize images by @imgbot in https://github.com/soxoj/maigret/pull/319
|
||||
* Bump pytest-asyncio from 0.17.0 to 0.17.1 by @dependabot in https://github.com/soxoj/maigret/pull/321
|
||||
* Bump pytest-asyncio from 0.17.1 to 0.17.2 by @dependabot in https://github.com/soxoj/maigret/pull/323
|
||||
* Disabled Ruboard by @soxoj in https://github.com/soxoj/maigret/pull/327
|
||||
* Disable kinooh, sites list update workflow added by @soxoj in https://github.com/soxoj/maigret/pull/329
|
||||
* Bump multidict from 5.2.0 to 6.0.1 by @dependabot in https://github.com/soxoj/maigret/pull/332
|
||||
* Bump multidict from 6.0.1 to 6.0.2 by @dependabot in https://github.com/soxoj/maigret/pull/333
|
||||
* Bump pytest-httpserver from 1.0.3 to 1.0.4 by @dependabot in https://github.com/soxoj/maigret/pull/334
|
||||
* Bump pytest from 6.2.5 to 7.0.0 by @dependabot in https://github.com/soxoj/maigret/pull/339
|
||||
* Bump pytest-asyncio from 0.17.2 to 0.18.0 by @dependabot in https://github.com/soxoj/maigret/pull/340
|
||||
* Bump pytest-asyncio from 0.18.0 to 0.18.1 by @dependabot in https://github.com/soxoj/maigret/pull/343
|
||||
* Bump pytest from 7.0.0 to 7.0.1 by @dependabot in https://github.com/soxoj/maigret/pull/345
|
||||
* Bump typing-extensions from 4.0.1 to 4.1.1 by @dependabot in https://github.com/soxoj/maigret/pull/346
|
||||
* Bump lxml from 4.7.1 to 4.8.0 by @dependabot in https://github.com/soxoj/maigret/pull/350
|
||||
* Pin reportlab version by @cyb3rk0tik in https://github.com/soxoj/maigret/pull/351
|
||||
* Fix reportlab not only for testing by @cyb3rk0tik in https://github.com/soxoj/maigret/pull/352
|
||||
* Added some scripts by @soxoj in https://github.com/soxoj/maigret/pull/355
|
||||
* Added package publishing instruction by @soxoj in https://github.com/soxoj/maigret/pull/356
|
||||
* Added DB statistics autoupdate and write to sites.md by @soxoj in https://github.com/soxoj/maigret/pull/357
|
||||
* CI autoupdate by @soxoj in https://github.com/soxoj/maigret/pull/359
|
||||
* Op.gg fixes by @soxoj in https://github.com/soxoj/maigret/pull/363
|
||||
* Wikipedia fix by @soxoj in https://github.com/soxoj/maigret/pull/365
|
||||
* Disabled Netvibes and LeetCode by @soxoj in https://github.com/soxoj/maigret/pull/366
|
||||
* Fixed several false positives, improved statistics info by @soxoj in https://github.com/soxoj/maigret/pull/368
|
||||
* Fix false positives by @soxoj in https://github.com/soxoj/maigret/pull/370
|
||||
* Fixed the rest of false positives for now by @soxoj in https://github.com/soxoj/maigret/pull/371
|
||||
* Fix false positive and CI by @soxoj in https://github.com/soxoj/maigret/pull/372
|
||||
* Added new sites to data.json by @kustermariocoding in https://github.com/soxoj/maigret/pull/375
|
||||
* Fixed issue with str alexaRank by @soxoj in https://github.com/soxoj/maigret/pull/382
|
||||
* Bump tqdm from 4.62.3 to 4.63.0 by @dependabot in https://github.com/soxoj/maigret/pull/374
|
||||
* Bump pytest-asyncio from 0.18.1 to 0.18.2 by @dependabot in https://github.com/soxoj/maigret/pull/380
|
||||
* @imgbot made their first contribution in https://github.com/soxoj/maigret/pull/319
|
||||
* @kustermariocoding made their first contribution in https://github.com/soxoj/maigret/pull/375
|
||||
|
||||
**Full Changelog**: https://github.com/soxoj/maigret/compare/v0.4.1...v0.4.2
|
||||
|
||||
## [0.4.1] - 2022-01-15
|
||||
* Added dozen of sites, improved submit mode by @soxoj in https://github.com/soxoj/maigret/pull/288
|
||||
* Bump requests from 2.26.0 to 2.27.0 by @dependabot in https://github.com/soxoj/maigret/pull/292
|
||||
* changed Bayoushooter to use XenForo and foursquare to use correct checkType by @antomarsi in https://github.com/soxoj/maigret/pull/289
|
||||
* Bump requests from 2.27.0 to 2.27.1 by @dependabot in https://github.com/soxoj/maigret/pull/293
|
||||
* Added aparat.com by @soxoj in https://github.com/soxoj/maigret/pull/294
|
||||
* Fixed BongaCams, links parsing improved by @soxoj in https://github.com/soxoj/maigret/pull/297
|
||||
* Temporary fix for Twitter (#299) by @soxoj in https://github.com/soxoj/maigret/pull/300
|
||||
* Fixed TikTok checks (#303) by @soxoj in https://github.com/soxoj/maigret/pull/306
|
||||
* Bump pycountry from 20.7.3 to 22.1.10 by @dependabot in https://github.com/soxoj/maigret/pull/313
|
||||
* Pornhub search improved by @soxoj in https://github.com/soxoj/maigret/pull/315
|
||||
* Codacademy fixed by @soxoj in https://github.com/soxoj/maigret/pull/316
|
||||
* Bump pytest-asyncio from 0.16.0 to 0.17.0 by @dependabot in https://github.com/soxoj/maigret/pull/314
|
||||
|
||||
**Full Changelog**: https://github.com/soxoj/maigret/compare/v0.4.0...v0.4.1
|
||||
|
||||
## [0.4.0] - 2022-01-03
|
||||
* Delayed import of requests module, speed check command, reqs updated by @soxoj in https://github.com/soxoj/maigret/pull/189
|
||||
* Snapcraft yaml added by @soxoj in https://github.com/soxoj/maigret/pull/190
|
||||
* Create codeql-analysis.yml by @soxoj in https://github.com/soxoj/maigret/pull/191
|
||||
* Move wiki pages to ReadTheDocs by @egornagornov in https://github.com/soxoj/maigret/pull/194
|
||||
* Created ReadTheDocs requirements file by @soxoj in https://github.com/soxoj/maigret/pull/195
|
||||
* Fix incompatible version requirements by @JasperJuergensen in https://github.com/soxoj/maigret/pull/196
|
||||
* Added link to documentation by @soxoj in https://github.com/soxoj/maigret/pull/198
|
||||
* Upgraded base docker image by @soxoj in https://github.com/soxoj/maigret/pull/199
|
||||
* Run CodeQL only aflter merge and each Saturday by @soxoj in https://github.com/soxoj/maigret/pull/201
|
||||
* Added cascade settings loading from /.maigret/settings.json and ./settings.json by @soxoj in https://github.com/soxoj/maigret/pull/200
|
||||
* Documentation and settings improved by @soxoj in https://github.com/soxoj/maigret/pull/203
|
||||
* New config options added by @soxoj in https://github.com/soxoj/maigret/pull/204
|
||||
* Added export of cli entrypoint by @soxoj in https://github.com/soxoj/maigret/pull/207
|
||||
* Removed redundant logging by @soxoj in https://github.com/soxoj/maigret/pull/210
|
||||
* PyInstaller workflow by @soxoj in https://github.com/soxoj/maigret/pull/206
|
||||
* Create bug.md by @soxoj in https://github.com/soxoj/maigret/pull/213
|
||||
* Fixed path and names of report files by @soxoj in https://github.com/soxoj/maigret/pull/216
|
||||
* Box drawing logic improved, added new settings by @soxoj in https://github.com/soxoj/maigret/pull/217
|
||||
* Fixes for win32 release by @soxoj in https://github.com/soxoj/maigret/pull/218
|
||||
* Bump six from 1.15.0 to 1.16.0 by @dependabot in https://github.com/soxoj/maigret/pull/221
|
||||
* Bump flake8 from 3.8.4 to 4.0.1 by @dependabot in https://github.com/soxoj/maigret/pull/219
|
||||
* Bump aiohttp from 3.7.4 to 3.8.0 by @dependabot in https://github.com/soxoj/maigret/pull/220
|
||||
* Bump aiohttp-socks from 0.5.5 to 0.6.0 by @dependabot in https://github.com/soxoj/maigret/pull/222
|
||||
* Bump typing-extensions from 3.7.4.3 to 3.10.0.2 by @dependabot in https://github.com/soxoj/maigret/pull/224
|
||||
* Bump multidict from 5.1.0 to 5.2.0 by @dependabot in https://github.com/soxoj/maigret/pull/225
|
||||
* Bump idna from 2.10 to 3.3 by @dependabot in https://github.com/soxoj/maigret/pull/228
|
||||
* Bump pytest-cov from 2.10.1 to 3.0.0 by @dependabot in https://github.com/soxoj/maigret/pull/227
|
||||
* Bump mock from 4.0.2 to 4.0.3 by @dependabot in https://github.com/soxoj/maigret/pull/226
|
||||
* Bump certifi from 2020.12.5 to 2021.10.8 by @dependabot in https://github.com/soxoj/maigret/pull/233
|
||||
* Bump pytest-httpserver from 1.0.0 to 1.0.2 by @dependabot in https://github.com/soxoj/maigret/pull/232
|
||||
* Bump lxml from 4.6.3 to 4.6.4 by @dependabot in https://github.com/soxoj/maigret/pull/231
|
||||
* Bump pefile from 2019.4.18 to 2021.9.3 by @dependabot in https://github.com/soxoj/maigret/pull/229
|
||||
* Bump pytest-rerunfailures from 9.1.1 to 10.2 by @dependabot in https://github.com/soxoj/maigret/pull/230
|
||||
* Bump yarl from 1.6.3 to 1.7.2 by @dependabot in https://github.com/soxoj/maigret/pull/237
|
||||
* Bump async-timeout from 4.0.0 to 4.0.1 by @dependabot in https://github.com/soxoj/maigret/pull/236
|
||||
* Bump psutil from 5.7.0 to 5.8.0 by @dependabot in https://github.com/soxoj/maigret/pull/234
|
||||
* Bump jinja2 from 3.0.2 to 3.0.3 by @dependabot in https://github.com/soxoj/maigret/pull/235
|
||||
* Bump pytest from 6.2.4 to 6.2.5 by @dependabot in https://github.com/soxoj/maigret/pull/238
|
||||
* Bump tqdm from 4.55.0 to 4.62.3 by @dependabot in https://github.com/soxoj/maigret/pull/242
|
||||
* Bump arabic-reshaper from 2.1.1 to 2.1.3 by @dependabot in https://github.com/soxoj/maigret/pull/243
|
||||
* Bump pytest-asyncio from 0.14.0 to 0.16.0 by @dependabot in https://github.com/soxoj/maigret/pull/240
|
||||
* Bump chardet from 3.0.4 to 4.0.0 by @dependabot in https://github.com/soxoj/maigret/pull/241
|
||||
* Bump soupsieve from 2.1 to 2.3.1 by @dependabot in https://github.com/soxoj/maigret/pull/239
|
||||
* Bump aiohttp from 3.8.0 to 3.8.1 by @dependabot in https://github.com/soxoj/maigret/pull/246
|
||||
* Bump typing-extensions from 3.10.0.2 to 4.0.0 by @dependabot in https://github.com/soxoj/maigret/pull/245
|
||||
* Bump aiohttp-socks from 0.6.0 to 0.6.1 by @dependabot in https://github.com/soxoj/maigret/pull/249
|
||||
* Bump aiohttp-socks from 0.6.1 to 0.7.1 by @dependabot in https://github.com/soxoj/maigret/pull/250
|
||||
* Bump typing-extensions from 4.0.0 to 4.0.1 by @dependabot in https://github.com/soxoj/maigret/pull/253
|
||||
* Fixed some false positives by @soxoj in https://github.com/soxoj/maigret/pull/254
|
||||
* Disabled non-working sites by @soxoj in https://github.com/soxoj/maigret/pull/255
|
||||
* Added false results buttons to reports, fixed some falses by @soxoj in https://github.com/soxoj/maigret/pull/256
|
||||
* Fixed xHamster, added support of proxies to self-check mode by @soxoj in https://github.com/soxoj/maigret/pull/259
|
||||
* Disabled non-working sites, updated public sites list by @soxoj in https://github.com/soxoj/maigret/pull/263
|
||||
* Bump lxml from 4.6.4 to 4.6.5 by @dependabot in https://github.com/soxoj/maigret/pull/266
|
||||
* Bump lxml from 4.6.5 to 4.7.1 by @dependabot in https://github.com/soxoj/maigret/pull/269
|
||||
* Bump pytest-httpserver from 1.0.2 to 1.0.3 by @dependabot in https://github.com/soxoj/maigret/pull/270
|
||||
* Fixed failed tests (thx to Meta aka Facebook) by @soxoj in https://github.com/soxoj/maigret/pull/273
|
||||
* Fixed votetags, updated issue template by @soxoj in https://github.com/soxoj/maigret/pull/278
|
||||
* Bump async-timeout from 4.0.1 to 4.0.2 by @dependabot in https://github.com/soxoj/maigret/pull/275
|
||||
* Fixed some false positives by @soxoj in https://github.com/soxoj/maigret/pull/280
|
||||
* Bump attrs from 21.2.0 to 21.3.0 by @dependabot in https://github.com/soxoj/maigret/pull/281
|
||||
* Bump psutil from 5.8.0 to 5.9.0 by @dependabot in https://github.com/soxoj/maigret/pull/282
|
||||
* Bump attrs from 21.3.0 to 21.4.0 by @dependabot in https://github.com/soxoj/maigret/pull/283
|
||||
|
||||
**Full Changelog**: https://github.com/soxoj/maigret/compare/v0.3.1...v0.4.0
|
||||
|
||||
## [0.3.1] - 2021-10-31
|
||||
* fixed false positives
|
||||
* accelerated maigret start time by 3 times
|
||||
|
||||
## [0.3.0] - 2021-06-02
|
||||
* added support of Tor and I2P sites
|
||||
* added experimental DNS checking feature
|
||||
* implemented sorting by data points for reports
|
||||
* reports fixes
|
||||
|
||||
## [0.2.4] - 2021-05-18
|
||||
* cli output report
|
||||
* various improvements
|
||||
|
||||
## [0.2.3] - 2021-05-12
|
||||
* added Yelp and yelp_userid support
|
||||
* tags markup stabilization
|
||||
* improved errors detection
|
||||
|
||||
## [0.2.2] - 2021-05-07
|
||||
* improved ids extractors
|
||||
* updated sites and engines
|
||||
* updates CLI options
|
||||
|
||||
## [0.2.1] - 2021-05-02
|
||||
* fixed json reports generation bug, added tests
|
||||
|
||||
## [0.2.0] - 2021-05-02
|
||||
* added `--retries` option
|
||||
* added `source` feature for sites' mirrors
|
||||
|
||||
@@ -0,0 +1,128 @@
|
||||
# Contributor Covenant Code of Conduct
|
||||
|
||||
## Our Pledge
|
||||
|
||||
We as members, contributors, and leaders pledge to make participation in our
|
||||
community a harassment-free experience for everyone, regardless of age, body
|
||||
size, visible or invisible disability, ethnicity, sex characteristics, gender
|
||||
identity and expression, level of experience, education, socio-economic status,
|
||||
nationality, personal appearance, race, religion, or sexual identity
|
||||
and orientation.
|
||||
|
||||
We pledge to act and interact in ways that contribute to an open, welcoming,
|
||||
diverse, inclusive, and healthy community.
|
||||
|
||||
## Our Standards
|
||||
|
||||
Examples of behavior that contributes to a positive environment for our
|
||||
community include:
|
||||
|
||||
* Demonstrating empathy and kindness toward other people
|
||||
* Being respectful of differing opinions, viewpoints, and experiences
|
||||
* Giving and gracefully accepting constructive feedback
|
||||
* Accepting responsibility and apologizing to those affected by our mistakes,
|
||||
and learning from the experience
|
||||
* Focusing on what is best not just for us as individuals, but for the
|
||||
overall community
|
||||
|
||||
Examples of unacceptable behavior include:
|
||||
|
||||
* The use of sexualized language or imagery, and sexual attention or
|
||||
advances of any kind
|
||||
* Trolling, insulting or derogatory comments, and personal or political attacks
|
||||
* Public or private harassment
|
||||
* Publishing others' private information, such as a physical or email
|
||||
address, without their explicit permission
|
||||
* Other conduct which could reasonably be considered inappropriate in a
|
||||
professional setting
|
||||
|
||||
## Enforcement Responsibilities
|
||||
|
||||
Community leaders are responsible for clarifying and enforcing our standards of
|
||||
acceptable behavior and will take appropriate and fair corrective action in
|
||||
response to any behavior that they deem inappropriate, threatening, offensive,
|
||||
or harmful.
|
||||
|
||||
Community leaders have the right and responsibility to remove, edit, or reject
|
||||
comments, commits, code, wiki edits, issues, and other contributions that are
|
||||
not aligned to this Code of Conduct, and will communicate reasons for moderation
|
||||
decisions when appropriate.
|
||||
|
||||
## Scope
|
||||
|
||||
This Code of Conduct applies within all community spaces, and also applies when
|
||||
an individual is officially representing the community in public spaces.
|
||||
Examples of representing our community include using an official e-mail address,
|
||||
posting via an official social media account, or acting as an appointed
|
||||
representative at an online or offline event.
|
||||
|
||||
## Enforcement
|
||||
|
||||
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
||||
reported to the community leaders responsible for enforcement at
|
||||
https://t.me/soxoj.
|
||||
All complaints will be reviewed and investigated promptly and fairly.
|
||||
|
||||
All community leaders are obligated to respect the privacy and security of the
|
||||
reporter of any incident.
|
||||
|
||||
## Enforcement Guidelines
|
||||
|
||||
Community leaders will follow these Community Impact Guidelines in determining
|
||||
the consequences for any action they deem in violation of this Code of Conduct:
|
||||
|
||||
### 1. Correction
|
||||
|
||||
**Community Impact**: Use of inappropriate language or other behavior deemed
|
||||
unprofessional or unwelcome in the community.
|
||||
|
||||
**Consequence**: A private, written warning from community leaders, providing
|
||||
clarity around the nature of the violation and an explanation of why the
|
||||
behavior was inappropriate. A public apology may be requested.
|
||||
|
||||
### 2. Warning
|
||||
|
||||
**Community Impact**: A violation through a single incident or series
|
||||
of actions.
|
||||
|
||||
**Consequence**: A warning with consequences for continued behavior. No
|
||||
interaction with the people involved, including unsolicited interaction with
|
||||
those enforcing the Code of Conduct, for a specified period of time. This
|
||||
includes avoiding interactions in community spaces as well as external channels
|
||||
like social media. Violating these terms may lead to a temporary or
|
||||
permanent ban.
|
||||
|
||||
### 3. Temporary Ban
|
||||
|
||||
**Community Impact**: A serious violation of community standards, including
|
||||
sustained inappropriate behavior.
|
||||
|
||||
**Consequence**: A temporary ban from any sort of interaction or public
|
||||
communication with the community for a specified period of time. No public or
|
||||
private interaction with the people involved, including unsolicited interaction
|
||||
with those enforcing the Code of Conduct, is allowed during this period.
|
||||
Violating these terms may lead to a permanent ban.
|
||||
|
||||
### 4. Permanent Ban
|
||||
|
||||
**Community Impact**: Demonstrating a pattern of violation of community
|
||||
standards, including sustained inappropriate behavior, harassment of an
|
||||
individual, or aggression toward or disparagement of classes of individuals.
|
||||
|
||||
**Consequence**: A permanent ban from any sort of public interaction within
|
||||
the community.
|
||||
|
||||
## Attribution
|
||||
|
||||
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
|
||||
version 2.0, available at
|
||||
https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
|
||||
|
||||
Community Impact Guidelines were inspired by [Mozilla's code of conduct
|
||||
enforcement ladder](https://github.com/mozilla/diversity).
|
||||
|
||||
[homepage]: https://www.contributor-covenant.org
|
||||
|
||||
For answers to common questions about this code of conduct, see the FAQ at
|
||||
https://www.contributor-covenant.org/faq. Translations are available at
|
||||
https://www.contributor-covenant.org/translations.
|
||||
@@ -0,0 +1,30 @@
|
||||
# How to contribute
|
||||
|
||||
Hey! I'm really glad you're reading this. Maigret contains a lot of sites, and it is very hard to keep all the sites operational. That's why any fix is important.
|
||||
|
||||
## How to add a new site
|
||||
|
||||
#### Beginner level
|
||||
|
||||
You can use Maigret **submit mode** (`maigret --submit URL`) to add a new site or update an existing site. In this mode Maigret do an automatic analysis of the given account URL or site main page URL to determine the site engine and methods to check account presence. After checking Maigret asks if you want to add the site, answering y/Y will rewrite the local database.
|
||||
|
||||
#### Advanced level
|
||||
|
||||
You can edit [the database JSON file](https://github.com/soxoj/maigret/blob/main/maigret/resources/data.json) (`./maigret/resources/data.json`) manually.
|
||||
|
||||
## Testing
|
||||
|
||||
There are CI checks for every PR to the Maigret repository. But it will be better to run `make format`, `make link` and `make test` to ensure you've made a corrent changes.
|
||||
|
||||
## Submitting changes
|
||||
|
||||
To submit you changes you must [send a GitHub PR](https://github.com/soxoj/maigret/pulls) to the Maigret project.
|
||||
Always write a clear log message for your commits. One-line messages are fine for small changes, but bigger changes should look like this:
|
||||
|
||||
$ git commit -m "A brief summary of the commit
|
||||
>
|
||||
> A paragraph describing what changed and its impact."
|
||||
|
||||
## Coding conventions
|
||||
|
||||
Start reading the code and you'll get the hang of it. ;)
|
||||
@@ -1,25 +1,16 @@
|
||||
FROM python:3.7
|
||||
LABEL maintainer="Soxoj <soxoj@protonmail.com>"
|
||||
|
||||
FROM python:3.9-slim
|
||||
MAINTAINER Soxoj <soxoj@protonmail.com>
|
||||
WORKDIR /app
|
||||
|
||||
ADD requirements.txt .
|
||||
|
||||
RUN pip install --upgrade pip
|
||||
|
||||
RUN apt update -y
|
||||
|
||||
RUN apt install -y\
|
||||
RUN apt update && \
|
||||
apt install -y \
|
||||
gcc \
|
||||
musl-dev \
|
||||
libxml2 \
|
||||
libxml2-dev \
|
||||
libxslt-dev \
|
||||
&& YARL_NO_EXTENSIONS=1 python3 -m pip install maigret \
|
||||
&& rm -rf /var/cache/apk/* \
|
||||
/tmp/* \
|
||||
/var/tmp/*
|
||||
|
||||
libxslt-dev
|
||||
RUN apt clean \
|
||||
&& rm -rf /var/lib/apt/lists/* /tmp/*
|
||||
ADD . .
|
||||
|
||||
RUN YARL_NO_EXTENSIONS=1 python3 -m pip install .
|
||||
ENTRYPOINT ["maigret"]
|
||||
|
||||
@@ -0,0 +1,41 @@
|
||||
LINT_FILES=maigret wizard.py tests
|
||||
|
||||
test:
|
||||
coverage run --source=./maigret -m pytest tests
|
||||
coverage report -m
|
||||
coverage html
|
||||
|
||||
rerun-tests:
|
||||
pytest --lf -vv
|
||||
|
||||
lint:
|
||||
@echo 'syntax errors or undefined names'
|
||||
flake8 --count --select=E9,F63,F7,F82 --show-source --statistics ${LINT_FILES} maigret.py
|
||||
|
||||
@echo 'warning'
|
||||
flake8 --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --ignore=E731,W503,E501 ${LINT_FILES} maigret.py
|
||||
|
||||
@echo 'mypy'
|
||||
mypy ${LINT_FILES}
|
||||
|
||||
speed:
|
||||
time python3 ./maigret.py --version
|
||||
python3 -c "import timeit; t = timeit.Timer('import maigret'); print(t.timeit(number = 1000000))"
|
||||
python3 -X importtime -c "import maigret" 2> maigret-import.log
|
||||
python3 -m tuna maigret-import.log
|
||||
|
||||
format:
|
||||
@echo 'black'
|
||||
black --skip-string-normalization ${LINT_FILES}
|
||||
|
||||
pull:
|
||||
git stash
|
||||
git checkout main
|
||||
git pull origin main
|
||||
git stash pop
|
||||
|
||||
clean:
|
||||
rm -rf reports htmcov dist
|
||||
|
||||
install:
|
||||
pip3 install .
|
||||
@@ -1,40 +1,58 @@
|
||||
# Maigret
|
||||
|
||||

|
||||

|
||||
[](https://gitter.im/maigret-osint/community)
|
||||
|
||||
<p align="center">
|
||||
<img src="./static/maigret.png" />
|
||||
<p align="center">
|
||||
<a href="https://pypi.org/project/maigret/">
|
||||
<img alt="PyPI" src="https://img.shields.io/pypi/v/maigret?style=flat-square">
|
||||
</a>
|
||||
<a href="https://pypi.org/project/maigret/">
|
||||
<img alt="PyPI - Downloads" src="https://img.shields.io/pypi/dw/maigret?style=flat-square">
|
||||
</a>
|
||||
<a href="https://pypi.org/project/maigret/">
|
||||
<img alt="Views" src="https://komarev.com/ghpvc/?username=maigret&color=brightgreen&label=views&style=flat-square">
|
||||
</a>
|
||||
</p>
|
||||
<p align="center">
|
||||
<img src="https://raw.githubusercontent.com/soxoj/maigret/main/static/maigret.png" height="200"/>
|
||||
</p>
|
||||
</p>
|
||||
|
||||
<i>The Commissioner Jules Maigret is a fictional French police detective, created by Georges Simenon. His investigation method is based on understanding the personality of different people and their interactions.</i>
|
||||
|
||||
## About
|
||||
|
||||
Purpose of Maigret - **collect a dossier on a person by username only**, checking for accounts on a huge number of sites.
|
||||
**Maigret** collect a dossier on a person **by username only**, checking for accounts on a huge number of sites and gathering all the available information from web pages. No API keys required. Maigret is an easy-to-use and powerful fork of [Sherlock](https://github.com/sherlock-project/sherlock).
|
||||
|
||||
This is a [sherlock](https://github.com/sherlock-project/) fork with cool features under heavy development.
|
||||
*Don't forget to regularly update source code from repo*.
|
||||
|
||||
Currently supported more than 2000 sites ([full list](./sites.md)), by default search is launched against 500 popular sites in descending order of popularity.
|
||||
Currently supported more than 2500 sites ([full list](https://github.com/soxoj/maigret/blob/main/sites.md)), search is launched against 500 popular sites in descending order of popularity by default. Also supported checking of Tor sites, I2P sites, and domains (via DNS resolving).
|
||||
|
||||
## Main features
|
||||
|
||||
* Profile pages parsing, [extracting](https://github.com/soxoj/socid_extractor) personal info, links to other profiles, etc.
|
||||
* Recursive search by new usernames found
|
||||
* Profile pages parsing, [extraction](https://github.com/soxoj/socid_extractor) of personal info, links to other profiles, etc.
|
||||
* Recursive search by new usernames and other ids found
|
||||
* Search by tags (site categories, countries)
|
||||
* Censorship and captcha detection
|
||||
* Very few false positives
|
||||
* Failed requests' restarts
|
||||
* Requests retries
|
||||
|
||||
See full description of Maigret features [in the documentation](https://maigret.readthedocs.io/en/latest/features.html).
|
||||
|
||||
## Installation
|
||||
|
||||
**NOTE**: Python 3.6 or higher and pip is required.
|
||||
Maigret can be installed using pip, Docker, or simply can be launched from the cloned repo.
|
||||
|
||||
**Python 3.8 is recommended.**
|
||||
Standalone EXE-binaries for Windows are located in [Releases section](https://github.com/soxoj/maigret/releases) of GitHub repository.
|
||||
|
||||
Also you can run Maigret using cloud shells and Jupyter notebooks (see buttons below).
|
||||
|
||||
[](https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/soxoj/maigret&tutorial=README.md)
|
||||
<a href="https://repl.it/github/soxoj/maigret"><img src="https://user-images.githubusercontent.com/27065646/92304596-bf719b00-ef7f-11ea-987f-2c1f3c323088.png" alt="Run on Repl.it" height="50"></a>
|
||||
|
||||
<a href="https://colab.research.google.com/gist/soxoj/879b51bc3b2f8b695abb054090645000/maigret-collab.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" height="45"></a>
|
||||
<a href="https://mybinder.org/v2/gist/soxoj/9d65c2f4d3bec5dd25949197ea73cf3a/HEAD"><img src="https://mybinder.org/badge_logo.svg" alt="Open In Binder" height="45"></a>
|
||||
|
||||
### Package installing
|
||||
|
||||
**NOTE**: Python 3.6 or higher and pip is required, **Python 3.8 is recommended.**
|
||||
|
||||
```bash
|
||||
# install from pypi
|
||||
pip3 install maigret
|
||||
@@ -42,34 +60,36 @@ pip3 install maigret
|
||||
# or clone and install manually
|
||||
git clone https://github.com/soxoj/maigret && cd maigret
|
||||
pip3 install .
|
||||
|
||||
# usage
|
||||
maigret username
|
||||
```
|
||||
|
||||
### Cloning a repository
|
||||
|
||||
```bash
|
||||
git clone https://github.com/soxoj/maigret && cd maigret
|
||||
```
|
||||
|
||||
You can use a free virtual machine, the repo will be automatically cloned:
|
||||
|
||||
[](https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/soxoj/maigret&tutorial=README.md) [](https://repl.it/github/soxoj/maigret)
|
||||
<a href="https://colab.research.google.com/gist//soxoj/879b51bc3b2f8b695abb054090645000/maigret.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" height="40"></a>
|
||||
|
||||
```bash
|
||||
pip3 install -r requirements.txt
|
||||
|
||||
# usage
|
||||
./maigret.py username
|
||||
```
|
||||
|
||||
## Using examples
|
||||
### Docker
|
||||
|
||||
```bash
|
||||
# for a cloned repo
|
||||
./maigret.py user
|
||||
# official image
|
||||
docker pull soxoj/maigret
|
||||
|
||||
# for a package
|
||||
maigret user
|
||||
# usage
|
||||
docker run soxoj/maigret:latest username
|
||||
|
||||
# manual build
|
||||
docker build -t maigret .
|
||||
```
|
||||
|
||||
Features:
|
||||
## Usage examples
|
||||
|
||||
```bash
|
||||
# make HTML and PDF reports
|
||||
maigret user --html --pdf
|
||||
@@ -77,35 +97,25 @@ maigret user --html --pdf
|
||||
# search on sites marked with tags photo & dating
|
||||
maigret user --tags photo,dating
|
||||
|
||||
|
||||
# search for three usernames on all available sites
|
||||
maigret user1 user2 user3 -a
|
||||
|
||||
```
|
||||
|
||||
Run `maigret --help` to get arguments description. Also options are documented in [the Maigret Wiki](https://github.com/soxoj/maigret/wiki/Command-line-options).
|
||||
Use `maigret --help` to get full options description. Also options [are documented](https://maigret.readthedocs.io/en/latest/command-line-options.html).
|
||||
|
||||
With Docker:
|
||||
```
|
||||
# manual build
|
||||
docker build -t maigret . && docker run maigret user
|
||||
|
||||
# official image
|
||||
docker run soxoj/maigret:latest user
|
||||
```
|
||||
|
||||
## Demo with page parsing and recursive username search
|
||||
|
||||
[PDF report](./static/report_alexaimephotographycars.pdf), [HTML report](https://htmlpreview.github.io/?https://raw.githubusercontent.com/soxoj/maigret/main/static/report_alexaimephotographycars.html)
|
||||
[PDF report](https://raw.githubusercontent.com/soxoj/maigret/main/static/report_alexaimephotographycars.pdf), [HTML report](https://htmlpreview.github.io/?https://raw.githubusercontent.com/soxoj/maigret/main/static/report_alexaimephotographycars.html)
|
||||
|
||||

|
||||

|
||||
|
||||

|
||||

|
||||
|
||||

|
||||

|
||||
|
||||
|
||||
[Full console output](./static/recursive_search.md)
|
||||
[Full console output](https://raw.githubusercontent.com/soxoj/maigret/main/static/recursive_search.md)
|
||||
|
||||
## License
|
||||
|
||||
|
||||
@@ -0,0 +1,18 @@
|
||||
#!/usr/bin/env python3
|
||||
import asyncio
|
||||
import sys
|
||||
|
||||
from maigret.maigret import main
|
||||
|
||||
|
||||
def run():
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
loop.run_until_complete(main())
|
||||
except KeyboardInterrupt:
|
||||
print('Maigret is interrupted.')
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
||||
@@ -0,0 +1,20 @@
|
||||
# Minimal makefile for Sphinx documentation
|
||||
#
|
||||
|
||||
# You can set these variables from the command line, and also
|
||||
# from the environment for the first two.
|
||||
SPHINXOPTS ?=
|
||||
SPHINXBUILD ?= sphinx-build
|
||||
SOURCEDIR = source
|
||||
BUILDDIR = build
|
||||
|
||||
# Put it first so that "make" without argument is like "make help".
|
||||
help:
|
||||
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
|
||||
.PHONY: help Makefile
|
||||
|
||||
# Catch-all target: route all unknown targets to Sphinx using the new
|
||||
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
||||
%: Makefile
|
||||
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
@@ -0,0 +1,35 @@
|
||||
@ECHO OFF
|
||||
|
||||
pushd %~dp0
|
||||
|
||||
REM Command file for Sphinx documentation
|
||||
|
||||
if "%SPHINXBUILD%" == "" (
|
||||
set SPHINXBUILD=sphinx-build
|
||||
)
|
||||
set SOURCEDIR=source
|
||||
set BUILDDIR=build
|
||||
|
||||
if "%1" == "" goto help
|
||||
|
||||
%SPHINXBUILD% >NUL 2>NUL
|
||||
if errorlevel 9009 (
|
||||
echo.
|
||||
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
|
||||
echo.installed, then set the SPHINXBUILD environment variable to point
|
||||
echo.to the full path of the 'sphinx-build' executable. Alternatively you
|
||||
echo.may add the Sphinx directory to PATH.
|
||||
echo.
|
||||
echo.If you don't have Sphinx installed, grab it from
|
||||
echo.http://sphinx-doc.org/
|
||||
exit /b 1
|
||||
)
|
||||
|
||||
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||
goto end
|
||||
|
||||
:help
|
||||
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
|
||||
|
||||
:end
|
||||
popd
|
||||
@@ -0,0 +1 @@
|
||||
sphinx-copybutton
|
||||
@@ -0,0 +1,127 @@
|
||||
.. _command-line-options:
|
||||
|
||||
Command line options
|
||||
====================
|
||||
|
||||
Usernames
|
||||
---------
|
||||
|
||||
``maigret username1 username2 ...``
|
||||
|
||||
You can specify several usernames separated by space. Usernames are
|
||||
**not** mandatory as there are other operations modes (see below).
|
||||
|
||||
Parsing of account pages and online documents
|
||||
---------------------------------------------
|
||||
|
||||
``maigret --parse URL``
|
||||
|
||||
Maigret will try to extract information about the document/account owner
|
||||
(including username and other ids) and will make a search by the
|
||||
extracted username and ids. :doc:`Examples <extracting-information-from-pages>`.
|
||||
|
||||
Main options
|
||||
------------
|
||||
|
||||
Options are also configurable through settings files, see
|
||||
:doc:`settings section <settings>`.
|
||||
|
||||
``--tags`` - Filter sites for searching by tags: sites categories and
|
||||
two-letter country codes. E.g. photo, dating, sport; jp, us, global.
|
||||
Multiple tags can be associated with one site. **Warning: tags markup is
|
||||
not stable now.**
|
||||
|
||||
``-n``, ``--max-connections`` - Allowed number of concurrent connections
|
||||
**(default: 100)**.
|
||||
|
||||
``-a``, ``--all-sites`` - Use all sites for scan **(default: top 500)**.
|
||||
|
||||
``--top-sites`` - Count of sites for scan ranked by Alexa Top
|
||||
**(default: top 500)**.
|
||||
|
||||
``--timeout`` - Time (in seconds) to wait for responses from sites
|
||||
**(default: 30)**. A longer timeout will be more likely to get results
|
||||
from slow sites. On the other hand, this may cause a long delay to
|
||||
gather all results. The choice of the right timeout should be carried
|
||||
out taking into account the bandwidth of the Internet connection.
|
||||
|
||||
``--cookies-jar-file`` - File with custom cookies in Netscape format
|
||||
(aka cookies.txt). You can install an extension to your browser to
|
||||
download own cookies (`Chrome <https://chrome.google.com/webstore/detail/get-cookiestxt/bgaddhkoddajcdgocldbbfleckgcbcid>`_, `Firefox <https://addons.mozilla.org/en-US/firefox/addon/cookies-txt/>`_).
|
||||
|
||||
``--no-recursion`` - Disable parsing pages for other usernames and
|
||||
recursive search by them.
|
||||
|
||||
``--use-disabled-sites`` - Use disabled sites to search (may cause many
|
||||
false positives).
|
||||
|
||||
``--id-type`` - Specify identifier(s) type (default: username).
|
||||
Supported types: gaia_id, vk_id, yandex_public_id, ok_id, wikimapia_uid.
|
||||
Currently, you must add ``-a`` flag to run a scan on sites with custom
|
||||
id types, sites will be filtered automatically.
|
||||
|
||||
``--ignore-ids`` - Do not make search by the specified username or other
|
||||
ids. Useful for repeated scanning with found known irrelevant usernames.
|
||||
|
||||
``--db`` - Load Maigret database from a JSON file or an online, valid,
|
||||
JSON file.
|
||||
|
||||
``--retries RETRIES`` - Count of attempts to restart temporarily failed
|
||||
requests.
|
||||
|
||||
Reports
|
||||
-------
|
||||
|
||||
``-P``, ``--pdf`` - Generate a PDF report (general report on all
|
||||
usernames).
|
||||
|
||||
``-H``, ``--html`` - Generate an HTML report file (general report on all
|
||||
usernames).
|
||||
|
||||
``-X``, ``--xmind`` - Generate an XMind 8 mindmap (one report per
|
||||
username).
|
||||
|
||||
``-C``, ``--csv`` - Generate a CSV report (one report per username).
|
||||
|
||||
``-T``, ``--txt`` - Generate a TXT report (one report per username).
|
||||
|
||||
``-J``, ``--json`` - Generate a JSON report of specific type: simple,
|
||||
ndjson (one report per username). E.g. ``--json ndjson``
|
||||
|
||||
``-fo``, ``--folderoutput`` - Results will be saved to this folder,
|
||||
``results`` by default. Will be created if doesn’t exist.
|
||||
|
||||
Output options
|
||||
--------------
|
||||
|
||||
``-v``, ``--verbose`` - Display extra information and metrics.
|
||||
*(loglevel=WARNING)*
|
||||
|
||||
``-vv``, ``--info`` - Display service information. *(loglevel=INFO)*
|
||||
|
||||
``-vvv``, ``--debug``, ``-d`` - Display debugging information and site
|
||||
responses. *(loglevel=DEBUG)*
|
||||
|
||||
``--print-not-found`` - Print sites where the username was not found.
|
||||
|
||||
``--print-errors`` - Print errors messages: connection, captcha, site
|
||||
country ban, etc.
|
||||
|
||||
Other operations modes
|
||||
----------------------
|
||||
|
||||
``--version`` - Display version information and dependencies.
|
||||
|
||||
``--self-check`` - Do self-checking for sites and database and disable
|
||||
non-working ones **for current search session** by default. It’s useful
|
||||
for testing new internet connection (it depends on provider/hosting on
|
||||
which sites there will be censorship stub or captcha display). After
|
||||
checking Maigret asks if you want to save updates, answering y/Y will
|
||||
rewrite the local database.
|
||||
|
||||
``--submit URL`` - Do an automatic analysis of the given account URL or
|
||||
site main page URL to determine the site engine and methods to check
|
||||
account presence. After checking Maigret asks if you want to add the
|
||||
site, answering y/Y will rewrite the local database.
|
||||
|
||||
|
||||
@@ -0,0 +1,36 @@
|
||||
# Configuration file for the Sphinx documentation builder.
|
||||
|
||||
# -- Project information
|
||||
|
||||
project = 'Maigret'
|
||||
copyright = '2021, soxoj'
|
||||
author = 'soxoj'
|
||||
|
||||
release = '0.4.2'
|
||||
version = '0.4.2'
|
||||
|
||||
# -- General configuration
|
||||
|
||||
extensions = [
|
||||
'sphinx.ext.duration',
|
||||
'sphinx.ext.doctest',
|
||||
'sphinx.ext.autodoc',
|
||||
'sphinx.ext.autosummary',
|
||||
'sphinx.ext.intersphinx',
|
||||
'sphinx_copybutton'
|
||||
]
|
||||
|
||||
intersphinx_mapping = {
|
||||
'python': ('https://docs.python.org/3/', None),
|
||||
'sphinx': ('https://www.sphinx-doc.org/en/master/', None),
|
||||
}
|
||||
intersphinx_disabled_domains = ['std']
|
||||
|
||||
templates_path = ['_templates']
|
||||
|
||||
# -- Options for HTML output
|
||||
|
||||
html_theme = 'sphinx_rtd_theme'
|
||||
|
||||
# -- Options for EPUB output
|
||||
epub_show_urls = 'footnote'
|
||||
@@ -0,0 +1,101 @@
|
||||
.. _development:
|
||||
|
||||
Development
|
||||
==============
|
||||
|
||||
Testing
|
||||
-------
|
||||
|
||||
It is recommended use Python 3.7/3.8 for test due to some conflicts in 3.9.
|
||||
|
||||
Install test requirements:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
pip install -r test-requirements.txt
|
||||
|
||||
|
||||
Use the following commands to check Maigret:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
# run linter and typing checks
|
||||
# order of checks%
|
||||
# - critical syntax errors or undefined names
|
||||
# - flake checks
|
||||
# - mypy checks
|
||||
make lint
|
||||
|
||||
# run testing with coverage html report
|
||||
# current test coverage is 60%
|
||||
make text
|
||||
|
||||
# open html report
|
||||
open htmlcov/index.html
|
||||
|
||||
|
||||
How to publish new version of Maigret
|
||||
-------------------------------------
|
||||
|
||||
**Collaborats rights are requires, write Soxoj to get them**.
|
||||
|
||||
For new version publishing you must create a new branch in repository
|
||||
with a bumped version number and actual changelog first. After it you
|
||||
must create a release, and GitHub action automatically create a new
|
||||
PyPi package.
|
||||
|
||||
- New branch example: https://github.com/soxoj/maigret/commit/e520418f6a25d7edacde2d73b41a8ae7c80ddf39
|
||||
- Release example: https://github.com/soxoj/maigret/releases/tag/v0.4.1
|
||||
|
||||
1. Make a new branch locally with a new version name. Check the current version number here: https://pypi.org/project/maigret/.
|
||||
**Increase only patch version (third number)** if there are no breaking changes.
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
git checkout -b 0.4.0
|
||||
|
||||
2. Update Maigret version in three files manually:
|
||||
|
||||
- setup.py
|
||||
- maigret/__version__.py
|
||||
- docs/source/conf.py
|
||||
|
||||
3. Create a new empty text section in the beginning of the file `CHANGELOG.md` with a current date:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
## [0.4.0] - 2022-01-03
|
||||
|
||||
4. Get auto-generate release notes:
|
||||
|
||||
- Open https://github.com/soxoj/maigret/releases/new
|
||||
- Click `Choose a tag`, enter `test`
|
||||
- Click `Create new tag`
|
||||
- Press `+ Auto-generate release notes`
|
||||
- Copy all the text from description text field below
|
||||
- Paste it to empty text section in `CHANGELOG.txt`
|
||||
- Remove redundant lines `## What's Changed` and `## New Contributors` section if it exists
|
||||
- *Close the new release page*
|
||||
|
||||
5. Commit all the changes, push, make pull request
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
git add ...
|
||||
git commit -m 'Bump to 0.4.0'
|
||||
git push origin head
|
||||
|
||||
|
||||
6. Merge pull request
|
||||
|
||||
7. Create new release
|
||||
|
||||
- Open https://github.com/soxoj/maigret/releases/new again
|
||||
- Click `Choose a tag`
|
||||
- Enter actual version in format `v0.4.0`
|
||||
- Also enter actual version in the field `Release title`
|
||||
- Click `Create new tag`
|
||||
- Press `+ Auto-generate release notes`
|
||||
- **Press "Publish release" button**
|
||||
|
||||
8. That's all, now you can simply wait push to PyPi. You can monitor it in Action page: https://github.com/soxoj/maigret/actions/workflows/python-publish.yml
|
||||
@@ -0,0 +1,35 @@
|
||||
.. _extracting-information-from-pages:
|
||||
|
||||
Extracting information from pages
|
||||
=================================
|
||||
Maigret can parse URLs and content of web pages by URLs to extract info about account owner and other meta information.
|
||||
|
||||
You must specify the URL with the option ``--parse``, it's can be a link to an account or an online document. List of supported sites `see here <https://github.com/soxoj/socid-extractor#sites>`_.
|
||||
|
||||
After the end of the parsing phase, Maigret will start the search phase by :doc:`supported identifiers <supported-identifier-types>` found (usernames, ids, etc.).
|
||||
|
||||
Examples
|
||||
--------
|
||||
.. code-block:: console
|
||||
|
||||
$ maigret --parse https://docs.google.com/spreadsheets/d/1HtZKMLRXNsZ0HjtBmo0Gi03nUPiJIA4CC4jTYbCAnXw/edit\#gid\=0
|
||||
|
||||
Scanning webpage by URL https://docs.google.com/spreadsheets/d/1HtZKMLRXNsZ0HjtBmo0Gi03nUPiJIA4CC4jTYbCAnXw/edit#gid=0...
|
||||
┣╸org_name: Gooten
|
||||
┗╸mime_type: application/vnd.google-apps.ritz
|
||||
Scanning webpage by URL https://clients6.google.com/drive/v2beta/files/1HtZKMLRXNsZ0HjtBmo0Gi03nUPiJIA4CC4jTYbCAnXw?fields=alternateLink%2CcopyRequiresWriterPermission%2CcreatedDate%2Cdescription%2CdriveId%2CfileSize%2CiconLink%2Cid%2Clabels(starred%2C%20trashed)%2ClastViewedByMeDate%2CmodifiedDate%2Cshared%2CteamDriveId%2CuserPermission(id%2Cname%2CemailAddress%2Cdomain%2Crole%2CadditionalRoles%2CphotoLink%2Ctype%2CwithLink)%2Cpermissions(id%2Cname%2CemailAddress%2Cdomain%2Crole%2CadditionalRoles%2CphotoLink%2Ctype%2CwithLink)%2Cparents(id)%2Ccapabilities(canMoveItemWithinDrive%2CcanMoveItemOutOfDrive%2CcanMoveItemOutOfTeamDrive%2CcanAddChildren%2CcanEdit%2CcanDownload%2CcanComment%2CcanMoveChildrenWithinDrive%2CcanRename%2CcanRemoveChildren%2CcanMoveItemIntoTeamDrive)%2Ckind&supportsTeamDrives=true&enforceSingleParent=true&key=AIzaSyC1eQ1xj69IdTMeii5r7brs3R90eck-m7k...
|
||||
┣╸created_at: 2016-02-16T18:51:52.021Z
|
||||
┣╸updated_at: 2019-10-23T17:15:47.157Z
|
||||
┣╸gaia_id: 15696155517366416778
|
||||
┣╸fullname: Nadia Burgess
|
||||
┣╸email: nadia@gooten.com
|
||||
┣╸image: https://lh3.googleusercontent.com/a-/AOh14GheZe1CyNa3NeJInWAl70qkip4oJ7qLsD8vDy6X=s64
|
||||
┗╸email_username: nadia
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
$ maigret.py --parse https://steamcommunity.com/profiles/76561199113454789
|
||||
Scanning webpage by URL https://steamcommunity.com/profiles/76561199113454789...
|
||||
┣╸steam_id: 76561199113454789
|
||||
┣╸nickname: Pok
|
||||
┗╸username: Machine42
|
||||
@@ -0,0 +1,76 @@
|
||||
.. _features:
|
||||
|
||||
Features
|
||||
========
|
||||
|
||||
This is the list of Maigret features.
|
||||
|
||||
Personal info gathering
|
||||
-----------------------
|
||||
|
||||
Maigret does the `parsing of accounts webpages and extraction <https://github.com/soxoj/socid-extractor>`_ of personal info, links to other profiles, etc.
|
||||
Extracted info displayed as an additional result in CLI output and as tables in HTML and PDF reports.
|
||||
Also, Maigret use found ids and usernames from links to start a recursive search.
|
||||
|
||||
Enabled by default, can be disabled with ``--no extracting``.
|
||||
|
||||
Recursive search
|
||||
----------------
|
||||
|
||||
Maigret can extract some :ref:`common ids <supported-identifier-types>` and usernames from links on the account page (often people placed links to their other accounts) and immediately start new searches. All the gathered information will be displayed in CLI output and reports.
|
||||
|
||||
Enabled by default, can be disabled with ``--no-recursion``.
|
||||
|
||||
Reports
|
||||
-------
|
||||
|
||||
Maigret currently supports HTML, PDF, TXT, XMind mindmap, and JSON reports.
|
||||
|
||||
HTML/PDF reports contain:
|
||||
|
||||
- profile photo
|
||||
- all the gathered personal info
|
||||
- additional information about supposed personal data (full name, gender, location), resulting from statistics of all found accounts
|
||||
|
||||
Also, there is a short text report in the CLI output after the end of a searching phase.
|
||||
|
||||
Tags
|
||||
----
|
||||
|
||||
The Maigret sites database very big (and will be bigger), and it is maybe an overhead to run a search for all the sites.
|
||||
Also, it is often hard to understand, what sites more interesting for us in the case of a certain person.
|
||||
|
||||
Tags markup allows selecting a subset of sites by interests (photo, messaging, finance, etc.) or by country. Tags of found accounts grouped and displayed in the reports.
|
||||
|
||||
See full description :doc:`in the Tags Wiki page <tags>`.
|
||||
|
||||
Censorship and captcha detection
|
||||
--------------------------------
|
||||
|
||||
Maigret can detect common errors such as censorship stub pages, CloudFlare captcha pages, and others.
|
||||
If you get more them 3% errors of a certain type in a session, you've got a warning message in the CLI output with recommendations to improve performance and avoid problems.
|
||||
|
||||
Retries
|
||||
-------
|
||||
|
||||
Maigret will do retries of the requests with temporary errors got (connection failures, proxy errors, etc.).
|
||||
|
||||
One attempt by default, can be changed with option ``--retries N``.
|
||||
|
||||
Archives and mirrors checking
|
||||
-----------------------------
|
||||
|
||||
The Maigret database contains not only the original websites, but also mirrors, archives, and aggregators. For example:
|
||||
|
||||
- `Reddit BigData search <https://camas.github.io/reddit-search/>`_
|
||||
- `Picuki <https://www.picuki.com/>`_, Instagram mirror
|
||||
- `Twitter shadowban <https://shadowban.eu/>`_ checker
|
||||
|
||||
It allows getting additional info about the person and checking the existence of the account even if the main site is unavailable (bot protection, captcha, etc.)
|
||||
|
||||
Simple API
|
||||
----------
|
||||
|
||||
Maigret can be easily integrated with the use of Python package `maigret <https://pypi.org/project/maigret/>`_.
|
||||
|
||||
Example: the official `Telegram bot <https://github.com/soxoj/maigret-tg-bot>`_
|
||||
@@ -0,0 +1,31 @@
|
||||
.. _index:
|
||||
|
||||
Welcome to the Maigret docs!
|
||||
============================
|
||||
|
||||
**Maigret** is an easy-to-use and powerful OSINT tool for collecting a dossier on a person by username only.
|
||||
|
||||
This is achieved by checking for accounts on a huge number of sites and gathering all the available information from web pages.
|
||||
|
||||
The project's main goal - give to OSINT researchers and pentesters a **universal tool** to get maximum information about a subject and integrate it with other tools in automatization pipelines.
|
||||
|
||||
You may be interested in:
|
||||
-------------------------
|
||||
- :doc:`Command line options description <command-line-options>` and :doc:`usage examples <usage-examples>`
|
||||
- :doc:`Features list <features>`
|
||||
- :doc:`Project roadmap <roadmap>`
|
||||
|
||||
.. toctree::
|
||||
:hidden:
|
||||
:caption: Sections
|
||||
|
||||
command-line-options
|
||||
extracting-information-from-pages
|
||||
features
|
||||
philosophy
|
||||
roadmap
|
||||
supported-identifier-types
|
||||
tags
|
||||
usage-examples
|
||||
settings
|
||||
development
|
||||
@@ -0,0 +1,6 @@
|
||||
.. _philosophy:
|
||||
|
||||
Philosophy
|
||||
==========
|
||||
|
||||
Username => Dossier
|
||||
@@ -0,0 +1,18 @@
|
||||
.. _roadmap:
|
||||
|
||||
Roadmap
|
||||
=======
|
||||
|
||||
.. figure:: https://i.imgur.com/kk8cFdR.png
|
||||
:target: https://i.imgur.com/kk8cFdR.png
|
||||
:align: center
|
||||
|
||||
Current status
|
||||
--------------
|
||||
|
||||
- Sites DB stats - ok
|
||||
- Scan sessions stats - ok
|
||||
- Site engine autodetect - ok
|
||||
- Engines for all the sites - WIP
|
||||
- Unified reporting flow - ok
|
||||
- Retries - ok
|
||||
@@ -0,0 +1,26 @@
|
||||
.. _settings:
|
||||
|
||||
Settings
|
||||
==============
|
||||
|
||||
Options are also configurable through settings files. See
|
||||
`settings JSON file <https://github.com/soxoj/maigret/blob/main/maigret/resources/settings.json>`_
|
||||
for the list of currently supported options.
|
||||
|
||||
After start Maigret tries to load configuration from the following sources in exactly the same order:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
# relative path, based on installed package path
|
||||
resources/settings.json
|
||||
|
||||
# absolute path, configuration file in home directory
|
||||
~/.maigret/settings.json
|
||||
|
||||
# relative path, based on current working directory
|
||||
settings.json
|
||||
|
||||
Missing any of these files is not an error.
|
||||
If the next settings file contains already known option,
|
||||
this option will be rewrited. So it is possible to make
|
||||
custom configuration for different users and directories.
|
||||
@@ -0,0 +1,15 @@
|
||||
.. _supported-identifier-types:
|
||||
|
||||
Supported identifier types
|
||||
==========================
|
||||
|
||||
Maigret can search against not only ordinary usernames, but also through certain common identifiers. There is a list of all currently supported identifiers.
|
||||
|
||||
- **gaia_id** - Google inner numeric user identifier, in former times was placed in a Google Plus account URL.
|
||||
- **steam_id** - Steam inner numeric user identifier.
|
||||
- **wikimapia_uid** - Wikimapia.org inner numeric user identifier.
|
||||
- **uidme_uguid** - uID.me inner numeric user identifier.
|
||||
- **yandex_public_id** - Yandex sites inner letter user identifier. See also: `YaSeeker <https://github.com/HowToFind-bot/YaSeeker>`_.
|
||||
- **vk_id** - VK.com inner numeric user identifier.
|
||||
- **ok_id** - OK.ru inner numeric user identifier.
|
||||
- **yelp_userid** - Yelp inner user identifier.
|
||||
@@ -0,0 +1,24 @@
|
||||
.. _tags:
|
||||
|
||||
Tags
|
||||
====
|
||||
|
||||
The use of tags allows you to select a subset of the sites from big Maigret DB for search.
|
||||
|
||||
**Warning: tags markup is not stable now.**
|
||||
|
||||
There are several types of tags:
|
||||
|
||||
1. **Country codes**: ``us``, ``jp``, ``br``... (`ISO 3166-1 alpha-2 <https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2>`_). These tags reflect the site language and regional origin of its users and are then used to locate the owner of a username. If the regional origin is difficult to establish or a site is positioned as worldwide, `no country code is given`. There could be multiple country code tags for one site.
|
||||
|
||||
2. **Site engines**. Most of them are forum engines now: ``uCoz``, ``vBulletin``, ``XenForo`` et al. Full list of engines stored in the Maigret database.
|
||||
|
||||
3. **Sites' subject/type and interests of its users**. Full list of "standard" tags is `present in the source code <https://github.com/soxoj/maigret/blob/main/maigret/sites.py#L13>`_ only for a moment.
|
||||
|
||||
Usage
|
||||
-----
|
||||
``--tags en,jp`` -- search on US and Japanese sites (actually marked as such in the Maigret database)
|
||||
|
||||
``--tags coding`` -- search on sites related to software development.
|
||||
|
||||
``--tags ucoz`` -- search on uCoz sites only (mostly CIS countries)
|
||||
@@ -0,0 +1,53 @@
|
||||
.. _usage-examples:
|
||||
|
||||
Usage examples
|
||||
==============
|
||||
|
||||
Start a search for accounts with username ``machine42`` on top 500 sites from the Maigret DB.
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
maigret machine42
|
||||
|
||||
Start a search for accounts with username ``machine42`` on **all sites** from the Maigret DB.
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
maigret machine42 -a
|
||||
|
||||
Start a search [...] and generate HTML and PDF reports.
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
maigret machine42 -a -HP
|
||||
|
||||
Start a search for accounts with username ``machine42`` only on Facebook.
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
maigret machine42 --site Facebook
|
||||
|
||||
Extract information from the Steam page by URL and start a search for accounts with found username ``machine42``.
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
maigret --parse https://steamcommunity.com/profiles/76561199113454789
|
||||
|
||||
Start a search for accounts with username ``machine42`` only on US and Japanese sites.
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
maigret michael --tags en,jp
|
||||
|
||||
Start a search for accounts with username ``machine42`` only on sites related to software development.
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
maigret michael --tags coding
|
||||
|
||||
Start a search for accounts with username ``machine42`` on uCoz sites only (mostly CIS countries).
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
maigret michael --tags ucoz
|
||||
|
||||
@@ -0,0 +1,68 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "8v6PEfyXb0Gx"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# clone the repo\n",
|
||||
"!git clone https://github.com/soxoj/maigret\n",
|
||||
"!pip3 install -r maigret/requirements.txt"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "cXOQUAhDchkl"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# help\n",
|
||||
"!python3 maigret/maigret.py --help"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"id": "SjDmpN4QGnJu"
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# search\n",
|
||||
"!python3 maigret/maigret.py user"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"collapsed_sections": [],
|
||||
"include_colab_link": true,
|
||||
"name": "maigret.ipynb",
|
||||
"provenance": []
|
||||
},
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.7.10"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 1
|
||||
}
|
||||
@@ -1,5 +0,0 @@
|
||||
#!/bin/sh
|
||||
FILES="maigret wizard.py maigret.py tests"
|
||||
|
||||
echo 'black'
|
||||
black --skip-string-normalization $FILES
|
||||
@@ -1,11 +0,0 @@
|
||||
#!/bin/sh
|
||||
FILES="maigret wizard.py maigret.py tests"
|
||||
|
||||
echo 'syntax errors or undefined names'
|
||||
flake8 --count --select=E9,F63,F7,F82 --show-source --statistics $FILES
|
||||
|
||||
echo 'warning'
|
||||
flake8 --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --ignore=E731,W503 $FILES
|
||||
|
||||
echo 'mypy'
|
||||
mypy ./maigret ./wizard.py ./tests
|
||||
@@ -1,5 +1,13 @@
|
||||
"""Maigret"""
|
||||
|
||||
__title__ = 'Maigret'
|
||||
__package__ = 'maigret'
|
||||
__author__ = 'Soxoj'
|
||||
__author_email__ = 'soxoj@protonmail.com'
|
||||
|
||||
|
||||
from .__version__ import __version__
|
||||
from .checking import maigret as search
|
||||
from .maigret import main as cli
|
||||
from .sites import MaigretEngine, MaigretSite, MaigretDatabase
|
||||
from .notify import QueryNotifyPrint as Notifier
|
||||
|
||||
@@ -0,0 +1,3 @@
|
||||
"""Maigret version file"""
|
||||
|
||||
__version__ = '0.4.2'
|
||||
@@ -1,7 +1,6 @@
|
||||
from http.cookiejar import MozillaCookieJar
|
||||
from http.cookies import Morsel
|
||||
|
||||
import requests
|
||||
from aiohttp import CookieJar
|
||||
|
||||
|
||||
@@ -10,6 +9,8 @@ class ParsingActivator:
|
||||
def twitter(site, logger, cookies={}):
|
||||
headers = dict(site.headers)
|
||||
del headers["x-guest-token"]
|
||||
import requests
|
||||
|
||||
r = requests.post(site.activation["url"], headers=headers)
|
||||
logger.info(r)
|
||||
j = r.json()
|
||||
@@ -21,6 +22,8 @@ class ParsingActivator:
|
||||
headers = dict(site.headers)
|
||||
if "Authorization" in headers:
|
||||
del headers["Authorization"]
|
||||
import requests
|
||||
|
||||
r = requests.get(site.activation["url"], headers=headers)
|
||||
jwt_token = r.json()["jwt"]
|
||||
site.headers["Authorization"] = "jwt " + jwt_token
|
||||
@@ -30,30 +33,14 @@ class ParsingActivator:
|
||||
headers = dict(site.headers)
|
||||
if "Authorization" in headers:
|
||||
del headers["Authorization"]
|
||||
import requests
|
||||
|
||||
r = requests.get(site.activation["url"])
|
||||
bearer_token = r.json()["accessToken"]
|
||||
site.headers["authorization"] = f"Bearer {bearer_token}"
|
||||
|
||||
@staticmethod
|
||||
def xssis(site, logger, cookies={}):
|
||||
if not cookies:
|
||||
logger.debug("You must have cookies to activate xss.is parsing!")
|
||||
return
|
||||
|
||||
headers = dict(site.headers)
|
||||
post_data = {
|
||||
"_xfResponseType": "json",
|
||||
"_xfToken": "1611177919,a2710362e45dad9aa1da381e21941a38",
|
||||
}
|
||||
headers["content-type"] = "application/x-www-form-urlencoded; charset=UTF-8"
|
||||
r = requests.post(
|
||||
site.activation["url"], headers=headers, cookies=cookies, data=post_data
|
||||
)
|
||||
csrf = r.json()["csrf"]
|
||||
site.get_params["_xfToken"] = csrf
|
||||
|
||||
|
||||
async def import_aiohttp_cookies(cookiestxt_filename):
|
||||
def import_aiohttp_cookies(cookiestxt_filename):
|
||||
cookies_obj = MozillaCookieJar(cookiestxt_filename)
|
||||
cookies_obj.load(ignore_discard=True, ignore_expires=True)
|
||||
|
||||
|
||||
@@ -1,17 +1,24 @@
|
||||
import asyncio
|
||||
import logging
|
||||
from mock import Mock
|
||||
|
||||
try:
|
||||
from mock import Mock
|
||||
except ImportError:
|
||||
from unittest.mock import Mock
|
||||
|
||||
import re
|
||||
import ssl
|
||||
import sys
|
||||
import tqdm
|
||||
from typing import Tuple, Optional, Dict, List
|
||||
from urllib.parse import quote
|
||||
|
||||
import aiohttp
|
||||
import aiodns
|
||||
import tqdm.asyncio
|
||||
from aiohttp_socks import ProxyConnector
|
||||
from python_socks import _errors as proxy_errors
|
||||
from socid_extractor import extract
|
||||
from aiohttp import TCPConnector, ClientSession, http_exceptions
|
||||
from aiohttp.client_exceptions import ServerDisconnectedError, ClientConnectorError
|
||||
|
||||
from .activation import ParsingActivator, import_aiohttp_cookies
|
||||
from . import errors
|
||||
@@ -21,13 +28,15 @@ from .executors import (
|
||||
AsyncioSimpleExecutor,
|
||||
AsyncioProgressbarQueueExecutor,
|
||||
)
|
||||
|
||||
from .result import QueryResult, QueryStatus
|
||||
from .sites import MaigretDatabase, MaigretSite
|
||||
from .types import QueryOptions, QueryResultWrapper
|
||||
from .utils import get_random_user_agent
|
||||
from .utils import get_random_user_agent, ascii_data_display
|
||||
|
||||
|
||||
supported_recursive_search_ids = (
|
||||
SUPPORTED_IDS = (
|
||||
"username",
|
||||
"yandex_public_id",
|
||||
"gaia_id",
|
||||
"vk_id",
|
||||
@@ -35,54 +44,155 @@ supported_recursive_search_ids = (
|
||||
"wikimapia_uid",
|
||||
"steam_id",
|
||||
"uidme_uguid",
|
||||
"yelp_userid",
|
||||
)
|
||||
|
||||
unsupported_characters = "#"
|
||||
BAD_CHARS = "#"
|
||||
|
||||
|
||||
async def get_response(request_future, logger) -> Tuple[str, int, Optional[CheckError]]:
|
||||
html_text = None
|
||||
status_code = 0
|
||||
error: Optional[CheckError] = CheckError("Unknown")
|
||||
class CheckerBase:
|
||||
pass
|
||||
|
||||
try:
|
||||
response = await request_future
|
||||
|
||||
status_code = response.status
|
||||
response_content = await response.content.read()
|
||||
charset = response.charset or "utf-8"
|
||||
decoded_content = response_content.decode(charset, "ignore")
|
||||
html_text = decoded_content
|
||||
class SimpleAiohttpChecker(CheckerBase):
|
||||
def __init__(self, *args, **kwargs):
|
||||
proxy = kwargs.get('proxy')
|
||||
cookie_jar = kwargs.get('cookie_jar')
|
||||
self.logger = kwargs.get('logger', Mock())
|
||||
|
||||
if status_code == 0:
|
||||
error = CheckError("Connection lost")
|
||||
# moved here to speed up the launch of Maigret
|
||||
from aiohttp_socks import ProxyConnector
|
||||
|
||||
# make http client session
|
||||
connector = ProxyConnector.from_url(proxy) if proxy else TCPConnector(ssl=False)
|
||||
connector.verify_ssl = False
|
||||
self.session = ClientSession(
|
||||
connector=connector, trust_env=True, cookie_jar=cookie_jar
|
||||
)
|
||||
|
||||
def prepare(self, url, headers=None, allow_redirects=True, timeout=0, method='get'):
|
||||
if method == 'get':
|
||||
request_method = self.session.get
|
||||
else:
|
||||
request_method = self.session.head
|
||||
|
||||
future = request_method(
|
||||
url=url,
|
||||
headers=headers,
|
||||
allow_redirects=allow_redirects,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
return future
|
||||
|
||||
async def close(self):
|
||||
await self.session.close()
|
||||
|
||||
async def check(self, future) -> Tuple[str, int, Optional[CheckError]]:
|
||||
html_text = None
|
||||
status_code = 0
|
||||
error: Optional[CheckError] = CheckError("Unknown")
|
||||
|
||||
try:
|
||||
response = await future
|
||||
|
||||
status_code = response.status
|
||||
response_content = await response.content.read()
|
||||
charset = response.charset or "utf-8"
|
||||
decoded_content = response_content.decode(charset, "ignore")
|
||||
html_text = decoded_content
|
||||
|
||||
error = None
|
||||
if status_code == 0:
|
||||
error = CheckError("Connection lost")
|
||||
|
||||
logger.debug(html_text)
|
||||
self.logger.debug(html_text)
|
||||
|
||||
except asyncio.TimeoutError as e:
|
||||
error = CheckError("Request timeout", str(e))
|
||||
except aiohttp.client_exceptions.ClientConnectorError as e:
|
||||
error = CheckError("Connecting failure", str(e))
|
||||
except aiohttp.http_exceptions.BadHttpMessage as e:
|
||||
error = CheckError("HTTP", str(e))
|
||||
except proxy_errors.ProxyError as e:
|
||||
error = CheckError("Proxy", str(e))
|
||||
except KeyboardInterrupt:
|
||||
error = CheckError("Interrupted")
|
||||
except Exception as e:
|
||||
# python-specific exceptions
|
||||
if sys.version_info.minor > 6:
|
||||
if isinstance(e, ssl.SSLCertVerificationError) or isinstance(
|
||||
e, ssl.SSLError
|
||||
except asyncio.TimeoutError as e:
|
||||
error = CheckError("Request timeout", str(e))
|
||||
except ClientConnectorError as e:
|
||||
error = CheckError("Connecting failure", str(e))
|
||||
except ServerDisconnectedError as e:
|
||||
error = CheckError("Server disconnected", str(e))
|
||||
except http_exceptions.BadHttpMessage as e:
|
||||
error = CheckError("HTTP", str(e))
|
||||
except proxy_errors.ProxyError as e:
|
||||
error = CheckError("Proxy", str(e))
|
||||
except KeyboardInterrupt:
|
||||
error = CheckError("Interrupted")
|
||||
except Exception as e:
|
||||
# python-specific exceptions
|
||||
if sys.version_info.minor > 6 and (
|
||||
isinstance(e, ssl.SSLCertVerificationError)
|
||||
or isinstance(e, ssl.SSLError)
|
||||
):
|
||||
error = CheckError("SSL", str(e))
|
||||
else:
|
||||
logger.debug(e, exc_info=True)
|
||||
error = CheckError("Unexpected", str(e))
|
||||
else:
|
||||
self.logger.debug(e, exc_info=True)
|
||||
error = CheckError("Unexpected", str(e))
|
||||
|
||||
return str(html_text), status_code, error
|
||||
if error == "Invalid proxy response":
|
||||
self.logger.debug(error, exc_info=True)
|
||||
|
||||
return str(html_text), status_code, error
|
||||
|
||||
|
||||
class ProxiedAiohttpChecker(SimpleAiohttpChecker):
|
||||
def __init__(self, *args, **kwargs):
|
||||
proxy = kwargs.get('proxy')
|
||||
cookie_jar = kwargs.get('cookie_jar')
|
||||
self.logger = kwargs.get('logger', Mock())
|
||||
|
||||
# moved here to speed up the launch of Maigret
|
||||
from aiohttp_socks import ProxyConnector
|
||||
|
||||
connector = ProxyConnector.from_url(proxy)
|
||||
connector.verify_ssl = False
|
||||
self.session = ClientSession(
|
||||
connector=connector, trust_env=True, cookie_jar=cookie_jar
|
||||
)
|
||||
|
||||
|
||||
class AiodnsDomainResolver(CheckerBase):
|
||||
def __init__(self, *args, **kwargs):
|
||||
loop = asyncio.get_event_loop()
|
||||
self.logger = kwargs.get('logger', Mock())
|
||||
self.resolver = aiodns.DNSResolver(loop=loop)
|
||||
|
||||
def prepare(self, url, headers=None, allow_redirects=True, timeout=0, method='get'):
|
||||
return self.resolver.query(url, 'A')
|
||||
|
||||
async def check(self, future) -> Tuple[str, int, Optional[CheckError]]:
|
||||
status = 404
|
||||
error = None
|
||||
text = ''
|
||||
|
||||
try:
|
||||
res = await future
|
||||
text = str(res[0].host)
|
||||
status = 200
|
||||
except aiodns.error.DNSError:
|
||||
pass
|
||||
except Exception as e:
|
||||
self.logger.error(e, exc_info=True)
|
||||
error = CheckError('DNS resolve error', str(e))
|
||||
|
||||
return text, status, error
|
||||
|
||||
|
||||
class CheckerMock:
|
||||
def __init__(self, *args, **kwargs):
|
||||
pass
|
||||
|
||||
def prepare(self, url, headers=None, allow_redirects=True, timeout=0, method='get'):
|
||||
return None
|
||||
|
||||
async def check(self, future) -> Tuple[str, int, Optional[CheckError]]:
|
||||
await asyncio.sleep(0)
|
||||
return '', 0, None
|
||||
|
||||
async def close(self):
|
||||
return
|
||||
|
||||
|
||||
# TODO: move to separate class
|
||||
@@ -109,6 +219,14 @@ def detect_error_page(
|
||||
return None
|
||||
|
||||
|
||||
def debug_response_logging(url, html_text, status_code, check_error):
|
||||
with open("debug.log", "a") as f:
|
||||
status = status_code or "No response"
|
||||
f.write(f"url: {url}\nerror: {check_error}\nr: {status}\n")
|
||||
if html_text:
|
||||
f.write(f"code: {status}\nresponse: {str(html_text)}\n")
|
||||
|
||||
|
||||
def process_site_result(
|
||||
response, query_notify, logger, results_info: QueryResultWrapper, site: MaigretSite
|
||||
):
|
||||
@@ -121,7 +239,7 @@ def process_site_result(
|
||||
username = results_info["username"]
|
||||
is_parsing_enabled = results_info["parsing_enabled"]
|
||||
url = results_info.get("url_user")
|
||||
logger.debug(url)
|
||||
logger.info(url)
|
||||
|
||||
status = results_info.get("status")
|
||||
if status is not None:
|
||||
@@ -142,40 +260,42 @@ def process_site_result(
|
||||
response_time = None
|
||||
|
||||
if logger.level == logging.DEBUG:
|
||||
with open("debug.txt", "a") as f:
|
||||
status = status_code or "No response"
|
||||
f.write(f"url: {url}\nerror: {check_error}\nr: {status}\n")
|
||||
if html_text:
|
||||
f.write(f"code: {status}\nresponse: {str(html_text)}\n")
|
||||
debug_response_logging(url, html_text, status_code, check_error)
|
||||
|
||||
# additional check for errors
|
||||
if status_code and not check_error:
|
||||
check_error = detect_error_page(
|
||||
html_text, status_code, site.errors, site.ignore403
|
||||
html_text, status_code, site.errors_dict, site.ignore403
|
||||
)
|
||||
|
||||
if site.activation and html_text:
|
||||
is_need_activation = any(
|
||||
[s for s in site.activation["marks"] if s in html_text]
|
||||
)
|
||||
if is_need_activation:
|
||||
method = site.activation["method"]
|
||||
try:
|
||||
activate_fun = getattr(ParsingActivator(), method)
|
||||
# TODO: async call
|
||||
activate_fun(site, logger)
|
||||
except AttributeError:
|
||||
logger.warning(
|
||||
f"Activation method {method} for site {site.name} not found!"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed activation {method} for site {site.name}: {e}")
|
||||
# parsing activation
|
||||
is_need_activation = any(
|
||||
[s for s in site.activation.get("marks", []) if s in html_text]
|
||||
)
|
||||
|
||||
if site.activation and html_text and is_need_activation:
|
||||
method = site.activation["method"]
|
||||
try:
|
||||
activate_fun = getattr(ParsingActivator(), method)
|
||||
# TODO: async call
|
||||
activate_fun(site, logger)
|
||||
except AttributeError:
|
||||
logger.warning(
|
||||
f"Activation method {method} for site {site.name} not found!"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Failed activation {method} for site {site.name}: {str(e)}",
|
||||
exc_info=True,
|
||||
)
|
||||
# TODO: temporary check error
|
||||
|
||||
site_name = site.pretty_name
|
||||
# presense flags
|
||||
# True by default
|
||||
presense_flags = site.presense_strs
|
||||
is_presense_detected = False
|
||||
|
||||
if html_text:
|
||||
if not presense_flags:
|
||||
is_presense_detected = True
|
||||
@@ -200,7 +320,7 @@ def process_site_result(
|
||||
)
|
||||
|
||||
if check_error:
|
||||
logger.debug(check_error)
|
||||
logger.warning(check_error)
|
||||
result = QueryResult(
|
||||
username,
|
||||
site_name,
|
||||
@@ -220,9 +340,9 @@ def process_site_result(
|
||||
result = build_result(QueryStatus.CLAIMED)
|
||||
else:
|
||||
result = build_result(QueryStatus.AVAILABLE)
|
||||
elif check_type == "status_code":
|
||||
elif check_type in "status_code":
|
||||
# Checks if the status code of the response is 2XX
|
||||
if is_presense_detected and (not status_code >= 300 or status_code < 200):
|
||||
if 200 <= status_code < 300:
|
||||
result = build_result(QueryStatus.CLAIMED)
|
||||
else:
|
||||
result = build_result(QueryStatus.AVAILABLE)
|
||||
@@ -255,16 +375,16 @@ def process_site_result(
|
||||
for k, v in extracted_ids_data.items():
|
||||
if "username" in k:
|
||||
new_usernames[v] = "username"
|
||||
if k in supported_recursive_search_ids:
|
||||
if k in SUPPORTED_IDS:
|
||||
new_usernames[v] = k
|
||||
|
||||
results_info["ids_usernames"] = new_usernames
|
||||
results_info["ids_links"] = eval(extracted_ids_data.get("links", "[]"))
|
||||
links = ascii_data_display(extracted_ids_data.get("links", "[]"))
|
||||
if "website" in extracted_ids_data:
|
||||
links.append(extracted_ids_data["website"])
|
||||
results_info["ids_links"] = links
|
||||
result.ids_data = extracted_ids_data
|
||||
|
||||
# Notify caller about results of query.
|
||||
query_notify.update(result, site.similar_search)
|
||||
|
||||
# Save status of request
|
||||
results_info["status"] = result
|
||||
|
||||
@@ -303,13 +423,14 @@ def make_site_result(
|
||||
|
||||
# URL of user on site (if it exists)
|
||||
url = site.url.format(
|
||||
urlMain=site.url_main, urlSubpath=site.url_subpath, username=username
|
||||
urlMain=site.url_main, urlSubpath=site.url_subpath, username=quote(username)
|
||||
)
|
||||
|
||||
# workaround to prevent slash errors
|
||||
url = re.sub("(?<!:)/+", "/", url)
|
||||
|
||||
session = options['session']
|
||||
# always clearweb_checker for now
|
||||
checker = options["checkers"][site.protocol]
|
||||
|
||||
# site check is disabled
|
||||
if site.disabled and not options['forced']:
|
||||
@@ -368,12 +489,12 @@ def make_site_result(
|
||||
# In most cases when we are detecting by status code,
|
||||
# it is not necessary to get the entire body: we can
|
||||
# detect fine with just the HEAD response.
|
||||
request_method = session.head
|
||||
request_method = 'head'
|
||||
else:
|
||||
# Either this detect method needs the content associated
|
||||
# with the GET response, or this specific website will
|
||||
# not respond properly unless we request the whole page.
|
||||
request_method = session.get
|
||||
request_method = 'get'
|
||||
|
||||
if site.check_type == "response_url":
|
||||
# Site forwards request to a different URL if username not
|
||||
@@ -385,7 +506,8 @@ def make_site_result(
|
||||
# The final result of the request will be what is available.
|
||||
allow_redirects = True
|
||||
|
||||
future = request_method(
|
||||
future = checker.prepare(
|
||||
method=request_method,
|
||||
url=url_probe,
|
||||
headers=headers,
|
||||
allow_redirects=allow_redirects,
|
||||
@@ -394,6 +516,7 @@ def make_site_result(
|
||||
|
||||
# Store future request object in the results object
|
||||
results_site["future"] = future
|
||||
results_site["checker"] = checker
|
||||
|
||||
return results_site
|
||||
|
||||
@@ -406,18 +529,22 @@ async def check_site_for_username(
|
||||
if not future:
|
||||
return site.name, default_result
|
||||
|
||||
response = await get_response(request_future=future, logger=logger)
|
||||
checker = default_result["checker"]
|
||||
|
||||
response = await checker.check(future=future)
|
||||
|
||||
response_result = process_site_result(
|
||||
response, query_notify, logger, default_result, site
|
||||
)
|
||||
|
||||
query_notify.update(response_result['status'], site.similar_search)
|
||||
|
||||
return site.name, response_result
|
||||
|
||||
|
||||
async def debug_ip_request(session, logger):
|
||||
future = session.get(url="https://icanhazip.com")
|
||||
ip, status, check_error = await get_response(future, logger)
|
||||
async def debug_ip_request(checker, logger):
|
||||
future = checker.prepare(url="https://icanhazip.com")
|
||||
ip, status, check_error = await checker.check(future)
|
||||
if ip:
|
||||
logger.debug(f"My IP is: {ip.strip()}")
|
||||
else:
|
||||
@@ -441,7 +568,9 @@ async def maigret(
|
||||
logger,
|
||||
query_notify=None,
|
||||
proxy=None,
|
||||
timeout=None,
|
||||
tor_proxy=None,
|
||||
i2p_proxy=None,
|
||||
timeout=3,
|
||||
is_parsing_enabled=False,
|
||||
id_type="username",
|
||||
debug=False,
|
||||
@@ -450,6 +579,7 @@ async def maigret(
|
||||
no_progressbar=False,
|
||||
cookies=None,
|
||||
retries=0,
|
||||
check_domains=False,
|
||||
) -> QueryResultWrapper:
|
||||
"""Main search func
|
||||
|
||||
@@ -463,7 +593,7 @@ async def maigret(
|
||||
query results.
|
||||
logger -- Standard Python logger object.
|
||||
timeout -- Time in seconds to wait before timing out request.
|
||||
Default is no timeout.
|
||||
Default is 3 seconds.
|
||||
is_parsing_enabled -- Extract additional info from account pages.
|
||||
id_type -- Type of username to search.
|
||||
Default is 'username', see all supported here:
|
||||
@@ -493,23 +623,36 @@ async def maigret(
|
||||
|
||||
query_notify.start(username, id_type)
|
||||
|
||||
# make http client session
|
||||
connector = (
|
||||
ProxyConnector.from_url(proxy) if proxy else aiohttp.TCPConnector(ssl=False)
|
||||
)
|
||||
connector.verify_ssl = False
|
||||
|
||||
cookie_jar = None
|
||||
if cookies:
|
||||
logger.debug(f"Using cookies jar file {cookies}")
|
||||
cookie_jar = await import_aiohttp_cookies(cookies)
|
||||
cookie_jar = import_aiohttp_cookies(cookies)
|
||||
|
||||
session = aiohttp.ClientSession(
|
||||
connector=connector, trust_env=True, cookie_jar=cookie_jar
|
||||
clearweb_checker = SimpleAiohttpChecker(
|
||||
proxy=proxy, cookie_jar=cookie_jar, logger=logger
|
||||
)
|
||||
|
||||
# TODO
|
||||
tor_checker = CheckerMock()
|
||||
if tor_proxy:
|
||||
tor_checker = ProxiedAiohttpChecker( # type: ignore
|
||||
proxy=tor_proxy, cookie_jar=cookie_jar, logger=logger
|
||||
)
|
||||
|
||||
# TODO
|
||||
i2p_checker = CheckerMock()
|
||||
if i2p_proxy:
|
||||
i2p_checker = ProxiedAiohttpChecker( # type: ignore
|
||||
proxy=i2p_proxy, cookie_jar=cookie_jar, logger=logger
|
||||
)
|
||||
|
||||
# TODO
|
||||
dns_checker = CheckerMock()
|
||||
if check_domains:
|
||||
dns_checker = AiodnsDomainResolver(logger=logger) # type: ignore
|
||||
|
||||
if logger.level == logging.DEBUG:
|
||||
await debug_ip_request(session, logger)
|
||||
await debug_ip_request(clearweb_checker, logger)
|
||||
|
||||
# setup parallel executor
|
||||
executor: Optional[AsyncExecutor] = None
|
||||
@@ -523,7 +666,12 @@ async def maigret(
|
||||
# make options objects for all the requests
|
||||
options: QueryOptions = {}
|
||||
options["cookies"] = cookie_jar
|
||||
options["session"] = session
|
||||
options["checkers"] = {
|
||||
'': clearweb_checker,
|
||||
'tor': tor_checker,
|
||||
'dns': dns_checker,
|
||||
'i2p': i2p_checker,
|
||||
}
|
||||
options["parsing"] = is_parsing_enabled
|
||||
options["timeout"] = timeout
|
||||
options["id_type"] = id_type
|
||||
@@ -576,7 +724,11 @@ async def maigret(
|
||||
)
|
||||
|
||||
# closing http client session
|
||||
await session.close()
|
||||
await clearweb_checker.close()
|
||||
if tor_proxy:
|
||||
await tor_checker.close()
|
||||
if i2p_proxy:
|
||||
await i2p_checker.close()
|
||||
|
||||
# notify caller that all queries are finished
|
||||
query_notify.finish()
|
||||
@@ -610,21 +762,23 @@ def timeout_check(value):
|
||||
|
||||
|
||||
async def site_self_check(
|
||||
site: MaigretSite, logger, semaphore, db: MaigretDatabase, silent=False
|
||||
site: MaigretSite,
|
||||
logger,
|
||||
semaphore,
|
||||
db: MaigretDatabase,
|
||||
silent=False,
|
||||
proxy=None,
|
||||
tor_proxy=None,
|
||||
i2p_proxy=None,
|
||||
):
|
||||
changes = {
|
||||
"disabled": False,
|
||||
}
|
||||
|
||||
try:
|
||||
check_data = [
|
||||
(site.username_claimed, QueryStatus.CLAIMED),
|
||||
(site.username_unclaimed, QueryStatus.AVAILABLE),
|
||||
]
|
||||
except Exception as e:
|
||||
logger.error(e)
|
||||
logger.error(site.__dict__)
|
||||
check_data = []
|
||||
check_data = [
|
||||
(site.username_claimed, QueryStatus.CLAIMED),
|
||||
(site.username_unclaimed, QueryStatus.AVAILABLE),
|
||||
]
|
||||
|
||||
logger.info(f"Checking {site.name}...")
|
||||
|
||||
@@ -639,6 +793,9 @@ async def site_self_check(
|
||||
forced=True,
|
||||
no_progressbar=True,
|
||||
retries=1,
|
||||
proxy=proxy,
|
||||
tor_proxy=tor_proxy,
|
||||
i2p_proxy=i2p_proxy,
|
||||
)
|
||||
|
||||
# don't disable entries with other ids types
|
||||
@@ -648,6 +805,8 @@ async def site_self_check(
|
||||
changes["disabled"] = True
|
||||
continue
|
||||
|
||||
logger.debug(results_dict)
|
||||
|
||||
result = results_dict[site.name]["status"]
|
||||
|
||||
site_status = result.status
|
||||
@@ -686,7 +845,14 @@ async def site_self_check(
|
||||
|
||||
|
||||
async def self_check(
|
||||
db: MaigretDatabase, site_data: dict, logger, silent=False, max_connections=10
|
||||
db: MaigretDatabase,
|
||||
site_data: dict,
|
||||
logger,
|
||||
silent=False,
|
||||
max_connections=10,
|
||||
proxy=None,
|
||||
tor_proxy=None,
|
||||
i2p_proxy=None,
|
||||
) -> bool:
|
||||
sem = asyncio.Semaphore(max_connections)
|
||||
tasks = []
|
||||
@@ -698,7 +864,9 @@ async def self_check(
|
||||
disabled_old_count = disabled_count(all_sites.values())
|
||||
|
||||
for _, site in all_sites.items():
|
||||
check_coro = site_self_check(site, logger, sem, db, silent)
|
||||
check_coro = site_self_check(
|
||||
site, logger, sem, db, silent, proxy, tor_proxy, i2p_proxy
|
||||
)
|
||||
future = asyncio.ensure_future(check_coro)
|
||||
tasks.append(future)
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from typing import Dict, List, Any
|
||||
|
||||
from .result import QueryResult
|
||||
from .types import QueryResultWrapper
|
||||
|
||||
|
||||
# error got as a result of completed search query
|
||||
@@ -34,6 +35,12 @@ COMMON_ERRORS = {
|
||||
'Please stand by, while we are checking your browser': CheckError(
|
||||
'Bot protection', 'Cloudflare'
|
||||
),
|
||||
'<span data-translate="checking_browser">Checking your browser before accessing</span>': CheckError(
|
||||
'Bot protection', 'Cloudflare'
|
||||
),
|
||||
'This website is using a security service to protect itself from online attacks.': CheckError(
|
||||
'Access denied', 'Cloudflare'
|
||||
),
|
||||
'<title>Доступ ограничен</title>': CheckError('Censorship', 'Rostelecom'),
|
||||
'document.getElementById(\'validate_form_submit\').disabled=true': CheckError(
|
||||
'Captcha', 'Mail.ru'
|
||||
@@ -48,6 +55,9 @@ COMMON_ERRORS = {
|
||||
'Censorship', 'MGTS'
|
||||
),
|
||||
'Incapsula incident ID': CheckError('Bot protection', 'Incapsula'),
|
||||
'Сайт заблокирован хостинг-провайдером': CheckError(
|
||||
'Site-specific', 'Site is disabled (Beget)'
|
||||
),
|
||||
}
|
||||
|
||||
ERRORS_TYPES = {
|
||||
@@ -57,6 +67,11 @@ ERRORS_TYPES = {
|
||||
'Request timeout': 'Try to increase timeout or to switch to another internet service provider',
|
||||
}
|
||||
|
||||
# TODO: checking for reason
|
||||
ERRORS_REASONS = {
|
||||
'Login required': 'Add authorization cookies through `--cookies-jar-file` (see cookies.txt)',
|
||||
}
|
||||
|
||||
TEMPORARY_ERRORS_TYPES = [
|
||||
'Request timeout',
|
||||
'Unknown',
|
||||
@@ -90,9 +105,9 @@ def solution_of(err_type) -> str:
|
||||
return ERRORS_TYPES.get(err_type, '')
|
||||
|
||||
|
||||
def extract_and_group(search_res: dict) -> List[Dict[str, Any]]:
|
||||
def extract_and_group(search_res: QueryResultWrapper) -> List[Dict[str, Any]]:
|
||||
errors_counts: Dict[str, int] = {}
|
||||
for r in search_res:
|
||||
for r in search_res.values():
|
||||
if r and isinstance(r, dict) and r.get('status'):
|
||||
if not isinstance(r['status'], QueryResult):
|
||||
continue
|
||||
|
||||
@@ -1,22 +1,23 @@
|
||||
"""
|
||||
Maigret main module
|
||||
"""
|
||||
import aiohttp
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import platform
|
||||
from argparse import ArgumentParser, RawDescriptionHelpFormatter
|
||||
from typing import List, Tuple
|
||||
import os.path as path
|
||||
|
||||
import requests
|
||||
from socid_extractor import extract, parse, __version__ as socid_version
|
||||
from socid_extractor import extract, parse
|
||||
|
||||
from .__version__ import __version__
|
||||
from .checking import (
|
||||
timeout_check,
|
||||
supported_recursive_search_ids,
|
||||
SUPPORTED_IDS,
|
||||
self_check,
|
||||
unsupported_characters,
|
||||
BAD_CHARS,
|
||||
maigret,
|
||||
)
|
||||
from . import errors
|
||||
@@ -29,26 +30,28 @@ from .report import (
|
||||
generate_report_context,
|
||||
save_txt_report,
|
||||
SUPPORTED_JSON_REPORT_FORMATS,
|
||||
check_supported_json_format,
|
||||
save_json_report,
|
||||
get_plaintext_report,
|
||||
sort_report_by_data_points,
|
||||
save_graph_report,
|
||||
)
|
||||
from .sites import MaigretDatabase
|
||||
from .submit import submit_dialog
|
||||
from .submit import Submitter
|
||||
from .types import QueryResultWrapper
|
||||
from .utils import get_dict_ascii_tree
|
||||
|
||||
__version__ = '0.2.0'
|
||||
from .settings import Settings
|
||||
|
||||
|
||||
def notify_about_errors(search_results, query_notify):
|
||||
errs = errors.extract_and_group(search_results.values())
|
||||
def notify_about_errors(search_results: QueryResultWrapper, query_notify):
|
||||
errs = errors.extract_and_group(search_results)
|
||||
was_errs_displayed = False
|
||||
for e in errs:
|
||||
if not errors.is_important(e):
|
||||
continue
|
||||
text = f'Too many errors of type "{e["err"]}" ({e["perc"]}%)'
|
||||
text = f'Too many errors of type "{e["err"]}" ({round(e["perc"],2)}%)'
|
||||
solution = errors.solution_of(e['err'])
|
||||
if solution:
|
||||
text = '. '.join([text, solution])
|
||||
text = '. '.join([text, solution.capitalize()])
|
||||
|
||||
query_notify.warning(text, '!')
|
||||
was_errs_displayed = True
|
||||
@@ -59,20 +62,82 @@ def notify_about_errors(search_results, query_notify):
|
||||
)
|
||||
|
||||
|
||||
def setup_arguments_parser():
|
||||
def extract_ids_from_page(url, logger, timeout=5) -> dict:
|
||||
results = {}
|
||||
# url, headers
|
||||
reqs: List[Tuple[str, set]] = [(url, set())]
|
||||
try:
|
||||
# temporary workaround for URL mutations MVP
|
||||
from socid_extractor import mutate_url
|
||||
|
||||
reqs += list(mutate_url(url))
|
||||
except Exception as e:
|
||||
logger.warning(e)
|
||||
|
||||
for req in reqs:
|
||||
url, headers = req
|
||||
print(f'Scanning webpage by URL {url}...')
|
||||
page, _ = parse(url, cookies_str='', headers=headers, timeout=timeout)
|
||||
logger.debug(page)
|
||||
info = extract(page)
|
||||
if not info:
|
||||
print('Nothing extracted')
|
||||
else:
|
||||
print(get_dict_ascii_tree(info.items(), new_line=False), ' ')
|
||||
for k, v in info.items():
|
||||
if 'username' in k:
|
||||
results[v] = 'username'
|
||||
if k in SUPPORTED_IDS:
|
||||
results[v] = k
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def extract_ids_from_results(results: QueryResultWrapper, db: MaigretDatabase) -> dict:
|
||||
ids_results = {}
|
||||
for website_name in results:
|
||||
dictionary = results[website_name]
|
||||
# TODO: fix no site data issue
|
||||
if not dictionary:
|
||||
continue
|
||||
|
||||
new_usernames = dictionary.get('ids_usernames')
|
||||
if new_usernames:
|
||||
for u, utype in new_usernames.items():
|
||||
ids_results[u] = utype
|
||||
|
||||
for url in dictionary.get('ids_links', []):
|
||||
ids_results.update(db.extract_ids_from_url(url))
|
||||
|
||||
return ids_results
|
||||
|
||||
|
||||
def setup_arguments_parser(settings: Settings):
|
||||
from aiohttp import __version__ as aiohttp_version
|
||||
from requests import __version__ as requests_version
|
||||
from socid_extractor import __version__ as socid_version
|
||||
|
||||
version_string = '\n'.join(
|
||||
[
|
||||
f'%(prog)s {__version__}',
|
||||
f'Socid-extractor: {socid_version}',
|
||||
f'Aiohttp: {aiohttp.__version__}',
|
||||
f'Requests: {requests.__version__}',
|
||||
f'Aiohttp: {aiohttp_version}',
|
||||
f'Requests: {requests_version}',
|
||||
f'Python: {platform.python_version()}',
|
||||
]
|
||||
)
|
||||
|
||||
parser = ArgumentParser(
|
||||
formatter_class=RawDescriptionHelpFormatter,
|
||||
description=f"Maigret v{__version__}",
|
||||
description=f"Maigret v{__version__}\n"
|
||||
"Documentation: https://maigret.readthedocs.io/\n"
|
||||
"All settings are also configurable through files, see docs.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"username",
|
||||
nargs='*',
|
||||
metavar="USERNAMES",
|
||||
help="One or more usernames to search by.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--version",
|
||||
@@ -80,71 +145,15 @@ def setup_arguments_parser():
|
||||
version=version_string,
|
||||
help="Display version information and dependencies.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--info",
|
||||
"-vv",
|
||||
action="store_true",
|
||||
dest="info",
|
||||
default=False,
|
||||
help="Display service information.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
"-v",
|
||||
action="store_true",
|
||||
dest="verbose",
|
||||
default=False,
|
||||
help="Display extra information and metrics.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
"--debug",
|
||||
"-vvv",
|
||||
action="store_true",
|
||||
dest="debug",
|
||||
default=False,
|
||||
help="Saving debugging information and sites responses in debug.txt.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--site",
|
||||
action="append",
|
||||
metavar='SITE_NAME',
|
||||
dest="site_list",
|
||||
default=[],
|
||||
help="Limit analysis to just the listed sites (use several times to specify more than one)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--proxy",
|
||||
"-p",
|
||||
metavar='PROXY_URL',
|
||||
action="store",
|
||||
dest="proxy",
|
||||
default=None,
|
||||
help="Make requests over a proxy. e.g. socks5://127.0.0.1:1080",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--db",
|
||||
metavar="DB_FILE",
|
||||
dest="db_file",
|
||||
default=None,
|
||||
help="Load Maigret database from a JSON file or an online, valid, JSON file.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cookies-jar-file",
|
||||
metavar="COOKIE_FILE",
|
||||
dest="cookie_file",
|
||||
default=None,
|
||||
help="File with cookies.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeout",
|
||||
action="store",
|
||||
metavar='TIMEOUT',
|
||||
dest="timeout",
|
||||
type=timeout_check,
|
||||
default=30,
|
||||
help="Time (in seconds) to wait for response to requests. "
|
||||
"Default timeout of 30.0s. "
|
||||
default=settings.timeout,
|
||||
help="Time in seconds to wait for response to requests "
|
||||
f"(default {settings.timeout}s). "
|
||||
"A longer timeout will be more likely to get results from slow sites. "
|
||||
"On the other hand, this may cause a long delay to gather all results. ",
|
||||
)
|
||||
@@ -153,8 +162,8 @@ def setup_arguments_parser():
|
||||
action="store",
|
||||
type=int,
|
||||
metavar='RETRIES',
|
||||
default=1,
|
||||
help="Attempts to restart temporary failed requests.",
|
||||
default=settings.retries_count,
|
||||
help="Attempts to restart temporarily failed requests.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-n",
|
||||
@@ -162,193 +171,291 @@ def setup_arguments_parser():
|
||||
action="store",
|
||||
type=int,
|
||||
dest="connections",
|
||||
default=100,
|
||||
default=settings.max_connections,
|
||||
help="Allowed number of concurrent connections.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-a",
|
||||
"--all-sites",
|
||||
action="store_true",
|
||||
dest="all_sites",
|
||||
default=False,
|
||||
help="Use all sites for scan.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--top-sites",
|
||||
action="store",
|
||||
default=500,
|
||||
type=int,
|
||||
help="Count of sites for scan ranked by Alexa Top (default: 500).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--print-not-found",
|
||||
action="store_true",
|
||||
dest="print_not_found",
|
||||
default=False,
|
||||
help="Print sites where the username was not found.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--print-errors",
|
||||
action="store_true",
|
||||
dest="print_check_errors",
|
||||
default=False,
|
||||
help="Print errors messages: connection, captcha, site country ban, etc.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--submit",
|
||||
metavar='EXISTING_USER_URL',
|
||||
type=str,
|
||||
dest="new_site_to_submit",
|
||||
default=False,
|
||||
help="URL of existing profile in new site to submit.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-color",
|
||||
action="store_true",
|
||||
dest="no_color",
|
||||
default=False,
|
||||
help="Don't color terminal output",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-progressbar",
|
||||
action="store_true",
|
||||
dest="no_progressbar",
|
||||
default=False,
|
||||
help="Don't show progressbar.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--browse",
|
||||
"-b",
|
||||
action="store_true",
|
||||
dest="browse",
|
||||
default=False,
|
||||
help="Browse to all results on default bowser.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-recursion",
|
||||
action="store_true",
|
||||
dest="disable_recursive_search",
|
||||
default=False,
|
||||
default=(not settings.recursive_search),
|
||||
help="Disable recursive search by additional data extracted from pages.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-extracting",
|
||||
action="store_true",
|
||||
dest="disable_extracting",
|
||||
default=False,
|
||||
default=(not settings.info_extracting),
|
||||
help="Disable parsing pages for additional data and other usernames.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--self-check",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Do self check for sites and database and disable non-working ones.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--stats", action="store_true", default=False, help="Show database statistics."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--use-disabled-sites",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Use disabled sites to search (may cause many false positives).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--parse",
|
||||
dest="parse_url",
|
||||
default='',
|
||||
help="Parse page by URL and extract username and IDs to use for search.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--id-type",
|
||||
dest="id_type",
|
||||
default='username',
|
||||
choices=SUPPORTED_IDS,
|
||||
help="Specify identifier(s) type (default: username).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--db",
|
||||
metavar="DB_FILE",
|
||||
dest="db_file",
|
||||
default=settings.sites_db_path,
|
||||
help="Load Maigret database from a JSON file or HTTP web resource.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cookies-jar-file",
|
||||
metavar="COOKIE_FILE",
|
||||
dest="cookie_file",
|
||||
default=settings.cookie_jar_file,
|
||||
help="File with cookies.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ignore-ids",
|
||||
action="append",
|
||||
metavar='IGNORED_IDS',
|
||||
dest="ignore_ids_list",
|
||||
default=[],
|
||||
default=settings.ignore_ids_list,
|
||||
help="Do not make search by the specified username or other ids.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"username",
|
||||
nargs='+',
|
||||
metavar='USERNAMES',
|
||||
action="store",
|
||||
help="One or more usernames to check with social networks.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tags", dest="tags", default='', help="Specify tags of sites."
|
||||
)
|
||||
# reports options
|
||||
parser.add_argument(
|
||||
"--folderoutput",
|
||||
"-fo",
|
||||
dest="folderoutput",
|
||||
default="reports",
|
||||
default=settings.reports_path,
|
||||
metavar="PATH",
|
||||
help="If using multiple usernames, the output of the results will be saved to this folder.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--proxy",
|
||||
"-p",
|
||||
metavar='PROXY_URL',
|
||||
action="store",
|
||||
dest="proxy",
|
||||
default=settings.proxy_url,
|
||||
help="Make requests over a proxy. e.g. socks5://127.0.0.1:1080",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tor-proxy",
|
||||
metavar='TOR_PROXY_URL',
|
||||
action="store",
|
||||
default=settings.tor_proxy_url,
|
||||
help="Specify URL of your Tor gateway. Default is socks5://127.0.0.1:9050",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--i2p-proxy",
|
||||
metavar='I2P_PROXY_URL',
|
||||
action="store",
|
||||
default=settings.i2p_proxy_url,
|
||||
help="Specify URL of your I2P gateway. Default is http://127.0.0.1:4444",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--with-domains",
|
||||
action="store_true",
|
||||
default=settings.domain_search,
|
||||
help="Enable (experimental) feature of checking domains on usernames.",
|
||||
)
|
||||
|
||||
filter_group = parser.add_argument_group(
|
||||
'Site filtering', 'Options to set site search scope'
|
||||
)
|
||||
filter_group.add_argument(
|
||||
"-a",
|
||||
"--all-sites",
|
||||
action="store_true",
|
||||
dest="all_sites",
|
||||
default=settings.scan_all_sites,
|
||||
help="Use all sites for scan.",
|
||||
)
|
||||
filter_group.add_argument(
|
||||
"--top-sites",
|
||||
action="store",
|
||||
default=settings.top_sites_count,
|
||||
metavar="N",
|
||||
type=int,
|
||||
help="Count of sites for scan ranked by Alexa Top (default: 500).",
|
||||
)
|
||||
filter_group.add_argument(
|
||||
"--tags", dest="tags", default='', help="Specify tags of sites (see `--stats`)."
|
||||
)
|
||||
filter_group.add_argument(
|
||||
"--site",
|
||||
action="append",
|
||||
metavar='SITE_NAME',
|
||||
dest="site_list",
|
||||
default=settings.scan_sites_list,
|
||||
help="Limit analysis to just the specified sites (multiple option).",
|
||||
)
|
||||
filter_group.add_argument(
|
||||
"--use-disabled-sites",
|
||||
action="store_true",
|
||||
default=settings.scan_disabled_sites,
|
||||
help="Use disabled sites to search (may cause many false positives).",
|
||||
)
|
||||
|
||||
modes_group = parser.add_argument_group(
|
||||
'Operating modes',
|
||||
'Various functions except the default search by a username. '
|
||||
'Modes are executed sequentially in the order of declaration.',
|
||||
)
|
||||
modes_group.add_argument(
|
||||
"--parse",
|
||||
dest="parse_url",
|
||||
default='',
|
||||
metavar='URL',
|
||||
help="Parse page by URL and extract username and IDs to use for search.",
|
||||
)
|
||||
modes_group.add_argument(
|
||||
"--submit",
|
||||
metavar='URL',
|
||||
type=str,
|
||||
dest="new_site_to_submit",
|
||||
default=False,
|
||||
help="URL of existing profile in new site to submit.",
|
||||
)
|
||||
modes_group.add_argument(
|
||||
"--self-check",
|
||||
action="store_true",
|
||||
default=settings.self_check_enabled,
|
||||
help="Do self check for sites and database and disable non-working ones.",
|
||||
)
|
||||
modes_group.add_argument(
|
||||
"--stats",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Show database statistics (most frequent sites engines and tags).",
|
||||
)
|
||||
|
||||
output_group = parser.add_argument_group(
|
||||
'Output options', 'Options to change verbosity and view of the console output'
|
||||
)
|
||||
output_group.add_argument(
|
||||
"--print-not-found",
|
||||
action="store_true",
|
||||
dest="print_not_found",
|
||||
default=settings.print_not_found,
|
||||
help="Print sites where the username was not found.",
|
||||
)
|
||||
output_group.add_argument(
|
||||
"--print-errors",
|
||||
action="store_true",
|
||||
dest="print_check_errors",
|
||||
default=settings.print_check_errors,
|
||||
help="Print errors messages: connection, captcha, site country ban, etc.",
|
||||
)
|
||||
output_group.add_argument(
|
||||
"--verbose",
|
||||
"-v",
|
||||
action="store_true",
|
||||
dest="verbose",
|
||||
default=False,
|
||||
help="Display extra information and metrics.",
|
||||
)
|
||||
output_group.add_argument(
|
||||
"--info",
|
||||
"-vv",
|
||||
action="store_true",
|
||||
dest="info",
|
||||
default=False,
|
||||
help="Display extra/service information and metrics.",
|
||||
)
|
||||
output_group.add_argument(
|
||||
"--debug",
|
||||
"-vvv",
|
||||
"-d",
|
||||
action="store_true",
|
||||
dest="debug",
|
||||
default=False,
|
||||
help="Display extra/service/debug information and metrics, save responses in debug.log.",
|
||||
)
|
||||
output_group.add_argument(
|
||||
"--no-color",
|
||||
action="store_true",
|
||||
dest="no_color",
|
||||
default=(not settings.colored_print),
|
||||
help="Don't color terminal output",
|
||||
)
|
||||
output_group.add_argument(
|
||||
"--no-progressbar",
|
||||
action="store_true",
|
||||
dest="no_progressbar",
|
||||
default=(not settings.show_progressbar),
|
||||
help="Don't show progressbar.",
|
||||
)
|
||||
|
||||
report_group = parser.add_argument_group(
|
||||
'Report formats', 'Supported formats of report files'
|
||||
)
|
||||
report_group.add_argument(
|
||||
"-T",
|
||||
"--txt",
|
||||
action="store_true",
|
||||
dest="txt",
|
||||
default=False,
|
||||
default=settings.txt_report,
|
||||
help="Create a TXT report (one report per username).",
|
||||
)
|
||||
parser.add_argument(
|
||||
report_group.add_argument(
|
||||
"-C",
|
||||
"--csv",
|
||||
action="store_true",
|
||||
dest="csv",
|
||||
default=False,
|
||||
default=settings.csv_report,
|
||||
help="Create a CSV report (one report per username).",
|
||||
)
|
||||
parser.add_argument(
|
||||
report_group.add_argument(
|
||||
"-H",
|
||||
"--html",
|
||||
action="store_true",
|
||||
dest="html",
|
||||
default=False,
|
||||
default=settings.html_report,
|
||||
help="Create an HTML report file (general report on all usernames).",
|
||||
)
|
||||
parser.add_argument(
|
||||
report_group.add_argument(
|
||||
"-X",
|
||||
"--xmind",
|
||||
action="store_true",
|
||||
dest="xmind",
|
||||
default=False,
|
||||
default=settings.xmind_report,
|
||||
help="Generate an XMind 8 mindmap report (one report per username).",
|
||||
)
|
||||
parser.add_argument(
|
||||
report_group.add_argument(
|
||||
"-P",
|
||||
"--pdf",
|
||||
action="store_true",
|
||||
dest="pdf",
|
||||
default=False,
|
||||
default=settings.pdf_report,
|
||||
help="Generate a PDF report (general report on all usernames).",
|
||||
)
|
||||
parser.add_argument(
|
||||
report_group.add_argument(
|
||||
"-G",
|
||||
"--graph",
|
||||
action="store_true",
|
||||
dest="graph",
|
||||
default=settings.graph_report,
|
||||
help="Generate a graph report (general report on all usernames).",
|
||||
)
|
||||
report_group.add_argument(
|
||||
"-J",
|
||||
"--json",
|
||||
action="store",
|
||||
metavar='REPORT_TYPE',
|
||||
metavar='TYPE',
|
||||
dest="json",
|
||||
default='',
|
||||
type=check_supported_json_format,
|
||||
default=settings.json_report_type,
|
||||
choices=SUPPORTED_JSON_REPORT_FORMATS,
|
||||
help=f"Generate a JSON report of specific type: {', '.join(SUPPORTED_JSON_REPORT_FORMATS)}"
|
||||
" (one report per username).",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--reports-sorting",
|
||||
default=settings.report_sorting,
|
||||
choices=('default', 'data'),
|
||||
help="Method of results sorting in reports (default: in order of getting the result)",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
async def main():
|
||||
arg_parser = setup_arguments_parser()
|
||||
args = arg_parser.parse_args()
|
||||
|
||||
# Logging
|
||||
log_level = logging.ERROR
|
||||
logging.basicConfig(
|
||||
@@ -356,22 +463,34 @@ async def main():
|
||||
datefmt='%H:%M:%S',
|
||||
level=log_level,
|
||||
)
|
||||
logger = logging.getLogger('maigret')
|
||||
logger.setLevel(log_level)
|
||||
|
||||
# Load settings
|
||||
settings = Settings()
|
||||
settings_loaded, err = settings.load()
|
||||
|
||||
if not settings_loaded:
|
||||
logger.error(err)
|
||||
sys.exit(3)
|
||||
|
||||
arg_parser = setup_arguments_parser(settings)
|
||||
args = arg_parser.parse_args()
|
||||
|
||||
# Re-set loggging level based on args
|
||||
if args.debug:
|
||||
log_level = logging.DEBUG
|
||||
elif args.info:
|
||||
log_level = logging.INFO
|
||||
elif args.verbose:
|
||||
log_level = logging.WARNING
|
||||
|
||||
logger = logging.getLogger('maigret')
|
||||
logger.setLevel(log_level)
|
||||
|
||||
# Usernames initial list
|
||||
usernames = {
|
||||
u: args.id_type
|
||||
for u in args.username
|
||||
if u not in ['-'] and u not in args.ignore_ids_list
|
||||
if u and u not in ['-'] and u not in args.ignore_ids_list
|
||||
}
|
||||
|
||||
parsing_enabled = not args.disable_extracting
|
||||
@@ -382,39 +501,15 @@ async def main():
|
||||
print("Using the proxy: " + args.proxy)
|
||||
|
||||
if args.parse_url:
|
||||
# url, headers
|
||||
reqs = [(args.parse_url, set())]
|
||||
try:
|
||||
# temporary workaround for URL mutations MVP
|
||||
from socid_extractor import mutate_url
|
||||
|
||||
reqs += list(mutate_url(args.parse_url))
|
||||
except Exception as e:
|
||||
logger.warning(e)
|
||||
pass
|
||||
|
||||
for req in reqs:
|
||||
url, headers = req
|
||||
print(f'Scanning webpage by URL {url}...')
|
||||
page, _ = parse(url, cookies_str='', headers=headers)
|
||||
info = extract(page)
|
||||
if not info:
|
||||
print('Nothing extracted')
|
||||
else:
|
||||
print(get_dict_ascii_tree(info.items(), new_line=False), ' ')
|
||||
for k, v in info.items():
|
||||
if 'username' in k:
|
||||
usernames[v] = 'username'
|
||||
if k in supported_recursive_search_ids:
|
||||
usernames[v] = k
|
||||
extracted_ids = extract_ids_from_page(
|
||||
args.parse_url, logger, timeout=args.timeout
|
||||
)
|
||||
usernames.update(extracted_ids)
|
||||
|
||||
if args.tags:
|
||||
args.tags = list(set(str(args.tags).split(',')))
|
||||
|
||||
if args.db_file is None:
|
||||
args.db_file = os.path.join(
|
||||
os.path.dirname(os.path.realpath(__file__)), "resources/data.json"
|
||||
)
|
||||
db_file = path.join(path.dirname(path.realpath(__file__)), args.db_file)
|
||||
|
||||
if args.top_sites == 0 or args.all_sites:
|
||||
args.top_sites = sys.maxsize
|
||||
@@ -429,51 +524,57 @@ async def main():
|
||||
)
|
||||
|
||||
# Create object with all information about sites we are aware of.
|
||||
db = MaigretDatabase().load_from_file(args.db_file)
|
||||
db = MaigretDatabase().load_from_path(db_file)
|
||||
get_top_sites_for_id = lambda x: db.ranked_sites_dict(
|
||||
top=args.top_sites,
|
||||
tags=args.tags,
|
||||
names=args.site_list,
|
||||
disabled=False,
|
||||
disabled=args.use_disabled_sites,
|
||||
id_type=x,
|
||||
)
|
||||
|
||||
site_data = get_top_sites_for_id(args.id_type)
|
||||
|
||||
if args.new_site_to_submit:
|
||||
is_submitted = await submit_dialog(
|
||||
db, args.new_site_to_submit, args.cookie_file, logger
|
||||
)
|
||||
submitter = Submitter(db=db, logger=logger, settings=settings, args=args)
|
||||
is_submitted = await submitter.dialog(args.new_site_to_submit, args.cookie_file)
|
||||
if is_submitted:
|
||||
db.save_to_file(args.db_file)
|
||||
db.save_to_file(db_file)
|
||||
|
||||
# Database self-checking
|
||||
if args.self_check:
|
||||
print('Maigret sites database self-checking...')
|
||||
is_need_update = await self_check(
|
||||
db, site_data, logger, max_connections=args.connections
|
||||
db,
|
||||
site_data,
|
||||
logger,
|
||||
proxy=args.proxy,
|
||||
max_connections=args.connections,
|
||||
tor_proxy=args.tor_proxy,
|
||||
i2p_proxy=args.i2p_proxy,
|
||||
)
|
||||
if is_need_update:
|
||||
if input('Do you want to save changes permanently? [Yn]\n').lower() == 'y':
|
||||
db.save_to_file(args.db_file)
|
||||
if input('Do you want to save changes permanently? [Yn]\n').lower() in (
|
||||
'y',
|
||||
'',
|
||||
):
|
||||
db.save_to_file(db_file)
|
||||
print('Database was successfully updated.')
|
||||
else:
|
||||
print('Updates will be applied only for current search session.')
|
||||
print(db.get_scan_stats(site_data))
|
||||
print('Scan sessions flags stats: ' + str(db.get_scan_stats(site_data)))
|
||||
|
||||
# Database statistics
|
||||
if args.stats:
|
||||
print(db.get_db_stats(db.sites_dict))
|
||||
print(db.get_db_stats())
|
||||
|
||||
report_dir = path.join(os.getcwd(), args.folderoutput)
|
||||
|
||||
# Make reports folder is not exists
|
||||
os.makedirs(args.folderoutput, exist_ok=True)
|
||||
os.makedirs(report_dir, exist_ok=True)
|
||||
|
||||
# Define one report filename template
|
||||
report_filepath_tpl = os.path.join(args.folderoutput, 'report_{username}{postfix}')
|
||||
|
||||
# Database stats
|
||||
# TODO: verbose info about filtered sites
|
||||
# enabled_count = len(list(filter(lambda x: not x.disabled, site_data.values())))
|
||||
# print(f'Sites in database, enabled/total: {enabled_count}/{len(site_data)}')
|
||||
report_filepath_tpl = path.join(report_dir, 'report_{username}{postfix}')
|
||||
|
||||
if usernames == {}:
|
||||
# magic params to exit after init
|
||||
@@ -483,14 +584,14 @@ async def main():
|
||||
if not site_data:
|
||||
query_notify.warning('No sites to check, exiting!')
|
||||
sys.exit(2)
|
||||
else:
|
||||
|
||||
query_notify.warning(
|
||||
f'Starting a search on top {len(site_data)} sites from the Maigret database...'
|
||||
)
|
||||
if not args.all_sites:
|
||||
query_notify.warning(
|
||||
f'Starting a search on top {len(site_data)} sites from the Maigret database...'
|
||||
'You can run search by full list of sites with flag `-a`', '!'
|
||||
)
|
||||
if not args.all_sites:
|
||||
query_notify.warning(
|
||||
'You can run search by full list of sites with flag `-a`', '!'
|
||||
)
|
||||
|
||||
already_checked = set()
|
||||
general_results = []
|
||||
@@ -501,8 +602,8 @@ async def main():
|
||||
|
||||
if username.lower() in already_checked:
|
||||
continue
|
||||
else:
|
||||
already_checked.add(username.lower())
|
||||
|
||||
already_checked.add(username.lower())
|
||||
|
||||
if username in args.ignore_ids_list:
|
||||
query_notify.warning(
|
||||
@@ -511,10 +612,7 @@ async def main():
|
||||
continue
|
||||
|
||||
# check for characters do not supported by sites generally
|
||||
found_unsupported_chars = set(unsupported_characters).intersection(
|
||||
set(username)
|
||||
)
|
||||
|
||||
found_unsupported_chars = set(BAD_CHARS).intersection(set(username))
|
||||
if found_unsupported_chars:
|
||||
pretty_chars_str = ','.join(
|
||||
map(lambda s: f'"{s}"', found_unsupported_chars)
|
||||
@@ -531,6 +629,8 @@ async def main():
|
||||
site_dict=dict(sites_to_check),
|
||||
query_notify=query_notify,
|
||||
proxy=args.proxy,
|
||||
tor_proxy=args.tor_proxy,
|
||||
i2p_proxy=args.i2p_proxy,
|
||||
timeout=args.timeout,
|
||||
is_parsing_enabled=parsing_enabled,
|
||||
id_type=id_type,
|
||||
@@ -541,29 +641,20 @@ async def main():
|
||||
max_connections=args.connections,
|
||||
no_progressbar=args.no_progressbar,
|
||||
retries=args.retries,
|
||||
check_domains=args.with_domains,
|
||||
)
|
||||
|
||||
notify_about_errors(results, query_notify)
|
||||
|
||||
if args.reports_sorting == "data":
|
||||
results = sort_report_by_data_points(results)
|
||||
|
||||
general_results.append((username, id_type, results))
|
||||
|
||||
# TODO: tests
|
||||
for website_name in results:
|
||||
dictionary = results[website_name]
|
||||
# TODO: fix no site data issue
|
||||
if not dictionary or not recursive_search_enabled:
|
||||
continue
|
||||
|
||||
new_usernames = dictionary.get('ids_usernames')
|
||||
if new_usernames:
|
||||
for u, utype in new_usernames.items():
|
||||
usernames[u] = utype
|
||||
|
||||
for url in dictionary.get('ids_links', []):
|
||||
for s in db.sites:
|
||||
u = s.detect_username(url)
|
||||
if u:
|
||||
usernames[u] = 'username'
|
||||
if recursive_search_enabled:
|
||||
extracted_ids = extract_ids_from_results(results, db)
|
||||
usernames.update(extracted_ids)
|
||||
|
||||
# reporting for a one username
|
||||
if args.xmind:
|
||||
@@ -599,7 +690,9 @@ async def main():
|
||||
username = report_context['username']
|
||||
|
||||
if args.html:
|
||||
filename = report_filepath_tpl.format(username=username, postfix='.html')
|
||||
filename = report_filepath_tpl.format(
|
||||
username=username, postfix='_plain.html'
|
||||
)
|
||||
save_html_report(filename, report_context)
|
||||
query_notify.warning(f'HTML report on all usernames saved in {filename}')
|
||||
|
||||
@@ -607,8 +700,21 @@ async def main():
|
||||
filename = report_filepath_tpl.format(username=username, postfix='.pdf')
|
||||
save_pdf_report(filename, report_context)
|
||||
query_notify.warning(f'PDF report on all usernames saved in {filename}')
|
||||
|
||||
if args.graph:
|
||||
filename = report_filepath_tpl.format(
|
||||
username=username, postfix='_graph.html'
|
||||
)
|
||||
save_graph_report(filename, general_results, db)
|
||||
query_notify.warning(f'Graph report on all usernames saved in {filename}')
|
||||
|
||||
text_report = get_plaintext_report(report_context)
|
||||
if text_report:
|
||||
query_notify.info('Short text report:')
|
||||
print(text_report)
|
||||
|
||||
# update database
|
||||
db.save_to_file(args.db_file)
|
||||
db.save_to_file(db_file)
|
||||
|
||||
|
||||
def run():
|
||||
|
||||
@@ -152,6 +152,27 @@ class QueryNotifyPrint(QueryNotify):
|
||||
|
||||
return
|
||||
|
||||
def make_colored_terminal_notify(
|
||||
self, status, text, status_color, text_color, appendix
|
||||
):
|
||||
text = [
|
||||
f"{Style.BRIGHT}{Fore.WHITE}[{status_color}{status}{Fore.WHITE}]"
|
||||
+ f"{text_color} {text}: {Style.RESET_ALL}"
|
||||
+ f"{appendix}"
|
||||
]
|
||||
return "".join(text)
|
||||
|
||||
def make_simple_terminal_notify(
|
||||
self, status, text, status_color, text_color, appendix
|
||||
):
|
||||
return f"[{status}] {text}: {appendix}"
|
||||
|
||||
def make_terminal_notify(self, *args):
|
||||
if self.color:
|
||||
return self.make_colored_terminal_notify(*args)
|
||||
else:
|
||||
return self.make_simple_terminal_notify(*args)
|
||||
|
||||
def start(self, message, id_type):
|
||||
"""Notify Start.
|
||||
|
||||
@@ -184,13 +205,20 @@ class QueryNotifyPrint(QueryNotify):
|
||||
else:
|
||||
print(f"[*] {title} {message} on:")
|
||||
|
||||
def warning(self, message, symbol="-"):
|
||||
msg = f"[{symbol}] {message}"
|
||||
def _colored_print(self, fore_color, msg):
|
||||
if self.color:
|
||||
print(Style.BRIGHT + Fore.YELLOW + msg)
|
||||
print(Style.BRIGHT + fore_color + msg)
|
||||
else:
|
||||
print(msg)
|
||||
|
||||
def warning(self, message, symbol="-"):
|
||||
msg = f"[{symbol}] {message}"
|
||||
self._colored_print(Fore.YELLOW, msg)
|
||||
|
||||
def info(self, message, symbol="*"):
|
||||
msg = f"[{symbol}] {message}"
|
||||
self._colored_print(Fore.BLUE, msg)
|
||||
|
||||
def update(self, result, is_similar=False):
|
||||
"""Notify Update.
|
||||
|
||||
@@ -204,40 +232,18 @@ class QueryNotifyPrint(QueryNotify):
|
||||
Return Value:
|
||||
Nothing.
|
||||
"""
|
||||
notify = None
|
||||
self.result = result
|
||||
|
||||
if not self.result.ids_data:
|
||||
ids_data_text = ""
|
||||
else:
|
||||
ids_data_text = ""
|
||||
if self.result.ids_data:
|
||||
ids_data_text = get_dict_ascii_tree(self.result.ids_data.items(), " ")
|
||||
|
||||
def make_colored_terminal_notify(
|
||||
status, text, status_color, text_color, appendix
|
||||
):
|
||||
text = [
|
||||
f"{Style.BRIGHT}{Fore.WHITE}[{status_color}{status}{Fore.WHITE}]"
|
||||
+ f"{text_color} {text}: {Style.RESET_ALL}"
|
||||
+ f"{appendix}"
|
||||
]
|
||||
return "".join(text)
|
||||
|
||||
def make_simple_terminal_notify(status, text, appendix):
|
||||
return f"[{status}] {text}: {appendix}"
|
||||
|
||||
def make_terminal_notify(is_colored=True, *args):
|
||||
if is_colored:
|
||||
return make_colored_terminal_notify(*args)
|
||||
else:
|
||||
return make_simple_terminal_notify(*args)
|
||||
|
||||
notify = None
|
||||
|
||||
# Output to the terminal is desired.
|
||||
if result.status == QueryStatus.CLAIMED:
|
||||
color = Fore.BLUE if is_similar else Fore.GREEN
|
||||
status = "?" if is_similar else "+"
|
||||
notify = make_terminal_notify(
|
||||
self.color,
|
||||
notify = self.make_terminal_notify(
|
||||
status,
|
||||
result.site_name,
|
||||
color,
|
||||
@@ -246,8 +252,7 @@ class QueryNotifyPrint(QueryNotify):
|
||||
)
|
||||
elif result.status == QueryStatus.AVAILABLE:
|
||||
if not self.print_found_only:
|
||||
notify = make_terminal_notify(
|
||||
self.color,
|
||||
notify = self.make_terminal_notify(
|
||||
"-",
|
||||
result.site_name,
|
||||
Fore.RED,
|
||||
@@ -256,8 +261,7 @@ class QueryNotifyPrint(QueryNotify):
|
||||
)
|
||||
elif result.status == QueryStatus.UNKNOWN:
|
||||
if not self.skip_check_errors:
|
||||
notify = make_terminal_notify(
|
||||
self.color,
|
||||
notify = self.make_terminal_notify(
|
||||
"?",
|
||||
result.site_name,
|
||||
Fore.RED,
|
||||
@@ -267,8 +271,7 @@ class QueryNotifyPrint(QueryNotify):
|
||||
elif result.status == QueryStatus.ILLEGAL:
|
||||
if not self.print_found_only:
|
||||
text = "Illegal Username Format For This Site!"
|
||||
notify = make_terminal_notify(
|
||||
self.color,
|
||||
notify = self.make_terminal_notify(
|
||||
"-",
|
||||
result.site_name,
|
||||
Fore.RED,
|
||||
@@ -286,7 +289,7 @@ class QueryNotifyPrint(QueryNotify):
|
||||
sys.stdout.write("\x1b[1K\r")
|
||||
print(notify)
|
||||
|
||||
return
|
||||
return notify
|
||||
|
||||
def __str__(self):
|
||||
"""Convert Object To String.
|
||||
|
||||
@@ -1,19 +1,19 @@
|
||||
import ast
|
||||
import csv
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from argparse import ArgumentTypeError
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any
|
||||
|
||||
import pycountry
|
||||
import xmind
|
||||
from dateutil.parser import parse as parse_datetime_str
|
||||
from jinja2 import Template
|
||||
from xhtml2pdf import pisa
|
||||
|
||||
from .checking import SUPPORTED_IDS
|
||||
from .result import QueryStatus
|
||||
from .sites import MaigretDatabase
|
||||
from .utils import is_country_tag, CaseConverter, enrich_link_str
|
||||
|
||||
SUPPORTED_JSON_REPORT_FORMATS = [
|
||||
@@ -37,6 +37,18 @@ def filter_supposed_data(data):
|
||||
return filtered_supposed_data
|
||||
|
||||
|
||||
def sort_report_by_data_points(results):
|
||||
return dict(
|
||||
sorted(
|
||||
results.items(),
|
||||
key=lambda x: len(
|
||||
(x[1].get('status') and x[1]['status'].ids_data or {}).keys()
|
||||
),
|
||||
reverse=True,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
"""
|
||||
REPORTS SAVING
|
||||
"""
|
||||
@@ -62,6 +74,10 @@ def save_html_report(filename: str, context: dict):
|
||||
def save_pdf_report(filename: str, context: dict):
|
||||
template, css = generate_report_template(is_pdf=True)
|
||||
filled_template = template.render(**context)
|
||||
|
||||
# moved here to speed up the launch of Maigret
|
||||
from xhtml2pdf import pisa
|
||||
|
||||
with open(filename, "w+b") as f:
|
||||
pisa.pisaDocument(io.StringIO(filled_template), dest=f, default_css=css)
|
||||
|
||||
@@ -71,6 +87,142 @@ def save_json_report(filename: str, username: str, results: dict, report_type: s
|
||||
generate_json_report(username, results, f, report_type=report_type)
|
||||
|
||||
|
||||
class MaigretGraph:
|
||||
other_params = {'size': 10, 'group': 3}
|
||||
site_params = {'size': 15, 'group': 2}
|
||||
username_params = {'size': 20, 'group': 1}
|
||||
|
||||
def __init__(self, graph):
|
||||
self.G = graph
|
||||
|
||||
def add_node(self, key, value):
|
||||
node_name = f'{key}: {value}'
|
||||
|
||||
params = self.other_params
|
||||
if key in SUPPORTED_IDS:
|
||||
params = self.username_params
|
||||
elif value.startswith('http'):
|
||||
params = self.site_params
|
||||
|
||||
self.G.add_node(node_name, title=node_name, **params)
|
||||
|
||||
if value != value.lower():
|
||||
normalized_node_name = self.add_node(key, value.lower())
|
||||
self.link(node_name, normalized_node_name)
|
||||
|
||||
return node_name
|
||||
|
||||
def link(self, node1_name, node2_name):
|
||||
self.G.add_edge(node1_name, node2_name, weight=2)
|
||||
|
||||
|
||||
def save_graph_report(filename: str, username_results: list, db: MaigretDatabase):
|
||||
# moved here to speed up the launch of Maigret
|
||||
import networkx as nx
|
||||
|
||||
G = nx.Graph()
|
||||
graph = MaigretGraph(G)
|
||||
|
||||
for username, id_type, results in username_results:
|
||||
username_node_name = graph.add_node(id_type, username)
|
||||
|
||||
for website_name in results:
|
||||
dictionary = results[website_name]
|
||||
# TODO: fix no site data issue
|
||||
if not dictionary:
|
||||
continue
|
||||
|
||||
if dictionary.get("is_similar"):
|
||||
continue
|
||||
|
||||
status = dictionary.get("status")
|
||||
if not status: # FIXME: currently in case of timeout
|
||||
continue
|
||||
|
||||
if dictionary["status"].status != QueryStatus.CLAIMED:
|
||||
continue
|
||||
|
||||
site_fallback_name = dictionary.get(
|
||||
'url_user', f'{website_name}: {username.lower()}'
|
||||
)
|
||||
# site_node_name = dictionary.get('url_user', f'{website_name}: {username.lower()}')
|
||||
site_node_name = graph.add_node('site', site_fallback_name)
|
||||
graph.link(username_node_name, site_node_name)
|
||||
|
||||
def process_ids(parent_node, ids):
|
||||
for k, v in ids.items():
|
||||
if k.endswith('_count') or k.startswith('is_') or k.endswith('_at'):
|
||||
continue
|
||||
if k in 'image':
|
||||
continue
|
||||
|
||||
v_data = v
|
||||
if v.startswith('['):
|
||||
try:
|
||||
v_data = ast.literal_eval(v)
|
||||
except Exception as e:
|
||||
logging.error(e)
|
||||
|
||||
# value is a list
|
||||
if isinstance(v_data, list):
|
||||
list_node_name = graph.add_node(k, site_fallback_name)
|
||||
for vv in v_data:
|
||||
data_node_name = graph.add_node(vv, site_fallback_name)
|
||||
graph.link(list_node_name, data_node_name)
|
||||
|
||||
add_ids = {
|
||||
a: b for b, a in db.extract_ids_from_url(vv).items()
|
||||
}
|
||||
if add_ids:
|
||||
process_ids(data_node_name, add_ids)
|
||||
else:
|
||||
# value is just a string
|
||||
# ids_data_name = f'{k}: {v}'
|
||||
# if ids_data_name == parent_node:
|
||||
# continue
|
||||
|
||||
ids_data_name = graph.add_node(k, v)
|
||||
# G.add_node(ids_data_name, size=10, title=ids_data_name, group=3)
|
||||
graph.link(parent_node, ids_data_name)
|
||||
|
||||
# check for username
|
||||
if 'username' in k or k in SUPPORTED_IDS:
|
||||
new_username_node_name = graph.add_node('username', v)
|
||||
graph.link(ids_data_name, new_username_node_name)
|
||||
|
||||
add_ids = {k: v for v, k in db.extract_ids_from_url(v).items()}
|
||||
if add_ids:
|
||||
process_ids(ids_data_name, add_ids)
|
||||
|
||||
if status.ids_data:
|
||||
process_ids(site_node_name, status.ids_data)
|
||||
|
||||
nodes_to_remove = []
|
||||
for node in G.nodes:
|
||||
if len(str(node)) > 100:
|
||||
nodes_to_remove.append(node)
|
||||
|
||||
[G.remove_node(node) for node in nodes_to_remove]
|
||||
|
||||
# moved here to speed up the launch of Maigret
|
||||
from pyvis.network import Network
|
||||
|
||||
nt = Network(notebook=True, height="750px", width="100%")
|
||||
nt.from_nx(G)
|
||||
nt.show(filename)
|
||||
|
||||
|
||||
def get_plaintext_report(context: dict) -> str:
|
||||
output = (context['brief'] + " ").replace('. ', '.\n')
|
||||
interests = list(map(lambda x: x[0], context.get('interests_tuple_list', [])))
|
||||
countries = list(map(lambda x: x[0], context.get('countries_tuple_list', [])))
|
||||
if countries:
|
||||
output += f'Countries: {", ".join(countries)}\n'
|
||||
if interests:
|
||||
output += f'Interests (tags): {", ".join(interests)}\n'
|
||||
return output.strip()
|
||||
|
||||
|
||||
"""
|
||||
REPORTS GENERATING
|
||||
"""
|
||||
@@ -108,6 +260,9 @@ def generate_report_context(username_results: list):
|
||||
|
||||
first_seen = None
|
||||
|
||||
# moved here to speed up the launch of Maigret
|
||||
import pycountry
|
||||
|
||||
for username, id_type, results in username_results:
|
||||
found_accounts = 0
|
||||
new_ids = []
|
||||
@@ -216,6 +371,7 @@ def generate_report_context(username_results: list):
|
||||
|
||||
return {
|
||||
"username": first_username,
|
||||
# TODO: return brief list
|
||||
"brief": brief,
|
||||
"results": username_results,
|
||||
"first_seen": first_seen,
|
||||
@@ -232,14 +388,18 @@ def generate_csv_report(username: str, results: dict, csvfile):
|
||||
["username", "name", "url_main", "url_user", "exists", "http_status"]
|
||||
)
|
||||
for site in results:
|
||||
# TODO: fix the reason
|
||||
status = 'Unknown'
|
||||
if "status" in results[site]:
|
||||
status = str(results[site]["status"].status)
|
||||
writer.writerow(
|
||||
[
|
||||
username,
|
||||
site,
|
||||
results[site]["url_main"],
|
||||
results[site]["url_user"],
|
||||
str(results[site]["status"].status),
|
||||
results[site]["http_status"],
|
||||
results[site].get("url_main", ""),
|
||||
results[site].get("url_user", ""),
|
||||
status,
|
||||
results[site].get("http_status", 0),
|
||||
]
|
||||
)
|
||||
|
||||
@@ -251,7 +411,10 @@ def generate_txt_report(username: str, results: dict, file):
|
||||
# TODO: fix no site data issue
|
||||
if not dictionary:
|
||||
continue
|
||||
if dictionary.get("status").status == QueryStatus.CLAIMED:
|
||||
if (
|
||||
dictionary.get("status")
|
||||
and dictionary["status"].status == QueryStatus.CLAIMED
|
||||
):
|
||||
exists_counter += 1
|
||||
file.write(dictionary["url_user"] + "\n")
|
||||
file.write(f"Total Websites Username Detected On : {exists_counter}")
|
||||
@@ -264,11 +427,18 @@ def generate_json_report(username: str, results: dict, file, report_type):
|
||||
for sitename in results:
|
||||
site_result = results[sitename]
|
||||
# TODO: fix no site data issue
|
||||
if not site_result or site_result.get("status").status != QueryStatus.CLAIMED:
|
||||
if not site_result or not site_result.get("status"):
|
||||
continue
|
||||
|
||||
if site_result["status"].status != QueryStatus.CLAIMED:
|
||||
continue
|
||||
|
||||
data = dict(site_result)
|
||||
data["status"] = data["status"].json()
|
||||
data["site"] = data["site"].json
|
||||
for field in ["future", "checker"]:
|
||||
if field in data:
|
||||
del data[field]
|
||||
|
||||
if is_report_per_line:
|
||||
data["sitename"] = sitename
|
||||
@@ -290,11 +460,20 @@ def save_xmind_report(filename, username, results):
|
||||
os.remove(filename)
|
||||
workbook = xmind.load(filename)
|
||||
sheet = workbook.getPrimarySheet()
|
||||
design_sheet(sheet, username, results)
|
||||
design_xmind_sheet(sheet, username, results)
|
||||
xmind.save(workbook, path=filename)
|
||||
|
||||
|
||||
def design_sheet(sheet, username, results):
|
||||
def add_xmind_subtopic(userlink, k, v, supposed_data):
|
||||
currentsublabel = userlink.addSubTopic()
|
||||
field = "fullname" if k == "name" else k
|
||||
if field not in supposed_data:
|
||||
supposed_data[field] = []
|
||||
supposed_data[field].append(v)
|
||||
currentsublabel.setTitle("%s: %s" % (k, v))
|
||||
|
||||
|
||||
def design_xmind_sheet(sheet, username, results):
|
||||
alltags = {}
|
||||
supposed_data = {}
|
||||
|
||||
@@ -308,64 +487,45 @@ def design_sheet(sheet, username, results):
|
||||
|
||||
for website_name in results:
|
||||
dictionary = results[website_name]
|
||||
if not dictionary:
|
||||
continue
|
||||
result_status = dictionary.get("status")
|
||||
# TODO: fix the reason
|
||||
if not result_status or result_status.status != QueryStatus.CLAIMED:
|
||||
continue
|
||||
|
||||
if dictionary.get("status").status == QueryStatus.CLAIMED:
|
||||
# firsttime I found that entry
|
||||
for tag in dictionary.get("status").tags:
|
||||
if tag.strip() == "":
|
||||
continue
|
||||
if tag not in alltags.keys():
|
||||
if not is_country_tag(tag):
|
||||
tagsection = root_topic1.addSubTopic()
|
||||
tagsection.setTitle(tag)
|
||||
alltags[tag] = tagsection
|
||||
stripped_tags = list(map(lambda x: x.strip(), result_status.tags))
|
||||
normalized_tags = list(
|
||||
filter(lambda x: x and not is_country_tag(x), stripped_tags)
|
||||
)
|
||||
|
||||
category = None
|
||||
for tag in dictionary.get("status").tags:
|
||||
if tag.strip() == "":
|
||||
continue
|
||||
if not is_country_tag(tag):
|
||||
category = tag
|
||||
category = None
|
||||
for tag in normalized_tags:
|
||||
if tag in alltags.keys():
|
||||
continue
|
||||
tagsection = root_topic1.addSubTopic()
|
||||
tagsection.setTitle(tag)
|
||||
alltags[tag] = tagsection
|
||||
category = tag
|
||||
|
||||
if category is None:
|
||||
userlink = undefinedsection.addSubTopic()
|
||||
userlink.addLabel(dictionary.get("status").site_url_user)
|
||||
section = alltags[category] if category else undefinedsection
|
||||
userlink = section.addSubTopic()
|
||||
userlink.addLabel(result_status.site_url_user)
|
||||
|
||||
ids_data = result_status.ids_data or {}
|
||||
for k, v in ids_data.items():
|
||||
# suppose target data
|
||||
if isinstance(v, list):
|
||||
for currentval in v:
|
||||
add_xmind_subtopic(userlink, k, currentval, supposed_data)
|
||||
else:
|
||||
userlink = alltags[category].addSubTopic()
|
||||
userlink.addLabel(dictionary.get("status").site_url_user)
|
||||
add_xmind_subtopic(userlink, k, v, supposed_data)
|
||||
|
||||
if dictionary.get("status").ids_data:
|
||||
for k, v in dictionary.get("status").ids_data.items():
|
||||
# suppose target data
|
||||
if not isinstance(v, list):
|
||||
currentsublabel = userlink.addSubTopic()
|
||||
field = "fullname" if k == "name" else k
|
||||
if field not in supposed_data:
|
||||
supposed_data[field] = []
|
||||
supposed_data[field].append(v)
|
||||
currentsublabel.setTitle("%s: %s" % (k, v))
|
||||
else:
|
||||
for currentval in v:
|
||||
currentsublabel = userlink.addSubTopic()
|
||||
field = "fullname" if k == "name" else k
|
||||
if field not in supposed_data:
|
||||
supposed_data[field] = []
|
||||
supposed_data[field].append(currentval)
|
||||
currentsublabel.setTitle("%s: %s" % (k, currentval))
|
||||
# add supposed data
|
||||
filterede_supposed_data = filter_supposed_data(supposed_data)
|
||||
if len(filterede_supposed_data) > 0:
|
||||
filtered_supposed_data = filter_supposed_data(supposed_data)
|
||||
if len(filtered_supposed_data) > 0:
|
||||
undefinedsection = root_topic1.addSubTopic()
|
||||
undefinedsection.setTitle("SUPPOSED DATA")
|
||||
for k, v in filterede_supposed_data.items():
|
||||
for k, v in filtered_supposed_data.items():
|
||||
currentsublabel = undefinedsection.addSubTopic()
|
||||
currentsublabel.setTitle("%s: %s" % (k, v))
|
||||
|
||||
|
||||
def check_supported_json_format(value):
|
||||
if value and value not in SUPPORTED_JSON_REPORT_FORMATS:
|
||||
raise ArgumentTypeError(
|
||||
"JSON report type must be one of the following types: "
|
||||
+ ", ".join(SUPPORTED_JSON_REPORT_FORMATS)
|
||||
)
|
||||
return value
|
||||
|
||||
@@ -0,0 +1,48 @@
|
||||
{
|
||||
"presence_strings": [
|
||||
"username",
|
||||
"not found",
|
||||
"пользователь",
|
||||
"profile",
|
||||
"lastname",
|
||||
"firstname",
|
||||
"biography",
|
||||
"birthday",
|
||||
"репутация",
|
||||
"информация",
|
||||
"e-mail"
|
||||
],
|
||||
"supposed_usernames": [
|
||||
"alex", "god", "admin", "red", "blue", "john"
|
||||
],
|
||||
"retries_count": 1,
|
||||
"sites_db_path": "resources/data.json",
|
||||
"timeout": 30,
|
||||
"max_connections": 100,
|
||||
"recursive_search": true,
|
||||
"info_extracting": true,
|
||||
"cookie_jar_file": null,
|
||||
"ignore_ids_list": [],
|
||||
"reports_path": "reports",
|
||||
"proxy_url": null,
|
||||
"tor_proxy_url": "socks5://127.0.0.1:9050",
|
||||
"i2p_proxy_url": "http://127.0.0.1:4444",
|
||||
"domain_search": false,
|
||||
"scan_all_sites": false,
|
||||
"top_sites_count": 500,
|
||||
"scan_disabled_sites": false,
|
||||
"scan_sites_list": [],
|
||||
"self_check_enabled": false,
|
||||
"print_not_found": false,
|
||||
"print_check_errors": false,
|
||||
"colored_print": true,
|
||||
"show_progressbar": true,
|
||||
"report_sorting": "default",
|
||||
"json_report_type": "",
|
||||
"txt_report": false,
|
||||
"csv_report": false,
|
||||
"xmind_report": false,
|
||||
"graph_report": false,
|
||||
"pdf_report": false,
|
||||
"html_report": false
|
||||
}
|
||||
@@ -68,7 +68,8 @@
|
||||
<div class="row-mb">
|
||||
<div class="col-md">
|
||||
<div class="card flex-md-row mb-4 box-shadow h-md-250">
|
||||
<img class="card-img-right flex-auto d-md-block" alt="Photo" style="width: 200px; height: 200px; object-fit: scale-down;" src="{{ v.status.ids_data.image or 'https://i.imgur.com/040fmbw.png' }}" data-holder-rendered="true">
|
||||
<span style="position: absolute; right: 10px;"><a href="https://github.com/soxoj/maigret/issues/new?assignees=soxoj&labels=bug&template=report-false-result.md&title=Invalid%20result%20{{ v.url_user }}">Invalid?</a></span>
|
||||
<img class="card-img-right flex-auto d-md-block" alt="Photo" style="width: 200px; height: 200px; object-fit: scale-down;" src="{{ v.status and v.status.ids_data and v.status.ids_data.image or 'https://i.imgur.com/040fmbw.png' }}" data-holder-rendered="true">
|
||||
<div class="card-body d-flex flex-column align-items-start" style="padding-top: 0;">
|
||||
<h3 class="mb-0" style="padding-top: 1rem;">
|
||||
<a class="text-dark" href="{{ v.url_main }}" target="_blank">{{ k }}</a>
|
||||
|
||||
@@ -38,4 +38,8 @@ div {
|
||||
border-bottom-color: #3e3e3e;
|
||||
border-bottom-width: 1px;
|
||||
border-bottom-style: solid;
|
||||
}
|
||||
.invalid-button {
|
||||
position: absolute;
|
||||
left: 10px;
|
||||
}
|
||||
@@ -64,6 +64,7 @@
|
||||
<div class="sitebox" style="margin-top: 20px;" >
|
||||
<div>
|
||||
<div>
|
||||
<span class="invalid-button"><a href="https://github.com/soxoj/maigret/issues/new?assignees=soxoj&labels=bug&template=report-false-result.md&title=Invalid%20result%20{{ v.url_user }}">Invalid?</a></span>
|
||||
<table>
|
||||
<tr>
|
||||
<td valign="top">
|
||||
|
||||
@@ -0,0 +1,85 @@
|
||||
import os
|
||||
import os.path as path
|
||||
import json
|
||||
from typing import List
|
||||
|
||||
SETTINGS_FILES_PATHS = [
|
||||
path.join(path.dirname(path.realpath(__file__)), "resources/settings.json"),
|
||||
'~/.maigret/settings.json',
|
||||
path.join(os.getcwd(), 'settings.json'),
|
||||
]
|
||||
|
||||
|
||||
class Settings:
|
||||
# main maigret setting
|
||||
retries_count: int
|
||||
sites_db_path: str
|
||||
timeout: int
|
||||
max_connections: int
|
||||
recursive_search: bool
|
||||
info_extracting: bool
|
||||
cookie_jar_file: str
|
||||
ignore_ids_list: List
|
||||
reports_path: str
|
||||
proxy_url: str
|
||||
tor_proxy_url: str
|
||||
i2p_proxy_url: str
|
||||
domain_search: bool
|
||||
scan_all_sites: bool
|
||||
top_sites_count: int
|
||||
scan_disabled_sites: bool
|
||||
scan_sites_list: List
|
||||
self_check_enabled: bool
|
||||
print_not_found: bool
|
||||
print_check_errors: bool
|
||||
colored_print: bool
|
||||
show_progressbar: bool
|
||||
report_sorting: str
|
||||
json_report_type: str
|
||||
txt_report: bool
|
||||
csv_report: bool
|
||||
xmind_report: bool
|
||||
pdf_report: bool
|
||||
html_report: bool
|
||||
graph_report: bool
|
||||
|
||||
# submit mode settings
|
||||
presence_strings: list
|
||||
supposed_usernames: list
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def load(self, paths=None):
|
||||
was_inited = False
|
||||
|
||||
if not paths:
|
||||
paths = SETTINGS_FILES_PATHS
|
||||
|
||||
for filename in paths:
|
||||
data = {}
|
||||
|
||||
try:
|
||||
with open(filename, "r", encoding="utf-8") as file:
|
||||
data = json.load(file)
|
||||
except FileNotFoundError:
|
||||
# treast as a normal situation
|
||||
pass
|
||||
except Exception as error:
|
||||
return False, ValueError(
|
||||
f"Problem with parsing json contents of "
|
||||
f"settings file '{filename}': {str(error)}."
|
||||
)
|
||||
|
||||
self.__dict__.update(data)
|
||||
if data:
|
||||
was_inited = True
|
||||
|
||||
return (
|
||||
was_inited,
|
||||
f'None of the default settings files found: {", ".join(paths)}',
|
||||
)
|
||||
|
||||
@property
|
||||
def json(self):
|
||||
return self.__dict__
|
||||
@@ -3,58 +3,10 @@
|
||||
import copy
|
||||
import json
|
||||
import sys
|
||||
from typing import Optional, List, Dict, Any
|
||||
|
||||
import requests
|
||||
from typing import Optional, List, Dict, Any, Tuple
|
||||
|
||||
from .utils import CaseConverter, URLMatcher, is_country_tag
|
||||
|
||||
# TODO: move to data.json
|
||||
SUPPORTED_TAGS = [
|
||||
"gaming",
|
||||
"coding",
|
||||
"photo",
|
||||
"music",
|
||||
"blog",
|
||||
"finance",
|
||||
"freelance",
|
||||
"dating",
|
||||
"tech",
|
||||
"forum",
|
||||
"porn",
|
||||
"erotic",
|
||||
"webcam",
|
||||
"video",
|
||||
"movies",
|
||||
"hacking",
|
||||
"art",
|
||||
"discussion",
|
||||
"sharing",
|
||||
"writing",
|
||||
"wiki",
|
||||
"business",
|
||||
"shopping",
|
||||
"sport",
|
||||
"books",
|
||||
"news",
|
||||
"documents",
|
||||
"travel",
|
||||
"maps",
|
||||
"hobby",
|
||||
"apps",
|
||||
"classified",
|
||||
"career",
|
||||
"geosocial",
|
||||
"streaming",
|
||||
"education",
|
||||
"networking",
|
||||
"torrent",
|
||||
"science",
|
||||
"medicine",
|
||||
"reading",
|
||||
"stock",
|
||||
]
|
||||
|
||||
|
||||
class MaigretEngine:
|
||||
site: Dict[str, Any] = {}
|
||||
@@ -110,6 +62,8 @@ class MaigretSite:
|
||||
alexa_rank = None
|
||||
source = None
|
||||
|
||||
protocol = ''
|
||||
|
||||
def __init__(self, name, information):
|
||||
self.name = name
|
||||
self.url_subpath = ""
|
||||
@@ -146,6 +100,19 @@ class MaigretSite:
|
||||
|
||||
return None
|
||||
|
||||
def extract_id_from_url(self, url: str) -> Optional[Tuple[str, str]]:
|
||||
if not self.url_regexp:
|
||||
return None
|
||||
|
||||
match_groups = self.url_regexp.match(url)
|
||||
if not match_groups:
|
||||
return None
|
||||
|
||||
_id = match_groups.groups()[-1].rstrip("/")
|
||||
_type = self.type
|
||||
|
||||
return _id, _type
|
||||
|
||||
@property
|
||||
def pretty_name(self):
|
||||
if self.source:
|
||||
@@ -167,6 +134,25 @@ class MaigretSite:
|
||||
|
||||
return result
|
||||
|
||||
@property
|
||||
def errors_dict(self) -> dict:
|
||||
errors: Dict[str, str] = {}
|
||||
if self.engine_obj:
|
||||
errors.update(self.engine_obj.site.get('errors', {}))
|
||||
errors.update(self.errors)
|
||||
return errors
|
||||
|
||||
def get_url_template(self) -> str:
|
||||
url = URLMatcher.extract_main_part(self.url)
|
||||
if url.startswith("{username}"):
|
||||
url = "SUBDOMAIN"
|
||||
elif url == "":
|
||||
url = f"{self.url} ({self.engine or 'no engine'})"
|
||||
else:
|
||||
parts = url.split("/")
|
||||
url = "/" + "/".join(parts[1:])
|
||||
return url
|
||||
|
||||
def update(self, updates: "dict") -> "MaigretSite":
|
||||
self.__dict__.update(updates)
|
||||
self.update_detectors()
|
||||
@@ -225,8 +211,9 @@ class MaigretSite:
|
||||
|
||||
class MaigretDatabase:
|
||||
def __init__(self):
|
||||
self._sites = []
|
||||
self._engines = []
|
||||
self._tags: list = []
|
||||
self._sites: list = []
|
||||
self._engines: list = []
|
||||
|
||||
@property
|
||||
def sites(self):
|
||||
@@ -257,12 +244,18 @@ class MaigretDatabase:
|
||||
lambda x: isinstance(x.engine, str) and x.engine.lower() in normalized_tags
|
||||
)
|
||||
is_tags_ok = lambda x: set(x.tags).intersection(set(normalized_tags))
|
||||
is_protocol_in_tags = lambda x: x.protocol and x.protocol in normalized_tags
|
||||
is_disabled_needed = lambda x: not x.disabled or (
|
||||
"disabled" in tags or disabled
|
||||
)
|
||||
is_id_type_ok = lambda x: x.type == id_type
|
||||
|
||||
filter_tags_engines_fun = lambda x: not tags or is_engine_ok(x) or is_tags_ok(x)
|
||||
filter_tags_engines_fun = (
|
||||
lambda x: not tags
|
||||
or is_engine_ok(x)
|
||||
or is_tags_ok(x)
|
||||
or is_protocol_in_tags(x)
|
||||
)
|
||||
filter_names_fun = lambda x: not names or is_name_ok(x) or is_source_ok(x)
|
||||
|
||||
filter_fun = (
|
||||
@@ -297,9 +290,13 @@ class MaigretDatabase:
|
||||
return self
|
||||
|
||||
def save_to_file(self, filename: str) -> "MaigretDatabase":
|
||||
if '://' in filename:
|
||||
return self
|
||||
|
||||
db_data = {
|
||||
"sites": {site.name: site.strip_engine_data().json for site in self._sites},
|
||||
"engines": {engine.name: engine.json for engine in self._engines},
|
||||
"tags": self._tags,
|
||||
}
|
||||
|
||||
json_data = json.dumps(db_data, indent=4)
|
||||
@@ -313,6 +310,9 @@ class MaigretDatabase:
|
||||
# Add all of site information from the json file to internal site list.
|
||||
site_data = json_data.get("sites", {})
|
||||
engines_data = json_data.get("engines", {})
|
||||
tags = json_data.get("tags", [])
|
||||
|
||||
self._tags += tags
|
||||
|
||||
for engine_name in engines_data:
|
||||
self._engines.append(MaigretEngine(engine_name, engines_data[engine_name]))
|
||||
@@ -345,12 +345,20 @@ class MaigretDatabase:
|
||||
|
||||
return self.load_from_json(data)
|
||||
|
||||
def load_from_url(self, url: str) -> "MaigretDatabase":
|
||||
def load_from_path(self, path: str) -> "MaigretDatabase":
|
||||
if '://' in path:
|
||||
return self.load_from_http(path)
|
||||
else:
|
||||
return self.load_from_file(path)
|
||||
|
||||
def load_from_http(self, url: str) -> "MaigretDatabase":
|
||||
is_url_valid = url.startswith("http://") or url.startswith("https://")
|
||||
|
||||
if not is_url_valid:
|
||||
raise FileNotFoundError(f"Invalid data file URL '{url}'.")
|
||||
|
||||
import requests
|
||||
|
||||
try:
|
||||
response = requests.get(url=url)
|
||||
except Exception as error:
|
||||
@@ -401,51 +409,66 @@ class MaigretDatabase:
|
||||
|
||||
return found_flags
|
||||
|
||||
def get_db_stats(self, sites_dict):
|
||||
if not sites_dict:
|
||||
sites_dict = self.sites_dict()
|
||||
def extract_ids_from_url(self, url: str) -> dict:
|
||||
results = {}
|
||||
for s in self._sites:
|
||||
result = s.extract_id_from_url(url)
|
||||
if not result:
|
||||
continue
|
||||
_id, _type = result
|
||||
results[_id] = _type
|
||||
return results
|
||||
|
||||
def get_db_stats(self, is_markdown=False):
|
||||
sites_dict = self.sites_dict
|
||||
|
||||
urls = {}
|
||||
tags = {}
|
||||
output = ""
|
||||
disabled_count = 0
|
||||
total_count = len(sites_dict)
|
||||
urls = {}
|
||||
tags = {}
|
||||
|
||||
message_checks = 0
|
||||
message_checks_one_factor = 0
|
||||
|
||||
for _, site in sites_dict.items():
|
||||
if site.disabled:
|
||||
disabled_count += 1
|
||||
|
||||
url = URLMatcher.extract_main_part(site.url)
|
||||
if url.startswith("{username}"):
|
||||
url = "SUBDOMAIN"
|
||||
elif url == "":
|
||||
url = f"{site.url} ({site.engine})"
|
||||
else:
|
||||
parts = url.split("/")
|
||||
url = "/" + "/".join(parts[1:])
|
||||
url_type = site.get_url_template()
|
||||
urls[url_type] = urls.get(url_type, 0) + 1
|
||||
|
||||
urls[url] = urls.get(url, 0) + 1
|
||||
if site.check_type == 'message' and not site.disabled:
|
||||
message_checks += 1
|
||||
if site.absence_strs and site.presense_strs:
|
||||
continue
|
||||
message_checks_one_factor += 1
|
||||
|
||||
if not site.tags:
|
||||
tags["NO_TAGS"] = tags.get("NO_TAGS", 0) + 1
|
||||
|
||||
for tag in site.tags:
|
||||
if is_country_tag(tag):
|
||||
# currenty do not display country tags
|
||||
continue
|
||||
for tag in filter(lambda x: not is_country_tag(x), site.tags):
|
||||
tags[tag] = tags.get(tag, 0) + 1
|
||||
|
||||
output += f"Enabled/total sites: {total_count - disabled_count}/{total_count}\n"
|
||||
output += "Top sites' profile URLs:\n"
|
||||
for url, count in sorted(urls.items(), key=lambda x: x[1], reverse=True)[:20]:
|
||||
enabled_perc = round(100*(total_count-disabled_count)/total_count, 2)
|
||||
output += f"Enabled/total sites: {total_count - disabled_count}/{total_count} = {enabled_perc}%\n\n"
|
||||
|
||||
checks_perc = round(100*message_checks_one_factor/message_checks, 2)
|
||||
output += f"Incomplete checks: {message_checks_one_factor}/{message_checks} = {checks_perc}% (false positive risks)\n\n"
|
||||
|
||||
top_urls_count = 20
|
||||
output += f"Top {top_urls_count} profile URLs:\n"
|
||||
for url, count in sorted(urls.items(), key=lambda x: x[1], reverse=True)[:top_urls_count]:
|
||||
if count == 1:
|
||||
break
|
||||
output += f"{count}\t{url}\n"
|
||||
output += "Top sites' tags:\n"
|
||||
for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True):
|
||||
output += f"- ({count})\t`{url}`\n" if is_markdown else f"{count}\t{url}\n"
|
||||
|
||||
top_tags_count = 20
|
||||
output += f"\nTop {top_tags_count} tags:\n"
|
||||
for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True)[:top_tags_count]:
|
||||
mark = ""
|
||||
if tag not in SUPPORTED_TAGS:
|
||||
if tag not in self._tags:
|
||||
mark = " (non-standard)"
|
||||
output += f"{count}\t{tag}{mark}\n"
|
||||
output += f"- ({count})\t`{tag}`{mark}\n" if is_markdown else f"{count}\t{tag}{mark}\n"
|
||||
|
||||
return output
|
||||
|
||||
@@ -1,336 +1,407 @@
|
||||
import asyncio
|
||||
import difflib
|
||||
import json
|
||||
import re
|
||||
from typing import List
|
||||
|
||||
import xml.etree.ElementTree as ET
|
||||
from aiohttp import TCPConnector, ClientSession
|
||||
import requests
|
||||
|
||||
from .activation import import_aiohttp_cookies
|
||||
from .checking import maigret
|
||||
from .result import QueryStatus
|
||||
from .settings import Settings
|
||||
from .sites import MaigretDatabase, MaigretSite, MaigretEngine
|
||||
from .utils import get_random_user_agent
|
||||
from .utils import get_random_user_agent, get_match_ratio
|
||||
|
||||
|
||||
DESIRED_STRINGS = [
|
||||
"username",
|
||||
"not found",
|
||||
"пользователь",
|
||||
"profile",
|
||||
"lastname",
|
||||
"firstname",
|
||||
"biography",
|
||||
"birthday",
|
||||
"репутация",
|
||||
"информация",
|
||||
"e-mail",
|
||||
]
|
||||
|
||||
SUPPOSED_USERNAMES = ["alex", "god", "admin", "red", "blue", "john"]
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": get_random_user_agent(),
|
||||
}
|
||||
|
||||
RATIO = 0.6
|
||||
TOP_FEATURES = 5
|
||||
URL_RE = re.compile(r"https?://(www\.)?")
|
||||
|
||||
|
||||
def get_match_ratio(x):
|
||||
return round(
|
||||
max(
|
||||
[difflib.SequenceMatcher(a=x.lower(), b=y).ratio() for y in DESIRED_STRINGS]
|
||||
),
|
||||
2,
|
||||
)
|
||||
|
||||
|
||||
def extract_mainpage_url(url):
|
||||
return "/".join(url.split("/", 3)[:3])
|
||||
|
||||
|
||||
async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False):
|
||||
changes = {
|
||||
"disabled": False,
|
||||
class Submitter:
|
||||
HEADERS = {
|
||||
"User-Agent": get_random_user_agent(),
|
||||
}
|
||||
|
||||
check_data = [
|
||||
(site.username_claimed, QueryStatus.CLAIMED),
|
||||
(site.username_unclaimed, QueryStatus.AVAILABLE),
|
||||
]
|
||||
SEPARATORS = "\"'"
|
||||
|
||||
logger.info(f"Checking {site.name}...")
|
||||
RATIO = 0.6
|
||||
TOP_FEATURES = 5
|
||||
URL_RE = re.compile(r"https?://(www\.)?")
|
||||
|
||||
for username, status in check_data:
|
||||
results_dict = await maigret(
|
||||
username=username,
|
||||
site_dict={site.name: site},
|
||||
logger=logger,
|
||||
timeout=30,
|
||||
id_type=site.type,
|
||||
forced=True,
|
||||
no_progressbar=True,
|
||||
def __init__(self, db: MaigretDatabase, settings: Settings, logger, args):
|
||||
self.settings = settings
|
||||
self.args = args
|
||||
self.db = db
|
||||
self.logger = logger
|
||||
|
||||
from aiohttp_socks import ProxyConnector
|
||||
proxy = self.args.proxy
|
||||
cookie_jar = None
|
||||
if args.cookie_file:
|
||||
cookie_jar = import_aiohttp_cookies(args.cookie_file)
|
||||
|
||||
connector = ProxyConnector.from_url(proxy) if proxy else TCPConnector(ssl=False)
|
||||
connector.verify_ssl = False
|
||||
self.session = ClientSession(
|
||||
connector=connector, trust_env=True, cookie_jar=cookie_jar
|
||||
)
|
||||
|
||||
# don't disable entries with other ids types
|
||||
# TODO: make normal checking
|
||||
if site.name not in results_dict:
|
||||
logger.info(results_dict)
|
||||
changes["disabled"] = True
|
||||
continue
|
||||
@staticmethod
|
||||
def get_alexa_rank(site_url_main):
|
||||
url = f"http://data.alexa.com/data?cli=10&url={site_url_main}"
|
||||
xml_data = requests.get(url).text
|
||||
root = ET.fromstring(xml_data)
|
||||
alexa_rank = 0
|
||||
|
||||
result = results_dict[site.name]["status"]
|
||||
try:
|
||||
alexa_rank = int(root.find('.//REACH').attrib['RANK'])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
site_status = result.status
|
||||
return alexa_rank
|
||||
|
||||
if site_status != status:
|
||||
if site_status == QueryStatus.UNKNOWN:
|
||||
msgs = site.absence_strs
|
||||
etype = site.check_type
|
||||
logger.warning(
|
||||
"Error while searching '%s' in %s: %s, %s, check type %s",
|
||||
username,
|
||||
site.name,
|
||||
result.context,
|
||||
msgs,
|
||||
etype,
|
||||
)
|
||||
# don't disable in case of available username
|
||||
if status == QueryStatus.CLAIMED:
|
||||
changes["disabled"] = True
|
||||
elif status == QueryStatus.CLAIMED:
|
||||
logger.warning(
|
||||
f"Not found `{username}` in {site.name}, must be claimed"
|
||||
)
|
||||
logger.info(results_dict[site.name])
|
||||
changes["disabled"] = True
|
||||
else:
|
||||
logger.warning(f"Found `{username}` in {site.name}, must be available")
|
||||
logger.info(results_dict[site.name])
|
||||
changes["disabled"] = True
|
||||
@staticmethod
|
||||
def extract_mainpage_url(url):
|
||||
return "/".join(url.split("/", 3)[:3])
|
||||
|
||||
logger.info(f"Site {site.name} checking is finished")
|
||||
async def site_self_check(self, site, semaphore, silent=False):
|
||||
changes = {
|
||||
"disabled": False,
|
||||
}
|
||||
|
||||
return changes
|
||||
|
||||
|
||||
def generate_additional_fields_dialog(engine: MaigretEngine, dialog):
|
||||
fields = {}
|
||||
if 'urlSubpath' in engine.site.get('url', ''):
|
||||
msg = (
|
||||
'Detected engine suppose additional URL subpath using (/forum/, /blog/, etc). '
|
||||
'Enter in manually if it exists: '
|
||||
)
|
||||
subpath = input(msg).strip('/')
|
||||
if subpath:
|
||||
fields['urlSubpath'] = f'/{subpath}'
|
||||
return fields
|
||||
|
||||
|
||||
async def detect_known_engine(
|
||||
db, url_exists, url_mainpage, logger
|
||||
) -> List[MaigretSite]:
|
||||
try:
|
||||
r = requests.get(url_mainpage)
|
||||
except Exception as e:
|
||||
logger.warning(e)
|
||||
print("Some error while checking main page")
|
||||
return []
|
||||
|
||||
for engine in db.engines:
|
||||
strs_to_check = engine.__dict__.get("presenseStrs")
|
||||
if strs_to_check and r and r.text:
|
||||
all_strs_in_response = True
|
||||
for s in strs_to_check:
|
||||
if s not in r.text:
|
||||
all_strs_in_response = False
|
||||
sites = []
|
||||
if all_strs_in_response:
|
||||
engine_name = engine.__dict__.get("name")
|
||||
|
||||
print(f"Detected engine {engine_name} for site {url_mainpage}")
|
||||
|
||||
usernames_to_check = SUPPOSED_USERNAMES
|
||||
supposed_username = extract_username_dialog(url_exists)
|
||||
if supposed_username:
|
||||
usernames_to_check = [supposed_username] + usernames_to_check
|
||||
|
||||
add_fields = generate_additional_fields_dialog(engine, url_exists)
|
||||
|
||||
for u in usernames_to_check:
|
||||
site_data = {
|
||||
"urlMain": url_mainpage,
|
||||
"name": url_mainpage.split("//")[1],
|
||||
"engine": engine_name,
|
||||
"usernameClaimed": u,
|
||||
"usernameUnclaimed": "noonewouldeverusethis7",
|
||||
**add_fields,
|
||||
}
|
||||
logger.info(site_data)
|
||||
|
||||
maigret_site = MaigretSite(url_mainpage.split("/")[-1], site_data)
|
||||
maigret_site.update_from_engine(db.engines_dict[engine_name])
|
||||
sites.append(maigret_site)
|
||||
|
||||
return sites
|
||||
|
||||
return []
|
||||
|
||||
|
||||
def extract_username_dialog(url):
|
||||
url_parts = url.rstrip("/").split("/")
|
||||
supposed_username = url_parts[-1]
|
||||
entered_username = input(
|
||||
f'Is "{supposed_username}" a valid username? If not, write it manually: '
|
||||
)
|
||||
return entered_username if entered_username else supposed_username
|
||||
|
||||
|
||||
async def check_features_manually(
|
||||
db, url_exists, url_mainpage, cookie_file, logger, redirects=True
|
||||
):
|
||||
supposed_username = extract_username_dialog(url_exists)
|
||||
non_exist_username = "noonewouldeverusethis7"
|
||||
|
||||
url_user = url_exists.replace(supposed_username, "{username}")
|
||||
url_not_exists = url_exists.replace(supposed_username, non_exist_username)
|
||||
|
||||
# cookies
|
||||
cookie_dict = None
|
||||
if cookie_file:
|
||||
cookie_jar = await import_aiohttp_cookies(cookie_file)
|
||||
cookie_dict = {c.key: c.value for c in cookie_jar}
|
||||
|
||||
exists_resp = requests.get(
|
||||
url_exists, cookies=cookie_dict, headers=HEADERS, allow_redirects=redirects
|
||||
)
|
||||
logger.debug(exists_resp.status_code)
|
||||
logger.debug(exists_resp.text)
|
||||
|
||||
non_exists_resp = requests.get(
|
||||
url_not_exists, cookies=cookie_dict, headers=HEADERS, allow_redirects=redirects
|
||||
)
|
||||
logger.debug(non_exists_resp.status_code)
|
||||
logger.debug(non_exists_resp.text)
|
||||
|
||||
a = exists_resp.text
|
||||
b = non_exists_resp.text
|
||||
|
||||
tokens_a = set(a.split('"'))
|
||||
tokens_b = set(b.split('"'))
|
||||
|
||||
a_minus_b = tokens_a.difference(tokens_b)
|
||||
b_minus_a = tokens_b.difference(tokens_a)
|
||||
|
||||
if len(a_minus_b) == len(b_minus_a) == 0:
|
||||
print("The pages for existing and non-existing account are the same!")
|
||||
|
||||
top_features_count = int(
|
||||
input(f"Specify count of features to extract [default {TOP_FEATURES}]: ")
|
||||
or TOP_FEATURES
|
||||
)
|
||||
|
||||
presence_list = sorted(a_minus_b, key=get_match_ratio, reverse=True)[
|
||||
:top_features_count
|
||||
]
|
||||
|
||||
print("Detected text features of existing account: " + ", ".join(presence_list))
|
||||
features = input("If features was not detected correctly, write it manually: ")
|
||||
|
||||
if features:
|
||||
presence_list = features.split(",")
|
||||
|
||||
absence_list = sorted(b_minus_a, key=get_match_ratio, reverse=True)[
|
||||
:top_features_count
|
||||
]
|
||||
print("Detected text features of non-existing account: " + ", ".join(absence_list))
|
||||
features = input("If features was not detected correctly, write it manually: ")
|
||||
|
||||
if features:
|
||||
absence_list = features.split(",")
|
||||
|
||||
site_data = {
|
||||
"absenceStrs": absence_list,
|
||||
"presenseStrs": presence_list,
|
||||
"url": url_user,
|
||||
"urlMain": url_mainpage,
|
||||
"usernameClaimed": supposed_username,
|
||||
"usernameUnclaimed": non_exist_username,
|
||||
"checkType": "message",
|
||||
}
|
||||
|
||||
site = MaigretSite(url_mainpage.split("/")[-1], site_data)
|
||||
return site
|
||||
|
||||
|
||||
async def submit_dialog(db, url_exists, cookie_file, logger):
|
||||
domain_raw = URL_RE.sub("", url_exists).strip().strip("/")
|
||||
domain_raw = domain_raw.split("/")[0]
|
||||
|
||||
# check for existence
|
||||
matched_sites = list(filter(lambda x: domain_raw in x.url_main + x.url, db.sites))
|
||||
|
||||
if matched_sites:
|
||||
print(
|
||||
f'Sites with domain "{domain_raw}" already exists in the Maigret database!'
|
||||
)
|
||||
status = lambda s: "(disabled)" if s.disabled else ""
|
||||
url_block = lambda s: f"\n\t{s.url_main}\n\t{s.url}"
|
||||
print(
|
||||
"\n".join(
|
||||
[
|
||||
f"{site.name} {status(site)}{url_block(site)}"
|
||||
for site in matched_sites
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
if input("Do you want to continue? [yN] ").lower() in "n":
|
||||
return False
|
||||
|
||||
url_mainpage = extract_mainpage_url(url_exists)
|
||||
|
||||
sites = await detect_known_engine(db, url_exists, url_mainpage, logger)
|
||||
if not sites:
|
||||
print("Unable to detect site engine, lets generate checking features")
|
||||
sites = [
|
||||
await check_features_manually(
|
||||
db, url_exists, url_mainpage, cookie_file, logger
|
||||
)
|
||||
check_data = [
|
||||
(site.username_claimed, QueryStatus.CLAIMED),
|
||||
(site.username_unclaimed, QueryStatus.AVAILABLE),
|
||||
]
|
||||
|
||||
logger.debug(sites[0].__dict__)
|
||||
self.logger.info(f"Checking {site.name}...")
|
||||
|
||||
sem = asyncio.Semaphore(1)
|
||||
for username, status in check_data:
|
||||
results_dict = await maigret(
|
||||
username=username,
|
||||
site_dict={site.name: site},
|
||||
proxy=self.args.proxy,
|
||||
logger=self.logger,
|
||||
cookies=self.args.cookie_file,
|
||||
timeout=30,
|
||||
id_type=site.type,
|
||||
forced=True,
|
||||
no_progressbar=True,
|
||||
)
|
||||
|
||||
found = False
|
||||
chosen_site = None
|
||||
for s in sites:
|
||||
chosen_site = s
|
||||
result = await site_self_check(s, logger, sem, db)
|
||||
if not result["disabled"]:
|
||||
found = True
|
||||
break
|
||||
# don't disable entries with other ids types
|
||||
# TODO: make normal checking
|
||||
if site.name not in results_dict:
|
||||
self.logger.info(results_dict)
|
||||
changes["disabled"] = True
|
||||
continue
|
||||
|
||||
if not found:
|
||||
print(
|
||||
f"Sorry, we couldn't find params to detect account presence/absence in {chosen_site.name}."
|
||||
result = results_dict[site.name]["status"]
|
||||
|
||||
site_status = result.status
|
||||
|
||||
if site_status != status:
|
||||
if site_status == QueryStatus.UNKNOWN:
|
||||
msgs = site.absence_strs
|
||||
etype = site.check_type
|
||||
self.logger.warning(
|
||||
"Error while searching '%s' in %s: %s, %s, check type %s",
|
||||
username,
|
||||
site.name,
|
||||
result.context,
|
||||
msgs,
|
||||
etype,
|
||||
)
|
||||
# don't disable in case of available username
|
||||
if status == QueryStatus.CLAIMED:
|
||||
changes["disabled"] = True
|
||||
elif status == QueryStatus.CLAIMED:
|
||||
self.logger.warning(
|
||||
f"Not found `{username}` in {site.name}, must be claimed"
|
||||
)
|
||||
self.logger.info(results_dict[site.name])
|
||||
changes["disabled"] = True
|
||||
else:
|
||||
self.logger.warning(
|
||||
f"Found `{username}` in {site.name}, must be available"
|
||||
)
|
||||
self.logger.info(results_dict[site.name])
|
||||
changes["disabled"] = True
|
||||
|
||||
self.logger.info(f"Site {site.name} checking is finished")
|
||||
|
||||
return changes
|
||||
|
||||
def generate_additional_fields_dialog(self, engine: MaigretEngine, dialog):
|
||||
fields = {}
|
||||
if 'urlSubpath' in engine.site.get('url', ''):
|
||||
msg = (
|
||||
'Detected engine suppose additional URL subpath using (/forum/, /blog/, etc). '
|
||||
'Enter in manually if it exists: '
|
||||
)
|
||||
subpath = input(msg).strip('/')
|
||||
if subpath:
|
||||
fields['urlSubpath'] = f'/{subpath}'
|
||||
return fields
|
||||
|
||||
async def detect_known_engine(self, url_exists, url_mainpage) -> List[MaigretSite]:
|
||||
resp_text = ''
|
||||
try:
|
||||
r = await self.session.get(url_mainpage)
|
||||
resp_text = await r.text()
|
||||
self.logger.debug(resp_text)
|
||||
except Exception as e:
|
||||
self.logger.warning(e)
|
||||
print("Some error while checking main page")
|
||||
return []
|
||||
|
||||
for engine in self.db.engines:
|
||||
strs_to_check = engine.__dict__.get("presenseStrs")
|
||||
if strs_to_check and resp_text:
|
||||
all_strs_in_response = True
|
||||
for s in strs_to_check:
|
||||
if s not in resp_text:
|
||||
all_strs_in_response = False
|
||||
sites = []
|
||||
if all_strs_in_response:
|
||||
engine_name = engine.__dict__.get("name")
|
||||
|
||||
print(f"Detected engine {engine_name} for site {url_mainpage}")
|
||||
|
||||
usernames_to_check = self.settings.supposed_usernames
|
||||
supposed_username = self.extract_username_dialog(url_exists)
|
||||
if supposed_username:
|
||||
usernames_to_check = [supposed_username] + usernames_to_check
|
||||
|
||||
add_fields = self.generate_additional_fields_dialog(
|
||||
engine, url_exists
|
||||
)
|
||||
|
||||
for u in usernames_to_check:
|
||||
site_data = {
|
||||
"urlMain": url_mainpage,
|
||||
"name": url_mainpage.split("//")[1],
|
||||
"engine": engine_name,
|
||||
"usernameClaimed": u,
|
||||
"usernameUnclaimed": "noonewouldeverusethis7",
|
||||
**add_fields,
|
||||
}
|
||||
self.logger.info(site_data)
|
||||
|
||||
maigret_site = MaigretSite(
|
||||
url_mainpage.split("/")[-1], site_data
|
||||
)
|
||||
maigret_site.update_from_engine(
|
||||
self.db.engines_dict[engine_name]
|
||||
)
|
||||
sites.append(maigret_site)
|
||||
|
||||
return sites
|
||||
|
||||
return []
|
||||
|
||||
def extract_username_dialog(self, url):
|
||||
url_parts = url.rstrip("/").split("/")
|
||||
supposed_username = url_parts[-1].strip('@')
|
||||
entered_username = input(
|
||||
f'Is "{supposed_username}" a valid username? If not, write it manually: '
|
||||
)
|
||||
print(
|
||||
"Try to run this mode again and increase features count or choose others."
|
||||
return entered_username if entered_username else supposed_username
|
||||
|
||||
async def check_features_manually(
|
||||
self, url_exists, url_mainpage, cookie_file, redirects=False
|
||||
):
|
||||
custom_headers = {}
|
||||
while self.args.verbose:
|
||||
header_key = input(
|
||||
'Specify custom header if you need or just press Enter to skip. Header name: '
|
||||
)
|
||||
if not header_key:
|
||||
break
|
||||
header_value = input('Header value: ')
|
||||
custom_headers[header_key.strip()] = header_value.strip()
|
||||
|
||||
supposed_username = self.extract_username_dialog(url_exists)
|
||||
non_exist_username = "noonewouldeverusethis7"
|
||||
|
||||
url_user = url_exists.replace(supposed_username, "{username}")
|
||||
url_not_exists = url_exists.replace(supposed_username, non_exist_username)
|
||||
|
||||
headers = dict(self.HEADERS)
|
||||
headers.update(custom_headers)
|
||||
|
||||
exists_resp = await self.session.get(
|
||||
url_exists,
|
||||
headers=headers,
|
||||
allow_redirects=redirects,
|
||||
)
|
||||
else:
|
||||
if (
|
||||
exists_resp_text = await exists_resp.text()
|
||||
self.logger.debug(url_exists)
|
||||
self.logger.debug(exists_resp.status)
|
||||
self.logger.debug(exists_resp_text)
|
||||
|
||||
non_exists_resp = await self.session.get(
|
||||
url_not_exists,
|
||||
headers=headers,
|
||||
allow_redirects=redirects,
|
||||
)
|
||||
non_exists_resp_text = await non_exists_resp.text()
|
||||
self.logger.debug(url_not_exists)
|
||||
self.logger.debug(non_exists_resp.status)
|
||||
self.logger.debug(non_exists_resp_text)
|
||||
|
||||
a = exists_resp_text
|
||||
b = non_exists_resp_text
|
||||
|
||||
tokens_a = set(re.split(f'[{self.SEPARATORS}]', a))
|
||||
tokens_b = set(re.split(f'[{self.SEPARATORS}]', b))
|
||||
|
||||
a_minus_b = tokens_a.difference(tokens_b)
|
||||
b_minus_a = tokens_b.difference(tokens_a)
|
||||
|
||||
if len(a_minus_b) == len(b_minus_a) == 0:
|
||||
print("The pages for existing and non-existing account are the same!")
|
||||
|
||||
top_features_count = int(
|
||||
input(
|
||||
f"Site {chosen_site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] "
|
||||
).lower()
|
||||
in "y"
|
||||
):
|
||||
logger.debug(chosen_site.json)
|
||||
site_data = chosen_site.strip_engine_data()
|
||||
logger.debug(site_data.json)
|
||||
db.update_site(site_data)
|
||||
return True
|
||||
f"Specify count of features to extract [default {self.TOP_FEATURES}]: "
|
||||
)
|
||||
or self.TOP_FEATURES
|
||||
)
|
||||
|
||||
return False
|
||||
match_fun = get_match_ratio(self.settings.presence_strings)
|
||||
|
||||
presence_list = sorted(a_minus_b, key=match_fun, reverse=True)[
|
||||
:top_features_count
|
||||
]
|
||||
|
||||
print("Detected text features of existing account: " + ", ".join(presence_list))
|
||||
features = input("If features was not detected correctly, write it manually: ")
|
||||
|
||||
if features:
|
||||
presence_list = list(map(str.strip, features.split(",")))
|
||||
|
||||
absence_list = sorted(b_minus_a, key=match_fun, reverse=True)[
|
||||
:top_features_count
|
||||
]
|
||||
print(
|
||||
"Detected text features of non-existing account: " + ", ".join(absence_list)
|
||||
)
|
||||
features = input("If features was not detected correctly, write it manually: ")
|
||||
|
||||
if features:
|
||||
absence_list = list(map(str.strip, features.split(",")))
|
||||
|
||||
site_data = {
|
||||
"absenceStrs": absence_list,
|
||||
"presenseStrs": presence_list,
|
||||
"url": url_user,
|
||||
"urlMain": url_mainpage,
|
||||
"usernameClaimed": supposed_username,
|
||||
"usernameUnclaimed": non_exist_username,
|
||||
"checkType": "message",
|
||||
}
|
||||
|
||||
if headers != self.HEADERS:
|
||||
site_data['headers'] = headers
|
||||
|
||||
site = MaigretSite(url_mainpage.split("/")[-1], site_data)
|
||||
return site
|
||||
|
||||
async def dialog(self, url_exists, cookie_file):
|
||||
domain_raw = self.URL_RE.sub("", url_exists).strip().strip("/")
|
||||
domain_raw = domain_raw.split("/")[0]
|
||||
self.logger.info('Domain is %s', domain_raw)
|
||||
|
||||
# check for existence
|
||||
matched_sites = list(
|
||||
filter(lambda x: domain_raw in x.url_main + x.url, self.db.sites)
|
||||
)
|
||||
|
||||
if matched_sites:
|
||||
print(
|
||||
f'Sites with domain "{domain_raw}" already exists in the Maigret database!'
|
||||
)
|
||||
status = lambda s: "(disabled)" if s.disabled else ""
|
||||
url_block = lambda s: f"\n\t{s.url_main}\n\t{s.url}"
|
||||
print(
|
||||
"\n".join(
|
||||
[
|
||||
f"{site.name} {status(site)}{url_block(site)}"
|
||||
for site in matched_sites
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
if input("Do you want to continue? [yN] ").lower() in "n":
|
||||
return False
|
||||
|
||||
url_mainpage = self.extract_mainpage_url(url_exists)
|
||||
|
||||
print('Detecting site engine, please wait...')
|
||||
sites = []
|
||||
try:
|
||||
sites = await self.detect_known_engine(url_exists, url_mainpage)
|
||||
except KeyboardInterrupt:
|
||||
print('Engine detect process is interrupted.')
|
||||
|
||||
if not sites:
|
||||
print("Unable to detect site engine, lets generate checking features")
|
||||
|
||||
redirects = False
|
||||
if self.args.verbose:
|
||||
redirects = 'y' in input('Should we do redirects automatically? [yN] ').lower()
|
||||
|
||||
sites = [
|
||||
await self.check_features_manually(
|
||||
url_exists, url_mainpage, cookie_file, redirects,
|
||||
)
|
||||
]
|
||||
|
||||
self.logger.debug(sites[0].__dict__)
|
||||
|
||||
sem = asyncio.Semaphore(1)
|
||||
|
||||
print("Checking, please wait...")
|
||||
found = False
|
||||
chosen_site = None
|
||||
for s in sites:
|
||||
chosen_site = s
|
||||
result = await self.site_self_check(s, sem)
|
||||
if not result["disabled"]:
|
||||
found = True
|
||||
break
|
||||
|
||||
if not found:
|
||||
print(
|
||||
f"Sorry, we couldn't find params to detect account presence/absence in {chosen_site.name}."
|
||||
)
|
||||
print(
|
||||
"Try to run this mode again and increase features count or choose others."
|
||||
)
|
||||
self.logger.debug(json.dumps(chosen_site.json))
|
||||
return False
|
||||
else:
|
||||
if (
|
||||
input(
|
||||
f"Site {chosen_site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] "
|
||||
)
|
||||
.lower()
|
||||
.strip("y")
|
||||
):
|
||||
return False
|
||||
|
||||
if self.args.verbose:
|
||||
source = input("Name the source site if it is mirror: ")
|
||||
if source:
|
||||
chosen_site.source = source
|
||||
|
||||
chosen_site.name = input("Change site name if you want: ") or chosen_site.name
|
||||
chosen_site.tags = list(map(str.strip, input("Site tags: ").split(',')))
|
||||
rank = Submitter.get_alexa_rank(chosen_site.url_main)
|
||||
if rank:
|
||||
print(f'New alexa rank: {rank}')
|
||||
chosen_site.alexa_rank = rank
|
||||
|
||||
self.logger.debug(chosen_site.json)
|
||||
site_data = chosen_site.strip_engine_data()
|
||||
self.logger.debug(site_data.json)
|
||||
self.db.update_site(site_data)
|
||||
return True
|
||||
|
||||
@@ -1,5 +1,9 @@
|
||||
# coding: utf8
|
||||
import ast
|
||||
import difflib
|
||||
import re
|
||||
import random
|
||||
from typing import Any
|
||||
|
||||
|
||||
DEFAULT_USER_AGENTS = [
|
||||
@@ -38,7 +42,7 @@ def enrich_link_str(link: str) -> str:
|
||||
|
||||
|
||||
class URLMatcher:
|
||||
_HTTP_URL_RE_STR = "^https?://(www.)?(.+)$"
|
||||
_HTTP_URL_RE_STR = "^https?://(www.|m.)?(.+)$"
|
||||
HTTP_URL_RE = re.compile(_HTTP_URL_RE_STR)
|
||||
UNSAFE_SYMBOLS = ".?"
|
||||
|
||||
@@ -55,25 +59,39 @@ class URLMatcher:
|
||||
url_main_part = self.extract_main_part(url)
|
||||
for c in self.UNSAFE_SYMBOLS:
|
||||
url_main_part = url_main_part.replace(c, f"\\{c}")
|
||||
username_regexp = username_regexp or ".+?"
|
||||
prepared_username_regexp = (username_regexp or ".+?").lstrip('^').rstrip('$')
|
||||
|
||||
url_regexp = url_main_part.replace("{username}", f"({username_regexp})")
|
||||
url_regexp = url_main_part.replace(
|
||||
"{username}", f"({prepared_username_regexp})"
|
||||
)
|
||||
regexp_str = self._HTTP_URL_RE_STR.replace("(.+)", url_regexp)
|
||||
|
||||
return re.compile(regexp_str)
|
||||
return re.compile(regexp_str, re.IGNORECASE)
|
||||
|
||||
|
||||
def ascii_data_display(data: str) -> Any:
|
||||
return ast.literal_eval(data)
|
||||
|
||||
|
||||
def get_dict_ascii_tree(items, prepend="", new_line=True):
|
||||
new_result = b'\xe2\x94\x9c'.decode()
|
||||
new_line = b'\xe2\x94\x80'.decode()
|
||||
last_result = b'\xe2\x94\x94'.decode()
|
||||
skip_result = b'\xe2\x94\x82'.decode()
|
||||
|
||||
text = ""
|
||||
for num, item in enumerate(items):
|
||||
box_symbol = "┣╸" if num != len(items) - 1 else "┗╸"
|
||||
box_symbol = (
|
||||
new_result + new_line if num != len(items) - 1 else last_result + new_line
|
||||
)
|
||||
|
||||
if type(item) == tuple:
|
||||
field_name, field_value = item
|
||||
if field_value.startswith("['"):
|
||||
is_last_item = num == len(items) - 1
|
||||
prepend_symbols = " " * 3 if is_last_item else " ┃ "
|
||||
field_value = get_dict_ascii_tree(eval(field_value), prepend_symbols)
|
||||
prepend_symbols = " " * 3 if is_last_item else f" {skip_result} "
|
||||
data = ascii_data_display(field_value)
|
||||
field_value = get_dict_ascii_tree(data, prepend_symbols)
|
||||
text += f"\n{prepend}{box_symbol}{field_name}: {field_value}"
|
||||
else:
|
||||
text += f"\n{prepend}{box_symbol} {item}"
|
||||
@@ -86,3 +104,18 @@ def get_dict_ascii_tree(items, prepend="", new_line=True):
|
||||
|
||||
def get_random_user_agent():
|
||||
return random.choice(DEFAULT_USER_AGENTS)
|
||||
|
||||
|
||||
def get_match_ratio(base_strs: list):
|
||||
def get_match_inner(s: str):
|
||||
return round(
|
||||
max(
|
||||
[
|
||||
difflib.SequenceMatcher(a=s.lower(), b=s2.lower()).ratio()
|
||||
for s2 in base_strs
|
||||
]
|
||||
),
|
||||
2,
|
||||
)
|
||||
|
||||
return get_match_inner
|
||||
|
||||
@@ -0,0 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
import asyncio
|
||||
|
||||
import maigret
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(maigret.cli())
|
||||
@@ -0,0 +1,55 @@
|
||||
# -*- mode: python ; coding: utf-8 -*-
|
||||
from PyInstaller.utils.hooks import collect_all
|
||||
|
||||
datas = []
|
||||
binaries = []
|
||||
hiddenimports = []
|
||||
|
||||
full_import_modules = ['maigret', 'socid_extractor', 'arabic_reshaper', 'pyvis', 'reportlab.graphics.barcode']
|
||||
|
||||
for module in full_import_modules:
|
||||
tmp_ret = collect_all(module)
|
||||
datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
|
||||
|
||||
hiddenimports += ['PySocks', 'beautifulsoup4', 'python-dateutil',
|
||||
'future-annotations', 'six', 'python-bidi',
|
||||
'typing-extensions', 'attrs', 'torrequest']
|
||||
|
||||
block_cipher = None
|
||||
|
||||
|
||||
a = Analysis(['maigret_standalone.py'],
|
||||
pathex=[],
|
||||
binaries=binaries,
|
||||
datas=datas,
|
||||
hiddenimports=hiddenimports,
|
||||
hookspath=[],
|
||||
hooksconfig={},
|
||||
runtime_hooks=[],
|
||||
excludes=[],
|
||||
win_no_prefer_redirects=False,
|
||||
win_private_assemblies=False,
|
||||
cipher=block_cipher,
|
||||
noarchive=False)
|
||||
|
||||
pyz = PYZ(a.pure, a.zipped_data,
|
||||
cipher=block_cipher)
|
||||
|
||||
exe = EXE(pyz,
|
||||
a.scripts,
|
||||
a.binaries,
|
||||
a.zipfiles,
|
||||
a.datas,
|
||||
[],
|
||||
name='maigret_standalone',
|
||||
debug=False,
|
||||
bootloader_ignore_signals=False,
|
||||
strip=False,
|
||||
upx=True,
|
||||
upx_exclude=[],
|
||||
runtime_tmpdir=None,
|
||||
console=True,
|
||||
disable_windowed_traceback=False,
|
||||
target_arch=None,
|
||||
codesign_identity=None,
|
||||
entitlements_file=None )
|
||||
@@ -0,0 +1,5 @@
|
||||
maigret @ https://github.com/soxoj/maigret/archive/refs/heads/main.zip
|
||||
pefile==2021.9.3
|
||||
psutil==5.9.0
|
||||
pyinstaller @ https://github.com/pyinstaller/pyinstaller/archive/develop.zip
|
||||
pywin32-ctypes==0.2.0
|
||||
@@ -3,3 +3,4 @@
|
||||
filterwarnings =
|
||||
error
|
||||
ignore::UserWarning
|
||||
asyncio_mode=auto
|
||||
@@ -1,38 +1,39 @@
|
||||
aiohttp==3.7.4
|
||||
aiohttp-socks==0.5.5
|
||||
arabic-reshaper==2.1.1
|
||||
async-timeout==3.0.1
|
||||
attrs==20.3.0
|
||||
beautifulsoup4==4.9.3
|
||||
bs4==0.0.1
|
||||
certifi==2020.12.5
|
||||
chardet==3.0.4
|
||||
aiodns==3.0.0
|
||||
aiohttp==3.8.1
|
||||
aiohttp-socks==0.7.1
|
||||
arabic-reshaper==2.1.3
|
||||
async-timeout==4.0.2
|
||||
attrs==21.4.0
|
||||
certifi==2021.10.8
|
||||
chardet==4.0.0
|
||||
colorama==0.4.4
|
||||
python-dateutil==2.8.1
|
||||
future==0.18.2
|
||||
future-annotations==1.0.0
|
||||
html5lib==1.1
|
||||
idna==2.10
|
||||
Jinja2==2.11.3
|
||||
lxml==4.6.3
|
||||
MarkupSafe==1.1.1
|
||||
mock==4.0.2
|
||||
multidict==5.1.0
|
||||
pycountry==20.7.3
|
||||
idna==3.3
|
||||
Jinja2==3.0.3
|
||||
lxml==4.8.0
|
||||
MarkupSafe==2.0.1
|
||||
mock==4.0.3
|
||||
multidict==5.2.0;python_version<"3.7"
|
||||
multidict==6.0.2;python_version>="3.7"
|
||||
pycountry==22.1.10
|
||||
PyPDF2==1.26.0
|
||||
PySocks==1.7.1
|
||||
python-bidi==0.4.2
|
||||
python-socks==1.1.2
|
||||
requests>=2.24.0
|
||||
requests==2.27.1
|
||||
requests-futures==1.0.0
|
||||
six==1.15.0
|
||||
socid-extractor>=0.0.16
|
||||
soupsieve==2.1
|
||||
six==1.16.0
|
||||
socid-extractor>=0.0.21
|
||||
soupsieve==2.3.1
|
||||
stem==1.8.0
|
||||
torrequest==0.1.0
|
||||
tqdm==4.55.0
|
||||
typing-extensions==3.7.4.3
|
||||
tqdm==4.63.0
|
||||
typing-extensions==4.1.1
|
||||
webencodings==0.5.1
|
||||
xhtml2pdf==0.2.5
|
||||
XMind==1.2.0
|
||||
yarl==1.6.3
|
||||
yarl==1.7.2
|
||||
networkx==2.5.1
|
||||
pyvis==0.1.9
|
||||
reportlab==3.6.6
|
||||
|
||||
@@ -5,14 +5,13 @@ from setuptools import (
|
||||
|
||||
|
||||
with open('README.md') as fh:
|
||||
readme = fh.read()
|
||||
long_description = readme.replace('./', 'https://raw.githubusercontent.com/soxoj/maigret/main/')
|
||||
long_description = fh.read()
|
||||
|
||||
with open('requirements.txt') as rf:
|
||||
requires = rf.read().splitlines()
|
||||
|
||||
setup(name='maigret',
|
||||
version='0.2.0',
|
||||
version='0.4.2',
|
||||
description='Collect a dossier on a person by username from a huge number of sites',
|
||||
long_description=long_description,
|
||||
long_description_content_type="text/markdown",
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
name: maigret2
|
||||
version: git
|
||||
summary: SOCMINT / Instagram
|
||||
description: |
|
||||
Test Test Test
|
||||
base: core18
|
||||
confinement: strict
|
||||
|
||||
|
||||
parts:
|
||||
maigret2:
|
||||
plugin: python
|
||||
python-version: python3
|
||||
source: .
|
||||
stage-packages:
|
||||
- python-six
|
||||
|
||||
|
||||
apps:
|
||||
maigret2:
|
||||
command: bin/maigret
|
||||
|
||||
|
||||
architectures:
|
||||
- build-on: amd64
|
||||
- build-on: i386
|
||||
|
||||
|
||||
|
||||
|
||||
|
Before Width: | Height: | Size: 15 KiB After Width: | Height: | Size: 9.0 KiB |
|
Before Width: | Height: | Size: 44 KiB After Width: | Height: | Size: 44 KiB |
|
Before Width: | Height: | Size: 607 KiB After Width: | Height: | Size: 451 KiB |
|
Before Width: | Height: | Size: 773 KiB After Width: | Height: | Size: 351 KiB |
@@ -0,0 +1,8 @@
|
||||
reportlab==3.6.6
|
||||
flake8==4.0.1
|
||||
pytest==7.0.1
|
||||
pytest-asyncio==0.16.0;python_version<"3.7"
|
||||
pytest-asyncio==0.18.2;python_version>="3.7"
|
||||
pytest-cov==3.0.0
|
||||
pytest-httpserver==1.0.4
|
||||
pytest-rerunfailures==10.2
|
||||
@@ -6,10 +6,16 @@ import pytest
|
||||
from _pytest.mark import Mark
|
||||
|
||||
from maigret.sites import MaigretDatabase
|
||||
from maigret.maigret import setup_arguments_parser
|
||||
from maigret.settings import Settings
|
||||
|
||||
|
||||
CUR_PATH = os.path.dirname(os.path.realpath(__file__))
|
||||
JSON_FILE = os.path.join(CUR_PATH, '../maigret/resources/data.json')
|
||||
empty_mark = Mark('', [], {})
|
||||
SETTINGS_FILE = os.path.join(CUR_PATH, '../maigret/resources/settings.json')
|
||||
TEST_JSON_FILE = os.path.join(CUR_PATH, 'db.json')
|
||||
LOCAL_TEST_JSON_FILE = os.path.join(CUR_PATH, 'local.json')
|
||||
empty_mark = Mark('', (), {})
|
||||
|
||||
|
||||
def by_slow_marker(item):
|
||||
@@ -33,9 +39,17 @@ def remove_test_reports():
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
def default_db():
|
||||
db = MaigretDatabase().load_from_file(JSON_FILE)
|
||||
return MaigretDatabase().load_from_file(JSON_FILE)
|
||||
|
||||
return db
|
||||
|
||||
@pytest.fixture(scope='function')
|
||||
def test_db():
|
||||
return MaigretDatabase().load_from_file(TEST_JSON_FILE)
|
||||
|
||||
|
||||
@pytest.fixture(scope='function')
|
||||
def local_test_db():
|
||||
return MaigretDatabase().load_from_file(LOCAL_TEST_JSON_FILE)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
@@ -43,3 +57,15 @@ def reports_autoclean():
|
||||
remove_test_reports()
|
||||
yield
|
||||
remove_test_reports()
|
||||
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
def argparser():
|
||||
settings = Settings()
|
||||
settings.load([SETTINGS_FILE])
|
||||
return setup_arguments_parser(settings)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def httpserver_listen_address():
|
||||
return ("localhost", 8989)
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
{
|
||||
"engines": {},
|
||||
"sites": {
|
||||
"GooglePlayStore": {
|
||||
"tags": ["global", "us"],
|
||||
"disabled": false,
|
||||
"checkType": "status_code",
|
||||
"alexaRank": 1,
|
||||
"url": "https://play.google.com/store/apps/developer?id={username}",
|
||||
"urlMain": "https://play.google.com/store",
|
||||
"usernameClaimed": "Facebook_nosuchname",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
},
|
||||
"Reddit": {
|
||||
"tags": ["news", "social", "us"],
|
||||
"checkType": "status_code",
|
||||
"presenseStrs": ["totalKarma"],
|
||||
"disabled": true,
|
||||
"alexaRank": 17,
|
||||
"url": "https://www.reddit.com/user/{username}",
|
||||
"urlMain": "https://www.reddit.com/",
|
||||
"usernameClaimed": "blue",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,21 @@
|
||||
{
|
||||
"engines": {},
|
||||
"sites": {
|
||||
"StatusCode": {
|
||||
"checkType": "status_code",
|
||||
"url": "http://localhost:8989/url?id={username}",
|
||||
"urlMain": "http://localhost:8989/",
|
||||
"usernameClaimed": "claimed",
|
||||
"usernameUnclaimed": "unclaimed"
|
||||
},
|
||||
"Message": {
|
||||
"checkType": "message",
|
||||
"url": "http://localhost:8989/url?id={username}",
|
||||
"urlMain": "http://localhost:8989/",
|
||||
"presenseStrs": ["user", "profile"],
|
||||
"absenseStrs": ["not found", "404"],
|
||||
"usernameClaimed": "claimed",
|
||||
"usernameUnclaimed": "unclaimed"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -22,6 +22,7 @@ httpbin.org FALSE / FALSE 0 a b
|
||||
"""
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="periodically fails")
|
||||
@pytest.mark.slow
|
||||
def test_twitter_activation(default_db):
|
||||
twitter_site = default_db.sites_dict['Twitter']
|
||||
@@ -39,7 +40,7 @@ async def test_import_aiohttp_cookies():
|
||||
with open(cookies_filename, 'w') as f:
|
||||
f.write(COOKIES_TXT)
|
||||
|
||||
cookie_jar = await import_aiohttp_cookies(cookies_filename)
|
||||
cookie_jar = import_aiohttp_cookies(cookies_filename)
|
||||
assert list(cookie_jar._cookies.keys()) == ['xss.is', 'httpbin.org']
|
||||
|
||||
url = 'https://httpbin.org/cookies'
|
||||
|
||||
@@ -0,0 +1,69 @@
|
||||
from mock import Mock
|
||||
import pytest
|
||||
|
||||
from maigret import search
|
||||
|
||||
|
||||
def site_result_except(server, username, **kwargs):
|
||||
query = f'id={username}'
|
||||
server.expect_request('/url', query_string=query).respond_with_data(**kwargs)
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.asyncio
|
||||
async def test_checking_by_status_code(httpserver, local_test_db):
|
||||
sites_dict = local_test_db.sites_dict
|
||||
|
||||
site_result_except(httpserver, 'claimed', status=200)
|
||||
site_result_except(httpserver, 'unclaimed', status=404)
|
||||
|
||||
result = await search('claimed', site_dict=sites_dict, logger=Mock())
|
||||
assert result['StatusCode']['status'].is_found() is True
|
||||
|
||||
result = await search('unclaimed', site_dict=sites_dict, logger=Mock())
|
||||
assert result['StatusCode']['status'].is_found() is False
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.asyncio
|
||||
async def test_checking_by_message_positive_full(httpserver, local_test_db):
|
||||
sites_dict = local_test_db.sites_dict
|
||||
|
||||
site_result_except(httpserver, 'claimed', response_data="user profile")
|
||||
site_result_except(httpserver, 'unclaimed', response_data="404 not found")
|
||||
|
||||
result = await search('claimed', site_dict=sites_dict, logger=Mock())
|
||||
assert result['Message']['status'].is_found() is True
|
||||
|
||||
result = await search('unclaimed', site_dict=sites_dict, logger=Mock())
|
||||
assert result['Message']['status'].is_found() is False
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.asyncio
|
||||
async def test_checking_by_message_positive_part(httpserver, local_test_db):
|
||||
sites_dict = local_test_db.sites_dict
|
||||
|
||||
site_result_except(httpserver, 'claimed', response_data="profile")
|
||||
site_result_except(httpserver, 'unclaimed', response_data="404")
|
||||
|
||||
result = await search('claimed', site_dict=sites_dict, logger=Mock())
|
||||
assert result['Message']['status'].is_found() is True
|
||||
|
||||
result = await search('unclaimed', site_dict=sites_dict, logger=Mock())
|
||||
assert result['Message']['status'].is_found() is False
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
@pytest.mark.asyncio
|
||||
async def test_checking_by_message_negative(httpserver, local_test_db):
|
||||
sites_dict = local_test_db.sites_dict
|
||||
|
||||
site_result_except(httpserver, 'claimed', response_data="")
|
||||
site_result_except(httpserver, 'unclaimed', response_data="user 404")
|
||||
|
||||
result = await search('claimed', site_dict=sites_dict, logger=Mock())
|
||||
assert result['Message']['status'].is_found() is False
|
||||
|
||||
result = await search('unclaimed', site_dict=sites_dict, logger=Mock())
|
||||
assert result['Message']['status'].is_found() is True
|
||||
@@ -0,0 +1,98 @@
|
||||
"""Maigret command-line arguments parsing tests"""
|
||||
from argparse import Namespace
|
||||
from typing import Dict, Any
|
||||
|
||||
DEFAULT_ARGS: Dict[str, Any] = {
|
||||
'all_sites': False,
|
||||
'connections': 100,
|
||||
'cookie_file': None,
|
||||
'csv': False,
|
||||
'db_file': 'resources/data.json',
|
||||
'debug': False,
|
||||
'disable_extracting': False,
|
||||
'disable_recursive_search': False,
|
||||
'folderoutput': 'reports',
|
||||
'html': False,
|
||||
'graph': False,
|
||||
'id_type': 'username',
|
||||
'ignore_ids_list': [],
|
||||
'info': False,
|
||||
'json': '',
|
||||
'new_site_to_submit': False,
|
||||
'no_color': False,
|
||||
'no_progressbar': False,
|
||||
'parse_url': '',
|
||||
'pdf': False,
|
||||
'print_check_errors': False,
|
||||
'print_not_found': False,
|
||||
'proxy': None,
|
||||
'reports_sorting': 'default',
|
||||
'retries': 1,
|
||||
'self_check': False,
|
||||
'site_list': [],
|
||||
'stats': False,
|
||||
'tags': '',
|
||||
'timeout': 30,
|
||||
'tor_proxy': 'socks5://127.0.0.1:9050',
|
||||
'i2p_proxy': 'http://127.0.0.1:4444',
|
||||
'top_sites': 500,
|
||||
'txt': False,
|
||||
'use_disabled_sites': False,
|
||||
'username': [],
|
||||
'verbose': False,
|
||||
'with_domains': False,
|
||||
'xmind': False,
|
||||
}
|
||||
|
||||
|
||||
def test_args_search_mode(argparser):
|
||||
args = argparser.parse_args('username'.split())
|
||||
|
||||
assert args.username == ['username']
|
||||
|
||||
want_args = dict(DEFAULT_ARGS)
|
||||
want_args.update({'username': ['username']})
|
||||
|
||||
assert args == Namespace(**want_args)
|
||||
|
||||
|
||||
def test_args_search_mode_several_usernames(argparser):
|
||||
args = argparser.parse_args('username1 username2'.split())
|
||||
|
||||
assert args.username == ['username1', 'username2']
|
||||
|
||||
want_args = dict(DEFAULT_ARGS)
|
||||
want_args.update({'username': ['username1', 'username2']})
|
||||
|
||||
assert args == Namespace(**want_args)
|
||||
|
||||
|
||||
def test_args_self_check_mode(argparser):
|
||||
args = argparser.parse_args('--self-check --site GitHub'.split())
|
||||
|
||||
want_args = dict(DEFAULT_ARGS)
|
||||
want_args.update(
|
||||
{
|
||||
'self_check': True,
|
||||
'site_list': ['GitHub'],
|
||||
'username': [],
|
||||
}
|
||||
)
|
||||
|
||||
assert args == Namespace(**want_args)
|
||||
|
||||
|
||||
def test_args_multiple_sites(argparser):
|
||||
args = argparser.parse_args(
|
||||
'--site GitHub VK --site PornHub --site Taringa,Steam'.split()
|
||||
)
|
||||
|
||||
want_args = dict(DEFAULT_ARGS)
|
||||
want_args.update(
|
||||
{
|
||||
'site_list': ['GitHub', 'PornHub', 'Taringa,Steam'],
|
||||
'username': ['VK'],
|
||||
}
|
||||
)
|
||||
|
||||
assert args == Namespace(**want_args)
|
||||
@@ -0,0 +1,16 @@
|
||||
"""Maigret data test functions"""
|
||||
|
||||
from maigret.utils import is_country_tag
|
||||
|
||||
|
||||
def test_tags_validity(default_db):
|
||||
unknown_tags = set()
|
||||
|
||||
tags = default_db._tags
|
||||
|
||||
for site in default_db.sites:
|
||||
for tag in filter(lambda x: not is_country_tag(x), site.tags):
|
||||
if tag not in tags:
|
||||
unknown_tags.add(tag)
|
||||
|
||||
assert unknown_tags == set()
|
||||
@@ -63,7 +63,10 @@ async def test_asyncio_progressbar_queue_executor():
|
||||
assert executor.execution_time < 0.5
|
||||
|
||||
executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=5)
|
||||
assert await executor.run(tasks) == [0, 3, 6, 1, 4, 7, 9, 2, 5, 8]
|
||||
assert await executor.run(tasks) in (
|
||||
[0, 3, 6, 1, 4, 7, 9, 2, 5, 8],
|
||||
[0, 3, 6, 1, 4, 9, 7, 2, 5, 8],
|
||||
)
|
||||
assert executor.execution_time > 0.3
|
||||
assert executor.execution_time < 0.4
|
||||
|
||||
|
||||
@@ -1,96 +1,177 @@
|
||||
"""Maigret main module test functions"""
|
||||
import asyncio
|
||||
import copy
|
||||
|
||||
import pytest
|
||||
from mock import Mock
|
||||
|
||||
from maigret.maigret import self_check
|
||||
from maigret.sites import MaigretDatabase
|
||||
from maigret.maigret import self_check, maigret
|
||||
from maigret.maigret import (
|
||||
extract_ids_from_page,
|
||||
extract_ids_from_results,
|
||||
)
|
||||
from maigret.sites import MaigretSite
|
||||
from maigret.result import QueryResult, QueryStatus
|
||||
|
||||
EXAMPLE_DB = {
|
||||
'engines': {},
|
||||
'sites': {
|
||||
"GooglePlayStore": {
|
||||
"tags": ["global", "us"],
|
||||
"disabled": False,
|
||||
"checkType": "status_code",
|
||||
"alexaRank": 1,
|
||||
"url": "https://play.google.com/store/apps/developer?id={username}",
|
||||
"urlMain": "https://play.google.com/store",
|
||||
"usernameClaimed": "Facebook_nosuchname",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7",
|
||||
},
|
||||
"Reddit": {
|
||||
"tags": ["news", "social", "us"],
|
||||
"checkType": "status_code",
|
||||
"presenseStrs": ["totalKarma"],
|
||||
"disabled": True,
|
||||
"alexaRank": 17,
|
||||
"url": "https://www.reddit.com/user/{username}",
|
||||
"urlMain": "https://www.reddit.com/",
|
||||
"usernameClaimed": "blue",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7",
|
||||
},
|
||||
|
||||
RESULTS_EXAMPLE = {
|
||||
'Reddit': {
|
||||
'cookies': None,
|
||||
'parsing_enabled': False,
|
||||
'url_main': 'https://www.reddit.com/',
|
||||
'username': 'Skyeng',
|
||||
},
|
||||
'GooglePlayStore': {
|
||||
'cookies': None,
|
||||
'http_status': 200,
|
||||
'is_similar': False,
|
||||
'parsing_enabled': False,
|
||||
'rank': 1,
|
||||
'url_main': 'https://play.google.com/store',
|
||||
'url_user': 'https://play.google.com/store/apps/developer?id=Skyeng',
|
||||
'username': 'Skyeng',
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_self_check_db_positive_disable():
|
||||
def test_self_check_db_positive_disable(test_db):
|
||||
logger = Mock()
|
||||
db = MaigretDatabase()
|
||||
db.load_from_json(EXAMPLE_DB)
|
||||
|
||||
assert db.sites[0].disabled == False
|
||||
assert test_db.sites[0].disabled is False
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
loop.run_until_complete(self_check(db, db.sites_dict, logger, silent=True))
|
||||
loop.run_until_complete(
|
||||
self_check(test_db, test_db.sites_dict, logger, silent=True)
|
||||
)
|
||||
|
||||
assert db.sites[0].disabled == True
|
||||
assert test_db.sites[0].disabled is True
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_self_check_db_positive_enable():
|
||||
def test_self_check_db_positive_enable(test_db):
|
||||
logger = Mock()
|
||||
db = MaigretDatabase()
|
||||
db.load_from_json(EXAMPLE_DB)
|
||||
|
||||
db.sites[0].disabled = True
|
||||
db.sites[0].username_claimed = 'Facebook'
|
||||
assert db.sites[0].disabled == True
|
||||
test_db.sites[0].disabled = True
|
||||
test_db.sites[0].username_claimed = 'Skyeng'
|
||||
assert test_db.sites[0].disabled is True
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
loop.run_until_complete(self_check(db, db.sites_dict, logger, silent=True))
|
||||
loop.run_until_complete(
|
||||
self_check(test_db, test_db.sites_dict, logger, silent=True)
|
||||
)
|
||||
|
||||
assert db.sites[0].disabled == False
|
||||
assert test_db.sites[0].disabled is False
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_self_check_db_negative_disabled():
|
||||
def test_self_check_db_negative_disabled(test_db):
|
||||
logger = Mock()
|
||||
db = MaigretDatabase()
|
||||
db.load_from_json(EXAMPLE_DB)
|
||||
|
||||
db.sites[0].disabled = True
|
||||
assert db.sites[0].disabled == True
|
||||
test_db.sites[0].disabled = True
|
||||
assert test_db.sites[0].disabled is True
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
loop.run_until_complete(self_check(db, db.sites_dict, logger, silent=True))
|
||||
loop.run_until_complete(
|
||||
self_check(test_db, test_db.sites_dict, logger, silent=True)
|
||||
)
|
||||
|
||||
assert db.sites[0].disabled == True
|
||||
assert test_db.sites[0].disabled is True
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_self_check_db_negative_enabled():
|
||||
def test_self_check_db_negative_enabled(test_db):
|
||||
logger = Mock()
|
||||
db = MaigretDatabase()
|
||||
db.load_from_json(EXAMPLE_DB)
|
||||
|
||||
db.sites[0].disabled = False
|
||||
db.sites[0].username_claimed = 'Facebook'
|
||||
assert db.sites[0].disabled == False
|
||||
test_db.sites[0].disabled = False
|
||||
test_db.sites[0].username_claimed = 'Skyeng'
|
||||
assert test_db.sites[0].disabled is False
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
loop.run_until_complete(self_check(db, db.sites_dict, logger, silent=True))
|
||||
loop.run_until_complete(
|
||||
self_check(test_db, test_db.sites_dict, logger, silent=True)
|
||||
)
|
||||
|
||||
assert db.sites[0].disabled == False
|
||||
assert test_db.sites[0].disabled is False
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_maigret_results(test_db):
|
||||
logger = Mock()
|
||||
|
||||
username = 'Skyeng'
|
||||
loop = asyncio.get_event_loop()
|
||||
results = loop.run_until_complete(
|
||||
maigret(username, site_dict=test_db.sites_dict, logger=logger, timeout=30)
|
||||
)
|
||||
|
||||
assert isinstance(results, dict)
|
||||
|
||||
reddit_site = results['Reddit']['site']
|
||||
assert isinstance(reddit_site, MaigretSite)
|
||||
|
||||
assert reddit_site.json == {
|
||||
'tags': ['news', 'social', 'us'],
|
||||
'checkType': 'status_code',
|
||||
'presenseStrs': ['totalKarma'],
|
||||
'disabled': True,
|
||||
'alexaRank': 17,
|
||||
'url': 'https://www.reddit.com/user/{username}',
|
||||
'urlMain': 'https://www.reddit.com/',
|
||||
'usernameClaimed': 'blue',
|
||||
'usernameUnclaimed': 'noonewouldeverusethis7',
|
||||
}
|
||||
|
||||
del results['Reddit']['site']
|
||||
del results['GooglePlayStore']['site']
|
||||
|
||||
reddit_status = results['Reddit']['status']
|
||||
assert isinstance(reddit_status, QueryResult)
|
||||
assert reddit_status.status == QueryStatus.ILLEGAL
|
||||
|
||||
playstore_status = results['GooglePlayStore']['status']
|
||||
assert isinstance(playstore_status, QueryResult)
|
||||
assert playstore_status.status == QueryStatus.CLAIMED
|
||||
|
||||
del results['Reddit']['status']
|
||||
del results['GooglePlayStore']['status']
|
||||
|
||||
assert results['Reddit'].get('future') is None
|
||||
del results['GooglePlayStore']['future']
|
||||
del results['GooglePlayStore']['checker']
|
||||
|
||||
assert results == RESULTS_EXAMPLE
|
||||
|
||||
|
||||
def test_extract_ids_from_url(default_db):
|
||||
assert default_db.extract_ids_from_url('https://www.reddit.com/user/test') == {
|
||||
'test': 'username'
|
||||
}
|
||||
assert default_db.extract_ids_from_url('https://vk.com/id123') == {'123': 'vk_id'}
|
||||
assert default_db.extract_ids_from_url('https://vk.com/ida123') == {
|
||||
'ida123': 'username'
|
||||
}
|
||||
assert default_db.extract_ids_from_url(
|
||||
'https://my.mail.ru/yandex.ru/dipres8904/'
|
||||
) == {'dipres8904': 'username'}
|
||||
assert default_db.extract_ids_from_url(
|
||||
'https://reviews.yandex.ru/user/adbced123'
|
||||
) == {'adbced123': 'yandex_public_id'}
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_extract_ids_from_page(test_db):
|
||||
logger = Mock()
|
||||
extract_ids_from_page('https://www.reddit.com/user/test', logger) == {
|
||||
'test': 'username'
|
||||
}
|
||||
|
||||
|
||||
def test_extract_ids_from_results(test_db):
|
||||
TEST_EXAMPLE = copy.deepcopy(RESULTS_EXAMPLE)
|
||||
TEST_EXAMPLE['Reddit']['ids_usernames'] = {'test1': 'yandex_public_id'}
|
||||
TEST_EXAMPLE['Reddit']['ids_links'] = ['https://www.reddit.com/user/test2']
|
||||
|
||||
extract_ids_from_results(TEST_EXAMPLE, test_db) == {
|
||||
'test1': 'yandex_public_id',
|
||||
'test2': 'username',
|
||||
}
|
||||
|
||||
@@ -0,0 +1,64 @@
|
||||
from maigret.errors import CheckError
|
||||
from maigret.notify import QueryNotifyPrint
|
||||
from maigret.result import QueryStatus, QueryResult
|
||||
|
||||
|
||||
def test_notify_illegal():
|
||||
n = QueryNotifyPrint(color=False)
|
||||
|
||||
assert (
|
||||
n.update(
|
||||
QueryResult(
|
||||
username="test",
|
||||
status=QueryStatus.ILLEGAL,
|
||||
site_name="TEST_SITE",
|
||||
site_url_user="http://example.com/test",
|
||||
)
|
||||
)
|
||||
== "[-] TEST_SITE: Illegal Username Format For This Site!"
|
||||
)
|
||||
|
||||
|
||||
def test_notify_claimed():
|
||||
n = QueryNotifyPrint(color=False)
|
||||
|
||||
assert (
|
||||
n.update(
|
||||
QueryResult(
|
||||
username="test",
|
||||
status=QueryStatus.CLAIMED,
|
||||
site_name="TEST_SITE",
|
||||
site_url_user="http://example.com/test",
|
||||
)
|
||||
)
|
||||
== "[+] TEST_SITE: http://example.com/test"
|
||||
)
|
||||
|
||||
|
||||
def test_notify_available():
|
||||
n = QueryNotifyPrint(color=False)
|
||||
|
||||
assert (
|
||||
n.update(
|
||||
QueryResult(
|
||||
username="test",
|
||||
status=QueryStatus.AVAILABLE,
|
||||
site_name="TEST_SITE",
|
||||
site_url_user="http://example.com/test",
|
||||
)
|
||||
)
|
||||
== "[-] TEST_SITE: Not found!"
|
||||
)
|
||||
|
||||
|
||||
def test_notify_unknown():
|
||||
n = QueryNotifyPrint(color=False)
|
||||
result = QueryResult(
|
||||
username="test",
|
||||
status=QueryStatus.UNKNOWN,
|
||||
site_name="TEST_SITE",
|
||||
site_url_user="http://example.com/test",
|
||||
)
|
||||
result.error = CheckError('Type', 'Reason')
|
||||
|
||||
assert n.update(result) == "[?] TEST_SITE: Type error: Reason"
|
||||
@@ -16,8 +16,14 @@ from maigret.report import (
|
||||
generate_report_template,
|
||||
generate_report_context,
|
||||
generate_json_report,
|
||||
get_plaintext_report,
|
||||
)
|
||||
from maigret.result import QueryResult, QueryStatus
|
||||
from maigret.sites import MaigretSite
|
||||
|
||||
|
||||
GOOD_RESULT = QueryResult('', '', '', QueryStatus.CLAIMED)
|
||||
BAD_RESULT = QueryResult('', '', '', QueryStatus.AVAILABLE)
|
||||
|
||||
EXAMPLE_RESULTS = {
|
||||
'GitHub': {
|
||||
@@ -35,11 +41,22 @@ EXAMPLE_RESULTS = {
|
||||
'http_status': 200,
|
||||
'is_similar': False,
|
||||
'rank': 78,
|
||||
'site': MaigretSite('test', {}),
|
||||
}
|
||||
}
|
||||
|
||||
GOOD_RESULT = QueryResult('', '', '', QueryStatus.CLAIMED)
|
||||
BAD_RESULT = QueryResult('', '', '', QueryStatus.AVAILABLE)
|
||||
BROKEN_RESULTS = {
|
||||
'GitHub': {
|
||||
'username': 'test',
|
||||
'parsing_enabled': True,
|
||||
'url_main': 'https://www.github.com/',
|
||||
'url_user': 'https://www.github.com/test',
|
||||
'http_status': 200,
|
||||
'is_similar': False,
|
||||
'rank': 78,
|
||||
'site': MaigretSite('test', {}),
|
||||
}
|
||||
}
|
||||
|
||||
GOOD_500PX_RESULT = copy.deepcopy(GOOD_RESULT)
|
||||
GOOD_500PX_RESULT.tags = ['photo', 'us', 'global']
|
||||
@@ -235,10 +252,13 @@ TEST = [
|
||||
]
|
||||
|
||||
SUPPOSED_BRIEF = """Search by username alexaimephotographycars returned 1 accounts. Found target's other IDs: alexaimephotography, Alexaimephotogr. Search by username alexaimephotography returned 2 accounts. Search by username Alexaimephotogr returned 1 accounts. Extended info extracted from 3 accounts."""
|
||||
|
||||
SUPPOSED_INTERESTS = "Interests: photo <span class=\"text-muted\">(2)</span>, news <span class=\"text-muted\">(1)</span>, social <span class=\"text-muted\">(1)</span>"
|
||||
SUPPOSED_BROKEN_BRIEF = """Search by username alexaimephotographycars returned 0 accounts. Search by username alexaimephotography returned 2 accounts. Search by username Alexaimephotogr returned 1 accounts. Extended info extracted from 2 accounts."""
|
||||
|
||||
SUPPOSED_GEO = "Geo: us <span class=\"text-muted\">(3)</span>"
|
||||
SUPPOSED_BROKEN_GEO = "Geo: us <span class=\"text-muted\">(2)</span>"
|
||||
|
||||
SUPPOSED_INTERESTS = "Interests: photo <span class=\"text-muted\">(2)</span>, news <span class=\"text-muted\">(1)</span>, social <span class=\"text-muted\">(1)</span>"
|
||||
SUPPOSED_BROKEN_INTERESTS = "Interests: news <span class=\"text-muted\">(1)</span>, photo <span class=\"text-muted\">(1)</span>, social <span class=\"text-muted\">(1)</span>"
|
||||
|
||||
|
||||
def test_generate_report_template():
|
||||
@@ -266,6 +286,19 @@ def test_generate_csv_report():
|
||||
]
|
||||
|
||||
|
||||
def test_generate_csv_report_broken():
|
||||
csvfile = StringIO()
|
||||
generate_csv_report('test', BROKEN_RESULTS, csvfile)
|
||||
|
||||
csvfile.seek(0)
|
||||
data = csvfile.readlines()
|
||||
|
||||
assert data == [
|
||||
'username,name,url_main,url_user,exists,http_status\r\n',
|
||||
'test,GitHub,https://www.github.com/,https://www.github.com/test,Unknown,200\r\n',
|
||||
]
|
||||
|
||||
|
||||
def test_generate_txt_report():
|
||||
txtfile = StringIO()
|
||||
generate_txt_report('test', EXAMPLE_RESULTS, txtfile)
|
||||
@@ -279,6 +312,18 @@ def test_generate_txt_report():
|
||||
]
|
||||
|
||||
|
||||
def test_generate_txt_report_broken():
|
||||
txtfile = StringIO()
|
||||
generate_txt_report('test', BROKEN_RESULTS, txtfile)
|
||||
|
||||
txtfile.seek(0)
|
||||
data = txtfile.readlines()
|
||||
|
||||
assert data == [
|
||||
'Total Websites Username Detected On : 0',
|
||||
]
|
||||
|
||||
|
||||
def test_generate_json_simple_report():
|
||||
jsonfile = StringIO()
|
||||
MODIFIED_RESULTS = dict(EXAMPLE_RESULTS)
|
||||
@@ -292,6 +337,19 @@ def test_generate_json_simple_report():
|
||||
assert list(json.loads(data[0]).keys()) == ['GitHub', 'GitHub2']
|
||||
|
||||
|
||||
def test_generate_json_simple_report_broken():
|
||||
jsonfile = StringIO()
|
||||
MODIFIED_RESULTS = dict(BROKEN_RESULTS)
|
||||
MODIFIED_RESULTS['GitHub2'] = BROKEN_RESULTS['GitHub']
|
||||
generate_json_report('test', BROKEN_RESULTS, jsonfile, 'simple')
|
||||
|
||||
jsonfile.seek(0)
|
||||
data = jsonfile.readlines()
|
||||
|
||||
assert len(data) == 1
|
||||
assert list(json.loads(data[0]).keys()) == []
|
||||
|
||||
|
||||
def test_generate_json_ndjson_report():
|
||||
jsonfile = StringIO()
|
||||
MODIFIED_RESULTS = dict(EXAMPLE_RESULTS)
|
||||
@@ -325,6 +383,20 @@ def test_save_xmind_report():
|
||||
)
|
||||
|
||||
|
||||
def test_save_xmind_report_broken():
|
||||
filename = 'report_test.xmind'
|
||||
save_xmind_report(filename, 'test', BROKEN_RESULTS)
|
||||
|
||||
workbook = xmind.load(filename)
|
||||
sheet = workbook.getPrimarySheet()
|
||||
data = sheet.getData()
|
||||
|
||||
assert data['title'] == 'test Analysis'
|
||||
assert data['topic']['title'] == 'test'
|
||||
assert len(data['topic']['topics']) == 1
|
||||
assert data['topic']['topics'][0]['title'] == 'Undefined'
|
||||
|
||||
|
||||
def test_html_report():
|
||||
report_name = 'report_test.html'
|
||||
context = generate_report_context(TEST)
|
||||
@@ -337,9 +409,47 @@ def test_html_report():
|
||||
assert SUPPOSED_INTERESTS in report_text
|
||||
|
||||
|
||||
def test_html_report_broken():
|
||||
report_name = 'report_test_broken.html'
|
||||
BROKEN_DATA = copy.deepcopy(TEST)
|
||||
BROKEN_DATA[0][2]['500px']['status'] = None
|
||||
|
||||
context = generate_report_context(BROKEN_DATA)
|
||||
save_html_report(report_name, context)
|
||||
|
||||
report_text = open(report_name).read()
|
||||
|
||||
assert SUPPOSED_BROKEN_BRIEF in report_text
|
||||
assert SUPPOSED_BROKEN_GEO in report_text
|
||||
assert SUPPOSED_BROKEN_INTERESTS in report_text
|
||||
|
||||
|
||||
def test_pdf_report():
|
||||
report_name = 'report_test.pdf'
|
||||
context = generate_report_context(TEST)
|
||||
save_pdf_report(report_name, context)
|
||||
|
||||
assert os.path.exists(report_name)
|
||||
|
||||
|
||||
def test_text_report():
|
||||
context = generate_report_context(TEST)
|
||||
report_text = get_plaintext_report(context)
|
||||
|
||||
for brief_part in SUPPOSED_BRIEF.split():
|
||||
assert brief_part in report_text
|
||||
assert 'us' in report_text
|
||||
assert 'photo' in report_text
|
||||
|
||||
|
||||
def test_text_report_broken():
|
||||
BROKEN_DATA = copy.deepcopy(TEST)
|
||||
BROKEN_DATA[0][2]['500px']['status'] = None
|
||||
|
||||
context = generate_report_context(BROKEN_DATA)
|
||||
report_text = get_plaintext_report(context)
|
||||
|
||||
for brief_part in SUPPOSED_BROKEN_BRIEF.split():
|
||||
assert brief_part in report_text
|
||||
assert 'us' in report_text
|
||||
assert 'photo' in report_text
|
||||
|
||||
@@ -103,6 +103,7 @@ def test_saving_site_error():
|
||||
|
||||
amperka = db.sites[0]
|
||||
assert len(amperka.errors) == 2
|
||||
assert len(amperka.errors_dict) == 2
|
||||
|
||||
assert amperka.strip_engine_data().errors == {'error1': 'text1'}
|
||||
assert amperka.strip_engine_data().json['errors'] == {'error1': 'text1'}
|
||||
@@ -114,7 +115,7 @@ def test_site_url_detector():
|
||||
|
||||
assert (
|
||||
db.sites[0].url_regexp.pattern
|
||||
== r'^https?://(www.)?forum\.amperka\.ru/members/\?username=(.+?)$'
|
||||
== r'^https?://(www.|m.)?forum\.amperka\.ru/members/\?username=(.+?)$'
|
||||
)
|
||||
assert (
|
||||
db.sites[0].detect_username('http://forum.amperka.ru/members/?username=test')
|
||||
@@ -178,3 +179,26 @@ def test_ranked_sites_dict_id_type():
|
||||
assert len(db.ranked_sites_dict()) == 2
|
||||
assert len(db.ranked_sites_dict(id_type='username')) == 2
|
||||
assert len(db.ranked_sites_dict(id_type='gaia_id')) == 1
|
||||
|
||||
|
||||
def test_get_url_template():
|
||||
site = MaigretSite(
|
||||
"test",
|
||||
{
|
||||
"urlMain": "https://ya.ru/",
|
||||
"url": "{urlMain}{urlSubpath}/members/?username={username}",
|
||||
},
|
||||
)
|
||||
assert (
|
||||
site.get_url_template()
|
||||
== "{urlMain}{urlSubpath}/members/?username={username} (no engine)"
|
||||
)
|
||||
|
||||
site = MaigretSite(
|
||||
"test",
|
||||
{
|
||||
"urlMain": "https://ya.ru/",
|
||||
"url": "https://{username}.ya.ru",
|
||||
},
|
||||
)
|
||||
assert site.get_url_template() == "SUBDOMAIN"
|
||||
|
||||
@@ -8,6 +8,7 @@ from maigret.utils import (
|
||||
enrich_link_str,
|
||||
URLMatcher,
|
||||
get_dict_ascii_tree,
|
||||
get_match_ratio,
|
||||
)
|
||||
|
||||
|
||||
@@ -40,13 +41,13 @@ def test_case_convert_camel_with_digits_to_snake():
|
||||
|
||||
|
||||
def test_is_country_tag():
|
||||
assert is_country_tag('ru') == True
|
||||
assert is_country_tag('FR') == True
|
||||
assert is_country_tag('ru') is True
|
||||
assert is_country_tag('FR') is True
|
||||
|
||||
assert is_country_tag('a1') == False
|
||||
assert is_country_tag('dating') == False
|
||||
assert is_country_tag('a1') is False
|
||||
assert is_country_tag('dating') is False
|
||||
|
||||
assert is_country_tag('global') == True
|
||||
assert is_country_tag('global') is True
|
||||
|
||||
|
||||
def test_enrich_link_str():
|
||||
@@ -57,6 +58,11 @@ def test_enrich_link_str():
|
||||
)
|
||||
|
||||
|
||||
def test_url_extract_main_part_negative():
|
||||
url_main_part = 'None'
|
||||
assert URLMatcher.extract_main_part(url_main_part) == ''
|
||||
|
||||
|
||||
def test_url_extract_main_part():
|
||||
url_main_part = 'flickr.com/photos/alexaimephotography'
|
||||
|
||||
@@ -67,9 +73,11 @@ def test_url_extract_main_part():
|
||||
['/', ''],
|
||||
]
|
||||
|
||||
url_regexp = re.compile('^https?://(www.)?flickr.com/photos/(.+?)$')
|
||||
url_regexp = re.compile(r'^https?://(www\.)?flickr.com/photos/(.+?)$')
|
||||
# combine parts variations
|
||||
for url_parts in itertools.product(*parts):
|
||||
url = ''.join(url_parts)
|
||||
# ensure all combinations give valid main part
|
||||
assert URLMatcher.extract_main_part(url) == url_main_part
|
||||
assert not url_regexp.match(url) is None
|
||||
|
||||
@@ -84,11 +92,13 @@ def test_url_make_profile_url_regexp():
|
||||
['/', ''],
|
||||
]
|
||||
|
||||
# combine parts variations
|
||||
for url_parts in itertools.product(*parts):
|
||||
url = ''.join(url_parts)
|
||||
# ensure all combinations match pattern
|
||||
assert (
|
||||
URLMatcher.make_profile_url_regexp(url).pattern
|
||||
== r'^https?://(www.)?flickr\.com/photos/(.+?)$'
|
||||
== r'^https?://(www.|m.)?flickr\.com/photos/(.+?)$'
|
||||
)
|
||||
|
||||
|
||||
@@ -98,6 +108,7 @@ def test_get_dict_ascii_tree():
|
||||
'legacy_id': '26403415',
|
||||
'username': 'alexaimephotographycars',
|
||||
'name': 'Alex Aimé',
|
||||
'links': "['www.instagram.com/street.reality.photography/']",
|
||||
'created_at': '2018-05-04T10:17:01.000+0000',
|
||||
'image': 'https://drscdn.500px.org/user_avatar/26403415/q%3D85_w%3D300_h%3D300/v2?webp=true&v=2&sig=0235678a4f7b65e007e864033ebfaf5ef6d87fad34f80a8639d985320c20fe3b',
|
||||
'image_bg': 'https://drscdn.500px.org/user_cover/26403415/q%3D65_m%3D2048/v2?webp=true&v=1&sig=bea411fb158391a4fdad498874ff17088f91257e59dfb376ff67e3a44c3a4201',
|
||||
@@ -107,20 +118,28 @@ def test_get_dict_ascii_tree():
|
||||
'twitter_username': 'Alexaimephotogr',
|
||||
}
|
||||
|
||||
ascii_tree = get_dict_ascii_tree(data.items())
|
||||
ascii_tree = get_dict_ascii_tree(data.items(), prepend=" ")
|
||||
|
||||
assert (
|
||||
ascii_tree
|
||||
== """
|
||||
┣╸uid: dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==
|
||||
┣╸legacy_id: 26403415
|
||||
┣╸username: alexaimephotographycars
|
||||
┣╸name: Alex Aimé
|
||||
┣╸created_at: 2018-05-04T10:17:01.000+0000
|
||||
┣╸image: https://drscdn.500px.org/user_avatar/26403415/q%3D85_w%3D300_h%3D300/v2?webp=true&v=2&sig=0235678a4f7b65e007e864033ebfaf5ef6d87fad34f80a8639d985320c20fe3b
|
||||
┣╸image_bg: https://drscdn.500px.org/user_cover/26403415/q%3D65_m%3D2048/v2?webp=true&v=1&sig=bea411fb158391a4fdad498874ff17088f91257e59dfb376ff67e3a44c3a4201
|
||||
┣╸website: www.instagram.com/street.reality.photography/
|
||||
┣╸facebook_link: www.instagram.com/street.reality.photography/
|
||||
┣╸instagram_username: Street.Reality.Photography
|
||||
┗╸twitter_username: Alexaimephotogr"""
|
||||
├─uid: dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==
|
||||
├─legacy_id: 26403415
|
||||
├─username: alexaimephotographycars
|
||||
├─name: Alex Aimé
|
||||
├─links:
|
||||
│ └─ www.instagram.com/street.reality.photography/
|
||||
├─created_at: 2018-05-04T10:17:01.000+0000
|
||||
├─image: https://drscdn.500px.org/user_avatar/26403415/q%3D85_w%3D300_h%3D300/v2?webp=true&v=2&sig=0235678a4f7b65e007e864033ebfaf5ef6d87fad34f80a8639d985320c20fe3b
|
||||
├─image_bg: https://drscdn.500px.org/user_cover/26403415/q%3D65_m%3D2048/v2?webp=true&v=1&sig=bea411fb158391a4fdad498874ff17088f91257e59dfb376ff67e3a44c3a4201
|
||||
├─website: www.instagram.com/street.reality.photography/
|
||||
├─facebook_link: www.instagram.com/street.reality.photography/
|
||||
├─instagram_username: Street.Reality.Photography
|
||||
└─twitter_username: Alexaimephotogr"""
|
||||
)
|
||||
|
||||
|
||||
def test_get_match_ratio():
|
||||
fun = get_match_ratio(["test", "maigret", "username"])
|
||||
|
||||
assert fun("test") == 1
|
||||
|
||||
@@ -0,0 +1,57 @@
|
||||
#!/usr/bin/env python3
|
||||
import random
|
||||
from argparse import ArgumentParser, RawDescriptionHelpFormatter
|
||||
|
||||
from maigret.maigret import MaigretDatabase
|
||||
from maigret.submit import get_alexa_rank
|
||||
|
||||
|
||||
def update_tags(site):
|
||||
tags = []
|
||||
if not site.tags:
|
||||
print(f'Site {site.name} doesn\'t have tags')
|
||||
else:
|
||||
tags = site.tags
|
||||
print(f'Site {site.name} tags: ' + ', '.join(tags))
|
||||
|
||||
print(f'URL: {site.url_main}')
|
||||
|
||||
new_tags = set(input('Enter new tags: ').split(', '))
|
||||
if "disabled" in new_tags:
|
||||
new_tags.remove("disabled")
|
||||
site.disabled = True
|
||||
|
||||
print(f'Old alexa rank: {site.alexa_rank}')
|
||||
rank = get_alexa_rank(site.url_main)
|
||||
if rank:
|
||||
print(f'New alexa rank: {rank}')
|
||||
site.alexa_rank = rank
|
||||
|
||||
site.tags = [x for x in list(new_tags) if x]
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter
|
||||
)
|
||||
parser.add_argument("--base","-b", metavar="BASE_FILE",
|
||||
dest="base_file", default="maigret/resources/data.json",
|
||||
help="JSON file with sites data to update.")
|
||||
|
||||
pool = list()
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
db = MaigretDatabase()
|
||||
db.load_from_file(args.base_file).sites
|
||||
|
||||
while True:
|
||||
site = random.choice(db.sites)
|
||||
if site.engine == 'uCoz':
|
||||
continue
|
||||
|
||||
if not 'in' in site.tags:
|
||||
continue
|
||||
|
||||
update_tags(site)
|
||||
|
||||
db.save_to_file(args.base_file)
|
||||
@@ -0,0 +1,152 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Maigret: Supported Site Listing with Alexa ranking and country tags
|
||||
This module generates the listing of supported sites in file `SITES.md`
|
||||
and pretty prints file with sites data.
|
||||
"""
|
||||
import aiohttp
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
import requests
|
||||
import logging
|
||||
import threading
|
||||
import xml.etree.ElementTree as ET
|
||||
from datetime import datetime
|
||||
from argparse import ArgumentParser, RawDescriptionHelpFormatter
|
||||
|
||||
import tqdm.asyncio
|
||||
|
||||
from maigret.maigret import get_response, site_self_check
|
||||
from maigret.sites import MaigretSite, MaigretDatabase, MaigretEngine
|
||||
from maigret.utils import CaseConverter
|
||||
|
||||
|
||||
async def check_engine_of_site(site_name, sites_with_engines, future, engine_name, semaphore, logger):
|
||||
async with semaphore:
|
||||
response = await get_response(request_future=future,
|
||||
site_name=site_name,
|
||||
logger=logger)
|
||||
|
||||
html_text, status_code, error_text, expection_text = response
|
||||
|
||||
if html_text and engine_name in html_text:
|
||||
sites_with_engines.append(site_name)
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter
|
||||
)
|
||||
parser.add_argument("--base","-b", metavar="BASE_FILE",
|
||||
dest="base_file", default="maigret/resources/data.json",
|
||||
help="JSON file with sites data to update.")
|
||||
|
||||
parser.add_argument('--engine', '-e', help='check only selected engine', type=str)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
log_level = logging.INFO
|
||||
logging.basicConfig(
|
||||
format='[%(filename)s:%(lineno)d] %(levelname)-3s %(asctime)s %(message)s',
|
||||
datefmt='%H:%M:%S',
|
||||
level=log_level
|
||||
)
|
||||
logger = logging.getLogger('engines-check')
|
||||
logger.setLevel(log_level)
|
||||
|
||||
db = MaigretDatabase()
|
||||
sites_subset = db.load_from_file(args.base_file).sites
|
||||
sites = {site.name: site for site in sites_subset}
|
||||
|
||||
with open(args.base_file, "r", encoding="utf-8") as data_file:
|
||||
sites_info = json.load(data_file)
|
||||
engines = sites_info['engines']
|
||||
|
||||
for engine_name, engine_data in engines.items():
|
||||
if args.engine and args.engine != engine_name:
|
||||
continue
|
||||
|
||||
if not 'presenseStrs' in engine_data:
|
||||
print(f'No features to automatically detect sites on engine {engine_name}')
|
||||
continue
|
||||
|
||||
engine_obj = MaigretEngine(engine_name, engine_data)
|
||||
|
||||
# setup connections for checking both engine and usernames
|
||||
connector = aiohttp.TCPConnector(ssl=False)
|
||||
connector.verify_ssl=False
|
||||
session = aiohttp.ClientSession(connector=connector)
|
||||
|
||||
sem = asyncio.Semaphore(100)
|
||||
loop = asyncio.get_event_loop()
|
||||
tasks = []
|
||||
|
||||
# check sites without engine if they look like sites on this engine
|
||||
new_engine_sites = []
|
||||
for site_name, site_data in sites.items():
|
||||
if site_data.engine:
|
||||
continue
|
||||
|
||||
future = session.get(url=site_data.url_main,
|
||||
allow_redirects=True,
|
||||
timeout=10,
|
||||
)
|
||||
|
||||
check_engine_coro = check_engine_of_site(site_name, new_engine_sites, future, engine_name, sem, logger)
|
||||
future = asyncio.ensure_future(check_engine_coro)
|
||||
tasks.append(future)
|
||||
|
||||
# progress bar
|
||||
for f in tqdm.asyncio.tqdm.as_completed(tasks):
|
||||
loop.run_until_complete(f)
|
||||
|
||||
print(f'Total detected {len(new_engine_sites)} sites on engine {engine_name}')
|
||||
# dict with new found engine sites
|
||||
new_sites = {site_name: sites[site_name] for site_name in new_engine_sites}
|
||||
|
||||
# update sites obj from engine
|
||||
for site_name, site in new_sites.items():
|
||||
site.request_future = None
|
||||
site.engine = engine_name
|
||||
site.update_from_engine(engine_obj)
|
||||
|
||||
async def update_site_data(site_name, site_data, all_sites, logger, no_progressbar):
|
||||
updates = await site_self_check(site_name, site_data, logger, no_progressbar)
|
||||
all_sites[site_name].update(updates)
|
||||
|
||||
tasks = []
|
||||
# for new_site_name, new_site_data in new_sites.items():
|
||||
# coro = update_site_data(new_site_name, new_site_data, new_sites, logger)
|
||||
# future = asyncio.ensure_future(coro)
|
||||
# tasks.append(future)
|
||||
|
||||
# asyncio.gather(*tasks)
|
||||
for new_site_name, new_site_data in new_sites.items():
|
||||
coro = update_site_data(new_site_name, new_site_data, new_sites, logger, no_progressbar=True)
|
||||
loop.run_until_complete(coro)
|
||||
|
||||
updated_sites_count = 0
|
||||
|
||||
for s in new_sites:
|
||||
site = new_sites[s]
|
||||
site.request_future = None
|
||||
|
||||
if site.disabled:
|
||||
print(f'{site.name} failed username checking of engine {engine_name}')
|
||||
continue
|
||||
|
||||
site = site.strip_engine_data()
|
||||
|
||||
db.update_site(site)
|
||||
updated_sites_count += 1
|
||||
db.save_to_file(args.base_file)
|
||||
|
||||
print(f'Site "{s}": ' + json.dumps(site.json, indent=4))
|
||||
|
||||
print(f'Updated total {updated_sites_count} sites!')
|
||||
print(f'Checking all sites on engine {engine_name}')
|
||||
|
||||
loop.run_until_complete(session.close())
|
||||
|
||||
print("\nFinished updating supported site listing!")
|
||||
@@ -0,0 +1,280 @@
|
||||
#!/usr/bin/env python3
|
||||
import json
|
||||
import random
|
||||
import re
|
||||
|
||||
import tqdm.asyncio
|
||||
from mock import Mock
|
||||
import requests
|
||||
|
||||
from maigret.maigret import *
|
||||
from maigret.result import QueryStatus
|
||||
from maigret.sites import MaigretSite
|
||||
|
||||
URL_RE = re.compile(r"https?://(www\.)?")
|
||||
TIMEOUT = 200
|
||||
|
||||
|
||||
async def maigret_check(site, site_data, username, status, logger):
|
||||
query_notify = Mock()
|
||||
logger.debug(f'Checking {site}...')
|
||||
|
||||
for username, status in [(username, status)]:
|
||||
results = await maigret(
|
||||
username,
|
||||
{site: site_data},
|
||||
logger,
|
||||
query_notify,
|
||||
timeout=TIMEOUT,
|
||||
forced=True,
|
||||
no_progressbar=True,
|
||||
)
|
||||
|
||||
if results[site]['status'].status != status:
|
||||
if results[site]['status'].status == QueryStatus.UNKNOWN:
|
||||
msg = site_data.absence_strs
|
||||
etype = site_data.check_type
|
||||
context = results[site]['status'].context
|
||||
|
||||
logger.debug(f'Error while searching {username} in {site}, must be claimed. Context: {context}')
|
||||
# if site_data.get('errors'):
|
||||
# continue
|
||||
return False
|
||||
|
||||
if status == QueryStatus.CLAIMED:
|
||||
logger.debug(f'Not found {username} in {site}, must be claimed')
|
||||
logger.debug(results[site])
|
||||
pass
|
||||
else:
|
||||
logger.debug(f'Found {username} in {site}, must be available')
|
||||
logger.debug(results[site])
|
||||
pass
|
||||
return False
|
||||
|
||||
return site_data
|
||||
|
||||
|
||||
async def check_and_add_maigret_site(site_data, semaphore, logger, ok_usernames, bad_usernames):
|
||||
async with semaphore:
|
||||
sitename = site_data.name
|
||||
positive = False
|
||||
negative = False
|
||||
|
||||
for ok_username in ok_usernames:
|
||||
site_data.username_claimed = ok_username
|
||||
status = QueryStatus.CLAIMED
|
||||
if await maigret_check(sitename, site_data, ok_username, status, logger):
|
||||
# print(f'{sitename} positive case is okay')
|
||||
positive = True
|
||||
break
|
||||
|
||||
for bad_username in bad_usernames:
|
||||
site_data.username_unclaimed = bad_username
|
||||
status = QueryStatus.AVAILABLE
|
||||
if await maigret_check(sitename, site_data, bad_username, status, logger):
|
||||
# print(f'{sitename} negative case is okay')
|
||||
negative = True
|
||||
break
|
||||
|
||||
if positive and negative:
|
||||
site_data = site_data.strip_engine_data()
|
||||
|
||||
db.update_site(site_data)
|
||||
print(site_data.json)
|
||||
try:
|
||||
db.save_to_file(args.base_file)
|
||||
except Exception as e:
|
||||
logging.error(e, exc_info=True)
|
||||
print(f'Saved new site {sitename}...')
|
||||
ok_sites.append(site_data)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter
|
||||
)
|
||||
parser.add_argument("--base", "-b", metavar="BASE_FILE",
|
||||
dest="base_file", default="maigret/resources/data.json",
|
||||
help="JSON file with sites data to update.")
|
||||
|
||||
parser.add_argument("--add-engine", dest="add_engine", help="Additional engine to check")
|
||||
|
||||
parser.add_argument("--only-engine", dest="only_engine", help="Use only this engine from detected to check")
|
||||
|
||||
parser.add_argument('--check', help='only check sites in database', action='store_true')
|
||||
|
||||
parser.add_argument('--random', help='shuffle list of urls', action='store_true', default=False)
|
||||
|
||||
parser.add_argument('--top', help='top count of records in file', type=int, default=10000)
|
||||
|
||||
parser.add_argument('--filter', help='substring to filter input urls', type=str, default='')
|
||||
|
||||
parser.add_argument('--username', help='preferable username to check with', type=str)
|
||||
|
||||
parser.add_argument(
|
||||
"--info",
|
||||
"-vv",
|
||||
action="store_true",
|
||||
dest="info",
|
||||
default=False,
|
||||
help="Display service information.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
"-v",
|
||||
action="store_true",
|
||||
dest="verbose",
|
||||
default=False,
|
||||
help="Display extra information and metrics.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-d",
|
||||
"--debug",
|
||||
"-vvv",
|
||||
action="store_true",
|
||||
dest="debug",
|
||||
default=False,
|
||||
help="Saving debugging information and sites responses in debug.txt.",
|
||||
)
|
||||
|
||||
parser.add_argument("urls_file",
|
||||
metavar='URLS_FILE',
|
||||
action="store",
|
||||
help="File with base site URLs"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
log_level = logging.ERROR
|
||||
if args.debug:
|
||||
log_level = logging.DEBUG
|
||||
elif args.info:
|
||||
log_level = logging.INFO
|
||||
elif args.verbose:
|
||||
log_level = logging.WARNING
|
||||
|
||||
logging.basicConfig(
|
||||
format='[%(filename)s:%(lineno)d] %(levelname)-3s %(asctime)s %(message)s',
|
||||
datefmt='%H:%M:%S',
|
||||
level=log_level
|
||||
)
|
||||
logger = logging.getLogger('engines-check')
|
||||
logger.setLevel(log_level)
|
||||
|
||||
db = MaigretDatabase()
|
||||
sites_subset = db.load_from_file(args.base_file).sites
|
||||
sites = {site.name: site for site in sites_subset}
|
||||
engines = db.engines
|
||||
|
||||
# TODO: usernames extractors
|
||||
ok_usernames = ['alex', 'god', 'admin', 'red', 'blue', 'john']
|
||||
if args.username:
|
||||
ok_usernames = [args.username] + ok_usernames
|
||||
|
||||
bad_usernames = ['noonewouldeverusethis7']
|
||||
|
||||
with open(args.urls_file, 'r') as urls_file:
|
||||
urls = urls_file.read().splitlines()
|
||||
if args.random:
|
||||
random.shuffle(urls)
|
||||
urls = urls[:args.top]
|
||||
|
||||
raw_maigret_data = json.dumps({site.name: site.json for site in sites_subset})
|
||||
|
||||
new_sites = []
|
||||
for site in tqdm.asyncio.tqdm(urls):
|
||||
site_lowercase = site.lower()
|
||||
|
||||
domain_raw = URL_RE.sub('', site_lowercase).strip().strip('/')
|
||||
domain_raw = domain_raw.split('/')[0]
|
||||
|
||||
if args.filter and args.filter not in domain_raw:
|
||||
logger.debug('Site %s skipped due to filtering by "%s"', domain_raw, args.filter)
|
||||
continue
|
||||
|
||||
if domain_raw in raw_maigret_data:
|
||||
logger.debug(f'Site {domain_raw} already exists in the Maigret database!')
|
||||
continue
|
||||
|
||||
if '"' in domain_raw:
|
||||
logger.debug(f'Invalid site {domain_raw}')
|
||||
continue
|
||||
|
||||
main_page_url = '/'.join(site.split('/', 3)[:3])
|
||||
|
||||
site_data = {
|
||||
'url': site,
|
||||
'urlMain': main_page_url,
|
||||
'name': domain_raw,
|
||||
}
|
||||
|
||||
try:
|
||||
r = requests.get(main_page_url, timeout=5)
|
||||
except:
|
||||
r = None
|
||||
pass
|
||||
|
||||
detected_engines = []
|
||||
|
||||
for e in engines:
|
||||
strs_to_check = e.__dict__.get('presenseStrs')
|
||||
if strs_to_check and r and r.text:
|
||||
all_strs_in_response = True
|
||||
for s in strs_to_check:
|
||||
if not s in r.text:
|
||||
all_strs_in_response = False
|
||||
if all_strs_in_response:
|
||||
engine_name = e.__dict__.get('name')
|
||||
detected_engines.append(engine_name)
|
||||
logger.info(f'Detected engine {engine_name} for site {main_page_url}')
|
||||
|
||||
if args.only_engine and args.only_engine in detected_engines:
|
||||
detected_engines = [args.only_engine]
|
||||
elif not detected_engines and args.add_engine:
|
||||
logging.debug('Could not detect any engine, applying default engine %s...', args.add_engine)
|
||||
detected_engines = [args.add_engine]
|
||||
|
||||
def create_site_from_engine(sitename, data, e):
|
||||
site = MaigretSite(sitename, data)
|
||||
site.update_from_engine(db.engines_dict[e])
|
||||
site.engine = e
|
||||
return site
|
||||
|
||||
for engine_name in detected_engines:
|
||||
site = create_site_from_engine(domain_raw, site_data, engine_name)
|
||||
new_sites.append(site)
|
||||
logger.debug(site.json)
|
||||
|
||||
# if engine_name == "phpBB":
|
||||
# site_data_with_subpath = dict(site_data)
|
||||
# site_data_with_subpath["urlSubpath"] = "/forum"
|
||||
# site = create_site_from_engine(domain_raw, site_data_with_subpath, engine_name)
|
||||
# new_sites.append(site)
|
||||
|
||||
# except Exception as e:
|
||||
# print(f'Error: {str(e)}')
|
||||
# pass
|
||||
|
||||
print(f'Found {len(new_sites)}/{len(urls)} new sites')
|
||||
|
||||
if args.check:
|
||||
for s in new_sites:
|
||||
print(s.url_main)
|
||||
sys.exit(0)
|
||||
|
||||
sem = asyncio.Semaphore(20)
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
ok_sites = []
|
||||
tasks = []
|
||||
for site in new_sites:
|
||||
check_coro = check_and_add_maigret_site(site, sem, logger, ok_usernames, bad_usernames)
|
||||
future = asyncio.ensure_future(check_coro)
|
||||
tasks.append(future)
|
||||
|
||||
for f in tqdm.asyncio.tqdm.as_completed(tasks, timeout=TIMEOUT):
|
||||
try:
|
||||
loop.run_until_complete(f)
|
||||
except asyncio.exceptions.TimeoutError:
|
||||
pass
|
||||
|
||||
print(f'Found and saved {len(ok_sites)} sites!')
|
||||
@@ -0,0 +1,36 @@
|
||||
import sys
|
||||
import difflib
|
||||
import requests
|
||||
|
||||
|
||||
a = requests.get(sys.argv[1]).text
|
||||
b = requests.get(sys.argv[2]).text
|
||||
|
||||
|
||||
tokens_a = set(a.split('"'))
|
||||
tokens_b = set(b.split('"'))
|
||||
|
||||
a_minus_b = tokens_a.difference(tokens_b)
|
||||
b_minus_a = tokens_b.difference(tokens_a)
|
||||
|
||||
print(a_minus_b)
|
||||
print(b_minus_a)
|
||||
|
||||
print(len(a_minus_b))
|
||||
print(len(b_minus_a))
|
||||
|
||||
desired_strings = ["username", "not found", "пользователь", "profile", "lastname", "firstname", "biography",
|
||||
"birthday", "репутация", "информация", "e-mail"]
|
||||
|
||||
|
||||
def get_match_ratio(x):
|
||||
return round(max([
|
||||
difflib.SequenceMatcher(a=x.lower(), b=y).ratio()
|
||||
for y in desired_strings
|
||||
]), 2)
|
||||
|
||||
|
||||
RATIO = 0.6
|
||||
|
||||
print(sorted(a_minus_b, key=get_match_ratio, reverse=True)[:10])
|
||||
print(sorted(b_minus_a, key=get_match_ratio, reverse=True)[:10])
|
||||
@@ -25,7 +25,7 @@ RANKS.update({
|
||||
'100000000': '100M',
|
||||
})
|
||||
|
||||
SEMAPHORE = threading.Semaphore(10)
|
||||
SEMAPHORE = threading.Semaphore(20)
|
||||
|
||||
def get_rank(domain_to_query, site, print_errors=True):
|
||||
with SEMAPHORE:
|
||||
@@ -37,15 +37,15 @@ def get_rank(domain_to_query, site, print_errors=True):
|
||||
try:
|
||||
#Get ranking for this site.
|
||||
site.alexa_rank = int(root.find('.//REACH').attrib['RANK'])
|
||||
country = root.find('.//COUNTRY')
|
||||
if not country is None and country.attrib:
|
||||
country_code = country.attrib['CODE']
|
||||
tags = set(site.tags)
|
||||
if country_code:
|
||||
tags.add(country_code.lower())
|
||||
site.tags = sorted(list(tags))
|
||||
if site.type != 'username':
|
||||
site.disabled = False
|
||||
# country = root.find('.//COUNTRY')
|
||||
# if not country is None and country.attrib:
|
||||
# country_code = country.attrib['CODE']
|
||||
# tags = set(site.tags)
|
||||
# if country_code:
|
||||
# tags.add(country_code.lower())
|
||||
# site.tags = sorted(list(tags))
|
||||
# if site.type != 'username':
|
||||
# site.disabled = False
|
||||
except Exception as e:
|
||||
if print_errors:
|
||||
logging.error(e)
|
||||
@@ -74,6 +74,7 @@ if __name__ == '__main__':
|
||||
dest="base_file", default="maigret/resources/data.json",
|
||||
help="JSON file with sites data to update.")
|
||||
|
||||
parser.add_argument('--with-rank', help='update with use of local data only', action='store_true')
|
||||
parser.add_argument('--empty-only', help='update only sites without rating', action='store_true')
|
||||
parser.add_argument('--exclude-engine', help='do not update score with certain engine',
|
||||
action="append", dest="exclude_engine_list", default=[])
|
||||
@@ -87,30 +88,33 @@ if __name__ == '__main__':
|
||||
|
||||
with open("sites.md", "w") as site_file:
|
||||
site_file.write(f"""
|
||||
## List of supported sites: total {len(sites_subset)}\n
|
||||
## List of supported sites (search methods): total {len(sites_subset)}\n
|
||||
Rank data fetched from Alexa by domains.
|
||||
|
||||
""")
|
||||
|
||||
for site in sites_subset:
|
||||
if not args.with_rank:
|
||||
break
|
||||
url_main = site.url_main
|
||||
if site.alexa_rank < sys.maxsize and args.empty_only:
|
||||
continue
|
||||
if args.exclude_engine_list and site.engine in args.exclude_engine_list:
|
||||
continue
|
||||
site.alexa_rank = 0
|
||||
th = threading.Thread(target=get_rank, args=(url_main, site))
|
||||
th = threading.Thread(target=get_rank, args=(url_main, site,))
|
||||
pool.append((site.name, url_main, th))
|
||||
th.start()
|
||||
|
||||
index = 1
|
||||
for site_name, url_main, th in pool:
|
||||
th.join()
|
||||
sys.stdout.write("\r{0}".format(f"Updated {index} out of {len(sites_subset)} entries"))
|
||||
sys.stdout.flush()
|
||||
index = index + 1
|
||||
if args.with_rank:
|
||||
index = 1
|
||||
for site_name, url_main, th in pool:
|
||||
th.join()
|
||||
sys.stdout.write("\r{0}".format(f"Updated {index} out of {len(sites_subset)} entries"))
|
||||
sys.stdout.flush()
|
||||
index = index + 1
|
||||
|
||||
sites_full_list = [(s, s.alexa_rank) for s in sites_subset]
|
||||
sites_full_list = [(s, int(s.alexa_rank)) for s in sites_subset]
|
||||
|
||||
sites_full_list.sort(reverse=False, key=lambda x: x[1])
|
||||
|
||||
@@ -123,6 +127,7 @@ Rank data fetched from Alexa by domains.
|
||||
url_main = site.url_main
|
||||
valid_rank = get_step_rank(rank)
|
||||
all_tags = site.tags
|
||||
all_tags.sort()
|
||||
tags = ', ' + ', '.join(all_tags) if all_tags else ''
|
||||
note = ''
|
||||
if site.disabled:
|
||||
@@ -132,7 +137,11 @@ Rank data fetched from Alexa by domains.
|
||||
site_file.write(f'1. {favicon} [{site}]({url_main})*: top {valid_rank}{tags}*{note}\n')
|
||||
db.update_site(site)
|
||||
|
||||
site_file.write(f'\nAlexa.com rank data fetched at ({datetime.utcnow()} UTC)\n')
|
||||
site_file.write(f'\nThe list was updated at ({datetime.utcnow()} UTC)\n')
|
||||
db.save_to_file(args.base_file)
|
||||
|
||||
statistics_text = db.get_db_stats(is_markdown=True)
|
||||
site_file.write('## Statistics\n\n')
|
||||
site_file.write(statistics_text)
|
||||
|
||||
print("\nFinished updating supported site listing!")
|
||||
|
||||