mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-07 14:34:33 +00:00
Compare commits
179 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| aa6cd0eca9 | |||
| 38e5d5c664 | |||
| 8a562d06ae | |||
| aa50ee9672 | |||
| 51327f9647 | |||
| 4a368c9bb6 | |||
| 6fd5f6e33a | |||
| fa3db9c39c | |||
| 5912ad4fbc | |||
| ee36dc0187 | |||
| 9eb62e4e22 | |||
| ead048af93 | |||
| acc751ff98 | |||
| b7bdd71cf0 | |||
| 43f189f774 | |||
| 5bda7fb339 | |||
| 414523a8ac | |||
| 6d4e268706 | |||
| b696b982f4 | |||
| d4234036c0 | |||
| b57c70091c | |||
| e90df3560b | |||
| bc6ee48b8c | |||
| e70bdf3789 | |||
| 84f9d417cf | |||
| 4333c40be7 | |||
| 9e504c0094 | |||
| 2f752a0368 | |||
| 53e9dab677 | |||
| 11b70a2a48 | |||
| 960708ef2e | |||
| e6f6d8735d | |||
| f77d7d307a | |||
| 158f739a59 | |||
| b6a207d0e3 | |||
| d59867b0d9 | |||
| 2145027196 | |||
| 386e9eba4f | |||
| 0e9655c46a | |||
| 009d51c380 | |||
| 78e9688ece | |||
| 3cbb9df7b3 | |||
| 2fb1f19948 | |||
| 3b91a9cd31 | |||
| 9858e71349 | |||
| c88e194d07 | |||
| ad5c7fbc7d | |||
| 66d6c7a93c | |||
| bdfb4911ce | |||
| 951be44452 | |||
| 188edc1b7f | |||
| ec0d3a1f70 | |||
| a084203ee1 | |||
| 1afdda7336 | |||
| 252d12ff9e | |||
| 6afb17e24f | |||
| 7fdd965bb2 | |||
| 8e30e969f9 | |||
| 5ee91f6659 | |||
| 7fd4a2c516 | |||
| bfa6afac32 | |||
| bfaf276f6e | |||
| c9194b20ba | |||
| a30a012550 | |||
| 2cdc9bb276 | |||
| 99fc6c8a8f | |||
| b269c4a8e0 | |||
| f43dc5bd6f | |||
| 83cda9e37f | |||
| cc3df85690 | |||
| 8007e92021 | |||
| daaddbde4e | |||
| cea5073962 | |||
| b345512489 | |||
| 786cb59145 | |||
| 481baddec6 | |||
| ecb3d76581 | |||
| 8a8fab5bed | |||
| 2fee65fe4e | |||
| dabba859f3 | |||
| 74d4d40abd | |||
| d6f6d78d3f | |||
| 1b61c5085e | |||
| 01e20518c1 | |||
| 8477385289 | |||
| 491dd8f166 | |||
| c64b7a1c85 | |||
| 03511a7a8f | |||
| 7f1a0fae03 | |||
| b0de174df2 | |||
| b5db3f0035 | |||
| 53d698bb7b | |||
| 23fff42ca7 | |||
| 51d9e6f5f6 | |||
| 640c04f20b | |||
| 69f78e331b | |||
| 69c315b00e | |||
| b755628a1d | |||
| 7490a412db | |||
| 2741680d4a | |||
| e5fc221ce2 | |||
| a044e3dd79 | |||
| 6da4ff1e7b | |||
| eb2442401d | |||
| d23d24eeca | |||
| a2ddb15f09 | |||
| e90e85d2a9 | |||
| 2bb01f7019 | |||
| b586a4cd06 | |||
| 28733282ab | |||
| 0a7a7ad70d | |||
| c895f6b418 | |||
| a6286a0286 | |||
| 314eb25d1f | |||
| fbbc8b49f3 | |||
| faa03b62e5 | |||
| d676f7bb94 | |||
| d4757aab78 | |||
| 908176be85 | |||
| 940f408da3 | |||
| 8c700b9810 | |||
| f9c9af5f41 | |||
| 57a9a82102 | |||
| 9bbca995e9 | |||
| 39b713497d | |||
| 6a84875775 | |||
| 84f7d93478 | |||
| 17870ef5c8 | |||
| d3cd5e45a1 | |||
| 9a3f2f0aa7 | |||
| 4b7d344b41 | |||
| ac9cfe7885 | |||
| 6058a4b70c | |||
| 3aa225bda4 | |||
| c6661e22ff | |||
| fdb68b5e80 | |||
| 9fe6b99239 | |||
| b9d303fde3 | |||
| d29e88d96f | |||
| 731a8e01f9 | |||
| cf7acfd8c8 | |||
| 9e6bd05acc | |||
| 6ea1dc33f7 | |||
| d5bc92d26a | |||
| f7263c9b3c | |||
| e6f82a8ba3 | |||
| ba7a38092c | |||
| 92a1677213 | |||
| 9bbc5e61a7 | |||
| da3e3f6719 | |||
| d28221462a | |||
| 5baccbae0c | |||
| 65de06dc13 | |||
| dd71bc19c0 | |||
| 0625867f2a | |||
| ac7ff47fad | |||
| 0449142745 | |||
| 1a77bc7472 | |||
| 8391d7317d | |||
| 8bf789633e | |||
| 2714ff8fff | |||
| b7c02456e7 | |||
| 15af5e14f2 | |||
| f24ad4abfe | |||
| 2e3eceed81 | |||
| 9bc3615afc | |||
| a9543e8303 | |||
| 31df4eb44d | |||
| 89c33e5409 | |||
| c0956a0e23 | |||
| bb4c5dc67a | |||
| c16fc7c002 | |||
| 53f72edaff | |||
| 631de7b346 | |||
| 7676c053f9 | |||
| 90135d4676 | |||
| 4f9dace1de | |||
| cdec320062 | |||
| 10426c07aa |
@@ -0,0 +1,32 @@
|
||||
name: Build docker image and push to DockerHub
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [ main ]
|
||||
|
||||
jobs:
|
||||
docker:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
-
|
||||
name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v1
|
||||
-
|
||||
name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v1
|
||||
-
|
||||
name: Login to DockerHub
|
||||
uses: docker/login-action@v1
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_HUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }}
|
||||
-
|
||||
name: Build and push
|
||||
id: docker_build
|
||||
uses: docker/build-push-action@v2
|
||||
with:
|
||||
push: true
|
||||
tags: ${{ secrets.DOCKER_HUB_USERNAME }}/maigret:latest
|
||||
-
|
||||
name: Image digest
|
||||
run: echo ${{ steps.docker_build.outputs.digest }}
|
||||
@@ -15,7 +15,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: [3.6, 3.7, 3.8, 3.9]
|
||||
python-version: [3.6.9, 3.7, 3.8, 3.9]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
@@ -26,8 +26,8 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip install flake8 pytest
|
||||
python -m pip install flake8 pytest pytest-rerunfailures
|
||||
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
|
||||
- name: Test with pytest
|
||||
run: |
|
||||
pytest
|
||||
pytest --reruns 3 --reruns-delay 5
|
||||
|
||||
+6
-3
@@ -22,9 +22,12 @@ src/
|
||||
# Comma-Separated Values (CSV) Reports
|
||||
*.csv
|
||||
|
||||
# Excluded sites list
|
||||
tests/.excluded_sites
|
||||
|
||||
# MacOS Folder Metadata File
|
||||
.DS_Store
|
||||
/reports/
|
||||
|
||||
# Testing
|
||||
.coverage
|
||||
dist/
|
||||
htmlcov/
|
||||
/test_*
|
||||
@@ -2,6 +2,63 @@
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
## [0.2.3] - 2021-05-12
|
||||
* added Yelp and yelp_userid support
|
||||
* tags markup stabilization
|
||||
* improved errors detection
|
||||
|
||||
## [0.2.2] - 2021-05-07
|
||||
* improved ids extractors
|
||||
* updated sites and engines
|
||||
* updates CLI options
|
||||
|
||||
## [0.2.1] - 2021-05-02
|
||||
* fixed json reports generation bug, added tests
|
||||
|
||||
## [0.2.0] - 2021-05-02
|
||||
* added `--retries` option
|
||||
* added `source` feature for sites' mirrors
|
||||
* improved `submit` mode
|
||||
* lot of style and logic fixes
|
||||
|
||||
## [0.1.20] - 2021-05-02 [YANKED]
|
||||
|
||||
## [0.1.19] - 2021-04-14
|
||||
* added `--no-progressbar` option
|
||||
* fixed ascii tree bug
|
||||
* fixed `python -m maigret` run
|
||||
* fixed requests freeze with timeout async tasks
|
||||
|
||||
## [0.1.18] - 2021-03-30
|
||||
* some API improvements
|
||||
|
||||
## [0.1.17] - 2021-03-30
|
||||
* simplified maigret search API
|
||||
* improved documentation
|
||||
* fixed 403 response code ignoring bug
|
||||
|
||||
## [0.1.16] - 2021-03-21
|
||||
* improved URL parsing mode
|
||||
* improved sites submit mode
|
||||
* added uID.me uguid support
|
||||
* improved requests processing
|
||||
|
||||
## [0.1.15] - 2021-03-14
|
||||
* improved HTML reports
|
||||
* fixed python-3.6-specific error
|
||||
* false positives fixes
|
||||
|
||||
## [0.1.14] - 2021-02-25
|
||||
* added JSON export formats
|
||||
* improved tags markup
|
||||
* realized username detection in userinfo links
|
||||
* added DB stats CLI option
|
||||
* added site submit logic and CLI option
|
||||
* added Spotify parsing activation
|
||||
* main logic refactoring
|
||||
* fixed Dockerfile
|
||||
* fixed requirements
|
||||
|
||||
## [0.1.13] - 2021-02-06
|
||||
* improved sites list filtering
|
||||
* pretty console messages
|
||||
|
||||
+6
-5
@@ -1,20 +1,21 @@
|
||||
FROM python:3.7-alpine
|
||||
FROM python:3.7
|
||||
LABEL maintainer="Soxoj <soxoj@protonmail.com>"
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ADD requirements.txt .
|
||||
|
||||
RUN pip install --upgrade pip \
|
||||
&& apk add --update --virtual .build-dependencies \
|
||||
build-base \
|
||||
RUN pip install --upgrade pip
|
||||
|
||||
RUN apt update -y
|
||||
|
||||
RUN apt install -y\
|
||||
gcc \
|
||||
musl-dev \
|
||||
libxml2 \
|
||||
libxml2-dev \
|
||||
libxslt-dev \
|
||||
&& YARL_NO_EXTENSIONS=1 python3 -m pip install maigret \
|
||||
&& apk del .build-dependencies \
|
||||
&& rm -rf /var/cache/apk/* \
|
||||
/tmp/* \
|
||||
/var/tmp/*
|
||||
|
||||
@@ -1,81 +1,110 @@
|
||||
# Maigret
|
||||
|
||||

|
||||

|
||||
[](https://gitter.im/maigret-osint/community)
|
||||
|
||||
<p align="center">
|
||||
<img src="./static/maigret.png" />
|
||||
<p align="center">
|
||||
<a href="https://pypi.org/project/maigret/">
|
||||
<img alt="PyPI" src="https://img.shields.io/pypi/v/maigret?style=flat-square">
|
||||
</a>
|
||||
<a href="https://pypi.org/project/maigret/">
|
||||
<img alt="PyPI - Downloads" src="https://img.shields.io/pypi/dw/maigret?style=flat-square">
|
||||
</a>
|
||||
<a href="https://gitter.im/maigret-osint/community">
|
||||
<img alt="Chat - Gitter" src="./static/chat_gitter.svg" />
|
||||
</a>
|
||||
<a href="https://twitter.com/intent/follow?screen_name=sox0j">
|
||||
<img src="https://img.shields.io/twitter/follow/sox0j?label=Follow%20sox0j&style=social&color=blue" alt="Follow @sox0j" />
|
||||
</a>
|
||||
</p>
|
||||
<p align="center">
|
||||
<img src="./static/maigret.png" height="200"/>
|
||||
</p>
|
||||
</p>
|
||||
|
||||
<i>The Commissioner Jules Maigret is a fictional French police detective, created by Georges Simenon. His investigation method is based on understanding the personality of different people and their interactions.</i>
|
||||
|
||||
## About
|
||||
|
||||
Purpose of Maigret - **collect a dossier on a person by username only**, checking for accounts on a huge number of sites.
|
||||
**Maigret** collect a dossier on a person **by username only**, checking for accounts on a huge number of sites and gathering all the available information from web pages. Maigret is an easy-to-use and powerful fork of [Sherlock](https://github.com/sherlock-project/sherlock).
|
||||
|
||||
This is a [sherlock](https://github.com/sherlock-project/) fork with cool features under heavy development.
|
||||
*Don't forget to regularly update source code from repo*.
|
||||
|
||||
Currently supported more than 2000 sites ([full list](./sites.md)), by default search is launched against 500 popular sites in descending order of popularity.
|
||||
Currently supported more than 2000 sites ([full list](./sites.md)), search is launched against 500 popular sites in descending order of popularity by default.
|
||||
|
||||
## Main features
|
||||
|
||||
* Profile pages parsing, [extracting](https://github.com/soxoj/socid_extractor) personal info, links to other profiles, etc.
|
||||
* Recursive search by new usernames found
|
||||
* Profile pages parsing, [extraction](https://github.com/soxoj/socid_extractor) of personal info, links to other profiles, etc.
|
||||
* Recursive search by new usernames and other ids found
|
||||
* Search by tags (site categories, countries)
|
||||
* Censorship and captcha detection
|
||||
* Very few false positives
|
||||
* Requests retries
|
||||
|
||||
See full description of Maigret features [in the Wiki](https://github.com/soxoj/maigret/wiki/Features).
|
||||
|
||||
## Installation
|
||||
|
||||
**NOTE**: Python 3.6 or higher and pip is required.
|
||||
Maigret can be installed using pip, Docker, or simply can be launched from the cloned repo.
|
||||
Also you can run Maigret using cloud shells (see buttons below).
|
||||
|
||||
**Python 3.8 is recommended.**
|
||||
[](https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/soxoj/maigret&tutorial=README.md) [](https://repl.it/github/soxoj/maigret)
|
||||
<a href="https://colab.research.google.com/gist//soxoj/879b51bc3b2f8b695abb054090645000/maigret.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" height="40"></a>
|
||||
|
||||
### Package installing
|
||||
|
||||
**NOTE**: Python 3.6 or higher and pip is required, **Python 3.8 is recommended.**
|
||||
|
||||
```bash
|
||||
# install from pypi
|
||||
$ pip3 install maigret
|
||||
pip3 install maigret
|
||||
|
||||
# or clone and install manually
|
||||
$ git clone https://github.com/soxoj/maigret && cd maigret
|
||||
$ pip3 install .
|
||||
git clone https://github.com/soxoj/maigret && cd maigret
|
||||
pip3 install .
|
||||
|
||||
# usage
|
||||
maigret username
|
||||
```
|
||||
|
||||
## Using examples
|
||||
### Cloning a repository
|
||||
|
||||
```bash
|
||||
maigret user
|
||||
git clone https://github.com/soxoj/maigret && cd maigret
|
||||
pip3 install -r requirements.txt
|
||||
|
||||
# usage
|
||||
./maigret.py username
|
||||
```
|
||||
|
||||
### Docker
|
||||
|
||||
```bash
|
||||
# official image
|
||||
docker pull soxoj/maigret
|
||||
|
||||
# usage
|
||||
docker run soxoj/maigret:latest username
|
||||
|
||||
# manual build
|
||||
docker build -t maigret .
|
||||
```
|
||||
|
||||
## Usage examples
|
||||
|
||||
```bash
|
||||
# make HTML and PDF reports
|
||||
maigret user --html --pdf
|
||||
|
||||
# search on sites marked with tags photo & dating
|
||||
maigret user --tags photo,dating
|
||||
|
||||
|
||||
# search for three usernames on all available sites
|
||||
maigret user1 user2 user3 -a
|
||||
|
||||
```
|
||||
|
||||
Run `maigret --help` to get arguments description. Also options are documented in [the Maigret Wiki](https://github.com/soxoj/maigret/wiki/Command-line-options).
|
||||
Use `maigret --help` to get full options description. Also options are documented in [the Maigret Wiki](https://github.com/soxoj/maigret/wiki/Command-line-options).
|
||||
|
||||
With Docker:
|
||||
```
|
||||
docker build -t maigret .
|
||||
|
||||
docker run maigret user
|
||||
```
|
||||
|
||||
## Demo with page parsing and recursive username search
|
||||
|
||||
[PDF report](./static/report_alexaimephotographycars.pdf), [HTML report](https://htmlpreview.github.io/?https://raw.githubusercontent.com/soxoj/maigret/main/static/report_alexaimephotographycars.html)
|
||||
|
||||
```bash
|
||||
maigret alexaimephotographycars
|
||||
```
|
||||
|
||||

|
||||
|
||||

|
||||
|
||||
+9
-11
@@ -1,15 +1,13 @@
|
||||
# HTTP Cookie File downloaded with cookies.txt by Genuinous @genuinous
|
||||
# This file can be used by wget, curl, aria2c and other standard compliant tools.
|
||||
# Usage Examples:
|
||||
# 1) wget -x --load-cookies cookies.txt "https://xss.is/search/"
|
||||
# 2) curl --cookie cookies.txt "https://xss.is/search/"
|
||||
# 3) aria2c --load-cookies cookies.txt "https://xss.is/search/"
|
||||
# 1) wget -x --load-cookies cookies.txt "https://pixabay.com/users/blue-156711/"
|
||||
# 2) curl --cookie cookies.txt "https://pixabay.com/users/blue-156711/"
|
||||
# 3) aria2c --load-cookies cookies.txt "https://pixabay.com/users/blue-156711/"
|
||||
#
|
||||
xss.is FALSE / TRUE 0 xf_csrf PMnZNsr42HETwYEr
|
||||
xss.is FALSE / TRUE 0 xf_from_search google
|
||||
xss.is FALSE / TRUE 1642709308 xf_user 215268%2CZNKB_-64Wk-BOpsdtLYy-1UxfS5zGpxWaiEGUhmX
|
||||
xss.is FALSE / TRUE 0 xf_session sGdxJtP_sKV0LCG8vUQbr6cL670_EFWM
|
||||
.xss.is TRUE / FALSE 0 muchacho_cache ["00fbb0f2772c9596b0483d6864563cce"]
|
||||
.xss.is TRUE / FALSE 0 muchacho_png ["00fbb0f2772c9596b0483d6864563cce"]
|
||||
.xss.is TRUE / FALSE 0 muchacho_etag ["00fbb0f2772c9596b0483d6864563cce"]
|
||||
.xss.is TRUE / FALSE 1924905600 2e66e4dd94a7a237d0d1b4d50f01e179_evc ["00fbb0f2772c9596b0483d6864563cce"]
|
||||
.pixabay.com TRUE / TRUE 1618356838 __cfduid d56929cd50d11474f421b849df5758a881615764837
|
||||
.pixabay.com TRUE / TRUE 1615766638 __cf_bm ea8f7c565b44d749f65500f0e45176cebccaeb09-1615764837-1800-AYJIXh2boDJ6HPf44JI9fnteWABHOVvkxiSccACP9EiS1E58UDTGhViXtqjFfVE0QRj1WowP4ss2DzCs+pW+qUc=
|
||||
pixabay.com FALSE / FALSE 0 anonymous_user_id c1e4ee09-5674-4252-aa94-8c47b1ea80ab
|
||||
pixabay.com FALSE / FALSE 1647214439 csrftoken vfetTSvIul7gBlURt6s985JNM18GCdEwN5MWMKqX4yI73xoPgEj42dbNefjGx5fr
|
||||
pixabay.com FALSE / FALSE 1647300839 client_width 1680
|
||||
pixabay.com FALSE / FALSE 748111764839 is_human 1
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
#!/bin/sh
|
||||
FILES="maigret wizard.py maigret.py tests"
|
||||
|
||||
echo 'black'
|
||||
black --skip-string-normalization $FILES
|
||||
@@ -0,0 +1,11 @@
|
||||
#!/bin/sh
|
||||
FILES="maigret wizard.py maigret.py tests"
|
||||
|
||||
echo 'syntax errors or undefined names'
|
||||
flake8 --count --select=E9,F63,F7,F82 --show-source --statistics $FILES
|
||||
|
||||
echo 'warning'
|
||||
flake8 --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --ignore=E731,W503 $FILES
|
||||
|
||||
echo 'mypy'
|
||||
mypy ./maigret ./wizard.py ./tests
|
||||
+2
-2
@@ -1,4 +1,4 @@
|
||||
#! /usr/bin/env python3
|
||||
#!/usr/bin/env python3
|
||||
import asyncio
|
||||
import sys
|
||||
|
||||
@@ -15,4 +15,4 @@ def run():
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
||||
run()
|
||||
|
||||
@@ -1 +1,12 @@
|
||||
"""Maigret"""
|
||||
|
||||
__title__ = 'Maigret'
|
||||
__package__ = 'maigret'
|
||||
__author__ = 'Soxoj'
|
||||
__author_email__ = 'soxoj@protonmail.com'
|
||||
|
||||
|
||||
from .__version__ import __version__
|
||||
from .checking import maigret as search
|
||||
from .sites import MaigretEngine, MaigretSite, MaigretDatabase
|
||||
from .notify import QueryNotifyPrint as Notifier
|
||||
|
||||
+2
-2
@@ -6,7 +6,7 @@ Maigret entrypoint
|
||||
|
||||
import asyncio
|
||||
|
||||
import maigret
|
||||
from .maigret import main
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(maigret.main())
|
||||
asyncio.run(main())
|
||||
|
||||
@@ -0,0 +1,3 @@
|
||||
"""Maigret version file"""
|
||||
|
||||
__version__ = '0.2.3'
|
||||
+19
-28
@@ -1,47 +1,38 @@
|
||||
import aiohttp
|
||||
from aiohttp import CookieJar
|
||||
import asyncio
|
||||
import json
|
||||
from http.cookiejar import MozillaCookieJar
|
||||
from http.cookies import Morsel
|
||||
|
||||
import requests
|
||||
from aiohttp import CookieJar
|
||||
|
||||
|
||||
class ParsingActivator:
|
||||
@staticmethod
|
||||
def twitter(site, logger, cookies={}):
|
||||
headers = dict(site.headers)
|
||||
del headers['x-guest-token']
|
||||
r = requests.post(site.activation['url'], headers=headers)
|
||||
del headers["x-guest-token"]
|
||||
r = requests.post(site.activation["url"], headers=headers)
|
||||
logger.info(r)
|
||||
j = r.json()
|
||||
guest_token = j[site.activation['src']]
|
||||
site.headers['x-guest-token'] = guest_token
|
||||
guest_token = j[site.activation["src"]]
|
||||
site.headers["x-guest-token"] = guest_token
|
||||
|
||||
@staticmethod
|
||||
def vimeo(site, logger, cookies={}):
|
||||
headers = dict(site.headers)
|
||||
if 'Authorization' in headers:
|
||||
del headers['Authorization']
|
||||
r = requests.get(site.activation['url'], headers=headers)
|
||||
jwt_token = r.json()['jwt']
|
||||
site.headers['Authorization'] = 'jwt ' + jwt_token
|
||||
if "Authorization" in headers:
|
||||
del headers["Authorization"]
|
||||
r = requests.get(site.activation["url"], headers=headers)
|
||||
jwt_token = r.json()["jwt"]
|
||||
site.headers["Authorization"] = "jwt " + jwt_token
|
||||
|
||||
@staticmethod
|
||||
def xssis(site, logger, cookies={}):
|
||||
if not cookies:
|
||||
logger.debug('You must have cookies to activate xss.is parsing!')
|
||||
return
|
||||
|
||||
def spotify(site, logger, cookies={}):
|
||||
headers = dict(site.headers)
|
||||
post_data = {
|
||||
'_xfResponseType': 'json',
|
||||
'_xfToken': '1611177919,a2710362e45dad9aa1da381e21941a38'
|
||||
}
|
||||
headers['content-type'] = 'application/x-www-form-urlencoded; charset=UTF-8'
|
||||
r = requests.post(site.activation['url'], headers=headers, cookies=cookies, data=post_data)
|
||||
csrf = r.json()['csrf']
|
||||
site.get_params['_xfToken'] = csrf
|
||||
if "Authorization" in headers:
|
||||
del headers["Authorization"]
|
||||
r = requests.get(site.activation["url"])
|
||||
bearer_token = r.json()["accessToken"]
|
||||
site.headers["authorization"] = f"Bearer {bearer_token}"
|
||||
|
||||
|
||||
async def import_aiohttp_cookies(cookiestxt_filename):
|
||||
@@ -55,8 +46,8 @@ async def import_aiohttp_cookies(cookiestxt_filename):
|
||||
for key, cookie in list(domain.values())[0].items():
|
||||
c = Morsel()
|
||||
c.set(key, cookie.value, cookie.value)
|
||||
c['domain'] = cookie.domain
|
||||
c['path'] = cookie.path
|
||||
c["domain"] = cookie.domain
|
||||
c["path"] = cookie.path
|
||||
cookies_list.append((key, c))
|
||||
|
||||
cookies.update_cookies(cookies_list)
|
||||
|
||||
@@ -0,0 +1,733 @@
|
||||
import asyncio
|
||||
import logging
|
||||
from mock import Mock
|
||||
import re
|
||||
import ssl
|
||||
import sys
|
||||
import tqdm
|
||||
from typing import Tuple, Optional, Dict, List
|
||||
from urllib.parse import quote
|
||||
|
||||
import aiohttp
|
||||
import tqdm.asyncio
|
||||
from aiohttp_socks import ProxyConnector
|
||||
from python_socks import _errors as proxy_errors
|
||||
from socid_extractor import extract
|
||||
from aiohttp.client_exceptions import ServerDisconnectedError, ClientConnectorError
|
||||
|
||||
from .activation import ParsingActivator, import_aiohttp_cookies
|
||||
from . import errors
|
||||
from .errors import CheckError
|
||||
from .executors import (
|
||||
AsyncExecutor,
|
||||
AsyncioSimpleExecutor,
|
||||
AsyncioProgressbarQueueExecutor,
|
||||
)
|
||||
from .result import QueryResult, QueryStatus
|
||||
from .sites import MaigretDatabase, MaigretSite
|
||||
from .types import QueryOptions, QueryResultWrapper
|
||||
from .utils import get_random_user_agent
|
||||
|
||||
|
||||
SUPPORTED_IDS = (
|
||||
"yandex_public_id",
|
||||
"gaia_id",
|
||||
"vk_id",
|
||||
"ok_id",
|
||||
"wikimapia_uid",
|
||||
"steam_id",
|
||||
"uidme_uguid",
|
||||
"yelp_userid",
|
||||
)
|
||||
|
||||
BAD_CHARS = "#"
|
||||
|
||||
|
||||
async def get_response(request_future, logger) -> Tuple[str, int, Optional[CheckError]]:
|
||||
html_text = None
|
||||
status_code = 0
|
||||
error: Optional[CheckError] = CheckError("Unknown")
|
||||
|
||||
try:
|
||||
response = await request_future
|
||||
|
||||
status_code = response.status
|
||||
response_content = await response.content.read()
|
||||
charset = response.charset or "utf-8"
|
||||
decoded_content = response_content.decode(charset, "ignore")
|
||||
html_text = decoded_content
|
||||
|
||||
error = None
|
||||
if status_code == 0:
|
||||
error = CheckError("Connection lost")
|
||||
|
||||
logger.debug(html_text)
|
||||
|
||||
except asyncio.TimeoutError as e:
|
||||
error = CheckError("Request timeout", str(e))
|
||||
except ClientConnectorError as e:
|
||||
error = CheckError("Connecting failure", str(e))
|
||||
except ServerDisconnectedError as e:
|
||||
error = CheckError("Server disconnected", str(e))
|
||||
except aiohttp.http_exceptions.BadHttpMessage as e:
|
||||
error = CheckError("HTTP", str(e))
|
||||
except proxy_errors.ProxyError as e:
|
||||
error = CheckError("Proxy", str(e))
|
||||
except KeyboardInterrupt:
|
||||
error = CheckError("Interrupted")
|
||||
except Exception as e:
|
||||
# python-specific exceptions
|
||||
if sys.version_info.minor > 6 and (
|
||||
isinstance(e, ssl.SSLCertVerificationError) or isinstance(e, ssl.SSLError)
|
||||
):
|
||||
error = CheckError("SSL", str(e))
|
||||
else:
|
||||
logger.debug(e, exc_info=True)
|
||||
error = CheckError("Unexpected", str(e))
|
||||
|
||||
return str(html_text), status_code, error
|
||||
|
||||
|
||||
# TODO: move to separate class
|
||||
def detect_error_page(
|
||||
html_text, status_code, fail_flags, ignore_403
|
||||
) -> Optional[CheckError]:
|
||||
# Detect service restrictions such as a country restriction
|
||||
for flag, msg in fail_flags.items():
|
||||
if flag in html_text:
|
||||
return CheckError("Site-specific", msg)
|
||||
|
||||
# Detect common restrictions such as provider censorship and bot protection
|
||||
err = errors.detect(html_text)
|
||||
if err:
|
||||
return err
|
||||
|
||||
# Detect common site errors
|
||||
if status_code == 403 and not ignore_403:
|
||||
return CheckError("Access denied", "403 status code, use proxy/vpn")
|
||||
|
||||
elif status_code >= 500:
|
||||
return CheckError("Server", f"{status_code} status code")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def debug_response_logging(url, html_text, status_code, check_error):
|
||||
with open("debug.log", "a") as f:
|
||||
status = status_code or "No response"
|
||||
f.write(f"url: {url}\nerror: {check_error}\nr: {status}\n")
|
||||
if html_text:
|
||||
f.write(f"code: {status}\nresponse: {str(html_text)}\n")
|
||||
|
||||
|
||||
def process_site_result(
|
||||
response, query_notify, logger, results_info: QueryResultWrapper, site: MaigretSite
|
||||
):
|
||||
if not response:
|
||||
return results_info
|
||||
|
||||
fulltags = site.tags
|
||||
|
||||
# Retrieve other site information again
|
||||
username = results_info["username"]
|
||||
is_parsing_enabled = results_info["parsing_enabled"]
|
||||
url = results_info.get("url_user")
|
||||
logger.info(url)
|
||||
|
||||
status = results_info.get("status")
|
||||
if status is not None:
|
||||
# We have already determined the user doesn't exist here
|
||||
return results_info
|
||||
|
||||
# Get the expected check type
|
||||
check_type = site.check_type
|
||||
|
||||
# TODO: refactor
|
||||
if not response:
|
||||
logger.error(f"No response for {site.name}")
|
||||
return results_info
|
||||
|
||||
html_text, status_code, check_error = response
|
||||
|
||||
# TODO: add elapsed request time counting
|
||||
response_time = None
|
||||
|
||||
if logger.level == logging.DEBUG:
|
||||
debug_response_logging(url, html_text, status_code, check_error)
|
||||
|
||||
# additional check for errors
|
||||
if status_code and not check_error:
|
||||
check_error = detect_error_page(
|
||||
html_text, status_code, site.errors_dict, site.ignore403
|
||||
)
|
||||
|
||||
# parsing activation
|
||||
is_need_activation = any(
|
||||
[s for s in site.activation.get("marks", []) if s in html_text]
|
||||
)
|
||||
|
||||
if site.activation and html_text and is_need_activation:
|
||||
method = site.activation["method"]
|
||||
try:
|
||||
activate_fun = getattr(ParsingActivator(), method)
|
||||
# TODO: async call
|
||||
activate_fun(site, logger)
|
||||
except AttributeError:
|
||||
logger.warning(
|
||||
f"Activation method {method} for site {site.name} not found!"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
f"Failed activation {method} for site {site.name}: {str(e)}",
|
||||
exc_info=True,
|
||||
)
|
||||
# TODO: temporary check error
|
||||
|
||||
site_name = site.pretty_name
|
||||
# presense flags
|
||||
# True by default
|
||||
presense_flags = site.presense_strs
|
||||
is_presense_detected = False
|
||||
|
||||
if html_text:
|
||||
if not presense_flags:
|
||||
is_presense_detected = True
|
||||
site.stats["presense_flag"] = None
|
||||
else:
|
||||
for presense_flag in presense_flags:
|
||||
if presense_flag in html_text:
|
||||
is_presense_detected = True
|
||||
site.stats["presense_flag"] = presense_flag
|
||||
logger.debug(presense_flag)
|
||||
break
|
||||
|
||||
def build_result(status, **kwargs):
|
||||
return QueryResult(
|
||||
username,
|
||||
site_name,
|
||||
url,
|
||||
status,
|
||||
query_time=response_time,
|
||||
tags=fulltags,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if check_error:
|
||||
logger.warning(check_error)
|
||||
result = QueryResult(
|
||||
username,
|
||||
site_name,
|
||||
url,
|
||||
QueryStatus.UNKNOWN,
|
||||
query_time=response_time,
|
||||
error=check_error,
|
||||
context=str(CheckError),
|
||||
tags=fulltags,
|
||||
)
|
||||
elif check_type == "message":
|
||||
# Checks if the error message is in the HTML
|
||||
is_absence_detected = any(
|
||||
[(absence_flag in html_text) for absence_flag in site.absence_strs]
|
||||
)
|
||||
if not is_absence_detected and is_presense_detected:
|
||||
result = build_result(QueryStatus.CLAIMED)
|
||||
else:
|
||||
result = build_result(QueryStatus.AVAILABLE)
|
||||
elif check_type == "status_code":
|
||||
# Checks if the status code of the response is 2XX
|
||||
if is_presense_detected and (not status_code >= 300 or status_code < 200):
|
||||
result = build_result(QueryStatus.CLAIMED)
|
||||
else:
|
||||
result = build_result(QueryStatus.AVAILABLE)
|
||||
elif check_type == "response_url":
|
||||
# For this detection method, we have turned off the redirect.
|
||||
# So, there is no need to check the response URL: it will always
|
||||
# match the request. Instead, we will ensure that the response
|
||||
# code indicates that the request was successful (i.e. no 404, or
|
||||
# forward to some odd redirect).
|
||||
if 200 <= status_code < 300 and is_presense_detected:
|
||||
result = build_result(QueryStatus.CLAIMED)
|
||||
else:
|
||||
result = build_result(QueryStatus.AVAILABLE)
|
||||
else:
|
||||
# It should be impossible to ever get here...
|
||||
raise ValueError(
|
||||
f"Unknown check type '{check_type}' for " f"site '{site.name}'"
|
||||
)
|
||||
|
||||
extracted_ids_data = {}
|
||||
|
||||
if is_parsing_enabled and result.status == QueryStatus.CLAIMED:
|
||||
try:
|
||||
extracted_ids_data = extract(html_text)
|
||||
except Exception as e:
|
||||
logger.warning(f"Error while parsing {site.name}: {e}", exc_info=True)
|
||||
|
||||
if extracted_ids_data:
|
||||
new_usernames = {}
|
||||
for k, v in extracted_ids_data.items():
|
||||
if "username" in k:
|
||||
new_usernames[v] = "username"
|
||||
if k in SUPPORTED_IDS:
|
||||
new_usernames[v] = k
|
||||
|
||||
results_info["ids_usernames"] = new_usernames
|
||||
links = eval(extracted_ids_data.get("links", "[]"))
|
||||
if "website" in extracted_ids_data:
|
||||
links.append(extracted_ids_data["website"])
|
||||
results_info["ids_links"] = links
|
||||
result.ids_data = extracted_ids_data
|
||||
|
||||
# Save status of request
|
||||
results_info["status"] = result
|
||||
|
||||
# Save results from request
|
||||
results_info["http_status"] = status_code
|
||||
results_info["is_similar"] = site.similar_search
|
||||
# results_site['response_text'] = html_text
|
||||
results_info["rank"] = site.alexa_rank
|
||||
return results_info
|
||||
|
||||
|
||||
def make_site_result(
|
||||
site: MaigretSite, username: str, options: QueryOptions, logger
|
||||
) -> QueryResultWrapper:
|
||||
results_site: QueryResultWrapper = {}
|
||||
|
||||
# Record URL of main site and username
|
||||
results_site["site"] = site
|
||||
results_site["username"] = username
|
||||
results_site["parsing_enabled"] = options["parsing"]
|
||||
results_site["url_main"] = site.url_main
|
||||
results_site["cookies"] = (
|
||||
options.get("cookie_jar")
|
||||
and options["cookie_jar"].filter_cookies(site.url_main)
|
||||
or None
|
||||
)
|
||||
|
||||
headers = {
|
||||
"User-Agent": get_random_user_agent(),
|
||||
}
|
||||
|
||||
headers.update(site.headers)
|
||||
|
||||
if "url" not in site.__dict__:
|
||||
logger.error("No URL for site %s", site.name)
|
||||
|
||||
# URL of user on site (if it exists)
|
||||
url = site.url.format(
|
||||
urlMain=site.url_main, urlSubpath=site.url_subpath, username=quote(username)
|
||||
)
|
||||
|
||||
# workaround to prevent slash errors
|
||||
url = re.sub("(?<!:)/+", "/", url)
|
||||
|
||||
session = options['session']
|
||||
|
||||
# site check is disabled
|
||||
if site.disabled and not options['forced']:
|
||||
logger.debug(f"Site {site.name} is disabled, skipping...")
|
||||
results_site["status"] = QueryResult(
|
||||
username,
|
||||
site.name,
|
||||
url,
|
||||
QueryStatus.ILLEGAL,
|
||||
error=CheckError("Check is disabled"),
|
||||
)
|
||||
# current username type could not be applied
|
||||
elif site.type != options["id_type"]:
|
||||
results_site["status"] = QueryResult(
|
||||
username,
|
||||
site.name,
|
||||
url,
|
||||
QueryStatus.ILLEGAL,
|
||||
error=CheckError('Unsupported identifier type', f'Want "{site.type}"'),
|
||||
)
|
||||
# username is not allowed.
|
||||
elif site.regex_check and re.search(site.regex_check, username) is None:
|
||||
results_site["status"] = QueryResult(
|
||||
username,
|
||||
site.name,
|
||||
url,
|
||||
QueryStatus.ILLEGAL,
|
||||
error=CheckError(
|
||||
'Unsupported username format', f'Want "{site.regex_check}"'
|
||||
),
|
||||
)
|
||||
results_site["url_user"] = ""
|
||||
results_site["http_status"] = ""
|
||||
results_site["response_text"] = ""
|
||||
# query_notify.update(results_site["status"])
|
||||
else:
|
||||
# URL of user on site (if it exists)
|
||||
results_site["url_user"] = url
|
||||
url_probe = site.url_probe
|
||||
if url_probe is None:
|
||||
# Probe URL is normal one seen by people out on the web.
|
||||
url_probe = url
|
||||
else:
|
||||
# There is a special URL for probing existence separate
|
||||
# from where the user profile normally can be found.
|
||||
url_probe = url_probe.format(
|
||||
urlMain=site.url_main,
|
||||
urlSubpath=site.url_subpath,
|
||||
username=username,
|
||||
)
|
||||
|
||||
for k, v in site.get_params.items():
|
||||
url_probe += f"&{k}={v}"
|
||||
|
||||
if site.check_type == "status_code" and site.request_head_only:
|
||||
# In most cases when we are detecting by status code,
|
||||
# it is not necessary to get the entire body: we can
|
||||
# detect fine with just the HEAD response.
|
||||
request_method = session.head
|
||||
else:
|
||||
# Either this detect method needs the content associated
|
||||
# with the GET response, or this specific website will
|
||||
# not respond properly unless we request the whole page.
|
||||
request_method = session.get
|
||||
|
||||
if site.check_type == "response_url":
|
||||
# Site forwards request to a different URL if username not
|
||||
# found. Disallow the redirect so we can capture the
|
||||
# http status from the original URL request.
|
||||
allow_redirects = False
|
||||
else:
|
||||
# Allow whatever redirect that the site wants to do.
|
||||
# The final result of the request will be what is available.
|
||||
allow_redirects = True
|
||||
|
||||
future = request_method(
|
||||
url=url_probe,
|
||||
headers=headers,
|
||||
allow_redirects=allow_redirects,
|
||||
timeout=options['timeout'],
|
||||
)
|
||||
|
||||
# Store future request object in the results object
|
||||
results_site["future"] = future
|
||||
|
||||
return results_site
|
||||
|
||||
|
||||
async def check_site_for_username(
|
||||
site, username, options: QueryOptions, logger, query_notify, *args, **kwargs
|
||||
) -> Tuple[str, QueryResultWrapper]:
|
||||
default_result = make_site_result(site, username, options, logger)
|
||||
future = default_result.get("future")
|
||||
if not future:
|
||||
return site.name, default_result
|
||||
|
||||
response = await get_response(request_future=future, logger=logger)
|
||||
|
||||
response_result = process_site_result(
|
||||
response, query_notify, logger, default_result, site
|
||||
)
|
||||
|
||||
query_notify.update(response_result['status'], site.similar_search)
|
||||
|
||||
return site.name, response_result
|
||||
|
||||
|
||||
async def debug_ip_request(session, logger):
|
||||
future = session.get(url="https://icanhazip.com")
|
||||
ip, status, check_error = await get_response(future, logger)
|
||||
if ip:
|
||||
logger.debug(f"My IP is: {ip.strip()}")
|
||||
else:
|
||||
logger.debug(f"IP requesting {check_error.type}: {check_error.desc}")
|
||||
|
||||
|
||||
def get_failed_sites(results: Dict[str, QueryResultWrapper]) -> List[str]:
|
||||
sites = []
|
||||
for sitename, r in results.items():
|
||||
status = r.get('status', {})
|
||||
if status and status.error:
|
||||
if errors.is_permanent(status.error.type):
|
||||
continue
|
||||
sites.append(sitename)
|
||||
return sites
|
||||
|
||||
|
||||
async def maigret(
|
||||
username: str,
|
||||
site_dict: Dict[str, MaigretSite],
|
||||
logger,
|
||||
query_notify=None,
|
||||
proxy=None,
|
||||
timeout=None,
|
||||
is_parsing_enabled=False,
|
||||
id_type="username",
|
||||
debug=False,
|
||||
forced=False,
|
||||
max_connections=100,
|
||||
no_progressbar=False,
|
||||
cookies=None,
|
||||
retries=0,
|
||||
) -> QueryResultWrapper:
|
||||
"""Main search func
|
||||
|
||||
Checks for existence of username on certain sites.
|
||||
|
||||
Keyword Arguments:
|
||||
username -- Username string will be used for search.
|
||||
site_dict -- Dictionary containing sites data in MaigretSite objects.
|
||||
query_notify -- Object with base type of QueryNotify().
|
||||
This will be used to notify the caller about
|
||||
query results.
|
||||
logger -- Standard Python logger object.
|
||||
timeout -- Time in seconds to wait before timing out request.
|
||||
Default is no timeout.
|
||||
is_parsing_enabled -- Extract additional info from account pages.
|
||||
id_type -- Type of username to search.
|
||||
Default is 'username', see all supported here:
|
||||
https://github.com/soxoj/maigret/wiki/Supported-identifier-types
|
||||
max_connections -- Maximum number of concurrent connections allowed.
|
||||
Default is 100.
|
||||
no_progressbar -- Displaying of ASCII progressbar during scanner.
|
||||
cookies -- Filename of a cookie jar file to use for each request.
|
||||
|
||||
Return Value:
|
||||
Dictionary containing results from report. Key of dictionary is the name
|
||||
of the social network site, and the value is another dictionary with
|
||||
the following keys:
|
||||
url_main: URL of main site.
|
||||
url_user: URL of user on site (if account exists).
|
||||
status: QueryResult() object indicating results of test for
|
||||
account existence.
|
||||
http_status: HTTP status code of query which checked for existence on
|
||||
site.
|
||||
response_text: Text that came back from request. May be None if
|
||||
there was an HTTP error when checking for existence.
|
||||
"""
|
||||
|
||||
# notify caller that we are starting the query.
|
||||
if not query_notify:
|
||||
query_notify = Mock()
|
||||
|
||||
query_notify.start(username, id_type)
|
||||
|
||||
# make http client session
|
||||
connector = (
|
||||
ProxyConnector.from_url(proxy) if proxy else aiohttp.TCPConnector(ssl=False)
|
||||
)
|
||||
connector.verify_ssl = False
|
||||
|
||||
cookie_jar = None
|
||||
if cookies:
|
||||
logger.debug(f"Using cookies jar file {cookies}")
|
||||
cookie_jar = await import_aiohttp_cookies(cookies)
|
||||
|
||||
session = aiohttp.ClientSession(
|
||||
connector=connector, trust_env=True, cookie_jar=cookie_jar
|
||||
)
|
||||
|
||||
if logger.level == logging.DEBUG:
|
||||
await debug_ip_request(session, logger)
|
||||
|
||||
# setup parallel executor
|
||||
executor: Optional[AsyncExecutor] = None
|
||||
if no_progressbar:
|
||||
executor = AsyncioSimpleExecutor(logger=logger)
|
||||
else:
|
||||
executor = AsyncioProgressbarQueueExecutor(
|
||||
logger=logger, in_parallel=max_connections, timeout=timeout + 0.5
|
||||
)
|
||||
|
||||
# make options objects for all the requests
|
||||
options: QueryOptions = {}
|
||||
options["cookies"] = cookie_jar
|
||||
options["session"] = session
|
||||
options["parsing"] = is_parsing_enabled
|
||||
options["timeout"] = timeout
|
||||
options["id_type"] = id_type
|
||||
options["forced"] = forced
|
||||
|
||||
# results from analysis of all sites
|
||||
all_results: Dict[str, QueryResultWrapper] = {}
|
||||
|
||||
sites = list(site_dict.keys())
|
||||
|
||||
attempts = retries + 1
|
||||
while attempts:
|
||||
tasks_dict = {}
|
||||
|
||||
for sitename, site in site_dict.items():
|
||||
if sitename not in sites:
|
||||
continue
|
||||
default_result: QueryResultWrapper = {
|
||||
'site': site,
|
||||
'status': QueryResult(
|
||||
username,
|
||||
sitename,
|
||||
'',
|
||||
QueryStatus.UNKNOWN,
|
||||
error=CheckError('Request failed'),
|
||||
),
|
||||
}
|
||||
tasks_dict[sitename] = (
|
||||
check_site_for_username,
|
||||
[site, username, options, logger, query_notify],
|
||||
{'default': (sitename, default_result)},
|
||||
)
|
||||
|
||||
cur_results = await executor.run(tasks_dict.values())
|
||||
|
||||
# wait for executor timeout errors
|
||||
await asyncio.sleep(1)
|
||||
|
||||
all_results.update(cur_results)
|
||||
|
||||
sites = get_failed_sites(dict(cur_results))
|
||||
attempts -= 1
|
||||
|
||||
if not sites:
|
||||
break
|
||||
|
||||
if attempts:
|
||||
query_notify.warning(
|
||||
f'Restarting checks for {len(sites)} sites... ({attempts} attempts left)'
|
||||
)
|
||||
|
||||
# closing http client session
|
||||
await session.close()
|
||||
|
||||
# notify caller that all queries are finished
|
||||
query_notify.finish()
|
||||
|
||||
return all_results
|
||||
|
||||
|
||||
def timeout_check(value):
|
||||
"""Check Timeout Argument.
|
||||
|
||||
Checks timeout for validity.
|
||||
|
||||
Keyword Arguments:
|
||||
value -- Time in seconds to wait before timing out request.
|
||||
|
||||
Return Value:
|
||||
Floating point number representing the time (in seconds) that should be
|
||||
used for the timeout.
|
||||
|
||||
NOTE: Will raise an exception if the timeout in invalid.
|
||||
"""
|
||||
from argparse import ArgumentTypeError
|
||||
|
||||
try:
|
||||
timeout = float(value)
|
||||
except ValueError:
|
||||
raise ArgumentTypeError(f"Timeout '{value}' must be a number.")
|
||||
if timeout <= 0:
|
||||
raise ArgumentTypeError(f"Timeout '{value}' must be greater than 0.0s.")
|
||||
return timeout
|
||||
|
||||
|
||||
async def site_self_check(
|
||||
site: MaigretSite, logger, semaphore, db: MaigretDatabase, silent=False
|
||||
):
|
||||
changes = {
|
||||
"disabled": False,
|
||||
}
|
||||
|
||||
check_data = [
|
||||
(site.username_claimed, QueryStatus.CLAIMED),
|
||||
(site.username_unclaimed, QueryStatus.AVAILABLE),
|
||||
]
|
||||
|
||||
logger.info(f"Checking {site.name}...")
|
||||
|
||||
for username, status in check_data:
|
||||
async with semaphore:
|
||||
results_dict = await maigret(
|
||||
username=username,
|
||||
site_dict={site.name: site},
|
||||
logger=logger,
|
||||
timeout=30,
|
||||
id_type=site.type,
|
||||
forced=True,
|
||||
no_progressbar=True,
|
||||
retries=1,
|
||||
)
|
||||
|
||||
# don't disable entries with other ids types
|
||||
# TODO: make normal checking
|
||||
if site.name not in results_dict:
|
||||
logger.info(results_dict)
|
||||
changes["disabled"] = True
|
||||
continue
|
||||
|
||||
result = results_dict[site.name]["status"]
|
||||
|
||||
site_status = result.status
|
||||
|
||||
if site_status != status:
|
||||
if site_status == QueryStatus.UNKNOWN:
|
||||
msgs = site.absence_strs
|
||||
etype = site.check_type
|
||||
logger.warning(
|
||||
f"Error while searching {username} in {site.name}: {result.context}, {msgs}, type {etype}"
|
||||
)
|
||||
# don't disable in case of available username
|
||||
if status == QueryStatus.CLAIMED:
|
||||
changes["disabled"] = True
|
||||
elif status == QueryStatus.CLAIMED:
|
||||
logger.warning(
|
||||
f"Not found `{username}` in {site.name}, must be claimed"
|
||||
)
|
||||
logger.info(results_dict[site.name])
|
||||
changes["disabled"] = True
|
||||
else:
|
||||
logger.warning(f"Found `{username}` in {site.name}, must be available")
|
||||
logger.info(results_dict[site.name])
|
||||
changes["disabled"] = True
|
||||
|
||||
logger.info(f"Site {site.name} checking is finished")
|
||||
|
||||
if changes["disabled"] != site.disabled:
|
||||
site.disabled = changes["disabled"]
|
||||
db.update_site(site)
|
||||
if not silent:
|
||||
action = "Disabled" if site.disabled else "Enabled"
|
||||
print(f"{action} site {site.name}...")
|
||||
|
||||
return changes
|
||||
|
||||
|
||||
async def self_check(
|
||||
db: MaigretDatabase, site_data: dict, logger, silent=False, max_connections=10
|
||||
) -> bool:
|
||||
sem = asyncio.Semaphore(max_connections)
|
||||
tasks = []
|
||||
all_sites = site_data
|
||||
|
||||
def disabled_count(lst):
|
||||
return len(list(filter(lambda x: x.disabled, lst)))
|
||||
|
||||
disabled_old_count = disabled_count(all_sites.values())
|
||||
|
||||
for _, site in all_sites.items():
|
||||
check_coro = site_self_check(site, logger, sem, db, silent)
|
||||
future = asyncio.ensure_future(check_coro)
|
||||
tasks.append(future)
|
||||
|
||||
for f in tqdm.asyncio.tqdm.as_completed(tasks):
|
||||
await f
|
||||
|
||||
disabled_new_count = disabled_count(all_sites.values())
|
||||
total_disabled = disabled_new_count - disabled_old_count
|
||||
|
||||
if total_disabled >= 0:
|
||||
message = "Disabled"
|
||||
else:
|
||||
message = "Enabled"
|
||||
total_disabled *= -1
|
||||
|
||||
if not silent:
|
||||
print(
|
||||
f"{message} {total_disabled} ({disabled_old_count} => {disabled_new_count}) checked sites. "
|
||||
"Run with `--info` flag to get more information"
|
||||
)
|
||||
|
||||
return total_disabled != 0
|
||||
@@ -0,0 +1,130 @@
|
||||
from typing import Dict, List, Any
|
||||
|
||||
from .result import QueryResult
|
||||
from .types import QueryResultWrapper
|
||||
|
||||
|
||||
# error got as a result of completed search query
|
||||
class CheckError:
|
||||
_type = 'Unknown'
|
||||
_desc = ''
|
||||
|
||||
def __init__(self, typename, desc=''):
|
||||
self._type = typename
|
||||
self._desc = desc
|
||||
|
||||
def __str__(self):
|
||||
if not self._desc:
|
||||
return f'{self._type} error'
|
||||
|
||||
return f'{self._type} error: {self._desc}'
|
||||
|
||||
@property
|
||||
def type(self):
|
||||
return self._type
|
||||
|
||||
@property
|
||||
def desc(self):
|
||||
return self._desc
|
||||
|
||||
|
||||
COMMON_ERRORS = {
|
||||
'<title>Attention Required! | Cloudflare</title>': CheckError(
|
||||
'Captcha', 'Cloudflare'
|
||||
),
|
||||
'Please stand by, while we are checking your browser': CheckError(
|
||||
'Bot protection', 'Cloudflare'
|
||||
),
|
||||
'<span data-translate="checking_browser">Checking your browser before accessing</span>': CheckError(
|
||||
'Bot protection', 'Cloudflare'
|
||||
),
|
||||
'This website is using a security service to protect itself from online attacks.': CheckError(
|
||||
'Access denied', 'Cloudflare'
|
||||
),
|
||||
'<title>Доступ ограничен</title>': CheckError('Censorship', 'Rostelecom'),
|
||||
'document.getElementById(\'validate_form_submit\').disabled=true': CheckError(
|
||||
'Captcha', 'Mail.ru'
|
||||
),
|
||||
'Verifying your browser, please wait...<br>DDoS Protection by</font> Blazingfast.io': CheckError(
|
||||
'Bot protection', 'Blazingfast'
|
||||
),
|
||||
'404</h1><p class="error-card__description">Мы не нашли страницу': CheckError(
|
||||
'Resolving', 'MegaFon 404 page'
|
||||
),
|
||||
'Доступ к информационному ресурсу ограничен на основании Федерального закона': CheckError(
|
||||
'Censorship', 'MGTS'
|
||||
),
|
||||
'Incapsula incident ID': CheckError('Bot protection', 'Incapsula'),
|
||||
'Сайт заблокирован хостинг-провайдером': CheckError(
|
||||
'Site-specific', 'Site is disabled (Beget)'
|
||||
),
|
||||
}
|
||||
|
||||
ERRORS_TYPES = {
|
||||
'Captcha': 'Try to switch to another IP address or to use service cookies',
|
||||
'Bot protection': 'Try to switch to another IP address',
|
||||
'Censorship': 'switch to another internet service provider',
|
||||
'Request timeout': 'Try to increase timeout or to switch to another internet service provider',
|
||||
}
|
||||
|
||||
# TODO: checking for reason
|
||||
ERRORS_REASONS = {
|
||||
'Login required': 'Add authorization cookies through `--cookies-jar-file` (see cookies.txt)',
|
||||
}
|
||||
|
||||
TEMPORARY_ERRORS_TYPES = [
|
||||
'Request timeout',
|
||||
'Unknown',
|
||||
'Request failed',
|
||||
'Connecting failure',
|
||||
'HTTP',
|
||||
'Proxy',
|
||||
'Interrupted',
|
||||
'Connection lost',
|
||||
]
|
||||
|
||||
THRESHOLD = 3 # percent
|
||||
|
||||
|
||||
def is_important(err_data):
|
||||
return err_data['perc'] >= THRESHOLD
|
||||
|
||||
|
||||
def is_permanent(err_type):
|
||||
return err_type not in TEMPORARY_ERRORS_TYPES
|
||||
|
||||
|
||||
def detect(text):
|
||||
for flag, err in COMMON_ERRORS.items():
|
||||
if flag in text:
|
||||
return err
|
||||
return None
|
||||
|
||||
|
||||
def solution_of(err_type) -> str:
|
||||
return ERRORS_TYPES.get(err_type, '')
|
||||
|
||||
|
||||
def extract_and_group(search_res: QueryResultWrapper) -> List[Dict[str, Any]]:
|
||||
errors_counts: Dict[str, int] = {}
|
||||
for r in search_res.values():
|
||||
if r and isinstance(r, dict) and r.get('status'):
|
||||
if not isinstance(r['status'], QueryResult):
|
||||
continue
|
||||
|
||||
err = r['status'].error
|
||||
if not err:
|
||||
continue
|
||||
errors_counts[err.type] = errors_counts.get(err.type, 0) + 1
|
||||
|
||||
counts = []
|
||||
for err, count in sorted(errors_counts.items(), key=lambda x: x[1], reverse=True):
|
||||
counts.append(
|
||||
{
|
||||
'err': err,
|
||||
'count': count,
|
||||
'perc': round(count / len(search_res), 2) * 100,
|
||||
}
|
||||
)
|
||||
|
||||
return counts
|
||||
@@ -0,0 +1,118 @@
|
||||
import asyncio
|
||||
import time
|
||||
import tqdm
|
||||
import sys
|
||||
from typing import Iterable, Any, List
|
||||
|
||||
from .types import QueryDraft
|
||||
|
||||
|
||||
def create_task_func():
|
||||
if sys.version_info.minor > 6:
|
||||
create_asyncio_task = asyncio.create_task
|
||||
else:
|
||||
loop = asyncio.get_event_loop()
|
||||
create_asyncio_task = loop.create_task
|
||||
return create_asyncio_task
|
||||
|
||||
|
||||
class AsyncExecutor:
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.logger = kwargs['logger']
|
||||
|
||||
async def run(self, tasks: Iterable[QueryDraft]):
|
||||
start_time = time.time()
|
||||
results = await self._run(tasks)
|
||||
self.execution_time = time.time() - start_time
|
||||
self.logger.debug(f'Spent time: {self.execution_time}')
|
||||
return results
|
||||
|
||||
async def _run(self, tasks: Iterable[QueryDraft]):
|
||||
await asyncio.sleep(0)
|
||||
|
||||
|
||||
class AsyncioSimpleExecutor(AsyncExecutor):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
async def _run(self, tasks: Iterable[QueryDraft]):
|
||||
futures = [f(*args, **kwargs) for f, args, kwargs in tasks]
|
||||
return await asyncio.gather(*futures)
|
||||
|
||||
|
||||
class AsyncioProgressbarExecutor(AsyncExecutor):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
async def _run(self, tasks: Iterable[QueryDraft]):
|
||||
futures = [f(*args, **kwargs) for f, args, kwargs in tasks]
|
||||
results = []
|
||||
for f in tqdm.asyncio.tqdm.as_completed(futures):
|
||||
results.append(await f)
|
||||
return results
|
||||
|
||||
|
||||
class AsyncioProgressbarSemaphoreExecutor(AsyncExecutor):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.semaphore = asyncio.Semaphore(kwargs.get('in_parallel', 1))
|
||||
|
||||
async def _run(self, tasks: Iterable[QueryDraft]):
|
||||
async def _wrap_query(q: QueryDraft):
|
||||
async with self.semaphore:
|
||||
f, args, kwargs = q
|
||||
return await f(*args, **kwargs)
|
||||
|
||||
async def semaphore_gather(tasks: Iterable[QueryDraft]):
|
||||
coros = [_wrap_query(q) for q in tasks]
|
||||
results = []
|
||||
for f in tqdm.asyncio.tqdm.as_completed(coros):
|
||||
results.append(await f)
|
||||
return results
|
||||
|
||||
return await semaphore_gather(tasks)
|
||||
|
||||
|
||||
class AsyncioProgressbarQueueExecutor(AsyncExecutor):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.workers_count = kwargs.get('in_parallel', 10)
|
||||
self.progress_func = kwargs.get('progress_func', tqdm.tqdm)
|
||||
self.queue = asyncio.Queue(self.workers_count)
|
||||
self.timeout = kwargs.get('timeout')
|
||||
|
||||
async def worker(self):
|
||||
while True:
|
||||
try:
|
||||
f, args, kwargs = self.queue.get_nowait()
|
||||
except asyncio.QueueEmpty:
|
||||
return
|
||||
|
||||
query_future = f(*args, **kwargs)
|
||||
query_task = create_task_func()(query_future)
|
||||
try:
|
||||
result = await asyncio.wait_for(query_task, timeout=self.timeout)
|
||||
except asyncio.TimeoutError:
|
||||
result = kwargs.get('default')
|
||||
|
||||
self.results.append(result)
|
||||
self.progress.update(1)
|
||||
self.queue.task_done()
|
||||
|
||||
async def _run(self, queries: Iterable[QueryDraft]):
|
||||
self.results: List[Any] = []
|
||||
|
||||
queries_list = list(queries)
|
||||
|
||||
min_workers = min(len(queries_list), self.workers_count)
|
||||
|
||||
workers = [create_task_func()(self.worker()) for _ in range(min_workers)]
|
||||
|
||||
self.progress = self.progress_func(total=len(queries_list))
|
||||
for t in queries_list:
|
||||
await self.queue.put(t)
|
||||
await self.queue.join()
|
||||
for w in workers:
|
||||
w.cancel()
|
||||
self.progress.close()
|
||||
return self.results
|
||||
+492
-785
File diff suppressed because it is too large
Load Diff
+84
-77
@@ -4,12 +4,14 @@ This module defines the objects for notifying the caller about the
|
||||
results of queries.
|
||||
"""
|
||||
import sys
|
||||
|
||||
from colorama import Fore, Style, init
|
||||
|
||||
from .result import QueryStatus
|
||||
from .utils import get_dict_ascii_tree
|
||||
|
||||
|
||||
class QueryNotify():
|
||||
class QueryNotify:
|
||||
"""Query Notify Object.
|
||||
|
||||
Base class that describes methods available to notify the results of
|
||||
@@ -37,7 +39,7 @@ class QueryNotify():
|
||||
|
||||
return
|
||||
|
||||
def start(self, message=None, id_type='username'):
|
||||
def start(self, message=None, id_type="username"):
|
||||
"""Notify Start.
|
||||
|
||||
Notify method for start of query. This method will be called before
|
||||
@@ -114,8 +116,14 @@ class QueryNotifyPrint(QueryNotify):
|
||||
Query notify class that prints results.
|
||||
"""
|
||||
|
||||
def __init__(self, result=None, verbose=False, print_found_only=False,
|
||||
skip_check_errors=False, color=True):
|
||||
def __init__(
|
||||
self,
|
||||
result=None,
|
||||
verbose=False,
|
||||
print_found_only=False,
|
||||
skip_check_errors=False,
|
||||
color=True,
|
||||
):
|
||||
"""Create Query Notify Print Object.
|
||||
|
||||
Contains information about a specific method of notifying the results
|
||||
@@ -144,6 +152,27 @@ class QueryNotifyPrint(QueryNotify):
|
||||
|
||||
return
|
||||
|
||||
def make_colored_terminal_notify(
|
||||
self, status, text, status_color, text_color, appendix
|
||||
):
|
||||
text = [
|
||||
f"{Style.BRIGHT}{Fore.WHITE}[{status_color}{status}{Fore.WHITE}]"
|
||||
+ f"{text_color} {text}: {Style.RESET_ALL}"
|
||||
+ f"{appendix}"
|
||||
]
|
||||
return "".join(text)
|
||||
|
||||
def make_simple_terminal_notify(
|
||||
self, status, text, status_color, text_color, appendix
|
||||
):
|
||||
return f"[{status}] {text}: {appendix}"
|
||||
|
||||
def make_terminal_notify(self, *args):
|
||||
if self.color:
|
||||
return self.make_colored_terminal_notify(*args)
|
||||
else:
|
||||
return self.make_simple_terminal_notify(*args)
|
||||
|
||||
def start(self, message, id_type):
|
||||
"""Notify Start.
|
||||
|
||||
@@ -160,38 +189,29 @@ class QueryNotifyPrint(QueryNotify):
|
||||
|
||||
title = f"Checking {id_type}"
|
||||
if self.color:
|
||||
print(Style.BRIGHT + Fore.GREEN + "[" +
|
||||
Fore.YELLOW + "*" +
|
||||
Fore.GREEN + f"] {title}" +
|
||||
Fore.WHITE + f" {message}" +
|
||||
Fore.GREEN + " on:")
|
||||
print(
|
||||
Style.BRIGHT
|
||||
+ Fore.GREEN
|
||||
+ "["
|
||||
+ Fore.YELLOW
|
||||
+ "*"
|
||||
+ Fore.GREEN
|
||||
+ f"] {title}"
|
||||
+ Fore.WHITE
|
||||
+ f" {message}"
|
||||
+ Fore.GREEN
|
||||
+ " on:"
|
||||
)
|
||||
else:
|
||||
print(f"[*] {title} {message} on:")
|
||||
|
||||
def warning(self, message, symbol='-'):
|
||||
msg = f'[{symbol}] {message}'
|
||||
def warning(self, message, symbol="-"):
|
||||
msg = f"[{symbol}] {message}"
|
||||
if self.color:
|
||||
print(Style.BRIGHT + Fore.YELLOW + msg)
|
||||
else:
|
||||
print(msg)
|
||||
|
||||
def get_additional_data_text(self, items, prepend=''):
|
||||
text = ''
|
||||
for num, item in enumerate(items):
|
||||
box_symbol = '┣╸' if num != len(items) - 1 else '┗╸'
|
||||
|
||||
if type(item) == tuple:
|
||||
field_name, field_value = item
|
||||
if field_value.startswith('[\''):
|
||||
is_last_item = num == len(items) - 1
|
||||
prepend_symbols = ' ' * 3 if is_last_item else ' ┃ '
|
||||
field_value = self.get_additional_data_text(eval(field_value), prepend_symbols)
|
||||
text += f'\n{prepend}{box_symbol}{field_name}: {field_value}'
|
||||
else:
|
||||
text += f'\n{prepend}{box_symbol} {item}'
|
||||
|
||||
return text
|
||||
|
||||
def update(self, result, is_similar=False):
|
||||
"""Notify Update.
|
||||
|
||||
@@ -205,77 +225,64 @@ class QueryNotifyPrint(QueryNotify):
|
||||
Return Value:
|
||||
Nothing.
|
||||
"""
|
||||
notify = None
|
||||
self.result = result
|
||||
|
||||
if not self.result.ids_data:
|
||||
ids_data_text = ""
|
||||
else:
|
||||
ids_data_text = self.get_additional_data_text(self.result.ids_data.items(), ' ')
|
||||
|
||||
def make_colored_terminal_notify(status, text, status_color, text_color, appendix):
|
||||
text = [
|
||||
f'{Style.BRIGHT}{Fore.WHITE}[{status_color}{status}{Fore.WHITE}]' +
|
||||
f'{text_color} {text}: {Style.RESET_ALL}' +
|
||||
f'{appendix}'
|
||||
]
|
||||
return ''.join(text)
|
||||
|
||||
def make_simple_terminal_notify(status, text, appendix):
|
||||
return f'[{status}] {text}: {appendix}'
|
||||
|
||||
def make_terminal_notify(is_colored=True, *args):
|
||||
if is_colored:
|
||||
return make_colored_terminal_notify(*args)
|
||||
else:
|
||||
return make_simple_terminal_notify(*args)
|
||||
|
||||
notify = None
|
||||
ids_data_text = ""
|
||||
if self.result.ids_data:
|
||||
ids_data_text = get_dict_ascii_tree(self.result.ids_data.items(), " ")
|
||||
|
||||
# Output to the terminal is desired.
|
||||
if result.status == QueryStatus.CLAIMED:
|
||||
color = Fore.BLUE if is_similar else Fore.GREEN
|
||||
status = '?' if is_similar else '+'
|
||||
notify = make_terminal_notify(
|
||||
self.color,
|
||||
status, result.site_name,
|
||||
color, color,
|
||||
result.site_url_user + ids_data_text
|
||||
status = "?" if is_similar else "+"
|
||||
notify = self.make_terminal_notify(
|
||||
status,
|
||||
result.site_name,
|
||||
color,
|
||||
color,
|
||||
result.site_url_user + ids_data_text,
|
||||
)
|
||||
elif result.status == QueryStatus.AVAILABLE:
|
||||
if not self.print_found_only:
|
||||
notify = make_terminal_notify(
|
||||
self.color,
|
||||
'-', result.site_name,
|
||||
Fore.RED, Fore.YELLOW,
|
||||
'Not found!' + ids_data_text
|
||||
notify = self.make_terminal_notify(
|
||||
"-",
|
||||
result.site_name,
|
||||
Fore.RED,
|
||||
Fore.YELLOW,
|
||||
"Not found!" + ids_data_text,
|
||||
)
|
||||
elif result.status == QueryStatus.UNKNOWN:
|
||||
if not self.skip_check_errors:
|
||||
notify = make_terminal_notify(
|
||||
self.color,
|
||||
'?', result.site_name,
|
||||
Fore.RED, Fore.RED,
|
||||
self.result.context + ids_data_text
|
||||
notify = self.make_terminal_notify(
|
||||
"?",
|
||||
result.site_name,
|
||||
Fore.RED,
|
||||
Fore.RED,
|
||||
str(self.result.error) + ids_data_text,
|
||||
)
|
||||
elif result.status == QueryStatus.ILLEGAL:
|
||||
if not self.print_found_only:
|
||||
text = 'Illegal Username Format For This Site!'
|
||||
notify = make_terminal_notify(
|
||||
self.color,
|
||||
'-', result.site_name,
|
||||
Fore.RED, Fore.YELLOW,
|
||||
text + ids_data_text
|
||||
text = "Illegal Username Format For This Site!"
|
||||
notify = self.make_terminal_notify(
|
||||
"-",
|
||||
result.site_name,
|
||||
Fore.RED,
|
||||
Fore.YELLOW,
|
||||
text + ids_data_text,
|
||||
)
|
||||
else:
|
||||
# It should be impossible to ever get here...
|
||||
raise ValueError(f"Unknown Query Status '{str(result.status)}' for "
|
||||
f"site '{self.result.site_name}'")
|
||||
raise ValueError(
|
||||
f"Unknown Query Status '{str(result.status)}' for "
|
||||
f"site '{self.result.site_name}'"
|
||||
)
|
||||
|
||||
if notify:
|
||||
sys.stdout.write('\x1b[1K\r')
|
||||
sys.stdout.write("\x1b[1K\r")
|
||||
print(notify)
|
||||
|
||||
return
|
||||
return notify
|
||||
|
||||
def __str__(self):
|
||||
"""Convert Object To String.
|
||||
|
||||
+180
-128
@@ -1,79 +1,100 @@
|
||||
import csv
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any
|
||||
|
||||
import pycountry
|
||||
import xmind
|
||||
from datetime import datetime
|
||||
from dateutil.parser import parse as parse_datetime_str
|
||||
from jinja2 import Template
|
||||
from xhtml2pdf import pisa
|
||||
from dateutil.parser import parse as parse_datetime_str
|
||||
|
||||
from .result import QueryStatus
|
||||
from .utils import is_country_tag, CaseConverter, enrich_link_str
|
||||
|
||||
SUPPORTED_JSON_REPORT_FORMATS = [
|
||||
"simple",
|
||||
"ndjson",
|
||||
]
|
||||
|
||||
'''
|
||||
"""
|
||||
UTILS
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
def filter_supposed_data(data):
|
||||
### interesting fields
|
||||
allowed_fields = ['fullname', 'gender', 'location', 'age']
|
||||
filtered_supposed_data = {CaseConverter.snake_to_title(k): v[0]
|
||||
for k, v in data.items()
|
||||
if k in allowed_fields}
|
||||
# interesting fields
|
||||
allowed_fields = ["fullname", "gender", "location", "age"]
|
||||
filtered_supposed_data = {
|
||||
CaseConverter.snake_to_title(k): v[0]
|
||||
for k, v in data.items()
|
||||
if k in allowed_fields
|
||||
}
|
||||
return filtered_supposed_data
|
||||
|
||||
|
||||
'''
|
||||
"""
|
||||
REPORTS SAVING
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
def save_csv_report(filename: str, username: str, results: dict):
|
||||
with open(filename, 'w', newline='', encoding='utf-8') as f:
|
||||
with open(filename, "w", newline="", encoding="utf-8") as f:
|
||||
generate_csv_report(username, results, f)
|
||||
|
||||
|
||||
def save_txt_report(filename: str, username: str, results: dict):
|
||||
with open(filename, 'w', encoding='utf-8') as f:
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
generate_txt_report(username, results, f)
|
||||
|
||||
|
||||
def save_html_report(filename: str, context: dict):
|
||||
template, _ = generate_report_template(is_pdf=False)
|
||||
filled_template = template.render(**context)
|
||||
with open(filename, 'w') as f:
|
||||
with open(filename, "w") as f:
|
||||
f.write(filled_template)
|
||||
|
||||
|
||||
def save_pdf_report(filename: str, context: dict):
|
||||
template, css = generate_report_template(is_pdf=True)
|
||||
filled_template = template.render(**context)
|
||||
with open(filename, 'w+b') as f:
|
||||
with open(filename, "w+b") as f:
|
||||
pisa.pisaDocument(io.StringIO(filled_template), dest=f, default_css=css)
|
||||
|
||||
|
||||
'''
|
||||
def save_json_report(filename: str, username: str, results: dict, report_type: str):
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
generate_json_report(username, results, f, report_type=report_type)
|
||||
|
||||
|
||||
"""
|
||||
REPORTS GENERATING
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
def generate_report_template(is_pdf: bool):
|
||||
"""
|
||||
HTML/PDF template generation
|
||||
HTML/PDF template generation
|
||||
"""
|
||||
|
||||
def get_resource_content(filename):
|
||||
return open(os.path.join(maigret_path, 'resources', filename)).read()
|
||||
return open(os.path.join(maigret_path, "resources", filename)).read()
|
||||
|
||||
maigret_path = os.path.dirname(os.path.realpath(__file__))
|
||||
|
||||
if is_pdf:
|
||||
template_content = get_resource_content('simple_report_pdf.tpl')
|
||||
css_content = get_resource_content('simple_report_pdf.css')
|
||||
template_content = get_resource_content("simple_report_pdf.tpl")
|
||||
css_content = get_resource_content("simple_report_pdf.css")
|
||||
else:
|
||||
template_content = get_resource_content('simple_report.tpl')
|
||||
template_content = get_resource_content("simple_report.tpl")
|
||||
css_content = None
|
||||
|
||||
template = Template(template_content)
|
||||
template.globals['title'] = CaseConverter.snake_to_title
|
||||
template.globals['detect_link'] = enrich_link_str
|
||||
template.globals["title"] = CaseConverter.snake_to_title # type: ignore
|
||||
template.globals["detect_link"] = enrich_link_str # type: ignore
|
||||
return template, css_content
|
||||
|
||||
|
||||
@@ -81,15 +102,15 @@ def generate_report_context(username_results: list):
|
||||
brief_text = []
|
||||
usernames = {}
|
||||
extended_info_count = 0
|
||||
tags = {}
|
||||
supposed_data = {}
|
||||
tags: Dict[str, int] = {}
|
||||
supposed_data: Dict[str, Any] = {}
|
||||
|
||||
first_seen = None
|
||||
|
||||
for username, id_type, results in username_results:
|
||||
found_accounts = 0
|
||||
new_ids = []
|
||||
usernames[username] = {'type': id_type}
|
||||
usernames[username] = {"type": id_type}
|
||||
|
||||
for website_name in results:
|
||||
dictionary = results[website_name]
|
||||
@@ -97,16 +118,19 @@ def generate_report_context(username_results: list):
|
||||
if not dictionary:
|
||||
continue
|
||||
|
||||
if dictionary.get('is_similar'):
|
||||
if dictionary.get("is_similar"):
|
||||
continue
|
||||
|
||||
status = dictionary.get("status")
|
||||
if not status: # FIXME: currently in case of timeout
|
||||
continue
|
||||
|
||||
status = dictionary.get('status')
|
||||
if status.ids_data:
|
||||
dictionary['ids_data'] = status.ids_data
|
||||
dictionary["ids_data"] = status.ids_data
|
||||
extended_info_count += 1
|
||||
|
||||
# detect first seen
|
||||
created_at = status.ids_data.get('created_at')
|
||||
created_at = status.ids_data.get("created_at")
|
||||
if created_at:
|
||||
if first_seen is None:
|
||||
first_seen = created_at
|
||||
@@ -116,37 +140,46 @@ def generate_report_context(username_results: list):
|
||||
new_time = parse_datetime_str(created_at)
|
||||
if new_time < known_time:
|
||||
first_seen = created_at
|
||||
except:
|
||||
logging.debug('Problems with converting datetime %s/%s', first_seen, created_at)
|
||||
except Exception as e:
|
||||
logging.debug(
|
||||
"Problems with converting datetime %s/%s: %s",
|
||||
first_seen,
|
||||
created_at,
|
||||
str(e),
|
||||
)
|
||||
|
||||
for k, v in status.ids_data.items():
|
||||
# suppose target data
|
||||
field = 'fullname' if k == 'name' else k
|
||||
if not field in supposed_data:
|
||||
field = "fullname" if k == "name" else k
|
||||
if field not in supposed_data:
|
||||
supposed_data[field] = []
|
||||
supposed_data[field].append(v)
|
||||
# suppose country
|
||||
if k in ['country', 'locale']:
|
||||
if k in ["country", "locale"]:
|
||||
try:
|
||||
if is_country_tag(k):
|
||||
tag = pycountry.countries.get(alpha_2=v).alpha_2.lower()
|
||||
else:
|
||||
tag = pycountry.countries.search_fuzzy(v)[0].alpha_2.lower()
|
||||
tag = pycountry.countries.search_fuzzy(v)[
|
||||
0
|
||||
].alpha_2.lower()
|
||||
# TODO: move countries to another struct
|
||||
tags[tag] = tags.get(tag, 0) + 1
|
||||
except Exception as e:
|
||||
logging.debug('pycountry exception', exc_info=True)
|
||||
logging.debug(
|
||||
"Pycountry exception: %s", str(e), exc_info=True
|
||||
)
|
||||
|
||||
new_usernames = dictionary.get('ids_usernames')
|
||||
new_usernames = dictionary.get("ids_usernames")
|
||||
if new_usernames:
|
||||
for u, utype in new_usernames.items():
|
||||
if not u in usernames:
|
||||
if u not in usernames:
|
||||
new_ids.append((u, utype))
|
||||
usernames[u] = {'type': utype}
|
||||
usernames[u] = {"type": utype}
|
||||
|
||||
if status.status == QueryStatus.CLAIMED:
|
||||
found_accounts += 1
|
||||
dictionary['found'] = True
|
||||
dictionary["found"] = True
|
||||
else:
|
||||
continue
|
||||
|
||||
@@ -155,25 +188,24 @@ def generate_report_context(username_results: list):
|
||||
for t in status.tags:
|
||||
tags[t] = tags.get(t, 0) + 1
|
||||
|
||||
|
||||
brief_text.append(f'Search by {id_type} {username} returned {found_accounts} accounts.')
|
||||
brief_text.append(
|
||||
f"Search by {id_type} {username} returned {found_accounts} accounts."
|
||||
)
|
||||
|
||||
if new_ids:
|
||||
ids_list = []
|
||||
for u, t in new_ids:
|
||||
ids_list.append(f'{u} ({t})' if t != 'username' else u)
|
||||
brief_text.append(f'Found target\'s other IDs: ' + ', '.join(ids_list) + '.')
|
||||
ids_list.append(f"{u} ({t})" if t != "username" else u)
|
||||
brief_text.append("Found target's other IDs: " + ", ".join(ids_list) + ".")
|
||||
|
||||
brief_text.append(f'Extended info extracted from {extended_info_count} accounts.')
|
||||
brief_text.append(f"Extended info extracted from {extended_info_count} accounts.")
|
||||
|
||||
|
||||
|
||||
brief = ' '.join(brief_text).strip()
|
||||
brief = " ".join(brief_text).strip()
|
||||
tuple_sort = lambda d: sorted(d, key=lambda x: x[1], reverse=True)
|
||||
|
||||
if 'global' in tags:
|
||||
if "global" in tags:
|
||||
# remove tag 'global' useless for country detection
|
||||
del tags['global']
|
||||
del tags["global"]
|
||||
|
||||
first_username = username_results[0][0]
|
||||
countries_lists = list(filter(lambda x: is_country_tag(x[0]), tags.items()))
|
||||
@@ -182,35 +214,33 @@ def generate_report_context(username_results: list):
|
||||
filtered_supposed_data = filter_supposed_data(supposed_data)
|
||||
|
||||
return {
|
||||
'username': first_username,
|
||||
'brief': brief,
|
||||
'results': username_results,
|
||||
'first_seen': first_seen,
|
||||
'interests_tuple_list': tuple_sort(interests_list),
|
||||
'countries_tuple_list': tuple_sort(countries_lists),
|
||||
'supposed_data': filtered_supposed_data,
|
||||
'generated_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
"username": first_username,
|
||||
"brief": brief,
|
||||
"results": username_results,
|
||||
"first_seen": first_seen,
|
||||
"interests_tuple_list": tuple_sort(interests_list),
|
||||
"countries_tuple_list": tuple_sort(countries_lists),
|
||||
"supposed_data": filtered_supposed_data,
|
||||
"generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
}
|
||||
|
||||
|
||||
def generate_csv_report(username: str, results: dict, csvfile):
|
||||
writer = csv.writer(csvfile)
|
||||
writer.writerow(['username',
|
||||
'name',
|
||||
'url_main',
|
||||
'url_user',
|
||||
'exists',
|
||||
'http_status'
|
||||
]
|
||||
)
|
||||
writer.writerow(
|
||||
["username", "name", "url_main", "url_user", "exists", "http_status"]
|
||||
)
|
||||
for site in results:
|
||||
writer.writerow([username,
|
||||
site,
|
||||
results[site]['url_main'],
|
||||
results[site]['url_user'],
|
||||
str(results[site]['status'].status),
|
||||
results[site]['http_status'],
|
||||
])
|
||||
writer.writerow(
|
||||
[
|
||||
username,
|
||||
site,
|
||||
results[site]["url_main"],
|
||||
results[site]["url_user"],
|
||||
str(results[site]["status"].status),
|
||||
results[site]["http_status"],
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def generate_txt_report(username: str, results: dict, file):
|
||||
@@ -223,28 +253,65 @@ def generate_txt_report(username: str, results: dict, file):
|
||||
if dictionary.get("status").status == QueryStatus.CLAIMED:
|
||||
exists_counter += 1
|
||||
file.write(dictionary["url_user"] + "\n")
|
||||
file.write(f'Total Websites Username Detected On : {exists_counter}')
|
||||
file.write(f"Total Websites Username Detected On : {exists_counter}")
|
||||
|
||||
'''
|
||||
|
||||
def generate_json_report(username: str, results: dict, file, report_type):
|
||||
is_report_per_line = report_type.startswith("ndjson")
|
||||
all_json = {}
|
||||
|
||||
for sitename in results:
|
||||
site_result = results[sitename]
|
||||
# TODO: fix no site data issue
|
||||
if not site_result or site_result.get("status").status != QueryStatus.CLAIMED:
|
||||
continue
|
||||
|
||||
data = dict(site_result)
|
||||
data["status"] = data["status"].json()
|
||||
data["site"] = data["site"].json
|
||||
if "future" in data:
|
||||
del data["future"]
|
||||
|
||||
if is_report_per_line:
|
||||
data["sitename"] = sitename
|
||||
file.write(json.dumps(data) + "\n")
|
||||
else:
|
||||
all_json[sitename] = data
|
||||
|
||||
if not is_report_per_line:
|
||||
file.write(json.dumps(all_json))
|
||||
|
||||
|
||||
"""
|
||||
XMIND 8 Functions
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
def save_xmind_report(filename, username, results):
|
||||
if os.path.exists(filename):
|
||||
os.remove(filename)
|
||||
workbook = xmind.load(filename)
|
||||
sheet = workbook.getPrimarySheet()
|
||||
design_sheet(sheet, username, results)
|
||||
design_xmind_sheet(sheet, username, results)
|
||||
xmind.save(workbook, path=filename)
|
||||
|
||||
|
||||
def design_sheet(sheet, username, results):
|
||||
##all tag list
|
||||
def add_xmind_subtopic(userlink, k, v, supposed_data):
|
||||
currentsublabel = userlink.addSubTopic()
|
||||
field = "fullname" if k == "name" else k
|
||||
if field not in supposed_data:
|
||||
supposed_data[field] = []
|
||||
supposed_data[field].append(v)
|
||||
currentsublabel.setTitle("%s: %s" % (k, v))
|
||||
|
||||
|
||||
def design_xmind_sheet(sheet, username, results):
|
||||
alltags = {}
|
||||
supposed_data = {}
|
||||
|
||||
sheet.setTitle("%s Analysis"%(username))
|
||||
sheet.setTitle("%s Analysis" % (username))
|
||||
root_topic1 = sheet.getRootTopic()
|
||||
root_topic1.setTitle("%s"%(username))
|
||||
root_topic1.setTitle("%s" % (username))
|
||||
|
||||
undefinedsection = root_topic1.addSubTopic()
|
||||
undefinedsection.setTitle("Undefined")
|
||||
@@ -252,57 +319,42 @@ def design_sheet(sheet, username, results):
|
||||
|
||||
for website_name in results:
|
||||
dictionary = results[website_name]
|
||||
result_status = dictionary.get("status")
|
||||
if result_status.status != QueryStatus.CLAIMED:
|
||||
continue
|
||||
|
||||
if dictionary.get("status").status == QueryStatus.CLAIMED:
|
||||
## firsttime I found that entry
|
||||
for tag in dictionary.get("status").tags:
|
||||
if tag.strip() == "":
|
||||
continue
|
||||
if tag not in alltags.keys():
|
||||
if not is_country_tag(tag):
|
||||
tagsection = root_topic1.addSubTopic()
|
||||
tagsection.setTitle(tag)
|
||||
alltags[tag] = tagsection
|
||||
stripped_tags = list(map(lambda x: x.strip(), result_status.tags))
|
||||
normalized_tags = list(
|
||||
filter(lambda x: x and not is_country_tag(x), stripped_tags)
|
||||
)
|
||||
|
||||
category = None
|
||||
for tag in dictionary.get("status").tags:
|
||||
if tag.strip() == "":
|
||||
continue
|
||||
if not is_country_tag(tag):
|
||||
category = tag
|
||||
category = None
|
||||
for tag in normalized_tags:
|
||||
if tag in alltags.keys():
|
||||
continue
|
||||
tagsection = root_topic1.addSubTopic()
|
||||
tagsection.setTitle(tag)
|
||||
alltags[tag] = tagsection
|
||||
category = tag
|
||||
|
||||
if category is None:
|
||||
userlink = undefinedsection.addSubTopic()
|
||||
userlink.addLabel(dictionary.get("status").site_url_user)
|
||||
section = alltags[category] if category else undefinedsection
|
||||
userlink = section.addSubTopic()
|
||||
userlink.addLabel(result_status.site_url_user)
|
||||
|
||||
ids_data = result_status.ids_data or {}
|
||||
for k, v in ids_data.items():
|
||||
# suppose target data
|
||||
if isinstance(v, list):
|
||||
for currentval in v:
|
||||
add_xmind_subtopic(userlink, k, currentval, supposed_data)
|
||||
else:
|
||||
userlink = alltags[category].addSubTopic()
|
||||
userlink.addLabel(dictionary.get("status").site_url_user)
|
||||
add_xmind_subtopic(userlink, k, v, supposed_data)
|
||||
|
||||
if dictionary.get("status").ids_data:
|
||||
for k, v in dictionary.get("status").ids_data.items():
|
||||
# suppose target data
|
||||
if not isinstance(v, list):
|
||||
currentsublabel = userlink.addSubTopic()
|
||||
field = 'fullname' if k == 'name' else k
|
||||
if not field in supposed_data:
|
||||
supposed_data[field] = []
|
||||
supposed_data[field].append(v)
|
||||
currentsublabel.setTitle("%s: %s" % (k, v))
|
||||
else:
|
||||
for currentval in v:
|
||||
currentsublabel = userlink.addSubTopic()
|
||||
field = 'fullname' if k == 'name' else k
|
||||
if not field in supposed_data:
|
||||
supposed_data[field] = []
|
||||
supposed_data[field].append(currentval)
|
||||
currentsublabel.setTitle("%s: %s" % (k, currentval))
|
||||
### Add Supposed DATA
|
||||
filterede_supposed_data = filter_supposed_data(supposed_data)
|
||||
if(len(filterede_supposed_data) >0):
|
||||
# add supposed data
|
||||
filtered_supposed_data = filter_supposed_data(supposed_data)
|
||||
if len(filtered_supposed_data) > 0:
|
||||
undefinedsection = root_topic1.addSubTopic()
|
||||
undefinedsection.setTitle("SUPPOSED DATA")
|
||||
for k, v in filterede_supposed_data.items():
|
||||
for k, v in filtered_supposed_data.items():
|
||||
currentsublabel = undefinedsection.addSubTopic()
|
||||
currentsublabel.setTitle("%s: %s" % (k, v))
|
||||
|
||||
|
||||
|
||||
+9446
-4494
File diff suppressed because it is too large
Load Diff
@@ -68,7 +68,7 @@
|
||||
<div class="row-mb">
|
||||
<div class="col-md">
|
||||
<div class="card flex-md-row mb-4 box-shadow h-md-250">
|
||||
<img class="card-img-right flex-auto d-none d-md-block" alt="Photo" style="width: 200px; height: 200px; object-fit: scale-down;" src="{{ v.status.ids_data.image or 'https://i.imgur.com/040fmbw.png' }}" data-holder-rendered="true">
|
||||
<img class="card-img-right flex-auto d-md-block" alt="Photo" style="width: 200px; height: 200px; object-fit: scale-down;" src="{{ v.status.ids_data.image or 'https://i.imgur.com/040fmbw.png' }}" data-holder-rendered="true">
|
||||
<div class="card-body d-flex flex-column align-items-start" style="padding-top: 0;">
|
||||
<h3 class="mb-0" style="padding-top: 1rem;">
|
||||
<a class="text-dark" href="{{ v.url_main }}" target="_blank">{{ k }}</a>
|
||||
|
||||
+28
-4
@@ -1,4 +1,4 @@
|
||||
"""Sherlock Result Module
|
||||
"""Maigret Result Module
|
||||
|
||||
This module defines various objects for recording the results of queries.
|
||||
"""
|
||||
@@ -10,6 +10,7 @@ class QueryStatus(Enum):
|
||||
|
||||
Describes status of query about a given username.
|
||||
"""
|
||||
|
||||
CLAIMED = "Claimed" # Username Detected
|
||||
AVAILABLE = "Available" # Username Not Detected
|
||||
UNKNOWN = "Unknown" # Error Occurred While Trying To Detect Username
|
||||
@@ -27,14 +28,24 @@ class QueryStatus(Enum):
|
||||
return self.value
|
||||
|
||||
|
||||
class QueryResult():
|
||||
class QueryResult:
|
||||
"""Query Result Object.
|
||||
|
||||
Describes result of query about a given username.
|
||||
"""
|
||||
|
||||
def __init__(self, username, site_name, site_url_user, status, ids_data=None,
|
||||
query_time=None, context=None, tags=[]):
|
||||
def __init__(
|
||||
self,
|
||||
username,
|
||||
site_name,
|
||||
site_url_user,
|
||||
status,
|
||||
ids_data=None,
|
||||
query_time=None,
|
||||
context=None,
|
||||
error=None,
|
||||
tags=[],
|
||||
):
|
||||
"""Create Query Result Object.
|
||||
|
||||
Contains information about a specific method of detecting usernames on
|
||||
@@ -73,7 +84,20 @@ class QueryResult():
|
||||
self.context = context
|
||||
self.ids_data = ids_data
|
||||
self.tags = tags
|
||||
self.error = error
|
||||
|
||||
def json(self):
|
||||
return {
|
||||
"username": self.username,
|
||||
"site_name": self.site_name,
|
||||
"url": self.site_url_user,
|
||||
"status": str(self.status),
|
||||
"ids": self.ids_data or {},
|
||||
"tags": self.tags,
|
||||
}
|
||||
|
||||
def is_found(self):
|
||||
return self.status == QueryStatus.CLAIMED
|
||||
|
||||
def __str__(self):
|
||||
"""Convert Object To String.
|
||||
|
||||
+282
-86
@@ -1,18 +1,77 @@
|
||||
# -*- coding: future_annotations -*-
|
||||
# ****************************** -*-
|
||||
"""Maigret Sites Information"""
|
||||
import copy
|
||||
import json
|
||||
import sys
|
||||
from typing import Optional, List, Dict, Any, Tuple
|
||||
|
||||
import requests
|
||||
|
||||
from .utils import CaseConverter
|
||||
from .utils import CaseConverter, URLMatcher, is_country_tag
|
||||
|
||||
# TODO: move to data.json
|
||||
SUPPORTED_TAGS = [
|
||||
"gaming",
|
||||
"coding",
|
||||
"photo",
|
||||
"music",
|
||||
"blog",
|
||||
"finance",
|
||||
"freelance",
|
||||
"dating",
|
||||
"tech",
|
||||
"forum",
|
||||
"porn",
|
||||
"erotic",
|
||||
"webcam",
|
||||
"video",
|
||||
"movies",
|
||||
"hacking",
|
||||
"art",
|
||||
"discussion",
|
||||
"sharing",
|
||||
"writing",
|
||||
"wiki",
|
||||
"business",
|
||||
"shopping",
|
||||
"sport",
|
||||
"books",
|
||||
"news",
|
||||
"documents",
|
||||
"travel",
|
||||
"maps",
|
||||
"hobby",
|
||||
"apps",
|
||||
"classified",
|
||||
"career",
|
||||
"geosocial",
|
||||
"streaming",
|
||||
"education",
|
||||
"networking",
|
||||
"torrent",
|
||||
"science",
|
||||
"medicine",
|
||||
"reading",
|
||||
"stock",
|
||||
"messaging",
|
||||
"trading",
|
||||
"links",
|
||||
"fashion",
|
||||
"tasks",
|
||||
"military",
|
||||
"auto",
|
||||
"gambling",
|
||||
"business",
|
||||
"cybercriminal",
|
||||
"review",
|
||||
]
|
||||
|
||||
|
||||
class MaigretEngine:
|
||||
site: Dict[str, Any] = {}
|
||||
|
||||
def __init__(self, name, data):
|
||||
self.name = name
|
||||
self.site = {}
|
||||
self.__dict__.update(data)
|
||||
|
||||
@property
|
||||
@@ -21,34 +80,50 @@ class MaigretEngine:
|
||||
|
||||
|
||||
class MaigretSite:
|
||||
NOT_SERIALIZABLE_FIELDS = [
|
||||
"name",
|
||||
"engineData",
|
||||
"requestFuture",
|
||||
"detectedEngine",
|
||||
"engineObj",
|
||||
"stats",
|
||||
"urlRegexp",
|
||||
]
|
||||
|
||||
username_claimed = ""
|
||||
username_unclaimed = ""
|
||||
url_subpath = ""
|
||||
url_main = ""
|
||||
url = ""
|
||||
disabled = False
|
||||
similar_search = False
|
||||
ignore403 = False
|
||||
tags: List[str] = []
|
||||
|
||||
type = "username"
|
||||
headers: Dict[str, str] = {}
|
||||
errors: Dict[str, str] = {}
|
||||
activation: Dict[str, Any] = {}
|
||||
regex_check = None
|
||||
url_probe = None
|
||||
check_type = ""
|
||||
request_head_only = ""
|
||||
get_params: Dict[str, Any] = {}
|
||||
|
||||
presense_strs: List[str] = []
|
||||
absence_strs: List[str] = []
|
||||
stats: Dict[str, Any] = {}
|
||||
|
||||
engine = None
|
||||
engine_data: Dict[str, Any] = {}
|
||||
engine_obj: Optional["MaigretEngine"] = None
|
||||
request_future = None
|
||||
alexa_rank = None
|
||||
source = None
|
||||
|
||||
def __init__(self, name, information):
|
||||
self.name = name
|
||||
|
||||
self.disabled = False
|
||||
self.similar_search = False
|
||||
self.ignore_403 = False
|
||||
self.tags = []
|
||||
|
||||
self.type = 'username'
|
||||
self.headers = {}
|
||||
self.errors = {}
|
||||
self.activation = {}
|
||||
self.url_subpath = ''
|
||||
self.regex_check = None
|
||||
self.url_probe = None
|
||||
self.check_type = ''
|
||||
self.request_head_only = ''
|
||||
self.get_params = {}
|
||||
|
||||
self.presense_strs = []
|
||||
self.absence_strs = []
|
||||
self.stats = {}
|
||||
|
||||
self.engine = None
|
||||
self.engine_data = {}
|
||||
self.engine_obj = None
|
||||
self.request_future = None
|
||||
self.alexa_rank = None
|
||||
self.url_subpath = ""
|
||||
|
||||
for k, v in information.items():
|
||||
self.__dict__[CaseConverter.camel_to_snake(k)] = v
|
||||
@@ -57,10 +132,50 @@ class MaigretSite:
|
||||
# We do not know the popularity, so make site go to bottom of list.
|
||||
self.alexa_rank = sys.maxsize
|
||||
|
||||
self.update_detectors()
|
||||
|
||||
def __str__(self):
|
||||
return f"{self.name} ({self.url_main})"
|
||||
|
||||
def update_detectors(self):
|
||||
if "url" in self.__dict__:
|
||||
url = self.url
|
||||
for group in ["urlMain", "urlSubpath"]:
|
||||
if group in url:
|
||||
url = url.replace(
|
||||
"{" + group + "}",
|
||||
self.__dict__[CaseConverter.camel_to_snake(group)],
|
||||
)
|
||||
|
||||
self.url_regexp = URLMatcher.make_profile_url_regexp(url, self.regex_check)
|
||||
|
||||
def detect_username(self, url: str) -> Optional[str]:
|
||||
if self.url_regexp:
|
||||
match_groups = self.url_regexp.match(url)
|
||||
if match_groups:
|
||||
return match_groups.groups()[-1].rstrip("/")
|
||||
|
||||
return None
|
||||
|
||||
def extract_id_from_url(self, url: str) -> Optional[Tuple[str, str]]:
|
||||
if not self.url_regexp:
|
||||
return None
|
||||
|
||||
match_groups = self.url_regexp.match(url)
|
||||
if not match_groups:
|
||||
return None
|
||||
|
||||
_id = match_groups.groups()[-1].rstrip("/")
|
||||
_type = self.type
|
||||
|
||||
return _id, _type
|
||||
|
||||
@property
|
||||
def pretty_name(self):
|
||||
if self.source:
|
||||
return f"{self.name} [{self.source}]"
|
||||
return self.name
|
||||
|
||||
@property
|
||||
def json(self):
|
||||
result = {}
|
||||
@@ -68,20 +183,40 @@ class MaigretSite:
|
||||
# convert to camelCase
|
||||
field = CaseConverter.snake_to_camel(k)
|
||||
# strip empty elements
|
||||
if v in (False, '', [], {}, None, sys.maxsize, 'username'):
|
||||
if v in (False, "", [], {}, None, sys.maxsize, "username"):
|
||||
continue
|
||||
if field in ['name', 'engineData', 'requestFuture', 'detectedEngine', 'engineObj', 'stats']:
|
||||
if field in self.NOT_SERIALIZABLE_FIELDS:
|
||||
continue
|
||||
result[field] = v
|
||||
|
||||
return result
|
||||
|
||||
def update(self, updates: dict) -> MaigretSite:
|
||||
@property
|
||||
def errors_dict(self) -> dict:
|
||||
errors: Dict[str, str] = {}
|
||||
if self.engine_obj:
|
||||
errors.update(self.engine_obj.site.get('errors', {}))
|
||||
errors.update(self.errors)
|
||||
return errors
|
||||
|
||||
def get_url_type(self) -> str:
|
||||
url = URLMatcher.extract_main_part(self.url)
|
||||
if url.startswith("{username}"):
|
||||
url = "SUBDOMAIN"
|
||||
elif url == "":
|
||||
url = f"{self.url} ({self.engine})"
|
||||
else:
|
||||
parts = url.split("/")
|
||||
url = "/" + "/".join(parts[1:])
|
||||
return url
|
||||
|
||||
def update(self, updates: "dict") -> "MaigretSite":
|
||||
self.__dict__.update(updates)
|
||||
self.update_detectors()
|
||||
|
||||
return self
|
||||
|
||||
def update_from_engine(self, engine: MaigretEngine) -> MaigretSite:
|
||||
def update_from_engine(self, engine: MaigretEngine) -> "MaigretSite":
|
||||
engine_data = engine.site
|
||||
for k, v in engine_data.items():
|
||||
field = CaseConverter.camel_to_snake(k)
|
||||
@@ -95,16 +230,19 @@ class MaigretSite:
|
||||
self.__dict__[field] = v
|
||||
|
||||
self.engine_obj = engine
|
||||
self.update_detectors()
|
||||
|
||||
return self
|
||||
|
||||
def strip_engine_data(self) -> MaigretSite:
|
||||
def strip_engine_data(self) -> "MaigretSite":
|
||||
if not self.engine_obj:
|
||||
return self
|
||||
|
||||
self.request_future = None
|
||||
self.url_regexp = None
|
||||
|
||||
self_copy = copy.deepcopy(self)
|
||||
engine_data = self_copy.engine_obj.site
|
||||
engine_data = self_copy.engine_obj and self_copy.engine_obj.site or {}
|
||||
site_data_keys = list(self_copy.__dict__.keys())
|
||||
|
||||
for k in engine_data.keys():
|
||||
@@ -113,7 +251,8 @@ class MaigretSite:
|
||||
# remove dict keys
|
||||
if isinstance(engine_data[k], dict) and is_exists:
|
||||
for f in engine_data[k].keys():
|
||||
del self_copy.__dict__[field][f]
|
||||
if f in self_copy.__dict__[field]:
|
||||
del self_copy.__dict__[field][f]
|
||||
continue
|
||||
# remove list items
|
||||
if isinstance(engine_data[k], list) and is_exists:
|
||||
@@ -140,29 +279,47 @@ class MaigretDatabase:
|
||||
def sites_dict(self):
|
||||
return {site.name: site for site in self._sites}
|
||||
|
||||
def ranked_sites_dict(self, reverse=False, top=sys.maxsize, tags=[], names=[],
|
||||
disabled=True, id_type='username'):
|
||||
def ranked_sites_dict(
|
||||
self,
|
||||
reverse=False,
|
||||
top=sys.maxsize,
|
||||
tags=[],
|
||||
names=[],
|
||||
disabled=True,
|
||||
id_type="username",
|
||||
):
|
||||
"""
|
||||
Ranking and filtering of the sites list
|
||||
Ranking and filtering of the sites list
|
||||
"""
|
||||
normalized_names = list(map(str.lower, names))
|
||||
normalized_tags = list(map(str.lower, tags))
|
||||
|
||||
is_name_ok = lambda x: x.name.lower() in normalized_names
|
||||
is_engine_ok = lambda x: isinstance(x.engine, str) and x.engine.lower() in normalized_tags
|
||||
is_source_ok = lambda x: x.source and x.source.lower() in normalized_names
|
||||
is_engine_ok = (
|
||||
lambda x: isinstance(x.engine, str) and x.engine.lower() in normalized_tags
|
||||
)
|
||||
is_tags_ok = lambda x: set(x.tags).intersection(set(normalized_tags))
|
||||
is_disabled_needed = lambda x: not x.disabled or ('disabled' in tags or disabled)
|
||||
is_disabled_needed = lambda x: not x.disabled or (
|
||||
"disabled" in tags or disabled
|
||||
)
|
||||
is_id_type_ok = lambda x: x.type == id_type
|
||||
|
||||
filter_tags_engines_fun = lambda x: not tags or is_engine_ok(x) or is_tags_ok(x)
|
||||
filter_names_fun = lambda x: not names or is_name_ok(x)
|
||||
filter_names_fun = lambda x: not names or is_name_ok(x) or is_source_ok(x)
|
||||
|
||||
filter_fun = lambda x: filter_tags_engines_fun(x) and filter_names_fun(x) \
|
||||
and is_disabled_needed(x) and is_id_type_ok(x)
|
||||
filter_fun = (
|
||||
lambda x: filter_tags_engines_fun(x)
|
||||
and filter_names_fun(x)
|
||||
and is_disabled_needed(x)
|
||||
and is_id_type_ok(x)
|
||||
)
|
||||
|
||||
filtered_list = [s for s in self.sites if filter_fun(s)]
|
||||
|
||||
sorted_list = sorted(filtered_list, key=lambda x: x.alexa_rank, reverse=reverse)[:top]
|
||||
sorted_list = sorted(
|
||||
filtered_list, key=lambda x: x.alexa_rank, reverse=reverse
|
||||
)[:top]
|
||||
return {site.name: site for site in sorted_list}
|
||||
|
||||
@property
|
||||
@@ -173,7 +330,7 @@ class MaigretDatabase:
|
||||
def engines_dict(self):
|
||||
return {engine.name: engine for engine in self._engines}
|
||||
|
||||
def update_site(self, site: MaigretSite) -> MaigretDatabase:
|
||||
def update_site(self, site: MaigretSite) -> "MaigretDatabase":
|
||||
for s in self._sites:
|
||||
if s.name == site.name:
|
||||
s = site
|
||||
@@ -182,21 +339,20 @@ class MaigretDatabase:
|
||||
self._sites.append(site)
|
||||
return self
|
||||
|
||||
def save_to_file(self, filename: str) -> MaigretDatabase:
|
||||
def save_to_file(self, filename: str) -> "MaigretDatabase":
|
||||
db_data = {
|
||||
'sites': {site.name: site.strip_engine_data().json for site in self._sites},
|
||||
'engines': {engine.name: engine.json for engine in self._engines},
|
||||
"sites": {site.name: site.strip_engine_data().json for site in self._sites},
|
||||
"engines": {engine.name: engine.json for engine in self._engines},
|
||||
}
|
||||
|
||||
json_data = json.dumps(db_data, indent=4)
|
||||
|
||||
with open(filename, 'w') as f:
|
||||
with open(filename, "w") as f:
|
||||
f.write(json_data)
|
||||
|
||||
return self
|
||||
|
||||
|
||||
def load_from_json(self, json_data: dict) -> MaigretDatabase:
|
||||
def load_from_json(self, json_data: dict) -> "MaigretDatabase":
|
||||
# Add all of site information from the json file to internal site list.
|
||||
site_data = json_data.get("sites", {})
|
||||
engines_data = json_data.get("engines", {})
|
||||
@@ -208,32 +364,32 @@ class MaigretDatabase:
|
||||
try:
|
||||
maigret_site = MaigretSite(site_name, site_data[site_name])
|
||||
|
||||
engine = site_data[site_name].get('engine')
|
||||
engine = site_data[site_name].get("engine")
|
||||
if engine:
|
||||
maigret_site.update_from_engine(self.engines_dict[engine])
|
||||
|
||||
self._sites.append(maigret_site)
|
||||
except KeyError as error:
|
||||
raise ValueError(f"Problem parsing json content for site {site_name}: "
|
||||
f"Missing attribute {str(error)}."
|
||||
)
|
||||
raise ValueError(
|
||||
f"Problem parsing json content for site {site_name}: "
|
||||
f"Missing attribute {str(error)}."
|
||||
)
|
||||
|
||||
return self
|
||||
|
||||
|
||||
def load_from_str(self, db_str: str) -> MaigretDatabase:
|
||||
def load_from_str(self, db_str: "str") -> "MaigretDatabase":
|
||||
try:
|
||||
data = json.loads(db_str)
|
||||
except Exception as error:
|
||||
raise ValueError(f"Problem parsing json contents from str"
|
||||
f"'{db_str[:50]}'...: {str(error)}."
|
||||
)
|
||||
raise ValueError(
|
||||
f"Problem parsing json contents from str"
|
||||
f"'{db_str[:50]}'...: {str(error)}."
|
||||
)
|
||||
|
||||
return self.load_from_json(data)
|
||||
|
||||
|
||||
def load_from_url(self, url: str) -> MaigretDatabase:
|
||||
is_url_valid = url.startswith('http://') or url.startswith('https://')
|
||||
def load_from_url(self, url: str) -> "MaigretDatabase":
|
||||
is_url_valid = url.startswith("http://") or url.startswith("https://")
|
||||
|
||||
if not is_url_valid:
|
||||
raise FileNotFoundError(f"Invalid data file URL '{url}'.")
|
||||
@@ -241,48 +397,88 @@ class MaigretDatabase:
|
||||
try:
|
||||
response = requests.get(url=url)
|
||||
except Exception as error:
|
||||
raise FileNotFoundError(f"Problem while attempting to access "
|
||||
f"data file URL '{url}': "
|
||||
f"{str(error)}"
|
||||
)
|
||||
raise FileNotFoundError(
|
||||
f"Problem while attempting to access "
|
||||
f"data file URL '{url}': "
|
||||
f"{str(error)}"
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
try:
|
||||
data = response.json()
|
||||
except Exception as error:
|
||||
raise ValueError(f"Problem parsing json contents at "
|
||||
f"'{url}': {str(error)}."
|
||||
)
|
||||
raise ValueError(
|
||||
f"Problem parsing json contents at " f"'{url}': {str(error)}."
|
||||
)
|
||||
else:
|
||||
raise FileNotFoundError(f"Bad response while accessing "
|
||||
f"data file URL '{url}'."
|
||||
)
|
||||
raise FileNotFoundError(
|
||||
f"Bad response while accessing " f"data file URL '{url}'."
|
||||
)
|
||||
|
||||
return self.load_from_json(data)
|
||||
|
||||
|
||||
def load_from_file(self, filename: str) -> MaigretDatabase:
|
||||
def load_from_file(self, filename: "str") -> "MaigretDatabase":
|
||||
try:
|
||||
with open(filename, 'r', encoding='utf-8') as file:
|
||||
with open(filename, "r", encoding="utf-8") as file:
|
||||
try:
|
||||
data = json.load(file)
|
||||
except Exception as error:
|
||||
raise ValueError(f"Problem parsing json contents from "
|
||||
f"file '{filename}': {str(error)}."
|
||||
)
|
||||
raise ValueError(
|
||||
f"Problem parsing json contents from "
|
||||
f"file '{filename}': {str(error)}."
|
||||
)
|
||||
except FileNotFoundError as error:
|
||||
raise FileNotFoundError(f"Problem while attempting to access "
|
||||
f"data file '{filename}'."
|
||||
)
|
||||
raise FileNotFoundError(
|
||||
f"Problem while attempting to access " f"data file '{filename}'."
|
||||
) from error
|
||||
|
||||
return self.load_from_json(data)
|
||||
|
||||
def get_stats(self, sites_dict):
|
||||
def get_scan_stats(self, sites_dict):
|
||||
sites = sites_dict or self.sites_dict
|
||||
found_flags = {}
|
||||
for _, s in sites.items():
|
||||
if 'presense_flag' in s.stats:
|
||||
flag = s.stats['presense_flag']
|
||||
if "presense_flag" in s.stats:
|
||||
flag = s.stats["presense_flag"]
|
||||
found_flags[flag] = found_flags.get(flag, 0) + 1
|
||||
|
||||
return found_flags
|
||||
|
||||
def get_db_stats(self, sites_dict):
|
||||
if not sites_dict:
|
||||
sites_dict = self.sites_dict()
|
||||
|
||||
urls = {}
|
||||
tags = {}
|
||||
output = ""
|
||||
disabled_count = 0
|
||||
total_count = len(sites_dict)
|
||||
|
||||
for _, site in sites_dict.items():
|
||||
if site.disabled:
|
||||
disabled_count += 1
|
||||
|
||||
url_type = site.get_url_type()
|
||||
urls[url_type] = urls.get(url_type, 0) + 1
|
||||
|
||||
if not site.tags:
|
||||
tags["NO_TAGS"] = tags.get("NO_TAGS", 0) + 1
|
||||
|
||||
for tag in filter(lambda x: not is_country_tag(x), site.tags):
|
||||
tags[tag] = tags.get(tag, 0) + 1
|
||||
|
||||
output += f"Enabled/total sites: {total_count - disabled_count}/{total_count}\n"
|
||||
output += "Top profile URLs:\n"
|
||||
for url, count in sorted(urls.items(), key=lambda x: x[1], reverse=True)[:20]:
|
||||
if count == 1:
|
||||
break
|
||||
output += f"{count}\t{url}\n"
|
||||
|
||||
output += "Top tags:\n"
|
||||
for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True)[:200]:
|
||||
mark = ""
|
||||
if tag not in SUPPORTED_TAGS:
|
||||
mark = " (non-standard)"
|
||||
output += f"{count}\t{tag}{mark}\n"
|
||||
|
||||
return output
|
||||
|
||||
@@ -0,0 +1,368 @@
|
||||
import asyncio
|
||||
import difflib
|
||||
import re
|
||||
from typing import List
|
||||
import xml.etree.ElementTree as ET
|
||||
import requests
|
||||
|
||||
from .activation import import_aiohttp_cookies
|
||||
from .checking import maigret
|
||||
from .result import QueryStatus
|
||||
from .sites import MaigretDatabase, MaigretSite, MaigretEngine
|
||||
from .utils import get_random_user_agent
|
||||
|
||||
|
||||
DESIRED_STRINGS = [
|
||||
"username",
|
||||
"not found",
|
||||
"пользователь",
|
||||
"profile",
|
||||
"lastname",
|
||||
"firstname",
|
||||
"biography",
|
||||
"birthday",
|
||||
"репутация",
|
||||
"информация",
|
||||
"e-mail",
|
||||
]
|
||||
|
||||
SUPPOSED_USERNAMES = ["alex", "god", "admin", "red", "blue", "john"]
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": get_random_user_agent(),
|
||||
}
|
||||
|
||||
RATIO = 0.6
|
||||
TOP_FEATURES = 5
|
||||
URL_RE = re.compile(r"https?://(www\.)?")
|
||||
|
||||
|
||||
def get_match_ratio(x):
|
||||
return round(
|
||||
max(
|
||||
[difflib.SequenceMatcher(a=x.lower(), b=y).ratio() for y in DESIRED_STRINGS]
|
||||
),
|
||||
2,
|
||||
)
|
||||
|
||||
|
||||
def get_alexa_rank(site_url_main):
|
||||
url = f"http://data.alexa.com/data?cli=10&url={site_url_main}"
|
||||
xml_data = requests.get(url).text
|
||||
root = ET.fromstring(xml_data)
|
||||
alexa_rank = 0
|
||||
|
||||
try:
|
||||
alexa_rank = int(root.find('.//REACH').attrib['RANK'])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return alexa_rank
|
||||
|
||||
|
||||
def extract_mainpage_url(url):
|
||||
return "/".join(url.split("/", 3)[:3])
|
||||
|
||||
|
||||
async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False):
|
||||
changes = {
|
||||
"disabled": False,
|
||||
}
|
||||
|
||||
check_data = [
|
||||
(site.username_claimed, QueryStatus.CLAIMED),
|
||||
(site.username_unclaimed, QueryStatus.AVAILABLE),
|
||||
]
|
||||
|
||||
logger.info(f"Checking {site.name}...")
|
||||
|
||||
for username, status in check_data:
|
||||
results_dict = await maigret(
|
||||
username=username,
|
||||
site_dict={site.name: site},
|
||||
logger=logger,
|
||||
timeout=30,
|
||||
id_type=site.type,
|
||||
forced=True,
|
||||
no_progressbar=True,
|
||||
)
|
||||
|
||||
# don't disable entries with other ids types
|
||||
# TODO: make normal checking
|
||||
if site.name not in results_dict:
|
||||
logger.info(results_dict)
|
||||
changes["disabled"] = True
|
||||
continue
|
||||
|
||||
result = results_dict[site.name]["status"]
|
||||
|
||||
site_status = result.status
|
||||
|
||||
if site_status != status:
|
||||
if site_status == QueryStatus.UNKNOWN:
|
||||
msgs = site.absence_strs
|
||||
etype = site.check_type
|
||||
logger.warning(
|
||||
"Error while searching '%s' in %s: %s, %s, check type %s",
|
||||
username,
|
||||
site.name,
|
||||
result.context,
|
||||
msgs,
|
||||
etype,
|
||||
)
|
||||
# don't disable in case of available username
|
||||
if status == QueryStatus.CLAIMED:
|
||||
changes["disabled"] = True
|
||||
elif status == QueryStatus.CLAIMED:
|
||||
logger.warning(
|
||||
f"Not found `{username}` in {site.name}, must be claimed"
|
||||
)
|
||||
logger.info(results_dict[site.name])
|
||||
changes["disabled"] = True
|
||||
else:
|
||||
logger.warning(f"Found `{username}` in {site.name}, must be available")
|
||||
logger.info(results_dict[site.name])
|
||||
changes["disabled"] = True
|
||||
|
||||
logger.info(f"Site {site.name} checking is finished")
|
||||
|
||||
return changes
|
||||
|
||||
|
||||
def generate_additional_fields_dialog(engine: MaigretEngine, dialog):
|
||||
fields = {}
|
||||
if 'urlSubpath' in engine.site.get('url', ''):
|
||||
msg = (
|
||||
'Detected engine suppose additional URL subpath using (/forum/, /blog/, etc). '
|
||||
'Enter in manually if it exists: '
|
||||
)
|
||||
subpath = input(msg).strip('/')
|
||||
if subpath:
|
||||
fields['urlSubpath'] = f'/{subpath}'
|
||||
return fields
|
||||
|
||||
|
||||
async def detect_known_engine(
|
||||
db, url_exists, url_mainpage, logger
|
||||
) -> List[MaigretSite]:
|
||||
try:
|
||||
r = requests.get(url_mainpage)
|
||||
logger.debug(r.text)
|
||||
except Exception as e:
|
||||
logger.warning(e)
|
||||
print("Some error while checking main page")
|
||||
return []
|
||||
|
||||
for engine in db.engines:
|
||||
strs_to_check = engine.__dict__.get("presenseStrs")
|
||||
if strs_to_check and r and r.text:
|
||||
all_strs_in_response = True
|
||||
for s in strs_to_check:
|
||||
if s not in r.text:
|
||||
all_strs_in_response = False
|
||||
sites = []
|
||||
if all_strs_in_response:
|
||||
engine_name = engine.__dict__.get("name")
|
||||
|
||||
print(f"Detected engine {engine_name} for site {url_mainpage}")
|
||||
|
||||
usernames_to_check = SUPPOSED_USERNAMES
|
||||
supposed_username = extract_username_dialog(url_exists)
|
||||
if supposed_username:
|
||||
usernames_to_check = [supposed_username] + usernames_to_check
|
||||
|
||||
add_fields = generate_additional_fields_dialog(engine, url_exists)
|
||||
|
||||
for u in usernames_to_check:
|
||||
site_data = {
|
||||
"urlMain": url_mainpage,
|
||||
"name": url_mainpage.split("//")[1],
|
||||
"engine": engine_name,
|
||||
"usernameClaimed": u,
|
||||
"usernameUnclaimed": "noonewouldeverusethis7",
|
||||
**add_fields,
|
||||
}
|
||||
logger.info(site_data)
|
||||
|
||||
maigret_site = MaigretSite(url_mainpage.split("/")[-1], site_data)
|
||||
maigret_site.update_from_engine(db.engines_dict[engine_name])
|
||||
sites.append(maigret_site)
|
||||
|
||||
return sites
|
||||
|
||||
return []
|
||||
|
||||
|
||||
def extract_username_dialog(url):
|
||||
url_parts = url.rstrip("/").split("/")
|
||||
supposed_username = url_parts[-1]
|
||||
entered_username = input(
|
||||
f'Is "{supposed_username}" a valid username? If not, write it manually: '
|
||||
)
|
||||
return entered_username if entered_username else supposed_username
|
||||
|
||||
|
||||
async def check_features_manually(
|
||||
db, url_exists, url_mainpage, cookie_file, logger, redirects=True
|
||||
):
|
||||
supposed_username = extract_username_dialog(url_exists)
|
||||
non_exist_username = "noonewouldeverusethis7"
|
||||
|
||||
url_user = url_exists.replace(supposed_username, "{username}")
|
||||
url_not_exists = url_exists.replace(supposed_username, non_exist_username)
|
||||
|
||||
# cookies
|
||||
cookie_dict = None
|
||||
if cookie_file:
|
||||
logger.info(f'Use {cookie_file} for cookies')
|
||||
cookie_jar = await import_aiohttp_cookies(cookie_file)
|
||||
cookie_dict = {c.key: c.value for c in cookie_jar}
|
||||
|
||||
exists_resp = requests.get(
|
||||
url_exists, cookies=cookie_dict, headers=HEADERS, allow_redirects=redirects
|
||||
)
|
||||
logger.debug(exists_resp.status_code)
|
||||
logger.debug(exists_resp.text)
|
||||
|
||||
non_exists_resp = requests.get(
|
||||
url_not_exists, cookies=cookie_dict, headers=HEADERS, allow_redirects=redirects
|
||||
)
|
||||
logger.debug(non_exists_resp.status_code)
|
||||
logger.debug(non_exists_resp.text)
|
||||
|
||||
a = exists_resp.text
|
||||
b = non_exists_resp.text
|
||||
|
||||
tokens_a = set(a.split('"'))
|
||||
tokens_b = set(b.split('"'))
|
||||
|
||||
a_minus_b = tokens_a.difference(tokens_b)
|
||||
b_minus_a = tokens_b.difference(tokens_a)
|
||||
|
||||
if len(a_minus_b) == len(b_minus_a) == 0:
|
||||
print("The pages for existing and non-existing account are the same!")
|
||||
|
||||
top_features_count = int(
|
||||
input(f"Specify count of features to extract [default {TOP_FEATURES}]: ")
|
||||
or TOP_FEATURES
|
||||
)
|
||||
|
||||
presence_list = sorted(a_minus_b, key=get_match_ratio, reverse=True)[
|
||||
:top_features_count
|
||||
]
|
||||
|
||||
print("Detected text features of existing account: " + ", ".join(presence_list))
|
||||
features = input("If features was not detected correctly, write it manually: ")
|
||||
|
||||
if features:
|
||||
presence_list = features.split(",")
|
||||
|
||||
absence_list = sorted(b_minus_a, key=get_match_ratio, reverse=True)[
|
||||
:top_features_count
|
||||
]
|
||||
print("Detected text features of non-existing account: " + ", ".join(absence_list))
|
||||
features = input("If features was not detected correctly, write it manually: ")
|
||||
|
||||
if features:
|
||||
absence_list = features.split(",")
|
||||
|
||||
site_data = {
|
||||
"absenceStrs": absence_list,
|
||||
"presenseStrs": presence_list,
|
||||
"url": url_user,
|
||||
"urlMain": url_mainpage,
|
||||
"usernameClaimed": supposed_username,
|
||||
"usernameUnclaimed": non_exist_username,
|
||||
"checkType": "message",
|
||||
}
|
||||
|
||||
site = MaigretSite(url_mainpage.split("/")[-1], site_data)
|
||||
return site
|
||||
|
||||
|
||||
async def submit_dialog(db, url_exists, cookie_file, logger):
|
||||
domain_raw = URL_RE.sub("", url_exists).strip().strip("/")
|
||||
domain_raw = domain_raw.split("/")[0]
|
||||
|
||||
# check for existence
|
||||
matched_sites = list(filter(lambda x: domain_raw in x.url_main + x.url, db.sites))
|
||||
|
||||
if matched_sites:
|
||||
print(
|
||||
f'Sites with domain "{domain_raw}" already exists in the Maigret database!'
|
||||
)
|
||||
status = lambda s: "(disabled)" if s.disabled else ""
|
||||
url_block = lambda s: f"\n\t{s.url_main}\n\t{s.url}"
|
||||
print(
|
||||
"\n".join(
|
||||
[
|
||||
f"{site.name} {status(site)}{url_block(site)}"
|
||||
for site in matched_sites
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
if input("Do you want to continue? [yN] ").lower() in "n":
|
||||
return False
|
||||
|
||||
url_mainpage = extract_mainpage_url(url_exists)
|
||||
|
||||
print('Detecting site engine, please wait...')
|
||||
sites = []
|
||||
try:
|
||||
sites = await detect_known_engine(db, url_exists, url_mainpage, logger)
|
||||
except KeyboardInterrupt:
|
||||
print('Engine detect process is interrupted.')
|
||||
|
||||
if not sites:
|
||||
print("Unable to detect site engine, lets generate checking features")
|
||||
sites = [
|
||||
await check_features_manually(
|
||||
db, url_exists, url_mainpage, cookie_file, logger
|
||||
)
|
||||
]
|
||||
|
||||
logger.debug(sites[0].__dict__)
|
||||
|
||||
sem = asyncio.Semaphore(1)
|
||||
|
||||
print("Checking, please wait...")
|
||||
found = False
|
||||
chosen_site = None
|
||||
for s in sites:
|
||||
chosen_site = s
|
||||
result = await site_self_check(s, logger, sem, db)
|
||||
if not result["disabled"]:
|
||||
found = True
|
||||
break
|
||||
|
||||
if not found:
|
||||
print(
|
||||
f"Sorry, we couldn't find params to detect account presence/absence in {chosen_site.name}."
|
||||
)
|
||||
print(
|
||||
"Try to run this mode again and increase features count or choose others."
|
||||
)
|
||||
return False
|
||||
else:
|
||||
if (
|
||||
input(
|
||||
f"Site {chosen_site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] "
|
||||
)
|
||||
.lower()
|
||||
.strip("y")
|
||||
):
|
||||
return False
|
||||
|
||||
chosen_site.name = input("Change site name if you want: ") or chosen_site.name
|
||||
chosen_site.tags = input("Site tags: ").split(',')
|
||||
rank = get_alexa_rank(chosen_site.url_main)
|
||||
if rank:
|
||||
print(f'New alexa rank: {rank}')
|
||||
chosen_site.alexa_rank = rank
|
||||
|
||||
logger.debug(chosen_site.json)
|
||||
site_data = chosen_site.strip_engine_data()
|
||||
logger.debug(site_data.json)
|
||||
db.update_site(site_data)
|
||||
return True
|
||||
@@ -0,0 +1,11 @@
|
||||
from typing import Callable, List, Dict, Tuple, Any
|
||||
|
||||
|
||||
# search query
|
||||
QueryDraft = Tuple[Callable, List, Dict]
|
||||
|
||||
# options dict
|
||||
QueryOptions = Dict[str, Any]
|
||||
|
||||
# TODO: throw out
|
||||
QueryResultWrapper = Dict[str, Any]
|
||||
+66
-7
@@ -1,31 +1,90 @@
|
||||
import re
|
||||
import random
|
||||
|
||||
|
||||
DEFAULT_USER_AGENTS = [
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36",
|
||||
]
|
||||
|
||||
|
||||
class CaseConverter:
|
||||
@staticmethod
|
||||
def camel_to_snake(camelcased_string: str) -> str:
|
||||
return re.sub(r'(?<!^)(?=[A-Z])', '_', camelcased_string).lower()
|
||||
return re.sub(r"(?<!^)(?=[A-Z])", "_", camelcased_string).lower()
|
||||
|
||||
@staticmethod
|
||||
def snake_to_camel(snakecased_string: str) -> str:
|
||||
formatted = ''.join(word.title() for word in snakecased_string.split('_'))
|
||||
formatted = "".join(word.title() for word in snakecased_string.split("_"))
|
||||
result = formatted[0].lower() + formatted[1:]
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def snake_to_title(snakecased_string: str) -> str:
|
||||
words = snakecased_string.split('_')
|
||||
words = snakecased_string.split("_")
|
||||
words[0] = words[0].title()
|
||||
return ' '.join(words)
|
||||
return " ".join(words)
|
||||
|
||||
|
||||
def is_country_tag(tag: str) -> bool:
|
||||
"""detect if tag represent a country"""
|
||||
return bool(re.match("^([a-zA-Z]){2}$", tag)) or tag == 'global'
|
||||
return bool(re.match("^([a-zA-Z]){2}$", tag)) or tag == "global"
|
||||
|
||||
|
||||
def enrich_link_str(link: str) -> str:
|
||||
link = link.strip()
|
||||
if link.startswith('www.') or (link.startswith('http') and '//' in link):
|
||||
if link.startswith("www.") or (link.startswith("http") and "//" in link):
|
||||
return f'<a class="auto-link" href="{link}">{link}</a>'
|
||||
return link
|
||||
return link
|
||||
|
||||
|
||||
class URLMatcher:
|
||||
_HTTP_URL_RE_STR = "^https?://(www.)?(.+)$"
|
||||
HTTP_URL_RE = re.compile(_HTTP_URL_RE_STR)
|
||||
UNSAFE_SYMBOLS = ".?"
|
||||
|
||||
@classmethod
|
||||
def extract_main_part(self, url: str) -> str:
|
||||
match = self.HTTP_URL_RE.search(url)
|
||||
if match and match.group(2):
|
||||
return match.group(2).rstrip("/")
|
||||
|
||||
return ""
|
||||
|
||||
@classmethod
|
||||
def make_profile_url_regexp(self, url: str, username_regexp: str = ""):
|
||||
url_main_part = self.extract_main_part(url)
|
||||
for c in self.UNSAFE_SYMBOLS:
|
||||
url_main_part = url_main_part.replace(c, f"\\{c}")
|
||||
prepared_username_regexp = (username_regexp or ".+?").lstrip('^').rstrip('$')
|
||||
|
||||
url_regexp = url_main_part.replace(
|
||||
"{username}", f"({prepared_username_regexp})"
|
||||
)
|
||||
regexp_str = self._HTTP_URL_RE_STR.replace("(.+)", url_regexp)
|
||||
|
||||
return re.compile(regexp_str)
|
||||
|
||||
|
||||
def get_dict_ascii_tree(items, prepend="", new_line=True):
|
||||
text = ""
|
||||
for num, item in enumerate(items):
|
||||
box_symbol = "┣╸" if num != len(items) - 1 else "┗╸"
|
||||
|
||||
if type(item) == tuple:
|
||||
field_name, field_value = item
|
||||
if field_value.startswith("['"):
|
||||
is_last_item = num == len(items) - 1
|
||||
prepend_symbols = " " * 3 if is_last_item else " ┃ "
|
||||
field_value = get_dict_ascii_tree(eval(field_value), prepend_symbols)
|
||||
text += f"\n{prepend}{box_symbol}{field_name}: {field_value}"
|
||||
else:
|
||||
text += f"\n{prepend}{box_symbol} {item}"
|
||||
|
||||
if not new_line:
|
||||
text = text[1:]
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def get_random_user_agent():
|
||||
return random.choice(DEFAULT_USER_AGENTS)
|
||||
|
||||
+5
-8
@@ -1,4 +1,4 @@
|
||||
aiohttp==3.7.3
|
||||
aiohttp==3.7.4
|
||||
aiohttp-socks==0.5.5
|
||||
arabic-reshaper==2.1.1
|
||||
async-timeout==3.0.1
|
||||
@@ -13,28 +13,25 @@ future==0.18.2
|
||||
future-annotations==1.0.0
|
||||
html5lib==1.1
|
||||
idna==2.10
|
||||
Jinja2==2.11.2
|
||||
lxml==4.6.2
|
||||
Jinja2==2.11.3
|
||||
lxml==4.6.3
|
||||
MarkupSafe==1.1.1
|
||||
mock==4.0.2
|
||||
multidict==5.1.0
|
||||
Pillow==8.1.0
|
||||
pycountry==20.7.3
|
||||
PyPDF2==1.26.0
|
||||
PySocks==1.7.1
|
||||
python-bidi==0.4.2
|
||||
python-socks==1.1.2
|
||||
reportlab==3.5.59
|
||||
requests==2.25.1
|
||||
requests>=2.24.0
|
||||
requests-futures==1.0.0
|
||||
six==1.15.0
|
||||
socid-extractor>=0.0.4
|
||||
socid-extractor>=0.0.19
|
||||
soupsieve==2.1
|
||||
stem==1.8.0
|
||||
torrequest==0.1.0
|
||||
tqdm==4.55.0
|
||||
typing-extensions==3.7.4.3
|
||||
urllib3==1.26.2
|
||||
webencodings==0.5.1
|
||||
xhtml2pdf==0.2.5
|
||||
XMind==1.2.0
|
||||
|
||||
@@ -1,3 +1,9 @@
|
||||
[egg_info]
|
||||
tag_build =
|
||||
tag_date = 0
|
||||
tag_date = 0
|
||||
|
||||
[flake8]
|
||||
per-file-ignores = __init__.py:F401
|
||||
|
||||
[mypy]
|
||||
ignore_missing_imports = True
|
||||
@@ -12,7 +12,7 @@ with open('requirements.txt') as rf:
|
||||
requires = rf.read().splitlines()
|
||||
|
||||
setup(name='maigret',
|
||||
version='0.1.13',
|
||||
version='0.2.3',
|
||||
description='Collect a dossier on a person by username from a huge number of sites',
|
||||
long_description=long_description,
|
||||
long_description_content_type="text/markdown",
|
||||
|
||||
@@ -0,0 +1,4 @@
|
||||
#!/bin/sh
|
||||
coverage run --source=./maigret -m pytest tests
|
||||
coverage report -m
|
||||
coverage html
|
||||
+20
-4
@@ -1,15 +1,18 @@
|
||||
import glob
|
||||
import logging
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from _pytest.mark import Mark
|
||||
from mock import Mock
|
||||
|
||||
from maigret.sites import MaigretDatabase, MaigretSite
|
||||
from maigret.sites import MaigretDatabase
|
||||
from maigret.maigret import setup_arguments_parser
|
||||
|
||||
|
||||
CUR_PATH = os.path.dirname(os.path.realpath(__file__))
|
||||
JSON_FILE = os.path.join(CUR_PATH, '../maigret/resources/data.json')
|
||||
empty_mark = Mark('', [], {})
|
||||
TEST_JSON_FILE = os.path.join(CUR_PATH, 'db.json')
|
||||
empty_mark = Mark('', (), {})
|
||||
|
||||
|
||||
def by_slow_marker(item):
|
||||
@@ -26,7 +29,8 @@ def get_test_reports_filenames():
|
||||
|
||||
def remove_test_reports():
|
||||
reports_list = get_test_reports_filenames()
|
||||
for f in reports_list: os.remove(f)
|
||||
for f in reports_list:
|
||||
os.remove(f)
|
||||
logging.error(f'Removed test reports {reports_list}')
|
||||
|
||||
|
||||
@@ -37,8 +41,20 @@ def default_db():
|
||||
return db
|
||||
|
||||
|
||||
@pytest.fixture(scope='function')
|
||||
def test_db():
|
||||
db = MaigretDatabase().load_from_file(TEST_JSON_FILE)
|
||||
|
||||
return db
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def reports_autoclean():
|
||||
remove_test_reports()
|
||||
yield
|
||||
remove_test_reports()
|
||||
|
||||
|
||||
@pytest.fixture(scope='session')
|
||||
def argparser():
|
||||
return setup_arguments_parser()
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
{
|
||||
"engines": {},
|
||||
"sites": {
|
||||
"GooglePlayStore": {
|
||||
"tags": ["global", "us"],
|
||||
"disabled": false,
|
||||
"checkType": "status_code",
|
||||
"alexaRank": 1,
|
||||
"url": "https://play.google.com/store/apps/developer?id={username}",
|
||||
"urlMain": "https://play.google.com/store",
|
||||
"usernameClaimed": "Facebook_nosuchname",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
},
|
||||
"Reddit": {
|
||||
"tags": ["news", "social", "us"],
|
||||
"checkType": "status_code",
|
||||
"presenseStrs": ["totalKarma"],
|
||||
"disabled": true,
|
||||
"alexaRank": 17,
|
||||
"url": "https://www.reddit.com/user/{username}",
|
||||
"urlMain": "https://www.reddit.com/",
|
||||
"usernameClaimed": "blue",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,5 +1,6 @@
|
||||
"""Maigret activation test functions"""
|
||||
import json
|
||||
|
||||
import aiohttp
|
||||
import pytest
|
||||
from mock import Mock
|
||||
@@ -43,8 +44,9 @@ async def test_import_aiohttp_cookies():
|
||||
|
||||
url = 'https://httpbin.org/cookies'
|
||||
connector = aiohttp.TCPConnector(ssl=False)
|
||||
session = aiohttp.ClientSession(connector=connector, trust_env=True,
|
||||
cookie_jar=cookie_jar)
|
||||
session = aiohttp.ClientSession(
|
||||
connector=connector, trust_env=True, cookie_jar=cookie_jar
|
||||
)
|
||||
|
||||
response = await session.get(url=url)
|
||||
result = json.loads(await response.content.read())
|
||||
|
||||
@@ -0,0 +1,93 @@
|
||||
"""Maigret command-line arguments parsing tests"""
|
||||
from argparse import Namespace
|
||||
from typing import Dict, Any
|
||||
|
||||
DEFAULT_ARGS: Dict[str, Any] = {
|
||||
'all_sites': False,
|
||||
'connections': 100,
|
||||
'cookie_file': None,
|
||||
'csv': False,
|
||||
'db_file': None,
|
||||
'debug': False,
|
||||
'disable_extracting': False,
|
||||
'disable_recursive_search': False,
|
||||
'folderoutput': 'reports',
|
||||
'html': False,
|
||||
'id_type': 'username',
|
||||
'ignore_ids_list': [],
|
||||
'info': False,
|
||||
'json': '',
|
||||
'new_site_to_submit': False,
|
||||
'no_color': False,
|
||||
'no_progressbar': False,
|
||||
'parse_url': '',
|
||||
'pdf': False,
|
||||
'print_check_errors': False,
|
||||
'print_not_found': False,
|
||||
'proxy': None,
|
||||
'retries': 1,
|
||||
'self_check': False,
|
||||
'site_list': [],
|
||||
'stats': False,
|
||||
'tags': '',
|
||||
'timeout': 30,
|
||||
'top_sites': 500,
|
||||
'txt': False,
|
||||
'use_disabled_sites': False,
|
||||
'username': [],
|
||||
'verbose': False,
|
||||
'xmind': False,
|
||||
}
|
||||
|
||||
|
||||
def test_args_search_mode(argparser):
|
||||
args = argparser.parse_args('username'.split())
|
||||
|
||||
assert args.username == ['username']
|
||||
|
||||
want_args = dict(DEFAULT_ARGS)
|
||||
want_args.update({'username': ['username']})
|
||||
|
||||
assert args == Namespace(**want_args)
|
||||
|
||||
|
||||
def test_args_search_mode_several_usernames(argparser):
|
||||
args = argparser.parse_args('username1 username2'.split())
|
||||
|
||||
assert args.username == ['username1', 'username2']
|
||||
|
||||
want_args = dict(DEFAULT_ARGS)
|
||||
want_args.update({'username': ['username1', 'username2']})
|
||||
|
||||
assert args == Namespace(**want_args)
|
||||
|
||||
|
||||
def test_args_self_check_mode(argparser):
|
||||
args = argparser.parse_args('--self-check --site GitHub'.split())
|
||||
|
||||
want_args = dict(DEFAULT_ARGS)
|
||||
want_args.update(
|
||||
{
|
||||
'self_check': True,
|
||||
'site_list': ['GitHub'],
|
||||
'username': [],
|
||||
}
|
||||
)
|
||||
|
||||
assert args == Namespace(**want_args)
|
||||
|
||||
|
||||
def test_args_multiple_sites(argparser):
|
||||
args = argparser.parse_args(
|
||||
'--site GitHub VK --site PornHub --site Taringa,Steam'.split()
|
||||
)
|
||||
|
||||
want_args = dict(DEFAULT_ARGS)
|
||||
want_args.update(
|
||||
{
|
||||
'site_list': ['GitHub', 'PornHub', 'Taringa,Steam'],
|
||||
'username': ['VK'],
|
||||
}
|
||||
)
|
||||
|
||||
assert args == Namespace(**want_args)
|
||||
@@ -0,0 +1,73 @@
|
||||
"""Maigret checking logic test functions"""
|
||||
import pytest
|
||||
import asyncio
|
||||
import logging
|
||||
from maigret.executors import (
|
||||
AsyncioSimpleExecutor,
|
||||
AsyncioProgressbarExecutor,
|
||||
AsyncioProgressbarSemaphoreExecutor,
|
||||
AsyncioProgressbarQueueExecutor,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def func(n):
|
||||
await asyncio.sleep(0.1 * (n % 3))
|
||||
return n
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_simple_asyncio_executor():
|
||||
tasks = [(func, [n], {}) for n in range(10)]
|
||||
executor = AsyncioSimpleExecutor(logger=logger)
|
||||
assert await executor.run(tasks) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
|
||||
assert executor.execution_time > 0.2
|
||||
assert executor.execution_time < 0.3
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_asyncio_progressbar_executor():
|
||||
tasks = [(func, [n], {}) for n in range(10)]
|
||||
|
||||
executor = AsyncioProgressbarExecutor(logger=logger)
|
||||
# no guarantees for the results order
|
||||
assert sorted(await executor.run(tasks)) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
|
||||
assert executor.execution_time > 0.2
|
||||
assert executor.execution_time < 0.3
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_asyncio_progressbar_semaphore_executor():
|
||||
tasks = [(func, [n], {}) for n in range(10)]
|
||||
|
||||
executor = AsyncioProgressbarSemaphoreExecutor(logger=logger, in_parallel=5)
|
||||
# no guarantees for the results order
|
||||
assert sorted(await executor.run(tasks)) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
|
||||
assert executor.execution_time > 0.2
|
||||
assert executor.execution_time < 0.4
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_asyncio_progressbar_queue_executor():
|
||||
tasks = [(func, [n], {}) for n in range(10)]
|
||||
|
||||
executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=2)
|
||||
assert await executor.run(tasks) == [0, 1, 3, 2, 4, 6, 7, 5, 9, 8]
|
||||
assert executor.execution_time > 0.5
|
||||
assert executor.execution_time < 0.6
|
||||
|
||||
executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=3)
|
||||
assert await executor.run(tasks) == [0, 3, 1, 4, 6, 2, 7, 9, 5, 8]
|
||||
assert executor.execution_time > 0.4
|
||||
assert executor.execution_time < 0.5
|
||||
|
||||
executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=5)
|
||||
assert await executor.run(tasks) == [0, 3, 6, 1, 4, 7, 9, 2, 5, 8]
|
||||
assert executor.execution_time > 0.3
|
||||
assert executor.execution_time < 0.4
|
||||
|
||||
executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=10)
|
||||
assert await executor.run(tasks) == [0, 3, 6, 9, 1, 4, 7, 2, 5, 8]
|
||||
assert executor.execution_time > 0.2
|
||||
assert executor.execution_time < 0.3
|
||||
+138
-66
@@ -1,105 +1,177 @@
|
||||
"""Maigret main module test functions"""
|
||||
import asyncio
|
||||
import copy
|
||||
|
||||
import pytest
|
||||
from mock import Mock
|
||||
|
||||
from maigret.maigret import self_check
|
||||
from maigret.sites import MaigretDatabase, MaigretSite
|
||||
from maigret.maigret import self_check, maigret
|
||||
from maigret.maigret import (
|
||||
extract_ids_from_page,
|
||||
extract_ids_from_results,
|
||||
extract_ids_from_url,
|
||||
)
|
||||
from maigret.sites import MaigretSite
|
||||
from maigret.result import QueryResult, QueryStatus
|
||||
|
||||
EXAMPLE_DB = {
|
||||
'engines': {
|
||||
|
||||
RESULTS_EXAMPLE = {
|
||||
'Reddit': {
|
||||
'cookies': None,
|
||||
'parsing_enabled': False,
|
||||
'url_main': 'https://www.reddit.com/',
|
||||
'username': 'Facebook',
|
||||
},
|
||||
'GooglePlayStore': {
|
||||
'cookies': None,
|
||||
'http_status': 200,
|
||||
'is_similar': False,
|
||||
'parsing_enabled': False,
|
||||
'rank': 1,
|
||||
'url_main': 'https://play.google.com/store',
|
||||
'url_user': 'https://play.google.com/store/apps/developer?id=Facebook',
|
||||
'username': 'Facebook',
|
||||
},
|
||||
'sites': {
|
||||
"GooglePlayStore": {
|
||||
"tags": [
|
||||
"global",
|
||||
"us"
|
||||
],
|
||||
"disabled": False,
|
||||
"checkType": "status_code",
|
||||
"alexaRank": 1,
|
||||
"url": "https://play.google.com/store/apps/developer?id={username}",
|
||||
"urlMain": "https://play.google.com/store",
|
||||
"usernameClaimed": "Facebook_nosuchname",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
},
|
||||
"Reddit": {
|
||||
"tags": [
|
||||
"news",
|
||||
"social",
|
||||
"us"
|
||||
],
|
||||
"checkType": "status_code",
|
||||
"presenseStrs": [
|
||||
"totalKarma"
|
||||
],
|
||||
"disabled": True,
|
||||
"alexaRank": 17,
|
||||
"url": "https://www.reddit.com/user/{username}",
|
||||
"urlMain": "https://www.reddit.com/",
|
||||
"usernameClaimed": "blue",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_self_check_db_positive_disable():
|
||||
def test_self_check_db_positive_disable(test_db):
|
||||
logger = Mock()
|
||||
db = MaigretDatabase()
|
||||
db.load_from_json(EXAMPLE_DB)
|
||||
|
||||
assert db.sites[0].disabled == False
|
||||
assert test_db.sites[0].disabled is False
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
loop.run_until_complete(self_check(db, db.sites_dict, logger, silent=True))
|
||||
loop.run_until_complete(
|
||||
self_check(test_db, test_db.sites_dict, logger, silent=True)
|
||||
)
|
||||
|
||||
assert db.sites[0].disabled == True
|
||||
assert test_db.sites[0].disabled is True
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_self_check_db_positive_enable():
|
||||
def test_self_check_db_positive_enable(test_db):
|
||||
logger = Mock()
|
||||
db = MaigretDatabase()
|
||||
db.load_from_json(EXAMPLE_DB)
|
||||
|
||||
db.sites[0].disabled = True
|
||||
db.sites[0].username_claimed = 'Facebook'
|
||||
assert db.sites[0].disabled == True
|
||||
test_db.sites[0].disabled = True
|
||||
test_db.sites[0].username_claimed = 'Facebook'
|
||||
assert test_db.sites[0].disabled is True
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
loop.run_until_complete(self_check(db, db.sites_dict, logger, silent=True))
|
||||
loop.run_until_complete(
|
||||
self_check(test_db, test_db.sites_dict, logger, silent=True)
|
||||
)
|
||||
|
||||
assert db.sites[0].disabled == False
|
||||
assert test_db.sites[0].disabled is False
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_self_check_db_negative_disabled():
|
||||
def test_self_check_db_negative_disabled(test_db):
|
||||
logger = Mock()
|
||||
db = MaigretDatabase()
|
||||
db.load_from_json(EXAMPLE_DB)
|
||||
|
||||
db.sites[0].disabled = True
|
||||
assert db.sites[0].disabled == True
|
||||
test_db.sites[0].disabled = True
|
||||
assert test_db.sites[0].disabled is True
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
loop.run_until_complete(self_check(db, db.sites_dict, logger, silent=True))
|
||||
loop.run_until_complete(
|
||||
self_check(test_db, test_db.sites_dict, logger, silent=True)
|
||||
)
|
||||
|
||||
assert db.sites[0].disabled == True
|
||||
assert test_db.sites[0].disabled is True
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_self_check_db_negative_enabled():
|
||||
def test_self_check_db_negative_enabled(test_db):
|
||||
logger = Mock()
|
||||
db = MaigretDatabase()
|
||||
db.load_from_json(EXAMPLE_DB)
|
||||
|
||||
db.sites[0].disabled = False
|
||||
db.sites[0].username_claimed = 'Facebook'
|
||||
assert db.sites[0].disabled == False
|
||||
test_db.sites[0].disabled = False
|
||||
test_db.sites[0].username_claimed = 'Facebook'
|
||||
assert test_db.sites[0].disabled is False
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
loop.run_until_complete(self_check(db, db.sites_dict, logger, silent=True))
|
||||
loop.run_until_complete(
|
||||
self_check(test_db, test_db.sites_dict, logger, silent=True)
|
||||
)
|
||||
|
||||
assert db.sites[0].disabled == False
|
||||
assert test_db.sites[0].disabled is False
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_maigret_results(test_db):
|
||||
logger = Mock()
|
||||
|
||||
username = 'Facebook'
|
||||
loop = asyncio.get_event_loop()
|
||||
results = loop.run_until_complete(
|
||||
maigret(username, site_dict=test_db.sites_dict, logger=logger, timeout=30)
|
||||
)
|
||||
|
||||
assert isinstance(results, dict)
|
||||
|
||||
reddit_site = results['Reddit']['site']
|
||||
assert isinstance(reddit_site, MaigretSite)
|
||||
|
||||
assert reddit_site.json == {
|
||||
'tags': ['news', 'social', 'us'],
|
||||
'checkType': 'status_code',
|
||||
'presenseStrs': ['totalKarma'],
|
||||
'disabled': True,
|
||||
'alexaRank': 17,
|
||||
'url': 'https://www.reddit.com/user/{username}',
|
||||
'urlMain': 'https://www.reddit.com/',
|
||||
'usernameClaimed': 'blue',
|
||||
'usernameUnclaimed': 'noonewouldeverusethis7',
|
||||
}
|
||||
|
||||
del results['Reddit']['site']
|
||||
del results['GooglePlayStore']['site']
|
||||
|
||||
reddit_status = results['Reddit']['status']
|
||||
assert isinstance(reddit_status, QueryResult)
|
||||
assert reddit_status.status == QueryStatus.ILLEGAL
|
||||
|
||||
playstore_status = results['GooglePlayStore']['status']
|
||||
assert isinstance(playstore_status, QueryResult)
|
||||
assert playstore_status.status == QueryStatus.CLAIMED
|
||||
|
||||
del results['Reddit']['status']
|
||||
del results['GooglePlayStore']['status']
|
||||
|
||||
assert results['Reddit'].get('future') is None
|
||||
del results['GooglePlayStore']['future']
|
||||
|
||||
assert results == RESULTS_EXAMPLE
|
||||
|
||||
|
||||
def test_extract_ids_from_url(default_db):
|
||||
assert extract_ids_from_url('https://www.reddit.com/user/test', default_db) == {
|
||||
'test': 'username'
|
||||
}
|
||||
assert extract_ids_from_url('https://vk.com/id123', default_db) == {'123': 'vk_id'}
|
||||
assert extract_ids_from_url('https://vk.com/ida123', default_db) == {
|
||||
'ida123': 'username'
|
||||
}
|
||||
assert extract_ids_from_url(
|
||||
'https://my.mail.ru/yandex.ru/dipres8904/', default_db
|
||||
) == {'dipres8904': 'username'}
|
||||
assert extract_ids_from_url(
|
||||
'https://reviews.yandex.ru/user/adbced123', default_db
|
||||
) == {'adbced123': 'yandex_public_id'}
|
||||
|
||||
|
||||
@pytest.mark.slow
|
||||
def test_extract_ids_from_page(test_db):
|
||||
logger = Mock()
|
||||
extract_ids_from_page('https://www.reddit.com/user/test', logger) == {
|
||||
'test': 'username'
|
||||
}
|
||||
|
||||
|
||||
def test_extract_ids_from_results(test_db):
|
||||
TEST_EXAMPLE = copy.deepcopy(RESULTS_EXAMPLE)
|
||||
TEST_EXAMPLE['Reddit']['ids_usernames'] = {'test1': 'yandex_public_id'}
|
||||
TEST_EXAMPLE['Reddit']['ids_links'] = ['https://www.reddit.com/user/test2']
|
||||
|
||||
extract_ids_from_results(TEST_EXAMPLE, test_db) == {
|
||||
'test1': 'yandex_public_id',
|
||||
'test2': 'username',
|
||||
}
|
||||
|
||||
@@ -0,0 +1,64 @@
|
||||
from maigret.errors import CheckError
|
||||
from maigret.notify import QueryNotifyPrint
|
||||
from maigret.result import QueryStatus, QueryResult
|
||||
|
||||
|
||||
def test_notify_illegal():
|
||||
n = QueryNotifyPrint(color=False)
|
||||
|
||||
assert (
|
||||
n.update(
|
||||
QueryResult(
|
||||
username="test",
|
||||
status=QueryStatus.ILLEGAL,
|
||||
site_name="TEST_SITE",
|
||||
site_url_user="http://example.com/test",
|
||||
)
|
||||
)
|
||||
== "[-] TEST_SITE: Illegal Username Format For This Site!"
|
||||
)
|
||||
|
||||
|
||||
def test_notify_claimed():
|
||||
n = QueryNotifyPrint(color=False)
|
||||
|
||||
assert (
|
||||
n.update(
|
||||
QueryResult(
|
||||
username="test",
|
||||
status=QueryStatus.CLAIMED,
|
||||
site_name="TEST_SITE",
|
||||
site_url_user="http://example.com/test",
|
||||
)
|
||||
)
|
||||
== "[+] TEST_SITE: http://example.com/test"
|
||||
)
|
||||
|
||||
|
||||
def test_notify_available():
|
||||
n = QueryNotifyPrint(color=False)
|
||||
|
||||
assert (
|
||||
n.update(
|
||||
QueryResult(
|
||||
username="test",
|
||||
status=QueryStatus.AVAILABLE,
|
||||
site_name="TEST_SITE",
|
||||
site_url_user="http://example.com/test",
|
||||
)
|
||||
)
|
||||
== "[-] TEST_SITE: Not found!"
|
||||
)
|
||||
|
||||
|
||||
def test_notify_unknown():
|
||||
n = QueryNotifyPrint(color=False)
|
||||
result = QueryResult(
|
||||
username="test",
|
||||
status=QueryStatus.UNKNOWN,
|
||||
site_name="TEST_SITE",
|
||||
site_url_user="http://example.com/test",
|
||||
)
|
||||
result.error = CheckError('Type', 'Reason')
|
||||
|
||||
assert n.update(result) == "[?] TEST_SITE: Type error: Reason"
|
||||
+235
-70
@@ -1,14 +1,28 @@
|
||||
"""Maigret reports test functions"""
|
||||
import copy
|
||||
import json
|
||||
import os
|
||||
from io import StringIO
|
||||
|
||||
import xmind
|
||||
from jinja2 import Template
|
||||
|
||||
from maigret.report import generate_csv_report, generate_txt_report, save_xmind_report, save_html_report, \
|
||||
save_pdf_report, generate_report_template, generate_report_context
|
||||
from maigret.report import (
|
||||
generate_csv_report,
|
||||
generate_txt_report,
|
||||
save_xmind_report,
|
||||
save_html_report,
|
||||
save_pdf_report,
|
||||
generate_report_template,
|
||||
generate_report_context,
|
||||
generate_json_report,
|
||||
)
|
||||
from maigret.result import QueryResult, QueryStatus
|
||||
from maigret.sites import MaigretSite
|
||||
|
||||
|
||||
GOOD_RESULT = QueryResult('', '', '', QueryStatus.CLAIMED)
|
||||
BAD_RESULT = QueryResult('', '', '', QueryStatus.AVAILABLE)
|
||||
|
||||
EXAMPLE_RESULTS = {
|
||||
'GitHub': {
|
||||
@@ -16,90 +30,212 @@ EXAMPLE_RESULTS = {
|
||||
'parsing_enabled': True,
|
||||
'url_main': 'https://www.github.com/',
|
||||
'url_user': 'https://www.github.com/test',
|
||||
'status': QueryResult('test',
|
||||
'GitHub',
|
||||
'https://www.github.com/test',
|
||||
QueryStatus.CLAIMED,
|
||||
tags=['test_tag']),
|
||||
'status': QueryResult(
|
||||
'test',
|
||||
'GitHub',
|
||||
'https://www.github.com/test',
|
||||
QueryStatus.CLAIMED,
|
||||
tags=['test_tag'],
|
||||
),
|
||||
'http_status': 200,
|
||||
'is_similar': False,
|
||||
'rank': 78
|
||||
'rank': 78,
|
||||
'site': MaigretSite('test', {}),
|
||||
}
|
||||
}
|
||||
|
||||
GOOD_RESULT = QueryResult('', '', '', QueryStatus.CLAIMED)
|
||||
BAD_RESULT = QueryResult('', '', '', QueryStatus.AVAILABLE)
|
||||
|
||||
GOOD_500PX_RESULT = copy.deepcopy(GOOD_RESULT)
|
||||
GOOD_500PX_RESULT.tags = ['photo', 'us', 'global']
|
||||
GOOD_500PX_RESULT.ids_data = {"uid": "dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==", "legacy_id": "26403415",
|
||||
"username": "alexaimephotographycars", "name": "Alex Aim\u00e9",
|
||||
"website": "www.flickr.com/photos/alexaimephotography/",
|
||||
"facebook_link": " www.instagram.com/street.reality.photography/",
|
||||
"instagram_username": "alexaimephotography", "twitter_username": "Alexaimephotogr"}
|
||||
GOOD_500PX_RESULT.ids_data = {
|
||||
"uid": "dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==",
|
||||
"legacy_id": "26403415",
|
||||
"username": "alexaimephotographycars",
|
||||
"name": "Alex Aim\u00e9",
|
||||
"website": "www.flickr.com/photos/alexaimephotography/",
|
||||
"facebook_link": " www.instagram.com/street.reality.photography/",
|
||||
"instagram_username": "alexaimephotography",
|
||||
"twitter_username": "Alexaimephotogr",
|
||||
}
|
||||
|
||||
GOOD_REDDIT_RESULT = copy.deepcopy(GOOD_RESULT)
|
||||
GOOD_REDDIT_RESULT.tags = ['news', 'us']
|
||||
GOOD_REDDIT_RESULT.ids_data = {"reddit_id": "t5_1nytpy", "reddit_username": "alexaimephotography",
|
||||
"fullname": "alexaimephotography",
|
||||
"image": "https://styles.redditmedia.com/t5_1nytpy/styles/profileIcon_7vmhdwzd3g931.jpg?width=256&height=256&crop=256:256,smart&frame=1&s=4f355f16b4920844a3f4eacd4237a7bf76b2e97e",
|
||||
"is_employee": "False", "is_nsfw": "False", "is_mod": "True", "is_following": "True",
|
||||
"has_user_profile": "True", "hide_from_robots": "False",
|
||||
"created_at": "2019-07-10 12:20:03", "total_karma": "53959", "post_karma": "52738"}
|
||||
GOOD_REDDIT_RESULT.ids_data = {
|
||||
"reddit_id": "t5_1nytpy",
|
||||
"reddit_username": "alexaimephotography",
|
||||
"fullname": "alexaimephotography",
|
||||
"image": "https://styles.redditmedia.com/t5_1nytpy/styles/profileIcon_7vmhdwzd3g931.jpg?width=256&height=256&crop=256:256,smart&frame=1&s=4f355f16b4920844a3f4eacd4237a7bf76b2e97e",
|
||||
"is_employee": "False",
|
||||
"is_nsfw": "False",
|
||||
"is_mod": "True",
|
||||
"is_following": "True",
|
||||
"has_user_profile": "True",
|
||||
"hide_from_robots": "False",
|
||||
"created_at": "2019-07-10 12:20:03",
|
||||
"total_karma": "53959",
|
||||
"post_karma": "52738",
|
||||
}
|
||||
|
||||
GOOD_IG_RESULT = copy.deepcopy(GOOD_RESULT)
|
||||
GOOD_IG_RESULT.tags = ['photo', 'global']
|
||||
GOOD_IG_RESULT.ids_data = {"instagram_username": "alexaimephotography", "fullname": "Alexaimephotography",
|
||||
"id": "6828488620",
|
||||
"image": "https://scontent-hel3-1.cdninstagram.com/v/t51.2885-19/s320x320/95420076_1169632876707608_8741505804647006208_n.jpg?_nc_ht=scontent-hel3-1.cdninstagram.com&_nc_ohc=jd87OUGsX4MAX_Ym5GX&tp=1&oh=0f42badd68307ba97ec7fb1ef7b4bfd4&oe=601E5E6F",
|
||||
"bio": "Photographer \nChild of fine street arts",
|
||||
"external_url": "https://www.flickr.com/photos/alexaimephotography2020/"}
|
||||
GOOD_IG_RESULT.ids_data = {
|
||||
"instagram_username": "alexaimephotography",
|
||||
"fullname": "Alexaimephotography",
|
||||
"id": "6828488620",
|
||||
"image": "https://scontent-hel3-1.cdninstagram.com/v/t51.2885-19/s320x320/95420076_1169632876707608_8741505804647006208_n.jpg?_nc_ht=scontent-hel3-1.cdninstagram.com&_nc_ohc=jd87OUGsX4MAX_Ym5GX&tp=1&oh=0f42badd68307ba97ec7fb1ef7b4bfd4&oe=601E5E6F",
|
||||
"bio": "Photographer \nChild of fine street arts",
|
||||
"external_url": "https://www.flickr.com/photos/alexaimephotography2020/",
|
||||
}
|
||||
|
||||
GOOD_TWITTER_RESULT = copy.deepcopy(GOOD_RESULT)
|
||||
GOOD_TWITTER_RESULT.tags = ['social', 'us']
|
||||
|
||||
TEST = [('alexaimephotographycars', 'username', {
|
||||
'500px': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://500px.com/',
|
||||
'url_user': 'https://500px.com/p/alexaimephotographycars',
|
||||
'ids_usernames': {'alexaimephotographycars': 'username', 'alexaimephotography': 'username',
|
||||
'Alexaimephotogr': 'username'}, 'status': GOOD_500PX_RESULT, 'http_status': 200,
|
||||
'is_similar': False, 'rank': 2981},
|
||||
'Reddit': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/',
|
||||
'url_user': 'https://www.reddit.com/user/alexaimephotographycars', 'status': BAD_RESULT,
|
||||
'http_status': 404, 'is_similar': False, 'rank': 17},
|
||||
'Twitter': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/',
|
||||
'url_user': 'https://twitter.com/alexaimephotographycars', 'status': BAD_RESULT, 'http_status': 400,
|
||||
'is_similar': False, 'rank': 55},
|
||||
'Instagram': {'username': 'alexaimephotographycars', 'parsing_enabled': True,
|
||||
'url_main': 'https://www.instagram.com/',
|
||||
'url_user': 'https://www.instagram.com/alexaimephotographycars', 'status': BAD_RESULT,
|
||||
'http_status': 404, 'is_similar': False, 'rank': 29}}), ('alexaimephotography', 'username', {
|
||||
'500px': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://500px.com/',
|
||||
'url_user': 'https://500px.com/p/alexaimephotography', 'status': BAD_RESULT, 'http_status': 200,
|
||||
'is_similar': False, 'rank': 2981},
|
||||
'Reddit': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/',
|
||||
'url_user': 'https://www.reddit.com/user/alexaimephotography',
|
||||
'ids_usernames': {'alexaimephotography': 'username'}, 'status': GOOD_REDDIT_RESULT, 'http_status': 200,
|
||||
'is_similar': False, 'rank': 17},
|
||||
'Twitter': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/',
|
||||
'url_user': 'https://twitter.com/alexaimephotography', 'status': BAD_RESULT, 'http_status': 400,
|
||||
'is_similar': False, 'rank': 55},
|
||||
'Instagram': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.instagram.com/',
|
||||
'url_user': 'https://www.instagram.com/alexaimephotography',
|
||||
'ids_usernames': {'alexaimephotography': 'username'}, 'status': GOOD_IG_RESULT, 'http_status': 200,
|
||||
'is_similar': False, 'rank': 29}}), ('Alexaimephotogr', 'username', {
|
||||
'500px': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://500px.com/',
|
||||
'url_user': 'https://500px.com/p/Alexaimephotogr', 'status': BAD_RESULT, 'http_status': 200,
|
||||
'is_similar': False, 'rank': 2981},
|
||||
'Reddit': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/',
|
||||
'url_user': 'https://www.reddit.com/user/Alexaimephotogr', 'status': BAD_RESULT, 'http_status': 404,
|
||||
'is_similar': False, 'rank': 17},
|
||||
'Twitter': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/',
|
||||
'url_user': 'https://twitter.com/Alexaimephotogr', 'status': GOOD_TWITTER_RESULT, 'http_status': 400,
|
||||
'is_similar': False, 'rank': 55},
|
||||
'Instagram': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.instagram.com/',
|
||||
'url_user': 'https://www.instagram.com/Alexaimephotogr', 'status': BAD_RESULT, 'http_status': 404,
|
||||
'is_similar': False, 'rank': 29}})]
|
||||
TEST = [
|
||||
(
|
||||
'alexaimephotographycars',
|
||||
'username',
|
||||
{
|
||||
'500px': {
|
||||
'username': 'alexaimephotographycars',
|
||||
'parsing_enabled': True,
|
||||
'url_main': 'https://500px.com/',
|
||||
'url_user': 'https://500px.com/p/alexaimephotographycars',
|
||||
'ids_usernames': {
|
||||
'alexaimephotographycars': 'username',
|
||||
'alexaimephotography': 'username',
|
||||
'Alexaimephotogr': 'username',
|
||||
},
|
||||
'status': GOOD_500PX_RESULT,
|
||||
'http_status': 200,
|
||||
'is_similar': False,
|
||||
'rank': 2981,
|
||||
},
|
||||
'Reddit': {
|
||||
'username': 'alexaimephotographycars',
|
||||
'parsing_enabled': True,
|
||||
'url_main': 'https://www.reddit.com/',
|
||||
'url_user': 'https://www.reddit.com/user/alexaimephotographycars',
|
||||
'status': BAD_RESULT,
|
||||
'http_status': 404,
|
||||
'is_similar': False,
|
||||
'rank': 17,
|
||||
},
|
||||
'Twitter': {
|
||||
'username': 'alexaimephotographycars',
|
||||
'parsing_enabled': True,
|
||||
'url_main': 'https://www.twitter.com/',
|
||||
'url_user': 'https://twitter.com/alexaimephotographycars',
|
||||
'status': BAD_RESULT,
|
||||
'http_status': 400,
|
||||
'is_similar': False,
|
||||
'rank': 55,
|
||||
},
|
||||
'Instagram': {
|
||||
'username': 'alexaimephotographycars',
|
||||
'parsing_enabled': True,
|
||||
'url_main': 'https://www.instagram.com/',
|
||||
'url_user': 'https://www.instagram.com/alexaimephotographycars',
|
||||
'status': BAD_RESULT,
|
||||
'http_status': 404,
|
||||
'is_similar': False,
|
||||
'rank': 29,
|
||||
},
|
||||
},
|
||||
),
|
||||
(
|
||||
'alexaimephotography',
|
||||
'username',
|
||||
{
|
||||
'500px': {
|
||||
'username': 'alexaimephotography',
|
||||
'parsing_enabled': True,
|
||||
'url_main': 'https://500px.com/',
|
||||
'url_user': 'https://500px.com/p/alexaimephotography',
|
||||
'status': BAD_RESULT,
|
||||
'http_status': 200,
|
||||
'is_similar': False,
|
||||
'rank': 2981,
|
||||
},
|
||||
'Reddit': {
|
||||
'username': 'alexaimephotography',
|
||||
'parsing_enabled': True,
|
||||
'url_main': 'https://www.reddit.com/',
|
||||
'url_user': 'https://www.reddit.com/user/alexaimephotography',
|
||||
'ids_usernames': {'alexaimephotography': 'username'},
|
||||
'status': GOOD_REDDIT_RESULT,
|
||||
'http_status': 200,
|
||||
'is_similar': False,
|
||||
'rank': 17,
|
||||
},
|
||||
'Twitter': {
|
||||
'username': 'alexaimephotography',
|
||||
'parsing_enabled': True,
|
||||
'url_main': 'https://www.twitter.com/',
|
||||
'url_user': 'https://twitter.com/alexaimephotography',
|
||||
'status': BAD_RESULT,
|
||||
'http_status': 400,
|
||||
'is_similar': False,
|
||||
'rank': 55,
|
||||
},
|
||||
'Instagram': {
|
||||
'username': 'alexaimephotography',
|
||||
'parsing_enabled': True,
|
||||
'url_main': 'https://www.instagram.com/',
|
||||
'url_user': 'https://www.instagram.com/alexaimephotography',
|
||||
'ids_usernames': {'alexaimephotography': 'username'},
|
||||
'status': GOOD_IG_RESULT,
|
||||
'http_status': 200,
|
||||
'is_similar': False,
|
||||
'rank': 29,
|
||||
},
|
||||
},
|
||||
),
|
||||
(
|
||||
'Alexaimephotogr',
|
||||
'username',
|
||||
{
|
||||
'500px': {
|
||||
'username': 'Alexaimephotogr',
|
||||
'parsing_enabled': True,
|
||||
'url_main': 'https://500px.com/',
|
||||
'url_user': 'https://500px.com/p/Alexaimephotogr',
|
||||
'status': BAD_RESULT,
|
||||
'http_status': 200,
|
||||
'is_similar': False,
|
||||
'rank': 2981,
|
||||
},
|
||||
'Reddit': {
|
||||
'username': 'Alexaimephotogr',
|
||||
'parsing_enabled': True,
|
||||
'url_main': 'https://www.reddit.com/',
|
||||
'url_user': 'https://www.reddit.com/user/Alexaimephotogr',
|
||||
'status': BAD_RESULT,
|
||||
'http_status': 404,
|
||||
'is_similar': False,
|
||||
'rank': 17,
|
||||
},
|
||||
'Twitter': {
|
||||
'username': 'Alexaimephotogr',
|
||||
'parsing_enabled': True,
|
||||
'url_main': 'https://www.twitter.com/',
|
||||
'url_user': 'https://twitter.com/Alexaimephotogr',
|
||||
'status': GOOD_TWITTER_RESULT,
|
||||
'http_status': 400,
|
||||
'is_similar': False,
|
||||
'rank': 55,
|
||||
},
|
||||
'Instagram': {
|
||||
'username': 'Alexaimephotogr',
|
||||
'parsing_enabled': True,
|
||||
'url_main': 'https://www.instagram.com/',
|
||||
'url_user': 'https://www.instagram.com/Alexaimephotogr',
|
||||
'status': BAD_RESULT,
|
||||
'http_status': 404,
|
||||
'is_similar': False,
|
||||
'rank': 29,
|
||||
},
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
SUPPOSED_BRIEF = """Search by username alexaimephotographycars returned 1 accounts. Found target's other IDs: alexaimephotography, Alexaimephotogr. Search by username alexaimephotography returned 2 accounts. Search by username Alexaimephotogr returned 1 accounts. Extended info extracted from 3 accounts."""
|
||||
|
||||
@@ -146,6 +282,32 @@ def test_generate_txt_report():
|
||||
]
|
||||
|
||||
|
||||
def test_generate_json_simple_report():
|
||||
jsonfile = StringIO()
|
||||
MODIFIED_RESULTS = dict(EXAMPLE_RESULTS)
|
||||
MODIFIED_RESULTS['GitHub2'] = EXAMPLE_RESULTS['GitHub']
|
||||
generate_json_report('test', MODIFIED_RESULTS, jsonfile, 'simple')
|
||||
|
||||
jsonfile.seek(0)
|
||||
data = jsonfile.readlines()
|
||||
|
||||
assert len(data) == 1
|
||||
assert list(json.loads(data[0]).keys()) == ['GitHub', 'GitHub2']
|
||||
|
||||
|
||||
def test_generate_json_ndjson_report():
|
||||
jsonfile = StringIO()
|
||||
MODIFIED_RESULTS = dict(EXAMPLE_RESULTS)
|
||||
MODIFIED_RESULTS['GitHub2'] = EXAMPLE_RESULTS['GitHub']
|
||||
generate_json_report('test', MODIFIED_RESULTS, jsonfile, 'ndjson')
|
||||
|
||||
jsonfile.seek(0)
|
||||
data = jsonfile.readlines()
|
||||
|
||||
assert len(data) == 2
|
||||
assert json.loads(data[0])['sitename'] == 'GitHub'
|
||||
|
||||
|
||||
def test_save_xmind_report():
|
||||
filename = 'report_test.xmind'
|
||||
save_xmind_report(filename, 'test', EXAMPLE_RESULTS)
|
||||
@@ -160,7 +322,10 @@ def test_save_xmind_report():
|
||||
assert data['topic']['topics'][0]['title'] == 'Undefined'
|
||||
assert data['topic']['topics'][1]['title'] == 'test_tag'
|
||||
assert len(data['topic']['topics'][1]['topics']) == 1
|
||||
assert data['topic']['topics'][1]['topics'][0]['label'] == 'https://www.github.com/test'
|
||||
assert (
|
||||
data['topic']['topics'][1]['topics'][0]['label']
|
||||
== 'https://www.github.com/test'
|
||||
)
|
||||
|
||||
|
||||
def test_html_report():
|
||||
|
||||
+31
-20
@@ -1,35 +1,30 @@
|
||||
"""Maigret Database test functions"""
|
||||
from maigret.sites import MaigretDatabase, MaigretSite
|
||||
|
||||
|
||||
EXAMPLE_DB = {
|
||||
'engines': {
|
||||
"XenForo": {
|
||||
"presenseStrs": ["XenForo"],
|
||||
"site": {
|
||||
"absenceStrs": [
|
||||
"The specified member cannot be found. Please enter a member's entire name.",
|
||||
],
|
||||
"checkType": "message",
|
||||
"errors": {
|
||||
"You must be logged-in to do that.": "Login required"
|
||||
"presenseStrs": ["XenForo"],
|
||||
"site": {
|
||||
"absenceStrs": [
|
||||
"The specified member cannot be found. Please enter a member's entire name.",
|
||||
],
|
||||
"checkType": "message",
|
||||
"errors": {"You must be logged-in to do that.": "Login required"},
|
||||
"url": "{urlMain}{urlSubpath}/members/?username={username}",
|
||||
},
|
||||
"url": "{urlMain}{urlSubpath}/members/?username={username}"
|
||||
}
|
||||
},
|
||||
},
|
||||
'sites': {
|
||||
"Amperka": {
|
||||
"engine": "XenForo",
|
||||
"rank": 121613,
|
||||
"tags": [
|
||||
"ru"
|
||||
],
|
||||
"urlMain": "http://forum.amperka.ru",
|
||||
"usernameClaimed": "adam",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
"engine": "XenForo",
|
||||
"rank": 121613,
|
||||
"tags": ["ru"],
|
||||
"urlMain": "http://forum.amperka.ru",
|
||||
"usernameClaimed": "adam",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7",
|
||||
},
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
@@ -108,11 +103,26 @@ def test_saving_site_error():
|
||||
|
||||
amperka = db.sites[0]
|
||||
assert len(amperka.errors) == 2
|
||||
assert len(amperka.errors_dict) == 2
|
||||
|
||||
assert amperka.strip_engine_data().errors == {'error1': 'text1'}
|
||||
assert amperka.strip_engine_data().json['errors'] == {'error1': 'text1'}
|
||||
|
||||
|
||||
def test_site_url_detector():
|
||||
db = MaigretDatabase()
|
||||
db.load_from_json(EXAMPLE_DB)
|
||||
|
||||
assert (
|
||||
db.sites[0].url_regexp.pattern
|
||||
== r'^https?://(www.)?forum\.amperka\.ru/members/\?username=(.+?)$'
|
||||
)
|
||||
assert (
|
||||
db.sites[0].detect_username('http://forum.amperka.ru/members/?username=test')
|
||||
== 'test'
|
||||
)
|
||||
|
||||
|
||||
def test_ranked_sites_dict():
|
||||
db = MaigretDatabase()
|
||||
db.update_site(MaigretSite('3', {'alexaRank': 1000, 'engine': 'ucoz'}))
|
||||
@@ -159,6 +169,7 @@ def test_ranked_sites_dict_disabled():
|
||||
assert len(db.ranked_sites_dict()) == 2
|
||||
assert len(db.ranked_sites_dict(disabled=False)) == 1
|
||||
|
||||
|
||||
def test_ranked_sites_dict_id_type():
|
||||
db = MaigretDatabase()
|
||||
db.update_site(MaigretSite('1', {}))
|
||||
|
||||
+116
-17
@@ -1,34 +1,133 @@
|
||||
"""Maigret utils test functions"""
|
||||
from maigret.utils import CaseConverter, is_country_tag, enrich_link_str
|
||||
import itertools
|
||||
import re
|
||||
|
||||
from maigret.utils import (
|
||||
CaseConverter,
|
||||
is_country_tag,
|
||||
enrich_link_str,
|
||||
URLMatcher,
|
||||
get_dict_ascii_tree,
|
||||
)
|
||||
|
||||
|
||||
def test_case_convert_camel_to_snake():
|
||||
a = 'SnakeCasedString'
|
||||
b = CaseConverter.camel_to_snake(a)
|
||||
a = 'SnakeCasedString'
|
||||
b = CaseConverter.camel_to_snake(a)
|
||||
|
||||
assert b == 'snake_cased_string'
|
||||
|
||||
assert b == 'snake_cased_string'
|
||||
|
||||
def test_case_convert_snake_to_camel():
|
||||
a = 'camel_cased_string'
|
||||
b = CaseConverter.snake_to_camel(a)
|
||||
a = 'camel_cased_string'
|
||||
b = CaseConverter.snake_to_camel(a)
|
||||
|
||||
assert b == 'camelCasedString'
|
||||
|
||||
assert b == 'camelCasedString'
|
||||
|
||||
def test_case_convert_snake_to_title():
|
||||
a = 'camel_cased_string'
|
||||
b = CaseConverter.snake_to_title(a)
|
||||
a = 'camel_cased_string'
|
||||
b = CaseConverter.snake_to_title(a)
|
||||
|
||||
assert b == 'Camel cased string'
|
||||
|
||||
|
||||
def test_case_convert_camel_with_digits_to_snake():
|
||||
a = 'ignore403'
|
||||
b = CaseConverter.camel_to_snake(a)
|
||||
|
||||
assert b == 'ignore403'
|
||||
|
||||
assert b == 'Camel cased string'
|
||||
|
||||
def test_is_country_tag():
|
||||
assert is_country_tag('ru') == True
|
||||
assert is_country_tag('FR') == True
|
||||
assert is_country_tag('ru') is True
|
||||
assert is_country_tag('FR') is True
|
||||
|
||||
assert is_country_tag('a1') == False
|
||||
assert is_country_tag('dating') == False
|
||||
assert is_country_tag('a1') is False
|
||||
assert is_country_tag('dating') is False
|
||||
|
||||
assert is_country_tag('global') is True
|
||||
|
||||
assert is_country_tag('global') == True
|
||||
|
||||
def test_enrich_link_str():
|
||||
assert enrich_link_str('test') == 'test'
|
||||
assert enrich_link_str(' www.flickr.com/photos/alexaimephotography/') == '<a class="auto-link" href="www.flickr.com/photos/alexaimephotography/">www.flickr.com/photos/alexaimephotography/</a>'
|
||||
assert enrich_link_str('test') == 'test'
|
||||
assert (
|
||||
enrich_link_str(' www.flickr.com/photos/alexaimephotography/')
|
||||
== '<a class="auto-link" href="www.flickr.com/photos/alexaimephotography/">www.flickr.com/photos/alexaimephotography/</a>'
|
||||
)
|
||||
|
||||
|
||||
def test_url_extract_main_part():
|
||||
url_main_part = 'flickr.com/photos/alexaimephotography'
|
||||
|
||||
parts = [
|
||||
['http://', 'https://'],
|
||||
['www.', ''],
|
||||
[url_main_part],
|
||||
['/', ''],
|
||||
]
|
||||
|
||||
url_regexp = re.compile('^https?://(www.)?flickr.com/photos/(.+?)$')
|
||||
# combine parts variations
|
||||
for url_parts in itertools.product(*parts):
|
||||
url = ''.join(url_parts)
|
||||
# ensure all combinations give valid main part
|
||||
assert URLMatcher.extract_main_part(url) == url_main_part
|
||||
assert not url_regexp.match(url) is None
|
||||
|
||||
|
||||
def test_url_make_profile_url_regexp():
|
||||
url_main_part = 'flickr.com/photos/{username}'
|
||||
|
||||
parts = [
|
||||
['http://', 'https://'],
|
||||
['www.', ''],
|
||||
[url_main_part],
|
||||
['/', ''],
|
||||
]
|
||||
|
||||
# combine parts variations
|
||||
for url_parts in itertools.product(*parts):
|
||||
url = ''.join(url_parts)
|
||||
# ensure all combinations match pattern
|
||||
assert (
|
||||
URLMatcher.make_profile_url_regexp(url).pattern
|
||||
== r'^https?://(www.)?flickr\.com/photos/(.+?)$'
|
||||
)
|
||||
|
||||
|
||||
def test_get_dict_ascii_tree():
|
||||
data = {
|
||||
'uid': 'dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==',
|
||||
'legacy_id': '26403415',
|
||||
'username': 'alexaimephotographycars',
|
||||
'name': 'Alex Aimé',
|
||||
'links': "['www.instagram.com/street.reality.photography/']",
|
||||
'created_at': '2018-05-04T10:17:01.000+0000',
|
||||
'image': 'https://drscdn.500px.org/user_avatar/26403415/q%3D85_w%3D300_h%3D300/v2?webp=true&v=2&sig=0235678a4f7b65e007e864033ebfaf5ef6d87fad34f80a8639d985320c20fe3b',
|
||||
'image_bg': 'https://drscdn.500px.org/user_cover/26403415/q%3D65_m%3D2048/v2?webp=true&v=1&sig=bea411fb158391a4fdad498874ff17088f91257e59dfb376ff67e3a44c3a4201',
|
||||
'website': 'www.instagram.com/street.reality.photography/',
|
||||
'facebook_link': ' www.instagram.com/street.reality.photography/',
|
||||
'instagram_username': 'Street.Reality.Photography',
|
||||
'twitter_username': 'Alexaimephotogr',
|
||||
}
|
||||
|
||||
ascii_tree = get_dict_ascii_tree(data.items(), prepend=" ")
|
||||
|
||||
assert (
|
||||
ascii_tree
|
||||
== """
|
||||
┣╸uid: dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==
|
||||
┣╸legacy_id: 26403415
|
||||
┣╸username: alexaimephotographycars
|
||||
┣╸name: Alex Aimé
|
||||
┣╸links:
|
||||
┃ ┗╸ www.instagram.com/street.reality.photography/
|
||||
┣╸created_at: 2018-05-04T10:17:01.000+0000
|
||||
┣╸image: https://drscdn.500px.org/user_avatar/26403415/q%3D85_w%3D300_h%3D300/v2?webp=true&v=2&sig=0235678a4f7b65e007e864033ebfaf5ef6d87fad34f80a8639d985320c20fe3b
|
||||
┣╸image_bg: https://drscdn.500px.org/user_cover/26403415/q%3D65_m%3D2048/v2?webp=true&v=1&sig=bea411fb158391a4fdad498874ff17088f91257e59dfb376ff67e3a44c3a4201
|
||||
┣╸website: www.instagram.com/street.reality.photography/
|
||||
┣╸facebook_link: www.instagram.com/street.reality.photography/
|
||||
┣╸instagram_username: Street.Reality.Photography
|
||||
┗╸twitter_username: Alexaimephotogr"""
|
||||
)
|
||||
|
||||
Executable
+57
@@ -0,0 +1,57 @@
|
||||
#!/usr/bin/env python3
|
||||
import random
|
||||
from argparse import ArgumentParser, RawDescriptionHelpFormatter
|
||||
|
||||
from maigret.maigret import MaigretDatabase
|
||||
from maigret.submit import get_alexa_rank
|
||||
|
||||
|
||||
def update_tags(site):
|
||||
tags = []
|
||||
if not site.tags:
|
||||
print(f'Site {site.name} doesn\'t have tags')
|
||||
else:
|
||||
tags = site.tags
|
||||
print(f'Site {site.name} tags: ' + ', '.join(tags))
|
||||
|
||||
print(f'URL: {site.url_main}')
|
||||
|
||||
new_tags = set(input('Enter new tags: ').split(', '))
|
||||
if "disabled" in new_tags:
|
||||
new_tags.remove("disabled")
|
||||
site.disabled = True
|
||||
|
||||
print(f'Old alexa rank: {site.alexa_rank}')
|
||||
rank = get_alexa_rank(site.url_main)
|
||||
if rank:
|
||||
print(f'New alexa rank: {rank}')
|
||||
site.alexa_rank = rank
|
||||
|
||||
site.tags = [x for x in list(new_tags) if x]
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter
|
||||
)
|
||||
parser.add_argument("--base","-b", metavar="BASE_FILE",
|
||||
dest="base_file", default="maigret/resources/data.json",
|
||||
help="JSON file with sites data to update.")
|
||||
|
||||
pool = list()
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
db = MaigretDatabase()
|
||||
db.load_from_file(args.base_file).sites
|
||||
|
||||
while True:
|
||||
site = random.choice(db.sites)
|
||||
if site.engine == 'uCoz':
|
||||
continue
|
||||
|
||||
if not 'in' in site.tags:
|
||||
continue
|
||||
|
||||
update_tags(site)
|
||||
|
||||
db.save_to_file(args.base_file)
|
||||
+34
-21
@@ -20,8 +20,9 @@ RANKS.update({
|
||||
'5000': '5K',
|
||||
'10000': '10K',
|
||||
'100000': '100K',
|
||||
'10000000': '1M',
|
||||
'50000000': '10M',
|
||||
'10000000': '10M',
|
||||
'50000000': '50M',
|
||||
'100000000': '100M',
|
||||
})
|
||||
|
||||
SEMAPHORE = threading.Semaphore(10)
|
||||
@@ -36,15 +37,15 @@ def get_rank(domain_to_query, site, print_errors=True):
|
||||
try:
|
||||
#Get ranking for this site.
|
||||
site.alexa_rank = int(root.find('.//REACH').attrib['RANK'])
|
||||
country = root.find('.//COUNTRY')
|
||||
if not country is None and country.attrib:
|
||||
country_code = country.attrib['CODE']
|
||||
tags = set(site.tags)
|
||||
if country_code:
|
||||
tags.add(country_code.lower())
|
||||
site.tags = sorted(list(tags))
|
||||
if site.type != 'username':
|
||||
site.disabled = False
|
||||
# country = root.find('.//COUNTRY')
|
||||
# if not country is None and country.attrib:
|
||||
# country_code = country.attrib['CODE']
|
||||
# tags = set(site.tags)
|
||||
# if country_code:
|
||||
# tags.add(country_code.lower())
|
||||
# site.tags = sorted(list(tags))
|
||||
# if site.type != 'username':
|
||||
# site.disabled = False
|
||||
except Exception as e:
|
||||
if print_errors:
|
||||
logging.error(e)
|
||||
@@ -58,8 +59,9 @@ def get_rank(domain_to_query, site, print_errors=True):
|
||||
def get_step_rank(rank):
|
||||
def get_readable_rank(r):
|
||||
return RANKS[str(r)]
|
||||
|
||||
valid_step_ranks = sorted(map(int, RANKS.keys()))
|
||||
if rank == 0:
|
||||
if rank == 0 or rank == sys.maxsize:
|
||||
return get_readable_rank(valid_step_ranks[-1])
|
||||
else:
|
||||
return get_readable_rank(list(filter(lambda x: x >= rank, valid_step_ranks))[0])
|
||||
@@ -72,7 +74,10 @@ if __name__ == '__main__':
|
||||
dest="base_file", default="maigret/resources/data.json",
|
||||
help="JSON file with sites data to update.")
|
||||
|
||||
parser.add_argument('--with-rank', help='update with use of local data only', action='store_true')
|
||||
parser.add_argument('--empty-only', help='update only sites without rating', action='store_true')
|
||||
parser.add_argument('--exclude-engine', help='do not update score with certain engine',
|
||||
action="append", dest="exclude_engine_list", default=[])
|
||||
|
||||
pool = list()
|
||||
|
||||
@@ -83,26 +88,31 @@ if __name__ == '__main__':
|
||||
|
||||
with open("sites.md", "w") as site_file:
|
||||
site_file.write(f"""
|
||||
## List of supported sites: total {len(sites_subset)}\n
|
||||
## List of supported sites (search methods): total {len(sites_subset)}\n
|
||||
Rank data fetched from Alexa by domains.
|
||||
|
||||
""")
|
||||
|
||||
for site in sites_subset:
|
||||
if not args.with_rank:
|
||||
break
|
||||
url_main = site.url_main
|
||||
if site.alexa_rank < sys.maxsize and args.empty_only:
|
||||
continue
|
||||
if args.exclude_engine_list and site.engine in args.exclude_engine_list:
|
||||
continue
|
||||
site.alexa_rank = 0
|
||||
th = threading.Thread(target=get_rank, args=(url_main, site))
|
||||
th = threading.Thread(target=get_rank, args=(url_main, site,))
|
||||
pool.append((site.name, url_main, th))
|
||||
th.start()
|
||||
|
||||
index = 1
|
||||
for site_name, url_main, th in pool:
|
||||
th.join()
|
||||
sys.stdout.write("\r{0}".format(f"Updated {index} out of {len(sites_subset)} entries"))
|
||||
sys.stdout.flush()
|
||||
index = index + 1
|
||||
if args.with_rank:
|
||||
index = 1
|
||||
for site_name, url_main, th in pool:
|
||||
th.join()
|
||||
sys.stdout.write("\r{0}".format(f"Updated {index} out of {len(sites_subset)} entries"))
|
||||
sys.stdout.flush()
|
||||
index = index + 1
|
||||
|
||||
sites_full_list = [(s, s.alexa_rank) for s in sites_subset]
|
||||
|
||||
@@ -117,11 +127,14 @@ Rank data fetched from Alexa by domains.
|
||||
url_main = site.url_main
|
||||
valid_rank = get_step_rank(rank)
|
||||
all_tags = site.tags
|
||||
all_tags.sort()
|
||||
tags = ', ' + ', '.join(all_tags) if all_tags else ''
|
||||
note = ''
|
||||
if site.disabled:
|
||||
note = ', search is disabled'
|
||||
site_file.write(f'1. [{site}]({url_main})*: top {valid_rank}{tags}*{note}\n')
|
||||
|
||||
favicon = f""
|
||||
site_file.write(f'1. {favicon} [{site}]({url_main})*: top {valid_rank}{tags}*{note}\n')
|
||||
db.update_site(site)
|
||||
|
||||
site_file.write(f'\nAlexa.com rank data fetched at ({datetime.utcnow()} UTC)\n')
|
||||
|
||||
@@ -0,0 +1,71 @@
|
||||
#!/usr/bin/env python3
|
||||
import asyncio
|
||||
import logging
|
||||
import maigret
|
||||
|
||||
|
||||
# top popular sites from the Maigret database
|
||||
TOP_SITES_COUNT = 300
|
||||
# Maigret HTTP requests timeout
|
||||
TIMEOUT = 10
|
||||
# max parallel requests
|
||||
MAX_CONNECTIONS = 50
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# setup logging and asyncio
|
||||
logger = logging.getLogger('maigret')
|
||||
logger.setLevel(logging.WARNING)
|
||||
loop = asyncio.get_event_loop()
|
||||
|
||||
# setup Maigret
|
||||
db = maigret.MaigretDatabase().load_from_file('./maigret/resources/data.json')
|
||||
# also can be downloaded from web
|
||||
# db = MaigretDatabase().load_from_url(MAIGRET_DB_URL)
|
||||
|
||||
# user input
|
||||
username = input('Enter username to search: ')
|
||||
|
||||
sites_count_raw = input(
|
||||
f'Select the number of sites to search ({TOP_SITES_COUNT} for default, {len(db.sites_dict)} max): '
|
||||
)
|
||||
sites_count = int(sites_count_raw) or TOP_SITES_COUNT
|
||||
|
||||
sites = db.ranked_sites_dict(top=sites_count)
|
||||
|
||||
show_progressbar_raw = input('Do you want to show a progressbar? [Yn] ')
|
||||
show_progressbar = show_progressbar_raw.lower() != 'n'
|
||||
|
||||
extract_info_raw = input(
|
||||
'Do you want to extract additional info from accounts\' pages? [Yn] '
|
||||
)
|
||||
extract_info = extract_info_raw.lower() != 'n'
|
||||
|
||||
use_notifier_raw = input(
|
||||
'Do you want to use notifier for displaying results while searching? [Yn] '
|
||||
)
|
||||
use_notifier = use_notifier_raw.lower() != 'n'
|
||||
|
||||
notifier = None
|
||||
if use_notifier:
|
||||
notifier = maigret.Notifier(print_found_only=True, skip_check_errors=True)
|
||||
|
||||
# search!
|
||||
search_func = maigret.search(
|
||||
username=username,
|
||||
site_dict=sites,
|
||||
timeout=TIMEOUT,
|
||||
logger=logger,
|
||||
max_connections=MAX_CONNECTIONS,
|
||||
query_notify=notifier,
|
||||
no_progressbar=(not show_progressbar),
|
||||
is_parsing_enabled=extract_info,
|
||||
)
|
||||
|
||||
results = loop.run_until_complete(search_func)
|
||||
|
||||
input('Search completed. Press any key to show results.')
|
||||
|
||||
for sitename, data in results.items():
|
||||
is_found = data['status'].is_found()
|
||||
print(f'{sitename} - {"Found!" if is_found else "Not found"}')
|
||||
Reference in New Issue
Block a user