mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-08 07:34:30 +00:00
Compare commits
153 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 37854a867b | |||
| 6480eebbdf | |||
| aad862b2ed | |||
| c6d0f332bd | |||
| f1c006159e | |||
| 69a09fcd94 | |||
| 9f948928e6 | |||
| a3034c11ff | |||
| d47c72b972 | |||
| 8062ec30e9 | |||
| 32000a1cfd | |||
| 8af6ce3af5 | |||
| 0dd1dd5d76 | |||
| 4aab21046b | |||
| 92ac9ec8b7 | |||
| ca2c8b3502 | |||
| 4362a41fca | |||
| c7977f1cdf | |||
| 49708da980 | |||
| bc1398061f | |||
| e8634c8c56 | |||
| dc59b93f38 | |||
| c727cbae27 | |||
| e6c6cc8f6d | |||
| c80e8b1207 | |||
| 6e78fdeb81 | |||
| 9c22e09808 | |||
| f057fd3a68 | |||
| 9b0acc092a | |||
| e6b4cdfa77 | |||
| eb721dc7e3 | |||
| eba0c4531c | |||
| b4a26c03fe | |||
| 9b7f36dc24 | |||
| 05167ad30c | |||
| cee6f0aa43 | |||
| 02cf330e37 | |||
| 5c8f7a3af0 | |||
| 13e1b6f4d1 | |||
| 5179cb56eb | |||
| 1a2c7e944a | |||
| f7eae046a1 | |||
| bdff08cb70 | |||
| a468cb1cd3 | |||
| 0fe933e8a1 | |||
| 5c3de91181 | |||
| 3356463102 | |||
| 7ac03cf5ca | |||
| 4aeacef07d | |||
| 8de1830cf3 | |||
| ba6169659e | |||
| 4a5c5c3f07 | |||
| 4ba7fcb1ff | |||
| a76f95858f | |||
| bea900dda0 | |||
| bb1bde833d | |||
| 5b405c6abb | |||
| 99fa58ceed | |||
| c71e404f63 | |||
| 2c04ccce57 | |||
| 435db7cdc9 | |||
| 413a0502a4 | |||
| 2aedcc3166 | |||
| 28835204f5 | |||
| b11a247dfd | |||
| c9219d91ec | |||
| aa6cd0eca9 | |||
| 38e5d5c664 | |||
| 8a562d06ae | |||
| aa50ee9672 | |||
| 51327f9647 | |||
| 4a368c9bb6 | |||
| 6fd5f6e33a | |||
| fa3db9c39c | |||
| 5912ad4fbc | |||
| ee36dc0187 | |||
| 9eb62e4e22 | |||
| ead048af93 | |||
| acc751ff98 | |||
| b7bdd71cf0 | |||
| 43f189f774 | |||
| 5bda7fb339 | |||
| 414523a8ac | |||
| 6d4e268706 | |||
| b696b982f4 | |||
| d4234036c0 | |||
| b57c70091c | |||
| e90df3560b | |||
| bc6ee48b8c | |||
| e70bdf3789 | |||
| 84f9d417cf | |||
| 4333c40be7 | |||
| 9e504c0094 | |||
| 2f752a0368 | |||
| 53e9dab677 | |||
| 11b70a2a48 | |||
| 960708ef2e | |||
| e6f6d8735d | |||
| f77d7d307a | |||
| 158f739a59 | |||
| b6a207d0e3 | |||
| d59867b0d9 | |||
| 2145027196 | |||
| 386e9eba4f | |||
| 0e9655c46a | |||
| 009d51c380 | |||
| 78e9688ece | |||
| 3cbb9df7b3 | |||
| 2fb1f19948 | |||
| 3b91a9cd31 | |||
| 9858e71349 | |||
| c88e194d07 | |||
| ad5c7fbc7d | |||
| 66d6c7a93c | |||
| bdfb4911ce | |||
| 951be44452 | |||
| 188edc1b7f | |||
| ec0d3a1f70 | |||
| a084203ee1 | |||
| 1afdda7336 | |||
| 252d12ff9e | |||
| 6afb17e24f | |||
| 7fdd965bb2 | |||
| 8e30e969f9 | |||
| 5ee91f6659 | |||
| 7fd4a2c516 | |||
| bfa6afac32 | |||
| bfaf276f6e | |||
| c9194b20ba | |||
| a30a012550 | |||
| 2cdc9bb276 | |||
| 99fc6c8a8f | |||
| b269c4a8e0 | |||
| f43dc5bd6f | |||
| 83cda9e37f | |||
| cc3df85690 | |||
| 8007e92021 | |||
| daaddbde4e | |||
| cea5073962 | |||
| b345512489 | |||
| 786cb59145 | |||
| 481baddec6 | |||
| ecb3d76581 | |||
| 8a8fab5bed | |||
| 2fee65fe4e | |||
| dabba859f3 | |||
| 74d4d40abd | |||
| d6f6d78d3f | |||
| 1b61c5085e | |||
| 01e20518c1 | |||
| 8477385289 | |||
| 491dd8f166 | |||
| c64b7a1c85 |
@@ -0,0 +1,13 @@
|
|||||||
|
---
|
||||||
|
name: Add a site
|
||||||
|
about: I want to add a new site for Maigret checks
|
||||||
|
title: New site
|
||||||
|
labels: new-site
|
||||||
|
assignees: soxoj
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Link to the site main page: https://example.com
|
||||||
|
Link to an existing account: https://example.com/users/john
|
||||||
|
Link to a nonexistent account: https://example.com/users/noonewouldeverusethis7
|
||||||
|
Tags: photo, us, ...
|
||||||
@@ -0,0 +1,32 @@
|
|||||||
|
name: Build docker image and push to DockerHub
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [ main ]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
docker:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
-
|
||||||
|
name: Set up QEMU
|
||||||
|
uses: docker/setup-qemu-action@v1
|
||||||
|
-
|
||||||
|
name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v1
|
||||||
|
-
|
||||||
|
name: Login to DockerHub
|
||||||
|
uses: docker/login-action@v1
|
||||||
|
with:
|
||||||
|
username: ${{ secrets.DOCKER_HUB_USERNAME }}
|
||||||
|
password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }}
|
||||||
|
-
|
||||||
|
name: Build and push
|
||||||
|
id: docker_build
|
||||||
|
uses: docker/build-push-action@v2
|
||||||
|
with:
|
||||||
|
push: true
|
||||||
|
tags: ${{ secrets.DOCKER_HUB_USERNAME }}/maigret:latest
|
||||||
|
-
|
||||||
|
name: Image digest
|
||||||
|
run: echo ${{ steps.docker_build.outputs.digest }}
|
||||||
@@ -26,7 +26,7 @@ jobs:
|
|||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
python -m pip install flake8 pytest pytest-rerunfailures
|
python -m pip install -r test-requirements.txt
|
||||||
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
|
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
|
||||||
- name: Test with pytest
|
- name: Test with pytest
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
+6
-3
@@ -22,9 +22,12 @@ src/
|
|||||||
# Comma-Separated Values (CSV) Reports
|
# Comma-Separated Values (CSV) Reports
|
||||||
*.csv
|
*.csv
|
||||||
|
|
||||||
# Excluded sites list
|
|
||||||
tests/.excluded_sites
|
|
||||||
|
|
||||||
# MacOS Folder Metadata File
|
# MacOS Folder Metadata File
|
||||||
.DS_Store
|
.DS_Store
|
||||||
/reports/
|
/reports/
|
||||||
|
|
||||||
|
# Testing
|
||||||
|
.coverage
|
||||||
|
dist/
|
||||||
|
htmlcov/
|
||||||
|
/test_*
|
||||||
@@ -2,6 +2,47 @@
|
|||||||
|
|
||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
|
## [0.3.1] - 2021-10-31
|
||||||
|
* fixed false positives
|
||||||
|
* accelerated maigret start time by 3 times
|
||||||
|
|
||||||
|
## [0.3.0] - 2021-06-02
|
||||||
|
* added support of Tor and I2P sites
|
||||||
|
* added experimental DNS checking feature
|
||||||
|
* implemented sorting by data points for reports
|
||||||
|
* reports fixes
|
||||||
|
|
||||||
|
## [0.2.4] - 2021-05-18
|
||||||
|
* cli output report
|
||||||
|
* various improvements
|
||||||
|
|
||||||
|
## [0.2.3] - 2021-05-12
|
||||||
|
* added Yelp and yelp_userid support
|
||||||
|
* tags markup stabilization
|
||||||
|
* improved errors detection
|
||||||
|
|
||||||
|
## [0.2.2] - 2021-05-07
|
||||||
|
* improved ids extractors
|
||||||
|
* updated sites and engines
|
||||||
|
* updates CLI options
|
||||||
|
|
||||||
|
## [0.2.1] - 2021-05-02
|
||||||
|
* fixed json reports generation bug, added tests
|
||||||
|
|
||||||
|
## [0.2.0] - 2021-05-02
|
||||||
|
* added `--retries` option
|
||||||
|
* added `source` feature for sites' mirrors
|
||||||
|
* improved `submit` mode
|
||||||
|
* lot of style and logic fixes
|
||||||
|
|
||||||
|
## [0.1.20] - 2021-05-02 [YANKED]
|
||||||
|
|
||||||
|
## [0.1.19] - 2021-04-14
|
||||||
|
* added `--no-progressbar` option
|
||||||
|
* fixed ascii tree bug
|
||||||
|
* fixed `python -m maigret` run
|
||||||
|
* fixed requests freeze with timeout async tasks
|
||||||
|
|
||||||
## [0.1.18] - 2021-03-30
|
## [0.1.18] - 2021-03-30
|
||||||
* some API improvements
|
* some API improvements
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,128 @@
|
|||||||
|
# Contributor Covenant Code of Conduct
|
||||||
|
|
||||||
|
## Our Pledge
|
||||||
|
|
||||||
|
We as members, contributors, and leaders pledge to make participation in our
|
||||||
|
community a harassment-free experience for everyone, regardless of age, body
|
||||||
|
size, visible or invisible disability, ethnicity, sex characteristics, gender
|
||||||
|
identity and expression, level of experience, education, socio-economic status,
|
||||||
|
nationality, personal appearance, race, religion, or sexual identity
|
||||||
|
and orientation.
|
||||||
|
|
||||||
|
We pledge to act and interact in ways that contribute to an open, welcoming,
|
||||||
|
diverse, inclusive, and healthy community.
|
||||||
|
|
||||||
|
## Our Standards
|
||||||
|
|
||||||
|
Examples of behavior that contributes to a positive environment for our
|
||||||
|
community include:
|
||||||
|
|
||||||
|
* Demonstrating empathy and kindness toward other people
|
||||||
|
* Being respectful of differing opinions, viewpoints, and experiences
|
||||||
|
* Giving and gracefully accepting constructive feedback
|
||||||
|
* Accepting responsibility and apologizing to those affected by our mistakes,
|
||||||
|
and learning from the experience
|
||||||
|
* Focusing on what is best not just for us as individuals, but for the
|
||||||
|
overall community
|
||||||
|
|
||||||
|
Examples of unacceptable behavior include:
|
||||||
|
|
||||||
|
* The use of sexualized language or imagery, and sexual attention or
|
||||||
|
advances of any kind
|
||||||
|
* Trolling, insulting or derogatory comments, and personal or political attacks
|
||||||
|
* Public or private harassment
|
||||||
|
* Publishing others' private information, such as a physical or email
|
||||||
|
address, without their explicit permission
|
||||||
|
* Other conduct which could reasonably be considered inappropriate in a
|
||||||
|
professional setting
|
||||||
|
|
||||||
|
## Enforcement Responsibilities
|
||||||
|
|
||||||
|
Community leaders are responsible for clarifying and enforcing our standards of
|
||||||
|
acceptable behavior and will take appropriate and fair corrective action in
|
||||||
|
response to any behavior that they deem inappropriate, threatening, offensive,
|
||||||
|
or harmful.
|
||||||
|
|
||||||
|
Community leaders have the right and responsibility to remove, edit, or reject
|
||||||
|
comments, commits, code, wiki edits, issues, and other contributions that are
|
||||||
|
not aligned to this Code of Conduct, and will communicate reasons for moderation
|
||||||
|
decisions when appropriate.
|
||||||
|
|
||||||
|
## Scope
|
||||||
|
|
||||||
|
This Code of Conduct applies within all community spaces, and also applies when
|
||||||
|
an individual is officially representing the community in public spaces.
|
||||||
|
Examples of representing our community include using an official e-mail address,
|
||||||
|
posting via an official social media account, or acting as an appointed
|
||||||
|
representative at an online or offline event.
|
||||||
|
|
||||||
|
## Enforcement
|
||||||
|
|
||||||
|
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
||||||
|
reported to the community leaders responsible for enforcement at
|
||||||
|
https://t.me/soxoj.
|
||||||
|
All complaints will be reviewed and investigated promptly and fairly.
|
||||||
|
|
||||||
|
All community leaders are obligated to respect the privacy and security of the
|
||||||
|
reporter of any incident.
|
||||||
|
|
||||||
|
## Enforcement Guidelines
|
||||||
|
|
||||||
|
Community leaders will follow these Community Impact Guidelines in determining
|
||||||
|
the consequences for any action they deem in violation of this Code of Conduct:
|
||||||
|
|
||||||
|
### 1. Correction
|
||||||
|
|
||||||
|
**Community Impact**: Use of inappropriate language or other behavior deemed
|
||||||
|
unprofessional or unwelcome in the community.
|
||||||
|
|
||||||
|
**Consequence**: A private, written warning from community leaders, providing
|
||||||
|
clarity around the nature of the violation and an explanation of why the
|
||||||
|
behavior was inappropriate. A public apology may be requested.
|
||||||
|
|
||||||
|
### 2. Warning
|
||||||
|
|
||||||
|
**Community Impact**: A violation through a single incident or series
|
||||||
|
of actions.
|
||||||
|
|
||||||
|
**Consequence**: A warning with consequences for continued behavior. No
|
||||||
|
interaction with the people involved, including unsolicited interaction with
|
||||||
|
those enforcing the Code of Conduct, for a specified period of time. This
|
||||||
|
includes avoiding interactions in community spaces as well as external channels
|
||||||
|
like social media. Violating these terms may lead to a temporary or
|
||||||
|
permanent ban.
|
||||||
|
|
||||||
|
### 3. Temporary Ban
|
||||||
|
|
||||||
|
**Community Impact**: A serious violation of community standards, including
|
||||||
|
sustained inappropriate behavior.
|
||||||
|
|
||||||
|
**Consequence**: A temporary ban from any sort of interaction or public
|
||||||
|
communication with the community for a specified period of time. No public or
|
||||||
|
private interaction with the people involved, including unsolicited interaction
|
||||||
|
with those enforcing the Code of Conduct, is allowed during this period.
|
||||||
|
Violating these terms may lead to a permanent ban.
|
||||||
|
|
||||||
|
### 4. Permanent Ban
|
||||||
|
|
||||||
|
**Community Impact**: Demonstrating a pattern of violation of community
|
||||||
|
standards, including sustained inappropriate behavior, harassment of an
|
||||||
|
individual, or aggression toward or disparagement of classes of individuals.
|
||||||
|
|
||||||
|
**Consequence**: A permanent ban from any sort of public interaction within
|
||||||
|
the community.
|
||||||
|
|
||||||
|
## Attribution
|
||||||
|
|
||||||
|
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
|
||||||
|
version 2.0, available at
|
||||||
|
https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
|
||||||
|
|
||||||
|
Community Impact Guidelines were inspired by [Mozilla's code of conduct
|
||||||
|
enforcement ladder](https://github.com/mozilla/diversity).
|
||||||
|
|
||||||
|
[homepage]: https://www.contributor-covenant.org
|
||||||
|
|
||||||
|
For answers to common questions about this code of conduct, see the FAQ at
|
||||||
|
https://www.contributor-covenant.org/faq. Translations are available at
|
||||||
|
https://www.contributor-covenant.org/translations.
|
||||||
@@ -0,0 +1,30 @@
|
|||||||
|
# How to contribute
|
||||||
|
|
||||||
|
Hey! I'm really glad you're reading this. Maigret contains a lot of sites, and it is very hard to keep all the sites operational. That's why any fix is important.
|
||||||
|
|
||||||
|
## How to add a new site
|
||||||
|
|
||||||
|
#### Beginner level
|
||||||
|
|
||||||
|
You can use Maigret **submit mode** (`maigret --submit URL`) to add a new site or update an existing site. In this mode Maigret do an automatic analysis of the given account URL or site main page URL to determine the site engine and methods to check account presence. After checking Maigret asks if you want to add the site, answering y/Y will rewrite the local database.
|
||||||
|
|
||||||
|
#### Advanced level
|
||||||
|
|
||||||
|
You can edit [the database JSON file](https://github.com/soxoj/maigret/blob/main/maigret/resources/data.json) (`./maigret/resources/data.json`) manually.
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
|
||||||
|
There are CI checks for every PR to the Maigret repository. But it will be better to run `make format`, `make link` and `make test` to ensure you've made a corrent changes.
|
||||||
|
|
||||||
|
## Submitting changes
|
||||||
|
|
||||||
|
To submit you changes you must [send a GitHub PR](https://github.com/soxoj/maigret/pulls) to the Maigret project.
|
||||||
|
Always write a clear log message for your commits. One-line messages are fine for small changes, but bigger changes should look like this:
|
||||||
|
|
||||||
|
$ git commit -m "A brief summary of the commit
|
||||||
|
>
|
||||||
|
> A paragraph describing what changed and its impact."
|
||||||
|
|
||||||
|
## Coding conventions
|
||||||
|
|
||||||
|
Start reading the code and you'll get the hang of it. ;)
|
||||||
+8
-17
@@ -1,25 +1,16 @@
|
|||||||
FROM python:3.7
|
FROM python:3.9
|
||||||
LABEL maintainer="Soxoj <soxoj@protonmail.com>"
|
MAINTAINER Soxoj <soxoj@protonmail.com>
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
ADD requirements.txt .
|
|
||||||
|
|
||||||
RUN pip install --upgrade pip
|
RUN pip install --upgrade pip
|
||||||
|
RUN apt update && \
|
||||||
RUN apt update -y
|
apt install -y \
|
||||||
|
|
||||||
RUN apt install -y\
|
|
||||||
gcc \
|
gcc \
|
||||||
musl-dev \
|
musl-dev \
|
||||||
libxml2 \
|
libxml2 \
|
||||||
libxml2-dev \
|
libxml2-dev \
|
||||||
libxslt-dev \
|
libxslt-dev
|
||||||
&& YARL_NO_EXTENSIONS=1 python3 -m pip install maigret \
|
RUN apt clean \
|
||||||
&& rm -rf /var/cache/apk/* \
|
&& rm -rf /var/lib/apt/lists/* /tmp/*
|
||||||
/tmp/* \
|
|
||||||
/var/tmp/*
|
|
||||||
|
|
||||||
ADD . .
|
ADD . .
|
||||||
|
RUN YARL_NO_EXTENSIONS=1 python3 -m pip install .
|
||||||
ENTRYPOINT ["maigret"]
|
ENTRYPOINT ["maigret"]
|
||||||
|
|||||||
@@ -0,0 +1,35 @@
|
|||||||
|
LINT_FILES=maigret wizard.py tests
|
||||||
|
|
||||||
|
test:
|
||||||
|
coverage run --source=./maigret -m pytest tests
|
||||||
|
coverage report -m
|
||||||
|
coverage html
|
||||||
|
|
||||||
|
rerun-tests:
|
||||||
|
pytest --lf -vv
|
||||||
|
|
||||||
|
lint:
|
||||||
|
@echo 'syntax errors or undefined names'
|
||||||
|
flake8 --count --select=E9,F63,F7,F82 --show-source --statistics ${LINT_FILES} maigret.py
|
||||||
|
|
||||||
|
@echo 'warning'
|
||||||
|
flake8 --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --ignore=E731,W503 ${LINT_FILES} maigret.py
|
||||||
|
|
||||||
|
@echo 'mypy'
|
||||||
|
mypy ${LINT_FILES}
|
||||||
|
|
||||||
|
format:
|
||||||
|
@echo 'black'
|
||||||
|
black --skip-string-normalization ${LINT_FILES}
|
||||||
|
|
||||||
|
pull:
|
||||||
|
git stash
|
||||||
|
git checkout main
|
||||||
|
git pull origin main
|
||||||
|
git stash pop
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -rf reports htmcov dist
|
||||||
|
|
||||||
|
install:
|
||||||
|
pip3 install .
|
||||||
@@ -1,39 +1,55 @@
|
|||||||
# Maigret
|
# Maigret
|
||||||
|
|
||||||

|
|
||||||

|
|
||||||
[](https://gitter.im/maigret-osint/community)
|
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
<img src="./static/maigret.png" />
|
<p align="center">
|
||||||
|
<a href="https://pypi.org/project/maigret/">
|
||||||
|
<img alt="PyPI" src="https://img.shields.io/pypi/v/maigret?style=flat-square">
|
||||||
|
</a>
|
||||||
|
<a href="https://pypi.org/project/maigret/">
|
||||||
|
<img alt="PyPI - Downloads" src="https://img.shields.io/pypi/dw/maigret?style=flat-square">
|
||||||
|
</a>
|
||||||
|
<a href="https://pypi.org/project/maigret/">
|
||||||
|
<img alt="Views" src="https://komarev.com/ghpvc/?username=maigret&color=brightgreen&label=views&style=flat-square">
|
||||||
|
</a>
|
||||||
|
</p>
|
||||||
|
<p align="center">
|
||||||
|
<img src="https://raw.githubusercontent.com/soxoj/maigret/main/static/maigret.png" height="200"/>
|
||||||
|
</p>
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
<i>The Commissioner Jules Maigret is a fictional French police detective, created by Georges Simenon. His investigation method is based on understanding the personality of different people and their interactions.</i>
|
<i>The Commissioner Jules Maigret is a fictional French police detective, created by Georges Simenon. His investigation method is based on understanding the personality of different people and their interactions.</i>
|
||||||
|
|
||||||
## About
|
## About
|
||||||
|
|
||||||
Purpose of Maigret - **collect a dossier on a person by username only**, checking for accounts on a huge number of sites.
|
**Maigret** collect a dossier on a person **by username only**, checking for accounts on a huge number of sites and gathering all the available information from web pages. No API keys required. Maigret is an easy-to-use and powerful fork of [Sherlock](https://github.com/sherlock-project/sherlock).
|
||||||
|
|
||||||
This is a [sherlock](https://github.com/sherlock-project/) fork with cool features under heavy development.
|
Currently supported more than 2000 sites ([full list](https://raw.githubusercontent.com/soxoj/maigret/main/sites.md)), search is launched against 500 popular sites in descending order of popularity by default. Also supported checking of Tor sites, I2P sites, and domains (via DNS resolving).
|
||||||
*Don't forget to regularly update source code from repo*.
|
|
||||||
|
|
||||||
Currently supported more than 2000 sites ([full list](./sites.md)), by default search is launched against 500 popular sites in descending order of popularity.
|
|
||||||
|
|
||||||
## Main features
|
## Main features
|
||||||
|
|
||||||
* Profile pages parsing, [extracting](https://github.com/soxoj/socid_extractor) personal info, links to other profiles, etc.
|
* Profile pages parsing, [extraction](https://github.com/soxoj/socid_extractor) of personal info, links to other profiles, etc.
|
||||||
* Recursive search by new usernames found
|
* Recursive search by new usernames and other ids found
|
||||||
* Search by tags (site categories, countries)
|
* Search by tags (site categories, countries)
|
||||||
* Censorship and captcha detection
|
* Censorship and captcha detection
|
||||||
* Very few false positives
|
* Requests retries
|
||||||
|
|
||||||
|
See full description of Maigret features [in the Wiki](https://github.com/soxoj/maigret/wiki/Features).
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
**NOTE**: Python 3.6 or higher and pip is required.
|
Maigret can be installed using pip, Docker, or simply can be launched from the cloned repo.
|
||||||
|
Also you can run Maigret using cloud shells and Jupyter notebooks (see buttons below).
|
||||||
|
|
||||||
**Python 3.8 is recommended.**
|
[](https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/soxoj/maigret&tutorial=README.md)
|
||||||
|
<a href="https://repl.it/github/soxoj/maigret"><img src="https://user-images.githubusercontent.com/27065646/92304596-bf719b00-ef7f-11ea-987f-2c1f3c323088.png" alt="Run on Repl.it" height="50"></a>
|
||||||
|
|
||||||
|
<a href="https://colab.research.google.com/gist/soxoj/879b51bc3b2f8b695abb054090645000/maigret-collab.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" height="45"></a>
|
||||||
|
<a href="https://mybinder.org/v2/gist/soxoj/9d65c2f4d3bec5dd25949197ea73cf3a/HEAD"><img src="https://mybinder.org/badge_logo.svg" alt="Open In Binder" height="45"></a>
|
||||||
|
|
||||||
### Package installing
|
### Package installing
|
||||||
|
|
||||||
|
**NOTE**: Python 3.6 or higher and pip is required, **Python 3.8 is recommended.**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# install from pypi
|
# install from pypi
|
||||||
pip3 install maigret
|
pip3 install maigret
|
||||||
@@ -41,34 +57,36 @@ pip3 install maigret
|
|||||||
# or clone and install manually
|
# or clone and install manually
|
||||||
git clone https://github.com/soxoj/maigret && cd maigret
|
git clone https://github.com/soxoj/maigret && cd maigret
|
||||||
pip3 install .
|
pip3 install .
|
||||||
|
|
||||||
|
# usage
|
||||||
|
maigret username
|
||||||
```
|
```
|
||||||
|
|
||||||
### Cloning a repository
|
### Cloning a repository
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
git clone https://github.com/soxoj/maigret && cd maigret
|
git clone https://github.com/soxoj/maigret && cd maigret
|
||||||
```
|
|
||||||
|
|
||||||
You can use your a free virtual machine, the repo will be automatically cloned:
|
|
||||||
|
|
||||||
[](https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/soxoj/maigret&tutorial=README.md) [](https://repl.it/github/soxoj/maigret)
|
|
||||||
<a href="https://colab.research.google.com/gist//soxoj/879b51bc3b2f8b695abb054090645000/maigret.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" height="40"></a>
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip3 install -r requirements.txt
|
pip3 install -r requirements.txt
|
||||||
|
|
||||||
|
# usage
|
||||||
|
./maigret.py username
|
||||||
```
|
```
|
||||||
|
|
||||||
## Using examples
|
### Docker
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# for a cloned repo
|
# official image
|
||||||
./maigret.py user
|
docker pull soxoj/maigret
|
||||||
|
|
||||||
# for a package
|
# usage
|
||||||
maigret user
|
docker run soxoj/maigret:latest username
|
||||||
|
|
||||||
|
# manual build
|
||||||
|
docker build -t maigret .
|
||||||
```
|
```
|
||||||
|
|
||||||
Features:
|
## Usage examples
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# make HTML and PDF reports
|
# make HTML and PDF reports
|
||||||
maigret user --html --pdf
|
maigret user --html --pdf
|
||||||
@@ -76,35 +94,25 @@ maigret user --html --pdf
|
|||||||
# search on sites marked with tags photo & dating
|
# search on sites marked with tags photo & dating
|
||||||
maigret user --tags photo,dating
|
maigret user --tags photo,dating
|
||||||
|
|
||||||
|
|
||||||
# search for three usernames on all available sites
|
# search for three usernames on all available sites
|
||||||
maigret user1 user2 user3 -a
|
maigret user1 user2 user3 -a
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Run `maigret --help` to get arguments description. Also options are documented in [the Maigret Wiki](https://github.com/soxoj/maigret/wiki/Command-line-options).
|
Use `maigret --help` to get full options description. Also options are documented in [the Maigret Wiki](https://github.com/soxoj/maigret/wiki/Command-line-options).
|
||||||
|
|
||||||
With Docker:
|
|
||||||
```
|
|
||||||
# manual build
|
|
||||||
docker build -t maigret . && docker run maigret user
|
|
||||||
|
|
||||||
# official image
|
|
||||||
docker run soxoj/maigret:latest user
|
|
||||||
```
|
|
||||||
|
|
||||||
## Demo with page parsing and recursive username search
|
## Demo with page parsing and recursive username search
|
||||||
|
|
||||||
[PDF report](./static/report_alexaimephotographycars.pdf), [HTML report](https://htmlpreview.github.io/?https://raw.githubusercontent.com/soxoj/maigret/main/static/report_alexaimephotographycars.html)
|
[PDF report](https://raw.githubusercontent.com/soxoj/maigret/main/static/report_alexaimephotographycars.pdf), [HTML report](https://htmlpreview.github.io/?https://raw.githubusercontent.com/soxoj/maigret/main/static/report_alexaimephotographycars.html)
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||

|

|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
|
|
||||||
[Full console output](./static/recursive_search.md)
|
[Full console output](https://raw.githubusercontent.com/soxoj/maigret/main/static/recursive_search.md)
|
||||||
|
|
||||||
## License
|
## License
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,68 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "8v6PEfyXb0Gx"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# clone the repo\n",
|
||||||
|
"!git clone https://github.com/soxoj/maigret\n",
|
||||||
|
"!pip3 install -r maigret/requirements.txt"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "cXOQUAhDchkl"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# help\n",
|
||||||
|
"!python3 maigret/maigret.py --help"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "SjDmpN4QGnJu"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# search\n",
|
||||||
|
"!python3 maigret/maigret.py user"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"collapsed_sections": [],
|
||||||
|
"include_colab_link": true,
|
||||||
|
"name": "maigret.ipynb",
|
||||||
|
"provenance": []
|
||||||
|
},
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.7.10"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 1
|
||||||
|
}
|
||||||
@@ -1,5 +1,12 @@
|
|||||||
"""Maigret"""
|
"""Maigret"""
|
||||||
|
|
||||||
|
__title__ = 'Maigret'
|
||||||
|
__package__ = 'maigret'
|
||||||
|
__author__ = 'Soxoj'
|
||||||
|
__author_email__ = 'soxoj@protonmail.com'
|
||||||
|
|
||||||
|
|
||||||
|
from .__version__ import __version__
|
||||||
from .checking import maigret as search
|
from .checking import maigret as search
|
||||||
from .sites import MaigretEngine, MaigretSite, MaigretDatabase
|
from .sites import MaigretEngine, MaigretSite, MaigretDatabase
|
||||||
from .notify import QueryNotifyPrint as Notifier
|
from .notify import QueryNotifyPrint as Notifier
|
||||||
+2
-2
@@ -6,7 +6,7 @@ Maigret entrypoint
|
|||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
|
|
||||||
import maigret
|
from .maigret import main
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
asyncio.run(maigret.main())
|
asyncio.run(main())
|
||||||
|
|||||||
@@ -0,0 +1,3 @@
|
|||||||
|
"""Maigret version file"""
|
||||||
|
|
||||||
|
__version__ = '0.3.1'
|
||||||
+17
-33
@@ -9,49 +9,33 @@ class ParsingActivator:
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def twitter(site, logger, cookies={}):
|
def twitter(site, logger, cookies={}):
|
||||||
headers = dict(site.headers)
|
headers = dict(site.headers)
|
||||||
del headers['x-guest-token']
|
del headers["x-guest-token"]
|
||||||
r = requests.post(site.activation['url'], headers=headers)
|
r = requests.post(site.activation["url"], headers=headers)
|
||||||
logger.info(r)
|
logger.info(r)
|
||||||
j = r.json()
|
j = r.json()
|
||||||
guest_token = j[site.activation['src']]
|
guest_token = j[site.activation["src"]]
|
||||||
site.headers['x-guest-token'] = guest_token
|
site.headers["x-guest-token"] = guest_token
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def vimeo(site, logger, cookies={}):
|
def vimeo(site, logger, cookies={}):
|
||||||
headers = dict(site.headers)
|
headers = dict(site.headers)
|
||||||
if 'Authorization' in headers:
|
if "Authorization" in headers:
|
||||||
del headers['Authorization']
|
del headers["Authorization"]
|
||||||
r = requests.get(site.activation['url'], headers=headers)
|
r = requests.get(site.activation["url"], headers=headers)
|
||||||
jwt_token = r.json()['jwt']
|
jwt_token = r.json()["jwt"]
|
||||||
site.headers['Authorization'] = 'jwt ' + jwt_token
|
site.headers["Authorization"] = "jwt " + jwt_token
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def spotify(site, logger, cookies={}):
|
def spotify(site, logger, cookies={}):
|
||||||
headers = dict(site.headers)
|
headers = dict(site.headers)
|
||||||
if 'Authorization' in headers:
|
if "Authorization" in headers:
|
||||||
del headers['Authorization']
|
del headers["Authorization"]
|
||||||
r = requests.get(site.activation['url'])
|
r = requests.get(site.activation["url"])
|
||||||
bearer_token = r.json()['accessToken']
|
bearer_token = r.json()["accessToken"]
|
||||||
site.headers['authorization'] = f'Bearer {bearer_token}'
|
site.headers["authorization"] = f"Bearer {bearer_token}"
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def xssis(site, logger, cookies={}):
|
|
||||||
if not cookies:
|
|
||||||
logger.debug('You must have cookies to activate xss.is parsing!')
|
|
||||||
return
|
|
||||||
|
|
||||||
headers = dict(site.headers)
|
|
||||||
post_data = {
|
|
||||||
'_xfResponseType': 'json',
|
|
||||||
'_xfToken': '1611177919,a2710362e45dad9aa1da381e21941a38'
|
|
||||||
}
|
|
||||||
headers['content-type'] = 'application/x-www-form-urlencoded; charset=UTF-8'
|
|
||||||
r = requests.post(site.activation['url'], headers=headers, cookies=cookies, data=post_data)
|
|
||||||
csrf = r.json()['csrf']
|
|
||||||
site.get_params['_xfToken'] = csrf
|
|
||||||
|
|
||||||
|
|
||||||
async def import_aiohttp_cookies(cookiestxt_filename):
|
def import_aiohttp_cookies(cookiestxt_filename):
|
||||||
cookies_obj = MozillaCookieJar(cookiestxt_filename)
|
cookies_obj = MozillaCookieJar(cookiestxt_filename)
|
||||||
cookies_obj.load(ignore_discard=True, ignore_expires=True)
|
cookies_obj.load(ignore_discard=True, ignore_expires=True)
|
||||||
|
|
||||||
@@ -62,8 +46,8 @@ async def import_aiohttp_cookies(cookiestxt_filename):
|
|||||||
for key, cookie in list(domain.values())[0].items():
|
for key, cookie in list(domain.values())[0].items():
|
||||||
c = Morsel()
|
c = Morsel()
|
||||||
c.set(key, cookie.value, cookie.value)
|
c.set(key, cookie.value, cookie.value)
|
||||||
c['domain'] = cookie.domain
|
c["domain"] = cookie.domain
|
||||||
c['path'] = cookie.path
|
c["path"] = cookie.path
|
||||||
cookies_list.append((key, c))
|
cookies_list.append((key, c))
|
||||||
|
|
||||||
cookies.update_cookies(cookies_list)
|
cookies.update_cookies(cookies_list)
|
||||||
|
|||||||
+582
-419
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,130 @@
|
|||||||
|
from typing import Dict, List, Any
|
||||||
|
|
||||||
|
from .result import QueryResult
|
||||||
|
from .types import QueryResultWrapper
|
||||||
|
|
||||||
|
|
||||||
|
# error got as a result of completed search query
|
||||||
|
class CheckError:
|
||||||
|
_type = 'Unknown'
|
||||||
|
_desc = ''
|
||||||
|
|
||||||
|
def __init__(self, typename, desc=''):
|
||||||
|
self._type = typename
|
||||||
|
self._desc = desc
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
if not self._desc:
|
||||||
|
return f'{self._type} error'
|
||||||
|
|
||||||
|
return f'{self._type} error: {self._desc}'
|
||||||
|
|
||||||
|
@property
|
||||||
|
def type(self):
|
||||||
|
return self._type
|
||||||
|
|
||||||
|
@property
|
||||||
|
def desc(self):
|
||||||
|
return self._desc
|
||||||
|
|
||||||
|
|
||||||
|
COMMON_ERRORS = {
|
||||||
|
'<title>Attention Required! | Cloudflare</title>': CheckError(
|
||||||
|
'Captcha', 'Cloudflare'
|
||||||
|
),
|
||||||
|
'Please stand by, while we are checking your browser': CheckError(
|
||||||
|
'Bot protection', 'Cloudflare'
|
||||||
|
),
|
||||||
|
'<span data-translate="checking_browser">Checking your browser before accessing</span>': CheckError(
|
||||||
|
'Bot protection', 'Cloudflare'
|
||||||
|
),
|
||||||
|
'This website is using a security service to protect itself from online attacks.': CheckError(
|
||||||
|
'Access denied', 'Cloudflare'
|
||||||
|
),
|
||||||
|
'<title>Доступ ограничен</title>': CheckError('Censorship', 'Rostelecom'),
|
||||||
|
'document.getElementById(\'validate_form_submit\').disabled=true': CheckError(
|
||||||
|
'Captcha', 'Mail.ru'
|
||||||
|
),
|
||||||
|
'Verifying your browser, please wait...<br>DDoS Protection by</font> Blazingfast.io': CheckError(
|
||||||
|
'Bot protection', 'Blazingfast'
|
||||||
|
),
|
||||||
|
'404</h1><p class="error-card__description">Мы не нашли страницу': CheckError(
|
||||||
|
'Resolving', 'MegaFon 404 page'
|
||||||
|
),
|
||||||
|
'Доступ к информационному ресурсу ограничен на основании Федерального закона': CheckError(
|
||||||
|
'Censorship', 'MGTS'
|
||||||
|
),
|
||||||
|
'Incapsula incident ID': CheckError('Bot protection', 'Incapsula'),
|
||||||
|
'Сайт заблокирован хостинг-провайдером': CheckError(
|
||||||
|
'Site-specific', 'Site is disabled (Beget)'
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
ERRORS_TYPES = {
|
||||||
|
'Captcha': 'Try to switch to another IP address or to use service cookies',
|
||||||
|
'Bot protection': 'Try to switch to another IP address',
|
||||||
|
'Censorship': 'switch to another internet service provider',
|
||||||
|
'Request timeout': 'Try to increase timeout or to switch to another internet service provider',
|
||||||
|
}
|
||||||
|
|
||||||
|
# TODO: checking for reason
|
||||||
|
ERRORS_REASONS = {
|
||||||
|
'Login required': 'Add authorization cookies through `--cookies-jar-file` (see cookies.txt)',
|
||||||
|
}
|
||||||
|
|
||||||
|
TEMPORARY_ERRORS_TYPES = [
|
||||||
|
'Request timeout',
|
||||||
|
'Unknown',
|
||||||
|
'Request failed',
|
||||||
|
'Connecting failure',
|
||||||
|
'HTTP',
|
||||||
|
'Proxy',
|
||||||
|
'Interrupted',
|
||||||
|
'Connection lost',
|
||||||
|
]
|
||||||
|
|
||||||
|
THRESHOLD = 3 # percent
|
||||||
|
|
||||||
|
|
||||||
|
def is_important(err_data):
|
||||||
|
return err_data['perc'] >= THRESHOLD
|
||||||
|
|
||||||
|
|
||||||
|
def is_permanent(err_type):
|
||||||
|
return err_type not in TEMPORARY_ERRORS_TYPES
|
||||||
|
|
||||||
|
|
||||||
|
def detect(text):
|
||||||
|
for flag, err in COMMON_ERRORS.items():
|
||||||
|
if flag in text:
|
||||||
|
return err
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def solution_of(err_type) -> str:
|
||||||
|
return ERRORS_TYPES.get(err_type, '')
|
||||||
|
|
||||||
|
|
||||||
|
def extract_and_group(search_res: QueryResultWrapper) -> List[Dict[str, Any]]:
|
||||||
|
errors_counts: Dict[str, int] = {}
|
||||||
|
for r in search_res.values():
|
||||||
|
if r and isinstance(r, dict) and r.get('status'):
|
||||||
|
if not isinstance(r['status'], QueryResult):
|
||||||
|
continue
|
||||||
|
|
||||||
|
err = r['status'].error
|
||||||
|
if not err:
|
||||||
|
continue
|
||||||
|
errors_counts[err.type] = errors_counts.get(err.type, 0) + 1
|
||||||
|
|
||||||
|
counts = []
|
||||||
|
for err, count in sorted(errors_counts.items(), key=lambda x: x[1], reverse=True):
|
||||||
|
counts.append(
|
||||||
|
{
|
||||||
|
'err': err,
|
||||||
|
'count': count,
|
||||||
|
'perc': round(count / len(search_res), 2) * 100,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return counts
|
||||||
@@ -0,0 +1,118 @@
|
|||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
import tqdm
|
||||||
|
import sys
|
||||||
|
from typing import Iterable, Any, List
|
||||||
|
|
||||||
|
from .types import QueryDraft
|
||||||
|
|
||||||
|
|
||||||
|
def create_task_func():
|
||||||
|
if sys.version_info.minor > 6:
|
||||||
|
create_asyncio_task = asyncio.create_task
|
||||||
|
else:
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
create_asyncio_task = loop.create_task
|
||||||
|
return create_asyncio_task
|
||||||
|
|
||||||
|
|
||||||
|
class AsyncExecutor:
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
self.logger = kwargs['logger']
|
||||||
|
|
||||||
|
async def run(self, tasks: Iterable[QueryDraft]):
|
||||||
|
start_time = time.time()
|
||||||
|
results = await self._run(tasks)
|
||||||
|
self.execution_time = time.time() - start_time
|
||||||
|
self.logger.debug(f'Spent time: {self.execution_time}')
|
||||||
|
return results
|
||||||
|
|
||||||
|
async def _run(self, tasks: Iterable[QueryDraft]):
|
||||||
|
await asyncio.sleep(0)
|
||||||
|
|
||||||
|
|
||||||
|
class AsyncioSimpleExecutor(AsyncExecutor):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
async def _run(self, tasks: Iterable[QueryDraft]):
|
||||||
|
futures = [f(*args, **kwargs) for f, args, kwargs in tasks]
|
||||||
|
return await asyncio.gather(*futures)
|
||||||
|
|
||||||
|
|
||||||
|
class AsyncioProgressbarExecutor(AsyncExecutor):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
|
||||||
|
async def _run(self, tasks: Iterable[QueryDraft]):
|
||||||
|
futures = [f(*args, **kwargs) for f, args, kwargs in tasks]
|
||||||
|
results = []
|
||||||
|
for f in tqdm.asyncio.tqdm.as_completed(futures):
|
||||||
|
results.append(await f)
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
class AsyncioProgressbarSemaphoreExecutor(AsyncExecutor):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.semaphore = asyncio.Semaphore(kwargs.get('in_parallel', 1))
|
||||||
|
|
||||||
|
async def _run(self, tasks: Iterable[QueryDraft]):
|
||||||
|
async def _wrap_query(q: QueryDraft):
|
||||||
|
async with self.semaphore:
|
||||||
|
f, args, kwargs = q
|
||||||
|
return await f(*args, **kwargs)
|
||||||
|
|
||||||
|
async def semaphore_gather(tasks: Iterable[QueryDraft]):
|
||||||
|
coros = [_wrap_query(q) for q in tasks]
|
||||||
|
results = []
|
||||||
|
for f in tqdm.asyncio.tqdm.as_completed(coros):
|
||||||
|
results.append(await f)
|
||||||
|
return results
|
||||||
|
|
||||||
|
return await semaphore_gather(tasks)
|
||||||
|
|
||||||
|
|
||||||
|
class AsyncioProgressbarQueueExecutor(AsyncExecutor):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
super().__init__(*args, **kwargs)
|
||||||
|
self.workers_count = kwargs.get('in_parallel', 10)
|
||||||
|
self.progress_func = kwargs.get('progress_func', tqdm.tqdm)
|
||||||
|
self.queue = asyncio.Queue(self.workers_count)
|
||||||
|
self.timeout = kwargs.get('timeout')
|
||||||
|
|
||||||
|
async def worker(self):
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
f, args, kwargs = self.queue.get_nowait()
|
||||||
|
except asyncio.QueueEmpty:
|
||||||
|
return
|
||||||
|
|
||||||
|
query_future = f(*args, **kwargs)
|
||||||
|
query_task = create_task_func()(query_future)
|
||||||
|
try:
|
||||||
|
result = await asyncio.wait_for(query_task, timeout=self.timeout)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
result = kwargs.get('default')
|
||||||
|
|
||||||
|
self.results.append(result)
|
||||||
|
self.progress.update(1)
|
||||||
|
self.queue.task_done()
|
||||||
|
|
||||||
|
async def _run(self, queries: Iterable[QueryDraft]):
|
||||||
|
self.results: List[Any] = []
|
||||||
|
|
||||||
|
queries_list = list(queries)
|
||||||
|
|
||||||
|
min_workers = min(len(queries_list), self.workers_count)
|
||||||
|
|
||||||
|
workers = [create_task_func()(self.worker()) for _ in range(min_workers)]
|
||||||
|
|
||||||
|
self.progress = self.progress_func(total=len(queries_list))
|
||||||
|
for t in queries_list:
|
||||||
|
await self.queue.put(t)
|
||||||
|
await self.queue.join()
|
||||||
|
for w in workers:
|
||||||
|
w.cancel()
|
||||||
|
self.progress.close()
|
||||||
|
return self.results
|
||||||
+529
-239
@@ -1,196 +1,467 @@
|
|||||||
"""
|
"""
|
||||||
Maigret main module
|
Maigret main module
|
||||||
"""
|
"""
|
||||||
import aiohttp
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import platform
|
import platform
|
||||||
from argparse import ArgumentParser, RawDescriptionHelpFormatter
|
from argparse import ArgumentParser, RawDescriptionHelpFormatter
|
||||||
|
from typing import List, Tuple
|
||||||
|
|
||||||
import requests
|
from socid_extractor import extract, parse
|
||||||
from socid_extractor import extract, parse, __version__ as socid_version
|
|
||||||
|
|
||||||
from .checking import timeout_check, supported_recursive_search_ids, self_check, unsupported_characters, maigret
|
from .__version__ import __version__
|
||||||
|
from .checking import (
|
||||||
|
timeout_check,
|
||||||
|
SUPPORTED_IDS,
|
||||||
|
self_check,
|
||||||
|
BAD_CHARS,
|
||||||
|
maigret,
|
||||||
|
)
|
||||||
|
from . import errors
|
||||||
from .notify import QueryNotifyPrint
|
from .notify import QueryNotifyPrint
|
||||||
from .report import save_csv_report, save_xmind_report, save_html_report, save_pdf_report, \
|
from .report import (
|
||||||
generate_report_context, save_txt_report, SUPPORTED_JSON_REPORT_FORMATS, check_supported_json_format, \
|
save_csv_report,
|
||||||
save_json_report
|
save_xmind_report,
|
||||||
|
save_html_report,
|
||||||
|
save_pdf_report,
|
||||||
|
generate_report_context,
|
||||||
|
save_txt_report,
|
||||||
|
SUPPORTED_JSON_REPORT_FORMATS,
|
||||||
|
save_json_report,
|
||||||
|
get_plaintext_report,
|
||||||
|
sort_report_by_data_points,
|
||||||
|
save_graph_report,
|
||||||
|
)
|
||||||
from .sites import MaigretDatabase
|
from .sites import MaigretDatabase
|
||||||
from .submit import submit_dialog
|
from .submit import Submitter
|
||||||
|
from .types import QueryResultWrapper
|
||||||
from .utils import get_dict_ascii_tree
|
from .utils import get_dict_ascii_tree
|
||||||
|
from .settings import Settings
|
||||||
|
|
||||||
__version__ = '0.1.18'
|
|
||||||
|
def notify_about_errors(search_results: QueryResultWrapper, query_notify):
|
||||||
|
errs = errors.extract_and_group(search_results)
|
||||||
|
was_errs_displayed = False
|
||||||
|
for e in errs:
|
||||||
|
if not errors.is_important(e):
|
||||||
|
continue
|
||||||
|
text = f'Too many errors of type "{e["err"]}" ({e["perc"]}%)'
|
||||||
|
solution = errors.solution_of(e['err'])
|
||||||
|
if solution:
|
||||||
|
text = '. '.join([text, solution.capitalize()])
|
||||||
|
|
||||||
|
query_notify.warning(text, '!')
|
||||||
|
was_errs_displayed = True
|
||||||
|
|
||||||
|
if was_errs_displayed:
|
||||||
|
query_notify.warning(
|
||||||
|
'You can see detailed site check errors with a flag `--print-errors`'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_ids_from_page(url, logger, timeout=5) -> dict:
|
||||||
|
results = {}
|
||||||
|
# url, headers
|
||||||
|
reqs: List[Tuple[str, set]] = [(url, set())]
|
||||||
|
try:
|
||||||
|
# temporary workaround for URL mutations MVP
|
||||||
|
from socid_extractor import mutate_url
|
||||||
|
|
||||||
|
reqs += list(mutate_url(url))
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(e)
|
||||||
|
|
||||||
|
for req in reqs:
|
||||||
|
url, headers = req
|
||||||
|
print(f'Scanning webpage by URL {url}...')
|
||||||
|
page, _ = parse(url, cookies_str='', headers=headers, timeout=timeout)
|
||||||
|
logger.debug(page)
|
||||||
|
info = extract(page)
|
||||||
|
if not info:
|
||||||
|
print('Nothing extracted')
|
||||||
|
else:
|
||||||
|
print(get_dict_ascii_tree(info.items(), new_line=False), ' ')
|
||||||
|
for k, v in info.items():
|
||||||
|
if 'username' in k:
|
||||||
|
results[v] = 'username'
|
||||||
|
if k in SUPPORTED_IDS:
|
||||||
|
results[v] = k
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def extract_ids_from_results(results: QueryResultWrapper, db: MaigretDatabase) -> dict:
|
||||||
|
ids_results = {}
|
||||||
|
for website_name in results:
|
||||||
|
dictionary = results[website_name]
|
||||||
|
# TODO: fix no site data issue
|
||||||
|
if not dictionary:
|
||||||
|
continue
|
||||||
|
|
||||||
|
new_usernames = dictionary.get('ids_usernames')
|
||||||
|
if new_usernames:
|
||||||
|
for u, utype in new_usernames.items():
|
||||||
|
ids_results[u] = utype
|
||||||
|
|
||||||
|
for url in dictionary.get('ids_links', []):
|
||||||
|
ids_results.update(db.extract_ids_from_url(url))
|
||||||
|
|
||||||
|
return ids_results
|
||||||
|
|
||||||
|
|
||||||
|
def setup_arguments_parser():
|
||||||
|
from aiohttp import __version__ as aiohttp_version
|
||||||
|
from requests import __version__ as requests_version
|
||||||
|
from socid_extractor import __version__ as socid_version
|
||||||
|
|
||||||
|
version_string = '\n'.join(
|
||||||
|
[
|
||||||
|
f'%(prog)s {__version__}',
|
||||||
|
f'Socid-extractor: {socid_version}',
|
||||||
|
f'Aiohttp: {aiohttp_version}',
|
||||||
|
f'Requests: {requests_version}',
|
||||||
|
f'Python: {platform.python_version()}',
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
parser = ArgumentParser(
|
||||||
|
formatter_class=RawDescriptionHelpFormatter,
|
||||||
|
description=f"Maigret v{__version__}",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"username",
|
||||||
|
nargs='*',
|
||||||
|
metavar="USERNAMES",
|
||||||
|
help="One or more usernames to search by.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--version",
|
||||||
|
action="version",
|
||||||
|
version=version_string,
|
||||||
|
help="Display version information and dependencies.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--timeout",
|
||||||
|
action="store",
|
||||||
|
metavar='TIMEOUT',
|
||||||
|
dest="timeout",
|
||||||
|
type=timeout_check,
|
||||||
|
default=30,
|
||||||
|
help="Time in seconds to wait for response to requests. "
|
||||||
|
"Default timeout of 30.0s. "
|
||||||
|
"A longer timeout will be more likely to get results from slow sites. "
|
||||||
|
"On the other hand, this may cause a long delay to gather all results. ",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--retries",
|
||||||
|
action="store",
|
||||||
|
type=int,
|
||||||
|
metavar='RETRIES',
|
||||||
|
default=1,
|
||||||
|
help="Attempts to restart temporarily failed requests.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-n",
|
||||||
|
"--max-connections",
|
||||||
|
action="store",
|
||||||
|
type=int,
|
||||||
|
dest="connections",
|
||||||
|
default=100,
|
||||||
|
help="Allowed number of concurrent connections.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-recursion",
|
||||||
|
action="store_true",
|
||||||
|
dest="disable_recursive_search",
|
||||||
|
default=False,
|
||||||
|
help="Disable recursive search by additional data extracted from pages.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-extracting",
|
||||||
|
action="store_true",
|
||||||
|
dest="disable_extracting",
|
||||||
|
default=False,
|
||||||
|
help="Disable parsing pages for additional data and other usernames.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--id-type",
|
||||||
|
dest="id_type",
|
||||||
|
default='username',
|
||||||
|
choices=SUPPORTED_IDS,
|
||||||
|
help="Specify identifier(s) type (default: username).",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--db",
|
||||||
|
metavar="DB_FILE",
|
||||||
|
dest="db_file",
|
||||||
|
default=None,
|
||||||
|
help="Load Maigret database from a JSON file or HTTP web resource.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--cookies-jar-file",
|
||||||
|
metavar="COOKIE_FILE",
|
||||||
|
dest="cookie_file",
|
||||||
|
default=None,
|
||||||
|
help="File with cookies.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--ignore-ids",
|
||||||
|
action="append",
|
||||||
|
metavar='IGNORED_IDS',
|
||||||
|
dest="ignore_ids_list",
|
||||||
|
default=[],
|
||||||
|
help="Do not make search by the specified username or other ids.",
|
||||||
|
)
|
||||||
|
# reports options
|
||||||
|
parser.add_argument(
|
||||||
|
"--folderoutput",
|
||||||
|
"-fo",
|
||||||
|
dest="folderoutput",
|
||||||
|
default="reports",
|
||||||
|
metavar="PATH",
|
||||||
|
help="If using multiple usernames, the output of the results will be saved to this folder.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--proxy",
|
||||||
|
"-p",
|
||||||
|
metavar='PROXY_URL',
|
||||||
|
action="store",
|
||||||
|
dest="proxy",
|
||||||
|
default=None,
|
||||||
|
help="Make requests over a proxy. e.g. socks5://127.0.0.1:1080",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--tor-proxy",
|
||||||
|
metavar='TOR_PROXY_URL',
|
||||||
|
action="store",
|
||||||
|
default='socks5://127.0.0.1:9050',
|
||||||
|
help="Specify URL of your Tor gateway. Default is socks5://127.0.0.1:9050",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--i2p-proxy",
|
||||||
|
metavar='I2P_PROXY_URL',
|
||||||
|
action="store",
|
||||||
|
default='http://127.0.0.1:4444',
|
||||||
|
help="Specify URL of your I2P gateway. Default is http://127.0.0.1:4444",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--with-domains",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Enable (experimental) feature of checking domains on usernames.",
|
||||||
|
)
|
||||||
|
|
||||||
|
filter_group = parser.add_argument_group(
|
||||||
|
'Site filtering', 'Options to set site search scope'
|
||||||
|
)
|
||||||
|
filter_group.add_argument(
|
||||||
|
"-a",
|
||||||
|
"--all-sites",
|
||||||
|
action="store_true",
|
||||||
|
dest="all_sites",
|
||||||
|
default=False,
|
||||||
|
help="Use all sites for scan.",
|
||||||
|
)
|
||||||
|
filter_group.add_argument(
|
||||||
|
"--top-sites",
|
||||||
|
action="store",
|
||||||
|
default=500,
|
||||||
|
metavar="N",
|
||||||
|
type=int,
|
||||||
|
help="Count of sites for scan ranked by Alexa Top (default: 500).",
|
||||||
|
)
|
||||||
|
filter_group.add_argument(
|
||||||
|
"--tags", dest="tags", default='', help="Specify tags of sites (see `--stats`)."
|
||||||
|
)
|
||||||
|
filter_group.add_argument(
|
||||||
|
"--site",
|
||||||
|
action="append",
|
||||||
|
metavar='SITE_NAME',
|
||||||
|
dest="site_list",
|
||||||
|
default=[],
|
||||||
|
help="Limit analysis to just the specified sites (multiple option).",
|
||||||
|
)
|
||||||
|
filter_group.add_argument(
|
||||||
|
"--use-disabled-sites",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Use disabled sites to search (may cause many false positives).",
|
||||||
|
)
|
||||||
|
|
||||||
|
modes_group = parser.add_argument_group(
|
||||||
|
'Operating modes',
|
||||||
|
'Various functions except the default search by a username. '
|
||||||
|
'Modes are executed sequentially in the order of declaration.',
|
||||||
|
)
|
||||||
|
modes_group.add_argument(
|
||||||
|
"--parse",
|
||||||
|
dest="parse_url",
|
||||||
|
default='',
|
||||||
|
metavar='URL',
|
||||||
|
help="Parse page by URL and extract username and IDs to use for search.",
|
||||||
|
)
|
||||||
|
modes_group.add_argument(
|
||||||
|
"--submit",
|
||||||
|
metavar='URL',
|
||||||
|
type=str,
|
||||||
|
dest="new_site_to_submit",
|
||||||
|
default=False,
|
||||||
|
help="URL of existing profile in new site to submit.",
|
||||||
|
)
|
||||||
|
modes_group.add_argument(
|
||||||
|
"--self-check",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Do self check for sites and database and disable non-working ones.",
|
||||||
|
)
|
||||||
|
modes_group.add_argument(
|
||||||
|
"--stats",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Show database statistics (most frequent sites engines and tags).",
|
||||||
|
)
|
||||||
|
|
||||||
|
output_group = parser.add_argument_group(
|
||||||
|
'Output options', 'Options to change verbosity and view of the console output'
|
||||||
|
)
|
||||||
|
output_group.add_argument(
|
||||||
|
"--print-not-found",
|
||||||
|
action="store_true",
|
||||||
|
dest="print_not_found",
|
||||||
|
default=False,
|
||||||
|
help="Print sites where the username was not found.",
|
||||||
|
)
|
||||||
|
output_group.add_argument(
|
||||||
|
"--print-errors",
|
||||||
|
action="store_true",
|
||||||
|
dest="print_check_errors",
|
||||||
|
default=False,
|
||||||
|
help="Print errors messages: connection, captcha, site country ban, etc.",
|
||||||
|
)
|
||||||
|
output_group.add_argument(
|
||||||
|
"--verbose",
|
||||||
|
"-v",
|
||||||
|
action="store_true",
|
||||||
|
dest="verbose",
|
||||||
|
default=False,
|
||||||
|
help="Display extra information and metrics.",
|
||||||
|
)
|
||||||
|
output_group.add_argument(
|
||||||
|
"--info",
|
||||||
|
"-vv",
|
||||||
|
action="store_true",
|
||||||
|
dest="info",
|
||||||
|
default=False,
|
||||||
|
help="Display extra/service information and metrics.",
|
||||||
|
)
|
||||||
|
output_group.add_argument(
|
||||||
|
"--debug",
|
||||||
|
"-vvv",
|
||||||
|
"-d",
|
||||||
|
action="store_true",
|
||||||
|
dest="debug",
|
||||||
|
default=False,
|
||||||
|
help="Display extra/service/debug information and metrics, save responses in debug.log.",
|
||||||
|
)
|
||||||
|
output_group.add_argument(
|
||||||
|
"--no-color",
|
||||||
|
action="store_true",
|
||||||
|
dest="no_color",
|
||||||
|
default=False,
|
||||||
|
help="Don't color terminal output",
|
||||||
|
)
|
||||||
|
output_group.add_argument(
|
||||||
|
"--no-progressbar",
|
||||||
|
action="store_true",
|
||||||
|
dest="no_progressbar",
|
||||||
|
default=False,
|
||||||
|
help="Don't show progressbar.",
|
||||||
|
)
|
||||||
|
|
||||||
|
report_group = parser.add_argument_group(
|
||||||
|
'Report formats', 'Supported formats of report files'
|
||||||
|
)
|
||||||
|
report_group.add_argument(
|
||||||
|
"-T",
|
||||||
|
"--txt",
|
||||||
|
action="store_true",
|
||||||
|
dest="txt",
|
||||||
|
default=False,
|
||||||
|
help="Create a TXT report (one report per username).",
|
||||||
|
)
|
||||||
|
report_group.add_argument(
|
||||||
|
"-C",
|
||||||
|
"--csv",
|
||||||
|
action="store_true",
|
||||||
|
dest="csv",
|
||||||
|
default=False,
|
||||||
|
help="Create a CSV report (one report per username).",
|
||||||
|
)
|
||||||
|
report_group.add_argument(
|
||||||
|
"-H",
|
||||||
|
"--html",
|
||||||
|
action="store_true",
|
||||||
|
dest="html",
|
||||||
|
default=False,
|
||||||
|
help="Create an HTML report file (general report on all usernames).",
|
||||||
|
)
|
||||||
|
report_group.add_argument(
|
||||||
|
"-X",
|
||||||
|
"--xmind",
|
||||||
|
action="store_true",
|
||||||
|
dest="xmind",
|
||||||
|
default=False,
|
||||||
|
help="Generate an XMind 8 mindmap report (one report per username).",
|
||||||
|
)
|
||||||
|
report_group.add_argument(
|
||||||
|
"-P",
|
||||||
|
"--pdf",
|
||||||
|
action="store_true",
|
||||||
|
dest="pdf",
|
||||||
|
default=False,
|
||||||
|
help="Generate a PDF report (general report on all usernames).",
|
||||||
|
)
|
||||||
|
report_group.add_argument(
|
||||||
|
"-G",
|
||||||
|
"--graph",
|
||||||
|
action="store_true",
|
||||||
|
dest="graph",
|
||||||
|
default=False,
|
||||||
|
help="Generate a graph report (general report on all usernames).",
|
||||||
|
)
|
||||||
|
report_group.add_argument(
|
||||||
|
"-J",
|
||||||
|
"--json",
|
||||||
|
action="store",
|
||||||
|
metavar='TYPE',
|
||||||
|
dest="json",
|
||||||
|
default='',
|
||||||
|
choices=SUPPORTED_JSON_REPORT_FORMATS,
|
||||||
|
help=f"Generate a JSON report of specific type: {', '.join(SUPPORTED_JSON_REPORT_FORMATS)}"
|
||||||
|
" (one report per username).",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--reports-sorting",
|
||||||
|
default='default',
|
||||||
|
choices=('default', 'data'),
|
||||||
|
help="Method of results sorting in reports (default: in order of getting the result)",
|
||||||
|
)
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
version_string = '\n'.join([
|
arg_parser = setup_arguments_parser()
|
||||||
f'%(prog)s {__version__}',
|
args = arg_parser.parse_args()
|
||||||
f'Socid-extractor: {socid_version}',
|
|
||||||
f'Aiohttp: {aiohttp.__version__}',
|
|
||||||
f'Requests: {requests.__version__}',
|
|
||||||
f'Python: {platform.python_version()}',
|
|
||||||
])
|
|
||||||
|
|
||||||
parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter,
|
|
||||||
description=f"Maigret v{__version__}"
|
|
||||||
)
|
|
||||||
parser.add_argument("--version",
|
|
||||||
action="version", version=version_string,
|
|
||||||
help="Display version information and dependencies."
|
|
||||||
)
|
|
||||||
parser.add_argument("--info", "-vv",
|
|
||||||
action="store_true", dest="info", default=False,
|
|
||||||
help="Display service information."
|
|
||||||
)
|
|
||||||
parser.add_argument("--verbose", "-v",
|
|
||||||
action="store_true", dest="verbose", default=False,
|
|
||||||
help="Display extra information and metrics."
|
|
||||||
)
|
|
||||||
parser.add_argument("-d", "--debug", "-vvv",
|
|
||||||
action="store_true", dest="debug", default=False,
|
|
||||||
help="Saving debugging information and sites responses in debug.txt."
|
|
||||||
)
|
|
||||||
parser.add_argument("--site",
|
|
||||||
action="append", metavar='SITE_NAME',
|
|
||||||
dest="site_list", default=[],
|
|
||||||
help="Limit analysis to just the listed sites (use several times to specify more than one)"
|
|
||||||
)
|
|
||||||
parser.add_argument("--proxy", "-p", metavar='PROXY_URL',
|
|
||||||
action="store", dest="proxy", default=None,
|
|
||||||
help="Make requests over a proxy. e.g. socks5://127.0.0.1:1080"
|
|
||||||
)
|
|
||||||
parser.add_argument("--db", metavar="DB_FILE",
|
|
||||||
dest="db_file", default=None,
|
|
||||||
help="Load Maigret database from a JSON file or an online, valid, JSON file.")
|
|
||||||
parser.add_argument("--cookies-jar-file", metavar="COOKIE_FILE",
|
|
||||||
dest="cookie_file", default=None,
|
|
||||||
help="File with cookies.")
|
|
||||||
parser.add_argument("--timeout",
|
|
||||||
action="store", metavar='TIMEOUT',
|
|
||||||
dest="timeout", type=timeout_check, default=10,
|
|
||||||
help="Time (in seconds) to wait for response to requests."
|
|
||||||
"Default timeout of 10.0s. "
|
|
||||||
"A longer timeout will be more likely to get results from slow sites."
|
|
||||||
"On the other hand, this may cause a long delay to gather all results."
|
|
||||||
)
|
|
||||||
parser.add_argument("-n", "--max-connections",
|
|
||||||
action="store", type=int,
|
|
||||||
dest="connections", default=100,
|
|
||||||
help="Allowed number of concurrent connections."
|
|
||||||
)
|
|
||||||
parser.add_argument("-a", "--all-sites",
|
|
||||||
action="store_true", dest="all_sites", default=False,
|
|
||||||
help="Use all sites for scan."
|
|
||||||
)
|
|
||||||
parser.add_argument("--top-sites",
|
|
||||||
action="store", default=500, type=int,
|
|
||||||
help="Count of sites for scan ranked by Alexa Top (default: 500)."
|
|
||||||
)
|
|
||||||
parser.add_argument("--print-not-found",
|
|
||||||
action="store_true", dest="print_not_found", default=False,
|
|
||||||
help="Print sites where the username was not found."
|
|
||||||
)
|
|
||||||
parser.add_argument("--print-errors",
|
|
||||||
action="store_true", dest="print_check_errors", default=False,
|
|
||||||
help="Print errors messages: connection, captcha, site country ban, etc."
|
|
||||||
)
|
|
||||||
parser.add_argument("--submit", metavar='EXISTING_USER_URL',
|
|
||||||
type=str, dest="new_site_to_submit", default=False,
|
|
||||||
help="URL of existing profile in new site to submit."
|
|
||||||
)
|
|
||||||
parser.add_argument("--no-color",
|
|
||||||
action="store_true", dest="no_color", default=False,
|
|
||||||
help="Don't color terminal output"
|
|
||||||
)
|
|
||||||
parser.add_argument("--browse", "-b",
|
|
||||||
action="store_true", dest="browse", default=False,
|
|
||||||
help="Browse to all results on default bowser."
|
|
||||||
)
|
|
||||||
parser.add_argument("--no-recursion",
|
|
||||||
action="store_true", dest="disable_recursive_search", default=False,
|
|
||||||
help="Disable recursive search by additional data extracted from pages."
|
|
||||||
)
|
|
||||||
parser.add_argument("--no-extracting",
|
|
||||||
action="store_true", dest="disable_extracting", default=False,
|
|
||||||
help="Disable parsing pages for additional data and other usernames."
|
|
||||||
)
|
|
||||||
parser.add_argument("--self-check",
|
|
||||||
action="store_true", default=False,
|
|
||||||
help="Do self check for sites and database and disable non-working ones."
|
|
||||||
)
|
|
||||||
parser.add_argument("--stats",
|
|
||||||
action="store_true", default=False,
|
|
||||||
help="Show database statistics."
|
|
||||||
)
|
|
||||||
parser.add_argument("--use-disabled-sites",
|
|
||||||
action="store_true", default=False,
|
|
||||||
help="Use disabled sites to search (may cause many false positives)."
|
|
||||||
)
|
|
||||||
parser.add_argument("--parse",
|
|
||||||
dest="parse_url", default='',
|
|
||||||
help="Parse page by URL and extract username and IDs to use for search."
|
|
||||||
)
|
|
||||||
parser.add_argument("--id-type",
|
|
||||||
dest="id_type", default='username',
|
|
||||||
help="Specify identifier(s) type (default: username)."
|
|
||||||
)
|
|
||||||
parser.add_argument("--ignore-ids",
|
|
||||||
action="append", metavar='IGNORED_IDS',
|
|
||||||
dest="ignore_ids_list", default=[],
|
|
||||||
help="Do not make search by the specified username or other ids."
|
|
||||||
)
|
|
||||||
parser.add_argument("username",
|
|
||||||
nargs='+', metavar='USERNAMES',
|
|
||||||
action="store",
|
|
||||||
help="One or more usernames to check with social networks."
|
|
||||||
)
|
|
||||||
parser.add_argument("--tags",
|
|
||||||
dest="tags", default='',
|
|
||||||
help="Specify tags of sites."
|
|
||||||
)
|
|
||||||
# reports options
|
|
||||||
parser.add_argument("--folderoutput", "-fo", dest="folderoutput", default="reports",
|
|
||||||
help="If using multiple usernames, the output of the results will be saved to this folder."
|
|
||||||
)
|
|
||||||
parser.add_argument("-T", "--txt",
|
|
||||||
action="store_true", dest="txt", default=False,
|
|
||||||
help="Create a TXT report (one report per username)."
|
|
||||||
)
|
|
||||||
parser.add_argument("-C", "--csv",
|
|
||||||
action="store_true", dest="csv", default=False,
|
|
||||||
help="Create a CSV report (one report per username)."
|
|
||||||
)
|
|
||||||
parser.add_argument("-H", "--html",
|
|
||||||
action="store_true", dest="html", default=False,
|
|
||||||
help="Create an HTML report file (general report on all usernames)."
|
|
||||||
)
|
|
||||||
parser.add_argument("-X", "--xmind",
|
|
||||||
action="store_true",
|
|
||||||
dest="xmind", default=False,
|
|
||||||
help="Generate an XMind 8 mindmap report (one report per username)."
|
|
||||||
)
|
|
||||||
parser.add_argument("-P", "--pdf",
|
|
||||||
action="store_true",
|
|
||||||
dest="pdf", default=False,
|
|
||||||
help="Generate a PDF report (general report on all usernames)."
|
|
||||||
)
|
|
||||||
parser.add_argument("-J", "--json",
|
|
||||||
action="store", metavar='REPORT_TYPE',
|
|
||||||
dest="json", default='', type=check_supported_json_format,
|
|
||||||
help=f"Generate a JSON report of specific type: {', '.join(SUPPORTED_JSON_REPORT_FORMATS)}"
|
|
||||||
" (one report per username)."
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
# Logging
|
# Logging
|
||||||
log_level = logging.ERROR
|
log_level = logging.ERROR
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
format='[%(filename)s:%(lineno)d] %(levelname)-3s %(asctime)s %(message)s',
|
format='[%(filename)s:%(lineno)d] %(levelname)-3s %(asctime)s %(message)s',
|
||||||
datefmt='%H:%M:%S',
|
datefmt='%H:%M:%S',
|
||||||
level=log_level
|
level=log_level,
|
||||||
)
|
)
|
||||||
|
|
||||||
if args.debug:
|
if args.debug:
|
||||||
@@ -207,8 +478,7 @@ async def main():
|
|||||||
usernames = {
|
usernames = {
|
||||||
u: args.id_type
|
u: args.id_type
|
||||||
for u in args.username
|
for u in args.username
|
||||||
if u not in ['-']
|
if u and u not in ['-'] and u not in args.ignore_ids_list
|
||||||
and u not in args.ignore_ids_list
|
|
||||||
}
|
}
|
||||||
|
|
||||||
parsing_enabled = not args.disable_extracting
|
parsing_enabled = not args.disable_extracting
|
||||||
@@ -219,74 +489,78 @@ async def main():
|
|||||||
print("Using the proxy: " + args.proxy)
|
print("Using the proxy: " + args.proxy)
|
||||||
|
|
||||||
if args.parse_url:
|
if args.parse_url:
|
||||||
# url, headers
|
extracted_ids = extract_ids_from_page(
|
||||||
reqs = [(args.parse_url, set())]
|
args.parse_url, logger, timeout=args.timeout
|
||||||
try:
|
)
|
||||||
# temporary workaround for URL mutations MVP
|
usernames.update(extracted_ids)
|
||||||
from socid_extractor import mutate_url
|
|
||||||
reqs += list(mutate_url(args.parse_url))
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
for req in reqs:
|
|
||||||
url, headers = req
|
|
||||||
print(f'Scanning webpage by URL {url}...')
|
|
||||||
page, _ = parse(url, cookies_str='', headers=headers)
|
|
||||||
info = extract(page)
|
|
||||||
if not info:
|
|
||||||
print('Nothing extracted')
|
|
||||||
else:
|
|
||||||
print(get_dict_ascii_tree(info.items(), new_line=False), ' ')
|
|
||||||
for k, v in info.items():
|
|
||||||
if 'username' in k:
|
|
||||||
usernames[v] = 'username'
|
|
||||||
if k in supported_recursive_search_ids:
|
|
||||||
usernames[v] = k
|
|
||||||
|
|
||||||
if args.tags:
|
if args.tags:
|
||||||
args.tags = list(set(str(args.tags).split(',')))
|
args.tags = list(set(str(args.tags).split(',')))
|
||||||
|
|
||||||
|
settings = Settings(
|
||||||
|
os.path.join(
|
||||||
|
os.path.dirname(os.path.realpath(__file__)), "resources/settings.json"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
if args.db_file is None:
|
if args.db_file is None:
|
||||||
args.db_file = \
|
args.db_file = os.path.join(
|
||||||
os.path.join(os.path.dirname(os.path.realpath(__file__)),
|
os.path.dirname(os.path.realpath(__file__)), "resources/data.json"
|
||||||
"resources/data.json"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if args.top_sites == 0 or args.all_sites:
|
if args.top_sites == 0 or args.all_sites:
|
||||||
args.top_sites = sys.maxsize
|
args.top_sites = sys.maxsize
|
||||||
|
|
||||||
# Create notify object for query results.
|
# Create notify object for query results.
|
||||||
query_notify = QueryNotifyPrint(result=None,
|
query_notify = QueryNotifyPrint(
|
||||||
|
result=None,
|
||||||
verbose=args.verbose,
|
verbose=args.verbose,
|
||||||
print_found_only=not args.print_not_found,
|
print_found_only=not args.print_not_found,
|
||||||
skip_check_errors=not args.print_check_errors,
|
skip_check_errors=not args.print_check_errors,
|
||||||
color=not args.no_color)
|
color=not args.no_color,
|
||||||
|
)
|
||||||
|
|
||||||
# Create object with all information about sites we are aware of.
|
# Create object with all information about sites we are aware of.
|
||||||
db = MaigretDatabase().load_from_file(args.db_file)
|
db = MaigretDatabase().load_from_path(args.db_file)
|
||||||
get_top_sites_for_id = lambda x: db.ranked_sites_dict(top=args.top_sites, tags=args.tags,
|
get_top_sites_for_id = lambda x: db.ranked_sites_dict(
|
||||||
|
top=args.top_sites,
|
||||||
|
tags=args.tags,
|
||||||
names=args.site_list,
|
names=args.site_list,
|
||||||
disabled=False, id_type=x)
|
disabled=args.use_disabled_sites,
|
||||||
|
id_type=x,
|
||||||
|
)
|
||||||
|
|
||||||
site_data = get_top_sites_for_id(args.id_type)
|
site_data = get_top_sites_for_id(args.id_type)
|
||||||
|
|
||||||
if args.new_site_to_submit:
|
if args.new_site_to_submit:
|
||||||
is_submitted = await submit_dialog(db, args.new_site_to_submit, args.cookie_file)
|
submitter = Submitter(db=db, logger=logger, settings=settings)
|
||||||
|
is_submitted = await submitter.dialog(args.new_site_to_submit, args.cookie_file)
|
||||||
if is_submitted:
|
if is_submitted:
|
||||||
db.save_to_file(args.db_file)
|
db.save_to_file(args.db_file)
|
||||||
|
|
||||||
# Database self-checking
|
# Database self-checking
|
||||||
if args.self_check:
|
if args.self_check:
|
||||||
print('Maigret sites database self-checking...')
|
print('Maigret sites database self-checking...')
|
||||||
is_need_update = await self_check(db, site_data, logger, max_connections=args.connections)
|
is_need_update = await self_check(
|
||||||
|
db,
|
||||||
|
site_data,
|
||||||
|
logger,
|
||||||
|
max_connections=args.connections,
|
||||||
|
tor_proxy=args.tor_proxy,
|
||||||
|
i2p_proxy=args.i2p_proxy,
|
||||||
|
)
|
||||||
if is_need_update:
|
if is_need_update:
|
||||||
if input('Do you want to save changes permanently? [Yn]\n').lower() == 'y':
|
if input('Do you want to save changes permanently? [Yn]\n').lower() in (
|
||||||
|
'y',
|
||||||
|
'',
|
||||||
|
):
|
||||||
db.save_to_file(args.db_file)
|
db.save_to_file(args.db_file)
|
||||||
print('Database was successfully updated.')
|
print('Database was successfully updated.')
|
||||||
else:
|
else:
|
||||||
print('Updates will be applied only for current search session.')
|
print('Updates will be applied only for current search session.')
|
||||||
print(db.get_scan_stats(site_data))
|
print('Scan sessions flags stats: ' + str(db.get_scan_stats(site_data)))
|
||||||
|
|
||||||
|
# Database statistics
|
||||||
if args.stats:
|
if args.stats:
|
||||||
print(db.get_db_stats(db.sites_dict))
|
print(db.get_db_stats(db.sites_dict))
|
||||||
|
|
||||||
@@ -296,11 +570,6 @@ async def main():
|
|||||||
# Define one report filename template
|
# Define one report filename template
|
||||||
report_filepath_tpl = os.path.join(args.folderoutput, 'report_{username}{postfix}')
|
report_filepath_tpl = os.path.join(args.folderoutput, 'report_{username}{postfix}')
|
||||||
|
|
||||||
# Database stats
|
|
||||||
# TODO: verbose info about filtered sites
|
|
||||||
# enabled_count = len(list(filter(lambda x: not x.disabled, site_data.values())))
|
|
||||||
# print(f'Sites in database, enabled/total: {enabled_count}/{len(site_data)}')
|
|
||||||
|
|
||||||
if usernames == {}:
|
if usernames == {}:
|
||||||
# magic params to exit after init
|
# magic params to exit after init
|
||||||
query_notify.warning('No usernames to check, exiting.')
|
query_notify.warning('No usernames to check, exiting.')
|
||||||
@@ -309,10 +578,14 @@ async def main():
|
|||||||
if not site_data:
|
if not site_data:
|
||||||
query_notify.warning('No sites to check, exiting!')
|
query_notify.warning('No sites to check, exiting!')
|
||||||
sys.exit(2)
|
sys.exit(2)
|
||||||
else:
|
|
||||||
query_notify.warning(f'Starting a search on top {len(site_data)} sites from the Maigret database...')
|
query_notify.warning(
|
||||||
|
f'Starting a search on top {len(site_data)} sites from the Maigret database...'
|
||||||
|
)
|
||||||
if not args.all_sites:
|
if not args.all_sites:
|
||||||
query_notify.warning(f'You can run search by full list of sites with flag `-a`', '!')
|
query_notify.warning(
|
||||||
|
'You can run search by full list of sites with flag `-a`', '!'
|
||||||
|
)
|
||||||
|
|
||||||
already_checked = set()
|
already_checked = set()
|
||||||
general_results = []
|
general_results = []
|
||||||
@@ -323,28 +596,35 @@ async def main():
|
|||||||
|
|
||||||
if username.lower() in already_checked:
|
if username.lower() in already_checked:
|
||||||
continue
|
continue
|
||||||
else:
|
|
||||||
already_checked.add(username.lower())
|
already_checked.add(username.lower())
|
||||||
|
|
||||||
if username in args.ignore_ids_list:
|
if username in args.ignore_ids_list:
|
||||||
query_notify.warning(f'Skip a search by username {username} cause it\'s marked as ignored.')
|
query_notify.warning(
|
||||||
|
f'Skip a search by username {username} cause it\'s marked as ignored.'
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# check for characters do not supported by sites generally
|
# check for characters do not supported by sites generally
|
||||||
found_unsupported_chars = set(unsupported_characters).intersection(set(username))
|
found_unsupported_chars = set(BAD_CHARS).intersection(set(username))
|
||||||
|
|
||||||
if found_unsupported_chars:
|
if found_unsupported_chars:
|
||||||
pretty_chars_str = ','.join(map(lambda s: f'"{s}"', found_unsupported_chars))
|
pretty_chars_str = ','.join(
|
||||||
|
map(lambda s: f'"{s}"', found_unsupported_chars)
|
||||||
|
)
|
||||||
query_notify.warning(
|
query_notify.warning(
|
||||||
f'Found unsupported URL characters: {pretty_chars_str}, skip search by username "{username}"')
|
f'Found unsupported URL characters: {pretty_chars_str}, skip search by username "{username}"'
|
||||||
|
)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
sites_to_check = get_top_sites_for_id(id_type)
|
sites_to_check = get_top_sites_for_id(id_type)
|
||||||
|
|
||||||
results = await maigret(username=username,
|
results = await maigret(
|
||||||
|
username=username,
|
||||||
site_dict=dict(sites_to_check),
|
site_dict=dict(sites_to_check),
|
||||||
query_notify=query_notify,
|
query_notify=query_notify,
|
||||||
proxy=args.proxy,
|
proxy=args.proxy,
|
||||||
|
tor_proxy=args.tor_proxy,
|
||||||
|
i2p_proxy=args.i2p_proxy,
|
||||||
timeout=args.timeout,
|
timeout=args.timeout,
|
||||||
is_parsing_enabled=parsing_enabled,
|
is_parsing_enabled=parsing_enabled,
|
||||||
id_type=id_type,
|
id_type=id_type,
|
||||||
@@ -353,27 +633,22 @@ async def main():
|
|||||||
cookies=args.cookie_file,
|
cookies=args.cookie_file,
|
||||||
forced=args.use_disabled_sites,
|
forced=args.use_disabled_sites,
|
||||||
max_connections=args.connections,
|
max_connections=args.connections,
|
||||||
|
no_progressbar=args.no_progressbar,
|
||||||
|
retries=args.retries,
|
||||||
|
check_domains=args.with_domains,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
notify_about_errors(results, query_notify)
|
||||||
|
|
||||||
|
if args.reports_sorting == "data":
|
||||||
|
results = sort_report_by_data_points(results)
|
||||||
|
|
||||||
general_results.append((username, id_type, results))
|
general_results.append((username, id_type, results))
|
||||||
|
|
||||||
# TODO: tests
|
# TODO: tests
|
||||||
for website_name in results:
|
if recursive_search_enabled:
|
||||||
dictionary = results[website_name]
|
extracted_ids = extract_ids_from_results(results, db)
|
||||||
# TODO: fix no site data issue
|
usernames.update(extracted_ids)
|
||||||
if not dictionary or not recursive_search_enabled:
|
|
||||||
continue
|
|
||||||
|
|
||||||
new_usernames = dictionary.get('ids_usernames')
|
|
||||||
if new_usernames:
|
|
||||||
for u, utype in new_usernames.items():
|
|
||||||
usernames[u] = utype
|
|
||||||
|
|
||||||
for url in dictionary.get('ids_links', []):
|
|
||||||
for s in db.sites:
|
|
||||||
u = s.detect_username(url)
|
|
||||||
if u:
|
|
||||||
usernames[u] = 'username'
|
|
||||||
|
|
||||||
# reporting for a one username
|
# reporting for a one username
|
||||||
if args.xmind:
|
if args.xmind:
|
||||||
@@ -392,9 +667,13 @@ async def main():
|
|||||||
query_notify.warning(f'TXT report for {username} saved in {filename}')
|
query_notify.warning(f'TXT report for {username} saved in {filename}')
|
||||||
|
|
||||||
if args.json:
|
if args.json:
|
||||||
filename = report_filepath_tpl.format(username=username, postfix=f'_{args.json}.json')
|
filename = report_filepath_tpl.format(
|
||||||
|
username=username, postfix=f'_{args.json}.json'
|
||||||
|
)
|
||||||
save_json_report(filename, username, results, report_type=args.json)
|
save_json_report(filename, username, results, report_type=args.json)
|
||||||
query_notify.warning(f'JSON {args.json} report for {username} saved in {filename}')
|
query_notify.warning(
|
||||||
|
f'JSON {args.json} report for {username} saved in {filename}'
|
||||||
|
)
|
||||||
|
|
||||||
# reporting for all the result
|
# reporting for all the result
|
||||||
if general_results:
|
if general_results:
|
||||||
@@ -413,6 +692,17 @@ async def main():
|
|||||||
filename = report_filepath_tpl.format(username=username, postfix='.pdf')
|
filename = report_filepath_tpl.format(username=username, postfix='.pdf')
|
||||||
save_pdf_report(filename, report_context)
|
save_pdf_report(filename, report_context)
|
||||||
query_notify.warning(f'PDF report on all usernames saved in {filename}')
|
query_notify.warning(f'PDF report on all usernames saved in {filename}')
|
||||||
|
|
||||||
|
if args.graph:
|
||||||
|
filename = report_filepath_tpl.format(username=username, postfix='.html')
|
||||||
|
save_graph_report(filename, general_results, db)
|
||||||
|
query_notify.warning(f'Graph report on all usernames saved in {filename}')
|
||||||
|
|
||||||
|
text_report = get_plaintext_report(report_context)
|
||||||
|
if text_report:
|
||||||
|
query_notify.info('Short text report:')
|
||||||
|
print(text_report)
|
||||||
|
|
||||||
# update database
|
# update database
|
||||||
db.save_to_file(args.db_file)
|
db.save_to_file(args.db_file)
|
||||||
|
|
||||||
|
|||||||
+88
-60
@@ -11,7 +11,7 @@ from .result import QueryStatus
|
|||||||
from .utils import get_dict_ascii_tree
|
from .utils import get_dict_ascii_tree
|
||||||
|
|
||||||
|
|
||||||
class QueryNotify():
|
class QueryNotify:
|
||||||
"""Query Notify Object.
|
"""Query Notify Object.
|
||||||
|
|
||||||
Base class that describes methods available to notify the results of
|
Base class that describes methods available to notify the results of
|
||||||
@@ -39,7 +39,7 @@ class QueryNotify():
|
|||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def start(self, message=None, id_type='username'):
|
def start(self, message=None, id_type="username"):
|
||||||
"""Notify Start.
|
"""Notify Start.
|
||||||
|
|
||||||
Notify method for start of query. This method will be called before
|
Notify method for start of query. This method will be called before
|
||||||
@@ -116,8 +116,14 @@ class QueryNotifyPrint(QueryNotify):
|
|||||||
Query notify class that prints results.
|
Query notify class that prints results.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, result=None, verbose=False, print_found_only=False,
|
def __init__(
|
||||||
skip_check_errors=False, color=True):
|
self,
|
||||||
|
result=None,
|
||||||
|
verbose=False,
|
||||||
|
print_found_only=False,
|
||||||
|
skip_check_errors=False,
|
||||||
|
color=True,
|
||||||
|
):
|
||||||
"""Create Query Notify Print Object.
|
"""Create Query Notify Print Object.
|
||||||
|
|
||||||
Contains information about a specific method of notifying the results
|
Contains information about a specific method of notifying the results
|
||||||
@@ -146,6 +152,27 @@ class QueryNotifyPrint(QueryNotify):
|
|||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
|
def make_colored_terminal_notify(
|
||||||
|
self, status, text, status_color, text_color, appendix
|
||||||
|
):
|
||||||
|
text = [
|
||||||
|
f"{Style.BRIGHT}{Fore.WHITE}[{status_color}{status}{Fore.WHITE}]"
|
||||||
|
+ f"{text_color} {text}: {Style.RESET_ALL}"
|
||||||
|
+ f"{appendix}"
|
||||||
|
]
|
||||||
|
return "".join(text)
|
||||||
|
|
||||||
|
def make_simple_terminal_notify(
|
||||||
|
self, status, text, status_color, text_color, appendix
|
||||||
|
):
|
||||||
|
return f"[{status}] {text}: {appendix}"
|
||||||
|
|
||||||
|
def make_terminal_notify(self, *args):
|
||||||
|
if self.color:
|
||||||
|
return self.make_colored_terminal_notify(*args)
|
||||||
|
else:
|
||||||
|
return self.make_simple_terminal_notify(*args)
|
||||||
|
|
||||||
def start(self, message, id_type):
|
def start(self, message, id_type):
|
||||||
"""Notify Start.
|
"""Notify Start.
|
||||||
|
|
||||||
@@ -162,21 +189,35 @@ class QueryNotifyPrint(QueryNotify):
|
|||||||
|
|
||||||
title = f"Checking {id_type}"
|
title = f"Checking {id_type}"
|
||||||
if self.color:
|
if self.color:
|
||||||
print(Style.BRIGHT + Fore.GREEN + "[" +
|
print(
|
||||||
Fore.YELLOW + "*" +
|
Style.BRIGHT
|
||||||
Fore.GREEN + f"] {title}" +
|
+ Fore.GREEN
|
||||||
Fore.WHITE + f" {message}" +
|
+ "["
|
||||||
Fore.GREEN + " on:")
|
+ Fore.YELLOW
|
||||||
|
+ "*"
|
||||||
|
+ Fore.GREEN
|
||||||
|
+ f"] {title}"
|
||||||
|
+ Fore.WHITE
|
||||||
|
+ f" {message}"
|
||||||
|
+ Fore.GREEN
|
||||||
|
+ " on:"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
print(f"[*] {title} {message} on:")
|
print(f"[*] {title} {message} on:")
|
||||||
|
|
||||||
def warning(self, message, symbol='-'):
|
def _colored_print(self, fore_color, msg):
|
||||||
msg = f'[{symbol}] {message}'
|
|
||||||
if self.color:
|
if self.color:
|
||||||
print(Style.BRIGHT + Fore.YELLOW + msg)
|
print(Style.BRIGHT + fore_color + msg)
|
||||||
else:
|
else:
|
||||||
print(msg)
|
print(msg)
|
||||||
|
|
||||||
|
def warning(self, message, symbol="-"):
|
||||||
|
msg = f"[{symbol}] {message}"
|
||||||
|
self._colored_print(Fore.YELLOW, msg)
|
||||||
|
|
||||||
|
def info(self, message, symbol="*"):
|
||||||
|
msg = f"[{symbol}] {message}"
|
||||||
|
self._colored_print(Fore.BLUE, msg)
|
||||||
|
|
||||||
def update(self, result, is_similar=False):
|
def update(self, result, is_similar=False):
|
||||||
"""Notify Update.
|
"""Notify Update.
|
||||||
@@ -191,77 +232,64 @@ class QueryNotifyPrint(QueryNotify):
|
|||||||
Return Value:
|
Return Value:
|
||||||
Nothing.
|
Nothing.
|
||||||
"""
|
"""
|
||||||
|
notify = None
|
||||||
self.result = result
|
self.result = result
|
||||||
|
|
||||||
if not self.result.ids_data:
|
|
||||||
ids_data_text = ""
|
ids_data_text = ""
|
||||||
else:
|
if self.result.ids_data:
|
||||||
ids_data_text = get_dict_ascii_tree(self.result.ids_data.items(), ' ')
|
ids_data_text = get_dict_ascii_tree(self.result.ids_data.items(), " ")
|
||||||
|
|
||||||
def make_colored_terminal_notify(status, text, status_color, text_color, appendix):
|
|
||||||
text = [
|
|
||||||
f'{Style.BRIGHT}{Fore.WHITE}[{status_color}{status}{Fore.WHITE}]' +
|
|
||||||
f'{text_color} {text}: {Style.RESET_ALL}' +
|
|
||||||
f'{appendix}'
|
|
||||||
]
|
|
||||||
return ''.join(text)
|
|
||||||
|
|
||||||
def make_simple_terminal_notify(status, text, appendix):
|
|
||||||
return f'[{status}] {text}: {appendix}'
|
|
||||||
|
|
||||||
def make_terminal_notify(is_colored=True, *args):
|
|
||||||
if is_colored:
|
|
||||||
return make_colored_terminal_notify(*args)
|
|
||||||
else:
|
|
||||||
return make_simple_terminal_notify(*args)
|
|
||||||
|
|
||||||
notify = None
|
|
||||||
|
|
||||||
# Output to the terminal is desired.
|
# Output to the terminal is desired.
|
||||||
if result.status == QueryStatus.CLAIMED:
|
if result.status == QueryStatus.CLAIMED:
|
||||||
color = Fore.BLUE if is_similar else Fore.GREEN
|
color = Fore.BLUE if is_similar else Fore.GREEN
|
||||||
status = '?' if is_similar else '+'
|
status = "?" if is_similar else "+"
|
||||||
notify = make_terminal_notify(
|
notify = self.make_terminal_notify(
|
||||||
self.color,
|
status,
|
||||||
status, result.site_name,
|
result.site_name,
|
||||||
color, color,
|
color,
|
||||||
result.site_url_user + ids_data_text
|
color,
|
||||||
|
result.site_url_user + ids_data_text,
|
||||||
)
|
)
|
||||||
elif result.status == QueryStatus.AVAILABLE:
|
elif result.status == QueryStatus.AVAILABLE:
|
||||||
if not self.print_found_only:
|
if not self.print_found_only:
|
||||||
notify = make_terminal_notify(
|
notify = self.make_terminal_notify(
|
||||||
self.color,
|
"-",
|
||||||
'-', result.site_name,
|
result.site_name,
|
||||||
Fore.RED, Fore.YELLOW,
|
Fore.RED,
|
||||||
'Not found!' + ids_data_text
|
Fore.YELLOW,
|
||||||
|
"Not found!" + ids_data_text,
|
||||||
)
|
)
|
||||||
elif result.status == QueryStatus.UNKNOWN:
|
elif result.status == QueryStatus.UNKNOWN:
|
||||||
if not self.skip_check_errors:
|
if not self.skip_check_errors:
|
||||||
notify = make_terminal_notify(
|
notify = self.make_terminal_notify(
|
||||||
self.color,
|
"?",
|
||||||
'?', result.site_name,
|
result.site_name,
|
||||||
Fore.RED, Fore.RED,
|
Fore.RED,
|
||||||
self.result.context + ids_data_text
|
Fore.RED,
|
||||||
|
str(self.result.error) + ids_data_text,
|
||||||
)
|
)
|
||||||
elif result.status == QueryStatus.ILLEGAL:
|
elif result.status == QueryStatus.ILLEGAL:
|
||||||
if not self.print_found_only:
|
if not self.print_found_only:
|
||||||
text = 'Illegal Username Format For This Site!'
|
text = "Illegal Username Format For This Site!"
|
||||||
notify = make_terminal_notify(
|
notify = self.make_terminal_notify(
|
||||||
self.color,
|
"-",
|
||||||
'-', result.site_name,
|
result.site_name,
|
||||||
Fore.RED, Fore.YELLOW,
|
Fore.RED,
|
||||||
text + ids_data_text
|
Fore.YELLOW,
|
||||||
|
text + ids_data_text,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# It should be impossible to ever get here...
|
# It should be impossible to ever get here...
|
||||||
raise ValueError(f"Unknown Query Status '{str(result.status)}' for "
|
raise ValueError(
|
||||||
f"site '{self.result.site_name}'")
|
f"Unknown Query Status '{str(result.status)}' for "
|
||||||
|
f"site '{self.result.site_name}'"
|
||||||
|
)
|
||||||
|
|
||||||
if notify:
|
if notify:
|
||||||
sys.stdout.write('\x1b[1K\r')
|
sys.stdout.write("\x1b[1K\r")
|
||||||
print(notify)
|
print(notify)
|
||||||
|
|
||||||
return
|
return notify
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
"""Convert Object To String.
|
"""Convert Object To String.
|
||||||
|
|||||||
+303
-131
@@ -1,76 +1,231 @@
|
|||||||
|
import ast
|
||||||
import csv
|
import csv
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from argparse import ArgumentTypeError
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from typing import Dict, Any
|
||||||
|
|
||||||
import pycountry
|
|
||||||
import xmind
|
import xmind
|
||||||
from dateutil.parser import parse as parse_datetime_str
|
from dateutil.parser import parse as parse_datetime_str
|
||||||
from jinja2 import Template
|
from jinja2 import Template
|
||||||
from xhtml2pdf import pisa
|
|
||||||
|
|
||||||
|
from .checking import SUPPORTED_IDS
|
||||||
from .result import QueryStatus
|
from .result import QueryStatus
|
||||||
|
from .sites import MaigretDatabase
|
||||||
from .utils import is_country_tag, CaseConverter, enrich_link_str
|
from .utils import is_country_tag, CaseConverter, enrich_link_str
|
||||||
|
|
||||||
SUPPORTED_JSON_REPORT_FORMATS = [
|
SUPPORTED_JSON_REPORT_FORMATS = [
|
||||||
'simple',
|
"simple",
|
||||||
'ndjson',
|
"ndjson",
|
||||||
]
|
]
|
||||||
|
|
||||||
'''
|
"""
|
||||||
UTILS
|
UTILS
|
||||||
'''
|
"""
|
||||||
|
|
||||||
|
|
||||||
def filter_supposed_data(data):
|
def filter_supposed_data(data):
|
||||||
### interesting fields
|
# interesting fields
|
||||||
allowed_fields = ['fullname', 'gender', 'location', 'age']
|
allowed_fields = ["fullname", "gender", "location", "age"]
|
||||||
filtered_supposed_data = {CaseConverter.snake_to_title(k): v[0]
|
filtered_supposed_data = {
|
||||||
|
CaseConverter.snake_to_title(k): v[0]
|
||||||
for k, v in data.items()
|
for k, v in data.items()
|
||||||
if k in allowed_fields}
|
if k in allowed_fields
|
||||||
|
}
|
||||||
return filtered_supposed_data
|
return filtered_supposed_data
|
||||||
|
|
||||||
|
|
||||||
'''
|
def sort_report_by_data_points(results):
|
||||||
|
return dict(
|
||||||
|
sorted(
|
||||||
|
results.items(),
|
||||||
|
key=lambda x: len(
|
||||||
|
(x[1].get('status') and x[1]['status'].ids_data or {}).keys()
|
||||||
|
),
|
||||||
|
reverse=True,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
REPORTS SAVING
|
REPORTS SAVING
|
||||||
'''
|
"""
|
||||||
|
|
||||||
|
|
||||||
def save_csv_report(filename: str, username: str, results: dict):
|
def save_csv_report(filename: str, username: str, results: dict):
|
||||||
with open(filename, 'w', newline='', encoding='utf-8') as f:
|
with open(filename, "w", newline="", encoding="utf-8") as f:
|
||||||
generate_csv_report(username, results, f)
|
generate_csv_report(username, results, f)
|
||||||
|
|
||||||
|
|
||||||
def save_txt_report(filename: str, username: str, results: dict):
|
def save_txt_report(filename: str, username: str, results: dict):
|
||||||
with open(filename, 'w', encoding='utf-8') as f:
|
with open(filename, "w", encoding="utf-8") as f:
|
||||||
generate_txt_report(username, results, f)
|
generate_txt_report(username, results, f)
|
||||||
|
|
||||||
|
|
||||||
def save_html_report(filename: str, context: dict):
|
def save_html_report(filename: str, context: dict):
|
||||||
template, _ = generate_report_template(is_pdf=False)
|
template, _ = generate_report_template(is_pdf=False)
|
||||||
filled_template = template.render(**context)
|
filled_template = template.render(**context)
|
||||||
with open(filename, 'w') as f:
|
with open(filename, "w") as f:
|
||||||
f.write(filled_template)
|
f.write(filled_template)
|
||||||
|
|
||||||
|
|
||||||
def save_pdf_report(filename: str, context: dict):
|
def save_pdf_report(filename: str, context: dict):
|
||||||
template, css = generate_report_template(is_pdf=True)
|
template, css = generate_report_template(is_pdf=True)
|
||||||
filled_template = template.render(**context)
|
filled_template = template.render(**context)
|
||||||
with open(filename, 'w+b') as f:
|
|
||||||
|
# moved here to speed up the launch of Maigret
|
||||||
|
from xhtml2pdf import pisa
|
||||||
|
|
||||||
|
with open(filename, "w+b") as f:
|
||||||
pisa.pisaDocument(io.StringIO(filled_template), dest=f, default_css=css)
|
pisa.pisaDocument(io.StringIO(filled_template), dest=f, default_css=css)
|
||||||
|
|
||||||
|
|
||||||
def save_json_report(filename: str, username: str, results: dict, report_type: str):
|
def save_json_report(filename: str, username: str, results: dict, report_type: str):
|
||||||
with open(filename, 'w', encoding='utf-8') as f:
|
with open(filename, "w", encoding="utf-8") as f:
|
||||||
generate_json_report(username, results, f, report_type=report_type)
|
generate_json_report(username, results, f, report_type=report_type)
|
||||||
|
|
||||||
|
|
||||||
'''
|
class MaigretGraph:
|
||||||
|
other_params = {'size': 10, 'group': 3}
|
||||||
|
site_params = {'size': 15, 'group': 2}
|
||||||
|
username_params = {'size': 20, 'group': 1}
|
||||||
|
|
||||||
|
def __init__(self, graph):
|
||||||
|
self.G = graph
|
||||||
|
|
||||||
|
def add_node(self, key, value):
|
||||||
|
node_name = f'{key}: {value}'
|
||||||
|
|
||||||
|
params = self.other_params
|
||||||
|
if key in SUPPORTED_IDS:
|
||||||
|
params = self.username_params
|
||||||
|
elif value.startswith('http'):
|
||||||
|
params = self.site_params
|
||||||
|
|
||||||
|
self.G.add_node(node_name, title=node_name, **params)
|
||||||
|
|
||||||
|
if value != value.lower():
|
||||||
|
normalized_node_name = self.add_node(key, value.lower())
|
||||||
|
self.link(node_name, normalized_node_name)
|
||||||
|
|
||||||
|
return node_name
|
||||||
|
|
||||||
|
def link(self, node1_name, node2_name):
|
||||||
|
self.G.add_edge(node1_name, node2_name, weight=2)
|
||||||
|
|
||||||
|
|
||||||
|
def save_graph_report(filename: str, username_results: list, db: MaigretDatabase):
|
||||||
|
# moved here to speed up the launch of Maigret
|
||||||
|
import networkx as nx
|
||||||
|
|
||||||
|
G = nx.Graph()
|
||||||
|
graph = MaigretGraph(G)
|
||||||
|
|
||||||
|
for username, id_type, results in username_results:
|
||||||
|
username_node_name = graph.add_node(id_type, username)
|
||||||
|
|
||||||
|
for website_name in results:
|
||||||
|
dictionary = results[website_name]
|
||||||
|
# TODO: fix no site data issue
|
||||||
|
if not dictionary:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if dictionary.get("is_similar"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
status = dictionary.get("status")
|
||||||
|
if not status: # FIXME: currently in case of timeout
|
||||||
|
continue
|
||||||
|
|
||||||
|
if dictionary["status"].status != QueryStatus.CLAIMED:
|
||||||
|
continue
|
||||||
|
|
||||||
|
site_fallback_name = dictionary.get(
|
||||||
|
'url_user', f'{website_name}: {username.lower()}'
|
||||||
|
)
|
||||||
|
# site_node_name = dictionary.get('url_user', f'{website_name}: {username.lower()}')
|
||||||
|
site_node_name = graph.add_node('site', site_fallback_name)
|
||||||
|
graph.link(username_node_name, site_node_name)
|
||||||
|
|
||||||
|
def process_ids(parent_node, ids):
|
||||||
|
for k, v in ids.items():
|
||||||
|
if k.endswith('_count') or k.startswith('is_') or k.endswith('_at'):
|
||||||
|
continue
|
||||||
|
if k in 'image':
|
||||||
|
continue
|
||||||
|
|
||||||
|
v_data = v
|
||||||
|
if v.startswith('['):
|
||||||
|
try:
|
||||||
|
v_data = ast.literal_eval(v)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(e)
|
||||||
|
|
||||||
|
# value is a list
|
||||||
|
if isinstance(v_data, list):
|
||||||
|
list_node_name = graph.add_node(k, site_fallback_name)
|
||||||
|
for vv in v_data:
|
||||||
|
data_node_name = graph.add_node(vv, site_fallback_name)
|
||||||
|
graph.link(list_node_name, data_node_name)
|
||||||
|
|
||||||
|
add_ids = {
|
||||||
|
a: b for b, a in db.extract_ids_from_url(vv).items()
|
||||||
|
}
|
||||||
|
if add_ids:
|
||||||
|
process_ids(data_node_name, add_ids)
|
||||||
|
else:
|
||||||
|
# value is just a string
|
||||||
|
# ids_data_name = f'{k}: {v}'
|
||||||
|
# if ids_data_name == parent_node:
|
||||||
|
# continue
|
||||||
|
|
||||||
|
ids_data_name = graph.add_node(k, v)
|
||||||
|
# G.add_node(ids_data_name, size=10, title=ids_data_name, group=3)
|
||||||
|
graph.link(parent_node, ids_data_name)
|
||||||
|
|
||||||
|
# check for username
|
||||||
|
if 'username' in k or k in SUPPORTED_IDS:
|
||||||
|
new_username_node_name = graph.add_node('username', v)
|
||||||
|
graph.link(ids_data_name, new_username_node_name)
|
||||||
|
|
||||||
|
add_ids = {k: v for v, k in db.extract_ids_from_url(v).items()}
|
||||||
|
if add_ids:
|
||||||
|
process_ids(ids_data_name, add_ids)
|
||||||
|
|
||||||
|
if status.ids_data:
|
||||||
|
process_ids(site_node_name, status.ids_data)
|
||||||
|
|
||||||
|
nodes_to_remove = []
|
||||||
|
for node in G.nodes:
|
||||||
|
if len(str(node)) > 100:
|
||||||
|
nodes_to_remove.append(node)
|
||||||
|
|
||||||
|
[G.remove_node(node) for node in nodes_to_remove]
|
||||||
|
|
||||||
|
# moved here to speed up the launch of Maigret
|
||||||
|
from pyvis.network import Network
|
||||||
|
|
||||||
|
nt = Network(notebook=True, height="750px", width="100%")
|
||||||
|
nt.from_nx(G)
|
||||||
|
nt.show(filename)
|
||||||
|
|
||||||
|
|
||||||
|
def get_plaintext_report(context: dict) -> str:
|
||||||
|
output = (context['brief'] + " ").replace('. ', '.\n')
|
||||||
|
interests = list(map(lambda x: x[0], context.get('interests_tuple_list', [])))
|
||||||
|
countries = list(map(lambda x: x[0], context.get('countries_tuple_list', [])))
|
||||||
|
if countries:
|
||||||
|
output += f'Countries: {", ".join(countries)}\n'
|
||||||
|
if interests:
|
||||||
|
output += f'Interests (tags): {", ".join(interests)}\n'
|
||||||
|
return output.strip()
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
REPORTS GENERATING
|
REPORTS GENERATING
|
||||||
'''
|
"""
|
||||||
|
|
||||||
|
|
||||||
def generate_report_template(is_pdf: bool):
|
def generate_report_template(is_pdf: bool):
|
||||||
@@ -79,20 +234,20 @@ def generate_report_template(is_pdf: bool):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def get_resource_content(filename):
|
def get_resource_content(filename):
|
||||||
return open(os.path.join(maigret_path, 'resources', filename)).read()
|
return open(os.path.join(maigret_path, "resources", filename)).read()
|
||||||
|
|
||||||
maigret_path = os.path.dirname(os.path.realpath(__file__))
|
maigret_path = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
|
||||||
if is_pdf:
|
if is_pdf:
|
||||||
template_content = get_resource_content('simple_report_pdf.tpl')
|
template_content = get_resource_content("simple_report_pdf.tpl")
|
||||||
css_content = get_resource_content('simple_report_pdf.css')
|
css_content = get_resource_content("simple_report_pdf.css")
|
||||||
else:
|
else:
|
||||||
template_content = get_resource_content('simple_report.tpl')
|
template_content = get_resource_content("simple_report.tpl")
|
||||||
css_content = None
|
css_content = None
|
||||||
|
|
||||||
template = Template(template_content)
|
template = Template(template_content)
|
||||||
template.globals['title'] = CaseConverter.snake_to_title
|
template.globals["title"] = CaseConverter.snake_to_title # type: ignore
|
||||||
template.globals['detect_link'] = enrich_link_str
|
template.globals["detect_link"] = enrich_link_str # type: ignore
|
||||||
return template, css_content
|
return template, css_content
|
||||||
|
|
||||||
|
|
||||||
@@ -100,15 +255,18 @@ def generate_report_context(username_results: list):
|
|||||||
brief_text = []
|
brief_text = []
|
||||||
usernames = {}
|
usernames = {}
|
||||||
extended_info_count = 0
|
extended_info_count = 0
|
||||||
tags = {}
|
tags: Dict[str, int] = {}
|
||||||
supposed_data = {}
|
supposed_data: Dict[str, Any] = {}
|
||||||
|
|
||||||
first_seen = None
|
first_seen = None
|
||||||
|
|
||||||
|
# moved here to speed up the launch of Maigret
|
||||||
|
import pycountry
|
||||||
|
|
||||||
for username, id_type, results in username_results:
|
for username, id_type, results in username_results:
|
||||||
found_accounts = 0
|
found_accounts = 0
|
||||||
new_ids = []
|
new_ids = []
|
||||||
usernames[username] = {'type': id_type}
|
usernames[username] = {"type": id_type}
|
||||||
|
|
||||||
for website_name in results:
|
for website_name in results:
|
||||||
dictionary = results[website_name]
|
dictionary = results[website_name]
|
||||||
@@ -116,19 +274,19 @@ def generate_report_context(username_results: list):
|
|||||||
if not dictionary:
|
if not dictionary:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if dictionary.get('is_similar'):
|
if dictionary.get("is_similar"):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
status = dictionary.get('status')
|
status = dictionary.get("status")
|
||||||
if not status: # FIXME: currently in case of timeout
|
if not status: # FIXME: currently in case of timeout
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if status.ids_data:
|
if status.ids_data:
|
||||||
dictionary['ids_data'] = status.ids_data
|
dictionary["ids_data"] = status.ids_data
|
||||||
extended_info_count += 1
|
extended_info_count += 1
|
||||||
|
|
||||||
# detect first seen
|
# detect first seen
|
||||||
created_at = status.ids_data.get('created_at')
|
created_at = status.ids_data.get("created_at")
|
||||||
if created_at:
|
if created_at:
|
||||||
if first_seen is None:
|
if first_seen is None:
|
||||||
first_seen = created_at
|
first_seen = created_at
|
||||||
@@ -138,37 +296,46 @@ def generate_report_context(username_results: list):
|
|||||||
new_time = parse_datetime_str(created_at)
|
new_time = parse_datetime_str(created_at)
|
||||||
if new_time < known_time:
|
if new_time < known_time:
|
||||||
first_seen = created_at
|
first_seen = created_at
|
||||||
except:
|
except Exception as e:
|
||||||
logging.debug('Problems with converting datetime %s/%s', first_seen, created_at)
|
logging.debug(
|
||||||
|
"Problems with converting datetime %s/%s: %s",
|
||||||
|
first_seen,
|
||||||
|
created_at,
|
||||||
|
str(e),
|
||||||
|
)
|
||||||
|
|
||||||
for k, v in status.ids_data.items():
|
for k, v in status.ids_data.items():
|
||||||
# suppose target data
|
# suppose target data
|
||||||
field = 'fullname' if k == 'name' else k
|
field = "fullname" if k == "name" else k
|
||||||
if not field in supposed_data:
|
if field not in supposed_data:
|
||||||
supposed_data[field] = []
|
supposed_data[field] = []
|
||||||
supposed_data[field].append(v)
|
supposed_data[field].append(v)
|
||||||
# suppose country
|
# suppose country
|
||||||
if k in ['country', 'locale']:
|
if k in ["country", "locale"]:
|
||||||
try:
|
try:
|
||||||
if is_country_tag(k):
|
if is_country_tag(k):
|
||||||
tag = pycountry.countries.get(alpha_2=v).alpha_2.lower()
|
tag = pycountry.countries.get(alpha_2=v).alpha_2.lower()
|
||||||
else:
|
else:
|
||||||
tag = pycountry.countries.search_fuzzy(v)[0].alpha_2.lower()
|
tag = pycountry.countries.search_fuzzy(v)[
|
||||||
|
0
|
||||||
|
].alpha_2.lower()
|
||||||
# TODO: move countries to another struct
|
# TODO: move countries to another struct
|
||||||
tags[tag] = tags.get(tag, 0) + 1
|
tags[tag] = tags.get(tag, 0) + 1
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.debug('pycountry exception', exc_info=True)
|
logging.debug(
|
||||||
|
"Pycountry exception: %s", str(e), exc_info=True
|
||||||
|
)
|
||||||
|
|
||||||
new_usernames = dictionary.get('ids_usernames')
|
new_usernames = dictionary.get("ids_usernames")
|
||||||
if new_usernames:
|
if new_usernames:
|
||||||
for u, utype in new_usernames.items():
|
for u, utype in new_usernames.items():
|
||||||
if not u in usernames:
|
if u not in usernames:
|
||||||
new_ids.append((u, utype))
|
new_ids.append((u, utype))
|
||||||
usernames[u] = {'type': utype}
|
usernames[u] = {"type": utype}
|
||||||
|
|
||||||
if status.status == QueryStatus.CLAIMED:
|
if status.status == QueryStatus.CLAIMED:
|
||||||
found_accounts += 1
|
found_accounts += 1
|
||||||
dictionary['found'] = True
|
dictionary["found"] = True
|
||||||
else:
|
else:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -177,22 +344,24 @@ def generate_report_context(username_results: list):
|
|||||||
for t in status.tags:
|
for t in status.tags:
|
||||||
tags[t] = tags.get(t, 0) + 1
|
tags[t] = tags.get(t, 0) + 1
|
||||||
|
|
||||||
brief_text.append(f'Search by {id_type} {username} returned {found_accounts} accounts.')
|
brief_text.append(
|
||||||
|
f"Search by {id_type} {username} returned {found_accounts} accounts."
|
||||||
|
)
|
||||||
|
|
||||||
if new_ids:
|
if new_ids:
|
||||||
ids_list = []
|
ids_list = []
|
||||||
for u, t in new_ids:
|
for u, t in new_ids:
|
||||||
ids_list.append(f'{u} ({t})' if t != 'username' else u)
|
ids_list.append(f"{u} ({t})" if t != "username" else u)
|
||||||
brief_text.append(f'Found target\'s other IDs: ' + ', '.join(ids_list) + '.')
|
brief_text.append("Found target's other IDs: " + ", ".join(ids_list) + ".")
|
||||||
|
|
||||||
brief_text.append(f'Extended info extracted from {extended_info_count} accounts.')
|
brief_text.append(f"Extended info extracted from {extended_info_count} accounts.")
|
||||||
|
|
||||||
brief = ' '.join(brief_text).strip()
|
brief = " ".join(brief_text).strip()
|
||||||
tuple_sort = lambda d: sorted(d, key=lambda x: x[1], reverse=True)
|
tuple_sort = lambda d: sorted(d, key=lambda x: x[1], reverse=True)
|
||||||
|
|
||||||
if 'global' in tags:
|
if "global" in tags:
|
||||||
# remove tag 'global' useless for country detection
|
# remove tag 'global' useless for country detection
|
||||||
del tags['global']
|
del tags["global"]
|
||||||
|
|
||||||
first_username = username_results[0][0]
|
first_username = username_results[0][0]
|
||||||
countries_lists = list(filter(lambda x: is_country_tag(x[0]), tags.items()))
|
countries_lists = list(filter(lambda x: is_country_tag(x[0]), tags.items()))
|
||||||
@@ -201,35 +370,38 @@ def generate_report_context(username_results: list):
|
|||||||
filtered_supposed_data = filter_supposed_data(supposed_data)
|
filtered_supposed_data = filter_supposed_data(supposed_data)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'username': first_username,
|
"username": first_username,
|
||||||
'brief': brief,
|
# TODO: return brief list
|
||||||
'results': username_results,
|
"brief": brief,
|
||||||
'first_seen': first_seen,
|
"results": username_results,
|
||||||
'interests_tuple_list': tuple_sort(interests_list),
|
"first_seen": first_seen,
|
||||||
'countries_tuple_list': tuple_sort(countries_lists),
|
"interests_tuple_list": tuple_sort(interests_list),
|
||||||
'supposed_data': filtered_supposed_data,
|
"countries_tuple_list": tuple_sort(countries_lists),
|
||||||
'generated_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
"supposed_data": filtered_supposed_data,
|
||||||
|
"generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
def generate_csv_report(username: str, results: dict, csvfile):
|
def generate_csv_report(username: str, results: dict, csvfile):
|
||||||
writer = csv.writer(csvfile)
|
writer = csv.writer(csvfile)
|
||||||
writer.writerow(['username',
|
writer.writerow(
|
||||||
'name',
|
["username", "name", "url_main", "url_user", "exists", "http_status"]
|
||||||
'url_main',
|
|
||||||
'url_user',
|
|
||||||
'exists',
|
|
||||||
'http_status'
|
|
||||||
]
|
|
||||||
)
|
)
|
||||||
for site in results:
|
for site in results:
|
||||||
writer.writerow([username,
|
# TODO: fix the reason
|
||||||
|
status = 'Unknown'
|
||||||
|
if "status" in results[site]:
|
||||||
|
status = str(results[site]["status"].status)
|
||||||
|
writer.writerow(
|
||||||
|
[
|
||||||
|
username,
|
||||||
site,
|
site,
|
||||||
results[site]['url_main'],
|
results[site].get("url_main", ""),
|
||||||
results[site]['url_user'],
|
results[site].get("url_user", ""),
|
||||||
str(results[site]['status'].status),
|
status,
|
||||||
results[site]['http_status'],
|
results[site].get("http_status", 0),
|
||||||
])
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def generate_txt_report(username: str, results: dict, file):
|
def generate_txt_report(username: str, results: dict, file):
|
||||||
@@ -239,29 +411,38 @@ def generate_txt_report(username: str, results: dict, file):
|
|||||||
# TODO: fix no site data issue
|
# TODO: fix no site data issue
|
||||||
if not dictionary:
|
if not dictionary:
|
||||||
continue
|
continue
|
||||||
if dictionary.get("status").status == QueryStatus.CLAIMED:
|
if (
|
||||||
|
dictionary.get("status")
|
||||||
|
and dictionary["status"].status == QueryStatus.CLAIMED
|
||||||
|
):
|
||||||
exists_counter += 1
|
exists_counter += 1
|
||||||
file.write(dictionary["url_user"] + "\n")
|
file.write(dictionary["url_user"] + "\n")
|
||||||
file.write(f'Total Websites Username Detected On : {exists_counter}')
|
file.write(f"Total Websites Username Detected On : {exists_counter}")
|
||||||
|
|
||||||
|
|
||||||
def generate_json_report(username: str, results: dict, file, report_type):
|
def generate_json_report(username: str, results: dict, file, report_type):
|
||||||
exists_counter = 0
|
is_report_per_line = report_type.startswith("ndjson")
|
||||||
is_report_per_line = report_type.startswith('ndjson')
|
|
||||||
all_json = {}
|
all_json = {}
|
||||||
|
|
||||||
for sitename in results:
|
for sitename in results:
|
||||||
site_result = results[sitename]
|
site_result = results[sitename]
|
||||||
# TODO: fix no site data issue
|
# TODO: fix no site data issue
|
||||||
if not site_result or site_result.get("status").status != QueryStatus.CLAIMED:
|
if not site_result or not site_result.get("status"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if site_result["status"].status != QueryStatus.CLAIMED:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
data = dict(site_result)
|
data = dict(site_result)
|
||||||
data['status'] = data['status'].json()
|
data["status"] = data["status"].json()
|
||||||
|
data["site"] = data["site"].json
|
||||||
|
for field in ["future", "checker"]:
|
||||||
|
if field in data:
|
||||||
|
del data[field]
|
||||||
|
|
||||||
if is_report_per_line:
|
if is_report_per_line:
|
||||||
data['sitename'] = sitename
|
data["sitename"] = sitename
|
||||||
file.write(json.dumps(data) + '\n')
|
file.write(json.dumps(data) + "\n")
|
||||||
else:
|
else:
|
||||||
all_json[sitename] = data
|
all_json[sitename] = data
|
||||||
|
|
||||||
@@ -269,9 +450,9 @@ def generate_json_report(username: str, results: dict, file, report_type):
|
|||||||
file.write(json.dumps(all_json))
|
file.write(json.dumps(all_json))
|
||||||
|
|
||||||
|
|
||||||
'''
|
"""
|
||||||
XMIND 8 Functions
|
XMIND 8 Functions
|
||||||
'''
|
"""
|
||||||
|
|
||||||
|
|
||||||
def save_xmind_report(filename, username, results):
|
def save_xmind_report(filename, username, results):
|
||||||
@@ -279,12 +460,20 @@ def save_xmind_report(filename, username, results):
|
|||||||
os.remove(filename)
|
os.remove(filename)
|
||||||
workbook = xmind.load(filename)
|
workbook = xmind.load(filename)
|
||||||
sheet = workbook.getPrimarySheet()
|
sheet = workbook.getPrimarySheet()
|
||||||
design_sheet(sheet, username, results)
|
design_xmind_sheet(sheet, username, results)
|
||||||
xmind.save(workbook, path=filename)
|
xmind.save(workbook, path=filename)
|
||||||
|
|
||||||
|
|
||||||
def design_sheet(sheet, username, results):
|
def add_xmind_subtopic(userlink, k, v, supposed_data):
|
||||||
##all tag list
|
currentsublabel = userlink.addSubTopic()
|
||||||
|
field = "fullname" if k == "name" else k
|
||||||
|
if field not in supposed_data:
|
||||||
|
supposed_data[field] = []
|
||||||
|
supposed_data[field].append(v)
|
||||||
|
currentsublabel.setTitle("%s: %s" % (k, v))
|
||||||
|
|
||||||
|
|
||||||
|
def design_xmind_sheet(sheet, username, results):
|
||||||
alltags = {}
|
alltags = {}
|
||||||
supposed_data = {}
|
supposed_data = {}
|
||||||
|
|
||||||
@@ -298,62 +487,45 @@ def design_sheet(sheet, username, results):
|
|||||||
|
|
||||||
for website_name in results:
|
for website_name in results:
|
||||||
dictionary = results[website_name]
|
dictionary = results[website_name]
|
||||||
|
if not dictionary:
|
||||||
if dictionary.get("status").status == QueryStatus.CLAIMED:
|
continue
|
||||||
## firsttime I found that entry
|
result_status = dictionary.get("status")
|
||||||
for tag in dictionary.get("status").tags:
|
# TODO: fix the reason
|
||||||
if tag.strip() == "":
|
if not result_status or result_status.status != QueryStatus.CLAIMED:
|
||||||
|
continue
|
||||||
|
|
||||||
|
stripped_tags = list(map(lambda x: x.strip(), result_status.tags))
|
||||||
|
normalized_tags = list(
|
||||||
|
filter(lambda x: x and not is_country_tag(x), stripped_tags)
|
||||||
|
)
|
||||||
|
|
||||||
|
category = None
|
||||||
|
for tag in normalized_tags:
|
||||||
|
if tag in alltags.keys():
|
||||||
continue
|
continue
|
||||||
if tag not in alltags.keys():
|
|
||||||
if not is_country_tag(tag):
|
|
||||||
tagsection = root_topic1.addSubTopic()
|
tagsection = root_topic1.addSubTopic()
|
||||||
tagsection.setTitle(tag)
|
tagsection.setTitle(tag)
|
||||||
alltags[tag] = tagsection
|
alltags[tag] = tagsection
|
||||||
|
|
||||||
category = None
|
|
||||||
for tag in dictionary.get("status").tags:
|
|
||||||
if tag.strip() == "":
|
|
||||||
continue
|
|
||||||
if not is_country_tag(tag):
|
|
||||||
category = tag
|
category = tag
|
||||||
|
|
||||||
if category is None:
|
section = alltags[category] if category else undefinedsection
|
||||||
userlink = undefinedsection.addSubTopic()
|
userlink = section.addSubTopic()
|
||||||
userlink.addLabel(dictionary.get("status").site_url_user)
|
userlink.addLabel(result_status.site_url_user)
|
||||||
else:
|
|
||||||
userlink = alltags[category].addSubTopic()
|
|
||||||
userlink.addLabel(dictionary.get("status").site_url_user)
|
|
||||||
|
|
||||||
if dictionary.get("status").ids_data:
|
ids_data = result_status.ids_data or {}
|
||||||
for k, v in dictionary.get("status").ids_data.items():
|
for k, v in ids_data.items():
|
||||||
# suppose target data
|
# suppose target data
|
||||||
if not isinstance(v, list):
|
if isinstance(v, list):
|
||||||
currentsublabel = userlink.addSubTopic()
|
|
||||||
field = 'fullname' if k == 'name' else k
|
|
||||||
if not field in supposed_data:
|
|
||||||
supposed_data[field] = []
|
|
||||||
supposed_data[field].append(v)
|
|
||||||
currentsublabel.setTitle("%s: %s" % (k, v))
|
|
||||||
else:
|
|
||||||
for currentval in v:
|
for currentval in v:
|
||||||
currentsublabel = userlink.addSubTopic()
|
add_xmind_subtopic(userlink, k, currentval, supposed_data)
|
||||||
field = 'fullname' if k == 'name' else k
|
else:
|
||||||
if not field in supposed_data:
|
add_xmind_subtopic(userlink, k, v, supposed_data)
|
||||||
supposed_data[field] = []
|
|
||||||
supposed_data[field].append(currentval)
|
# add supposed data
|
||||||
currentsublabel.setTitle("%s: %s" % (k, currentval))
|
filtered_supposed_data = filter_supposed_data(supposed_data)
|
||||||
### Add Supposed DATA
|
if len(filtered_supposed_data) > 0:
|
||||||
filterede_supposed_data = filter_supposed_data(supposed_data)
|
|
||||||
if (len(filterede_supposed_data) > 0):
|
|
||||||
undefinedsection = root_topic1.addSubTopic()
|
undefinedsection = root_topic1.addSubTopic()
|
||||||
undefinedsection.setTitle("SUPPOSED DATA")
|
undefinedsection.setTitle("SUPPOSED DATA")
|
||||||
for k, v in filterede_supposed_data.items():
|
for k, v in filtered_supposed_data.items():
|
||||||
currentsublabel = undefinedsection.addSubTopic()
|
currentsublabel = undefinedsection.addSubTopic()
|
||||||
currentsublabel.setTitle("%s: %s" % (k, v))
|
currentsublabel.setTitle("%s: %s" % (k, v))
|
||||||
|
|
||||||
|
|
||||||
def check_supported_json_format(value):
|
|
||||||
if value and not value in SUPPORTED_JSON_REPORT_FORMATS:
|
|
||||||
raise ArgumentTypeError(f'JSON report type must be one of the following types: '
|
|
||||||
+ ', '.join(SUPPORTED_JSON_REPORT_FORMATS))
|
|
||||||
return value
|
|
||||||
|
|||||||
+9017
-3988
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,17 @@
|
|||||||
|
{
|
||||||
|
"presence_strings": [
|
||||||
|
"username",
|
||||||
|
"not found",
|
||||||
|
"пользователь",
|
||||||
|
"profile",
|
||||||
|
"lastname",
|
||||||
|
"firstname",
|
||||||
|
"biography",
|
||||||
|
"birthday",
|
||||||
|
"репутация",
|
||||||
|
"информация",
|
||||||
|
"e-mail"
|
||||||
|
],
|
||||||
|
"supposed_usernames": [
|
||||||
|
"alex", "god", "admin", "red", "blue", "john"]
|
||||||
|
}
|
||||||
@@ -68,7 +68,7 @@
|
|||||||
<div class="row-mb">
|
<div class="row-mb">
|
||||||
<div class="col-md">
|
<div class="col-md">
|
||||||
<div class="card flex-md-row mb-4 box-shadow h-md-250">
|
<div class="card flex-md-row mb-4 box-shadow h-md-250">
|
||||||
<img class="card-img-right flex-auto d-md-block" alt="Photo" style="width: 200px; height: 200px; object-fit: scale-down;" src="{{ v.status.ids_data.image or 'https://i.imgur.com/040fmbw.png' }}" data-holder-rendered="true">
|
<img class="card-img-right flex-auto d-md-block" alt="Photo" style="width: 200px; height: 200px; object-fit: scale-down;" src="{{ v.status and v.status.ids_data and v.status.ids_data.image or 'https://i.imgur.com/040fmbw.png' }}" data-holder-rendered="true">
|
||||||
<div class="card-body d-flex flex-column align-items-start" style="padding-top: 0;">
|
<div class="card-body d-flex flex-column align-items-start" style="padding-top: 0;">
|
||||||
<h3 class="mb-0" style="padding-top: 1rem;">
|
<h3 class="mb-0" style="padding-top: 1rem;">
|
||||||
<a class="text-dark" href="{{ v.url_main }}" target="_blank">{{ k }}</a>
|
<a class="text-dark" href="{{ v.url_main }}" target="_blank">{{ k }}</a>
|
||||||
|
|||||||
+21
-9
@@ -10,6 +10,7 @@ class QueryStatus(Enum):
|
|||||||
|
|
||||||
Describes status of query about a given username.
|
Describes status of query about a given username.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
CLAIMED = "Claimed" # Username Detected
|
CLAIMED = "Claimed" # Username Detected
|
||||||
AVAILABLE = "Available" # Username Not Detected
|
AVAILABLE = "Available" # Username Not Detected
|
||||||
UNKNOWN = "Unknown" # Error Occurred While Trying To Detect Username
|
UNKNOWN = "Unknown" # Error Occurred While Trying To Detect Username
|
||||||
@@ -27,14 +28,24 @@ class QueryStatus(Enum):
|
|||||||
return self.value
|
return self.value
|
||||||
|
|
||||||
|
|
||||||
class QueryResult():
|
class QueryResult:
|
||||||
"""Query Result Object.
|
"""Query Result Object.
|
||||||
|
|
||||||
Describes result of query about a given username.
|
Describes result of query about a given username.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, username, site_name, site_url_user, status, ids_data=None,
|
def __init__(
|
||||||
query_time=None, context=None, tags=[]):
|
self,
|
||||||
|
username,
|
||||||
|
site_name,
|
||||||
|
site_url_user,
|
||||||
|
status,
|
||||||
|
ids_data=None,
|
||||||
|
query_time=None,
|
||||||
|
context=None,
|
||||||
|
error=None,
|
||||||
|
tags=[],
|
||||||
|
):
|
||||||
"""Create Query Result Object.
|
"""Create Query Result Object.
|
||||||
|
|
||||||
Contains information about a specific method of detecting usernames on
|
Contains information about a specific method of detecting usernames on
|
||||||
@@ -73,15 +84,16 @@ class QueryResult():
|
|||||||
self.context = context
|
self.context = context
|
||||||
self.ids_data = ids_data
|
self.ids_data = ids_data
|
||||||
self.tags = tags
|
self.tags = tags
|
||||||
|
self.error = error
|
||||||
|
|
||||||
def json(self):
|
def json(self):
|
||||||
return {
|
return {
|
||||||
'username': self.username,
|
"username": self.username,
|
||||||
'site_name': self.site_name,
|
"site_name": self.site_name,
|
||||||
'url': self.site_url_user,
|
"url": self.site_url_user,
|
||||||
'status': str(self.status),
|
"status": str(self.status),
|
||||||
'ids': self.ids_data or {},
|
"ids": self.ids_data or {},
|
||||||
'tags': self.tags,
|
"tags": self.tags,
|
||||||
}
|
}
|
||||||
|
|
||||||
def is_found(self):
|
def is_found(self):
|
||||||
|
|||||||
@@ -0,0 +1,29 @@
|
|||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
class Settings:
|
||||||
|
presence_strings: list
|
||||||
|
supposed_usernames: list
|
||||||
|
|
||||||
|
def __init__(self, filename):
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(filename, "r", encoding="utf-8") as file:
|
||||||
|
try:
|
||||||
|
data = json.load(file)
|
||||||
|
except Exception as error:
|
||||||
|
raise ValueError(
|
||||||
|
f"Problem with parsing json contents of "
|
||||||
|
f"settings file '{filename}': {str(error)}."
|
||||||
|
)
|
||||||
|
except FileNotFoundError as error:
|
||||||
|
raise FileNotFoundError(
|
||||||
|
f"Problem while attempting to access settings file '{filename}'."
|
||||||
|
) from error
|
||||||
|
|
||||||
|
self.__dict__.update(data)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def json(self):
|
||||||
|
return self.__dict__
|
||||||
+201
-118
@@ -1,28 +1,20 @@
|
|||||||
# -*- coding: future_annotations -*-
|
# ****************************** -*-
|
||||||
"""Maigret Sites Information"""
|
"""Maigret Sites Information"""
|
||||||
import copy
|
import copy
|
||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
|
from typing import Optional, List, Dict, Any, Tuple
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from .utils import CaseConverter, URLMatcher, is_country_tag
|
from .utils import CaseConverter, URLMatcher, is_country_tag
|
||||||
|
|
||||||
# TODO: move to data.json
|
|
||||||
SUPPORTED_TAGS = [
|
|
||||||
'gaming', 'coding', 'photo', 'music', 'blog', 'finance', 'freelance', 'dating',
|
|
||||||
'tech', 'forum', 'porn', 'erotic', 'webcam', 'video', 'movies', 'hacking', 'art',
|
|
||||||
'discussion', 'sharing', 'writing', 'wiki', 'business', 'shopping', 'sport',
|
|
||||||
'books', 'news', 'documents', 'travel', 'maps', 'hobby', 'apps', 'classified',
|
|
||||||
'career', 'geosocial', 'streaming', 'education', 'networking', 'torrent',
|
|
||||||
'science', 'medicine',
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class MaigretEngine:
|
class MaigretEngine:
|
||||||
|
site: Dict[str, Any] = {}
|
||||||
|
|
||||||
def __init__(self, name, data):
|
def __init__(self, name, data):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.site = {}
|
|
||||||
self.__dict__.update(data)
|
self.__dict__.update(data)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -32,43 +24,51 @@ class MaigretEngine:
|
|||||||
|
|
||||||
class MaigretSite:
|
class MaigretSite:
|
||||||
NOT_SERIALIZABLE_FIELDS = [
|
NOT_SERIALIZABLE_FIELDS = [
|
||||||
'name',
|
"name",
|
||||||
'engineData',
|
"engineData",
|
||||||
'requestFuture',
|
"requestFuture",
|
||||||
'detectedEngine',
|
"detectedEngine",
|
||||||
'engineObj',
|
"engineObj",
|
||||||
'stats',
|
"stats",
|
||||||
'urlRegexp',
|
"urlRegexp",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
username_claimed = ""
|
||||||
|
username_unclaimed = ""
|
||||||
|
url_subpath = ""
|
||||||
|
url_main = ""
|
||||||
|
url = ""
|
||||||
|
disabled = False
|
||||||
|
similar_search = False
|
||||||
|
ignore403 = False
|
||||||
|
tags: List[str] = []
|
||||||
|
|
||||||
|
type = "username"
|
||||||
|
headers: Dict[str, str] = {}
|
||||||
|
errors: Dict[str, str] = {}
|
||||||
|
activation: Dict[str, Any] = {}
|
||||||
|
regex_check = None
|
||||||
|
url_probe = None
|
||||||
|
check_type = ""
|
||||||
|
request_head_only = ""
|
||||||
|
get_params: Dict[str, Any] = {}
|
||||||
|
|
||||||
|
presense_strs: List[str] = []
|
||||||
|
absence_strs: List[str] = []
|
||||||
|
stats: Dict[str, Any] = {}
|
||||||
|
|
||||||
|
engine = None
|
||||||
|
engine_data: Dict[str, Any] = {}
|
||||||
|
engine_obj: Optional["MaigretEngine"] = None
|
||||||
|
request_future = None
|
||||||
|
alexa_rank = None
|
||||||
|
source = None
|
||||||
|
|
||||||
|
protocol = ''
|
||||||
|
|
||||||
def __init__(self, name, information):
|
def __init__(self, name, information):
|
||||||
self.name = name
|
self.name = name
|
||||||
|
self.url_subpath = ""
|
||||||
self.disabled = False
|
|
||||||
self.similar_search = False
|
|
||||||
self.ignore403 = False
|
|
||||||
self.tags = []
|
|
||||||
|
|
||||||
self.type = 'username'
|
|
||||||
self.headers = {}
|
|
||||||
self.errors = {}
|
|
||||||
self.activation = {}
|
|
||||||
self.url_subpath = ''
|
|
||||||
self.regex_check = None
|
|
||||||
self.url_probe = None
|
|
||||||
self.check_type = ''
|
|
||||||
self.request_head_only = ''
|
|
||||||
self.get_params = {}
|
|
||||||
|
|
||||||
self.presense_strs = []
|
|
||||||
self.absence_strs = []
|
|
||||||
self.stats = {}
|
|
||||||
|
|
||||||
self.engine = None
|
|
||||||
self.engine_data = {}
|
|
||||||
self.engine_obj = None
|
|
||||||
self.request_future = None
|
|
||||||
self.alexa_rank = None
|
|
||||||
|
|
||||||
for k, v in information.items():
|
for k, v in information.items():
|
||||||
self.__dict__[CaseConverter.camel_to_snake(k)] = v
|
self.__dict__[CaseConverter.camel_to_snake(k)] = v
|
||||||
@@ -83,22 +83,44 @@ class MaigretSite:
|
|||||||
return f"{self.name} ({self.url_main})"
|
return f"{self.name} ({self.url_main})"
|
||||||
|
|
||||||
def update_detectors(self):
|
def update_detectors(self):
|
||||||
if 'url' in self.__dict__:
|
if "url" in self.__dict__:
|
||||||
url = self.url
|
url = self.url
|
||||||
for group in ['urlMain', 'urlSubpath']:
|
for group in ["urlMain", "urlSubpath"]:
|
||||||
if group in url:
|
if group in url:
|
||||||
url = url.replace('{' + group + '}', self.__dict__[CaseConverter.camel_to_snake(group)])
|
url = url.replace(
|
||||||
|
"{" + group + "}",
|
||||||
|
self.__dict__[CaseConverter.camel_to_snake(group)],
|
||||||
|
)
|
||||||
|
|
||||||
self.url_regexp = URLMatcher.make_profile_url_regexp(url, self.regex_check)
|
self.url_regexp = URLMatcher.make_profile_url_regexp(url, self.regex_check)
|
||||||
|
|
||||||
def detect_username(self, url: str) -> str:
|
def detect_username(self, url: str) -> Optional[str]:
|
||||||
if self.url_regexp:
|
if self.url_regexp:
|
||||||
match_groups = self.url_regexp.match(url)
|
match_groups = self.url_regexp.match(url)
|
||||||
if match_groups:
|
if match_groups:
|
||||||
return match_groups.groups()[-1].rstrip('/')
|
return match_groups.groups()[-1].rstrip("/")
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def extract_id_from_url(self, url: str) -> Optional[Tuple[str, str]]:
|
||||||
|
if not self.url_regexp:
|
||||||
|
return None
|
||||||
|
|
||||||
|
match_groups = self.url_regexp.match(url)
|
||||||
|
if not match_groups:
|
||||||
|
return None
|
||||||
|
|
||||||
|
_id = match_groups.groups()[-1].rstrip("/")
|
||||||
|
_type = self.type
|
||||||
|
|
||||||
|
return _id, _type
|
||||||
|
|
||||||
|
@property
|
||||||
|
def pretty_name(self):
|
||||||
|
if self.source:
|
||||||
|
return f"{self.name} [{self.source}]"
|
||||||
|
return self.name
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def json(self):
|
def json(self):
|
||||||
result = {}
|
result = {}
|
||||||
@@ -106,7 +128,7 @@ class MaigretSite:
|
|||||||
# convert to camelCase
|
# convert to camelCase
|
||||||
field = CaseConverter.snake_to_camel(k)
|
field = CaseConverter.snake_to_camel(k)
|
||||||
# strip empty elements
|
# strip empty elements
|
||||||
if v in (False, '', [], {}, None, sys.maxsize, 'username'):
|
if v in (False, "", [], {}, None, sys.maxsize, "username"):
|
||||||
continue
|
continue
|
||||||
if field in self.NOT_SERIALIZABLE_FIELDS:
|
if field in self.NOT_SERIALIZABLE_FIELDS:
|
||||||
continue
|
continue
|
||||||
@@ -114,13 +136,32 @@ class MaigretSite:
|
|||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def update(self, updates: dict) -> MaigretSite:
|
@property
|
||||||
|
def errors_dict(self) -> dict:
|
||||||
|
errors: Dict[str, str] = {}
|
||||||
|
if self.engine_obj:
|
||||||
|
errors.update(self.engine_obj.site.get('errors', {}))
|
||||||
|
errors.update(self.errors)
|
||||||
|
return errors
|
||||||
|
|
||||||
|
def get_url_template(self) -> str:
|
||||||
|
url = URLMatcher.extract_main_part(self.url)
|
||||||
|
if url.startswith("{username}"):
|
||||||
|
url = "SUBDOMAIN"
|
||||||
|
elif url == "":
|
||||||
|
url = f"{self.url} ({self.engine or 'no engine'})"
|
||||||
|
else:
|
||||||
|
parts = url.split("/")
|
||||||
|
url = "/" + "/".join(parts[1:])
|
||||||
|
return url
|
||||||
|
|
||||||
|
def update(self, updates: "dict") -> "MaigretSite":
|
||||||
self.__dict__.update(updates)
|
self.__dict__.update(updates)
|
||||||
self.update_detectors()
|
self.update_detectors()
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def update_from_engine(self, engine: MaigretEngine) -> MaigretSite:
|
def update_from_engine(self, engine: MaigretEngine) -> "MaigretSite":
|
||||||
engine_data = engine.site
|
engine_data = engine.site
|
||||||
for k, v in engine_data.items():
|
for k, v in engine_data.items():
|
||||||
field = CaseConverter.camel_to_snake(k)
|
field = CaseConverter.camel_to_snake(k)
|
||||||
@@ -138,7 +179,7 @@ class MaigretSite:
|
|||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def strip_engine_data(self) -> MaigretSite:
|
def strip_engine_data(self) -> "MaigretSite":
|
||||||
if not self.engine_obj:
|
if not self.engine_obj:
|
||||||
return self
|
return self
|
||||||
|
|
||||||
@@ -146,7 +187,7 @@ class MaigretSite:
|
|||||||
self.url_regexp = None
|
self.url_regexp = None
|
||||||
|
|
||||||
self_copy = copy.deepcopy(self)
|
self_copy = copy.deepcopy(self)
|
||||||
engine_data = self_copy.engine_obj.site
|
engine_data = self_copy.engine_obj and self_copy.engine_obj.site or {}
|
||||||
site_data_keys = list(self_copy.__dict__.keys())
|
site_data_keys = list(self_copy.__dict__.keys())
|
||||||
|
|
||||||
for k in engine_data.keys():
|
for k in engine_data.keys():
|
||||||
@@ -172,8 +213,9 @@ class MaigretSite:
|
|||||||
|
|
||||||
class MaigretDatabase:
|
class MaigretDatabase:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._sites = []
|
self._tags: list = []
|
||||||
self._engines = []
|
self._sites: list = []
|
||||||
|
self._engines: list = []
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def sites(self):
|
def sites(self):
|
||||||
@@ -183,8 +225,15 @@ class MaigretDatabase:
|
|||||||
def sites_dict(self):
|
def sites_dict(self):
|
||||||
return {site.name: site for site in self._sites}
|
return {site.name: site for site in self._sites}
|
||||||
|
|
||||||
def ranked_sites_dict(self, reverse=False, top=sys.maxsize, tags=[], names=[],
|
def ranked_sites_dict(
|
||||||
disabled=True, id_type='username'):
|
self,
|
||||||
|
reverse=False,
|
||||||
|
top=sys.maxsize,
|
||||||
|
tags=[],
|
||||||
|
names=[],
|
||||||
|
disabled=True,
|
||||||
|
id_type="username",
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
Ranking and filtering of the sites list
|
Ranking and filtering of the sites list
|
||||||
"""
|
"""
|
||||||
@@ -192,20 +241,37 @@ class MaigretDatabase:
|
|||||||
normalized_tags = list(map(str.lower, tags))
|
normalized_tags = list(map(str.lower, tags))
|
||||||
|
|
||||||
is_name_ok = lambda x: x.name.lower() in normalized_names
|
is_name_ok = lambda x: x.name.lower() in normalized_names
|
||||||
is_engine_ok = lambda x: isinstance(x.engine, str) and x.engine.lower() in normalized_tags
|
is_source_ok = lambda x: x.source and x.source.lower() in normalized_names
|
||||||
|
is_engine_ok = (
|
||||||
|
lambda x: isinstance(x.engine, str) and x.engine.lower() in normalized_tags
|
||||||
|
)
|
||||||
is_tags_ok = lambda x: set(x.tags).intersection(set(normalized_tags))
|
is_tags_ok = lambda x: set(x.tags).intersection(set(normalized_tags))
|
||||||
is_disabled_needed = lambda x: not x.disabled or ('disabled' in tags or disabled)
|
is_protocol_in_tags = lambda x: x.protocol and x.protocol in normalized_tags
|
||||||
|
is_disabled_needed = lambda x: not x.disabled or (
|
||||||
|
"disabled" in tags or disabled
|
||||||
|
)
|
||||||
is_id_type_ok = lambda x: x.type == id_type
|
is_id_type_ok = lambda x: x.type == id_type
|
||||||
|
|
||||||
filter_tags_engines_fun = lambda x: not tags or is_engine_ok(x) or is_tags_ok(x)
|
filter_tags_engines_fun = (
|
||||||
filter_names_fun = lambda x: not names or is_name_ok(x)
|
lambda x: not tags
|
||||||
|
or is_engine_ok(x)
|
||||||
|
or is_tags_ok(x)
|
||||||
|
or is_protocol_in_tags(x)
|
||||||
|
)
|
||||||
|
filter_names_fun = lambda x: not names or is_name_ok(x) or is_source_ok(x)
|
||||||
|
|
||||||
filter_fun = lambda x: filter_tags_engines_fun(x) and filter_names_fun(x) \
|
filter_fun = (
|
||||||
and is_disabled_needed(x) and is_id_type_ok(x)
|
lambda x: filter_tags_engines_fun(x)
|
||||||
|
and filter_names_fun(x)
|
||||||
|
and is_disabled_needed(x)
|
||||||
|
and is_id_type_ok(x)
|
||||||
|
)
|
||||||
|
|
||||||
filtered_list = [s for s in self.sites if filter_fun(s)]
|
filtered_list = [s for s in self.sites if filter_fun(s)]
|
||||||
|
|
||||||
sorted_list = sorted(filtered_list, key=lambda x: x.alexa_rank, reverse=reverse)[:top]
|
sorted_list = sorted(
|
||||||
|
filtered_list, key=lambda x: x.alexa_rank, reverse=reverse
|
||||||
|
)[:top]
|
||||||
return {site.name: site for site in sorted_list}
|
return {site.name: site for site in sorted_list}
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -216,7 +282,7 @@ class MaigretDatabase:
|
|||||||
def engines_dict(self):
|
def engines_dict(self):
|
||||||
return {engine.name: engine for engine in self._engines}
|
return {engine.name: engine for engine in self._engines}
|
||||||
|
|
||||||
def update_site(self, site: MaigretSite) -> MaigretDatabase:
|
def update_site(self, site: MaigretSite) -> "MaigretDatabase":
|
||||||
for s in self._sites:
|
for s in self._sites:
|
||||||
if s.name == site.name:
|
if s.name == site.name:
|
||||||
s = site
|
s = site
|
||||||
@@ -225,23 +291,30 @@ class MaigretDatabase:
|
|||||||
self._sites.append(site)
|
self._sites.append(site)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def save_to_file(self, filename: str) -> MaigretDatabase:
|
def save_to_file(self, filename: str) -> "MaigretDatabase":
|
||||||
|
if '://' in filename:
|
||||||
|
return self
|
||||||
|
|
||||||
db_data = {
|
db_data = {
|
||||||
'sites': {site.name: site.strip_engine_data().json for site in self._sites},
|
"sites": {site.name: site.strip_engine_data().json for site in self._sites},
|
||||||
'engines': {engine.name: engine.json for engine in self._engines},
|
"engines": {engine.name: engine.json for engine in self._engines},
|
||||||
|
"tags": self._tags,
|
||||||
}
|
}
|
||||||
|
|
||||||
json_data = json.dumps(db_data, indent=4)
|
json_data = json.dumps(db_data, indent=4)
|
||||||
|
|
||||||
with open(filename, 'w') as f:
|
with open(filename, "w") as f:
|
||||||
f.write(json_data)
|
f.write(json_data)
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def load_from_json(self, json_data: dict) -> MaigretDatabase:
|
def load_from_json(self, json_data: dict) -> "MaigretDatabase":
|
||||||
# Add all of site information from the json file to internal site list.
|
# Add all of site information from the json file to internal site list.
|
||||||
site_data = json_data.get("sites", {})
|
site_data = json_data.get("sites", {})
|
||||||
engines_data = json_data.get("engines", {})
|
engines_data = json_data.get("engines", {})
|
||||||
|
tags = json_data.get("tags", [])
|
||||||
|
|
||||||
|
self._tags += tags
|
||||||
|
|
||||||
for engine_name in engines_data:
|
for engine_name in engines_data:
|
||||||
self._engines.append(MaigretEngine(engine_name, engines_data[engine_name]))
|
self._engines.append(MaigretEngine(engine_name, engines_data[engine_name]))
|
||||||
@@ -250,30 +323,38 @@ class MaigretDatabase:
|
|||||||
try:
|
try:
|
||||||
maigret_site = MaigretSite(site_name, site_data[site_name])
|
maigret_site = MaigretSite(site_name, site_data[site_name])
|
||||||
|
|
||||||
engine = site_data[site_name].get('engine')
|
engine = site_data[site_name].get("engine")
|
||||||
if engine:
|
if engine:
|
||||||
maigret_site.update_from_engine(self.engines_dict[engine])
|
maigret_site.update_from_engine(self.engines_dict[engine])
|
||||||
|
|
||||||
self._sites.append(maigret_site)
|
self._sites.append(maigret_site)
|
||||||
except KeyError as error:
|
except KeyError as error:
|
||||||
raise ValueError(f"Problem parsing json content for site {site_name}: "
|
raise ValueError(
|
||||||
|
f"Problem parsing json content for site {site_name}: "
|
||||||
f"Missing attribute {str(error)}."
|
f"Missing attribute {str(error)}."
|
||||||
)
|
)
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def load_from_str(self, db_str: str) -> MaigretDatabase:
|
def load_from_str(self, db_str: "str") -> "MaigretDatabase":
|
||||||
try:
|
try:
|
||||||
data = json.loads(db_str)
|
data = json.loads(db_str)
|
||||||
except Exception as error:
|
except Exception as error:
|
||||||
raise ValueError(f"Problem parsing json contents from str"
|
raise ValueError(
|
||||||
|
f"Problem parsing json contents from str"
|
||||||
f"'{db_str[:50]}'...: {str(error)}."
|
f"'{db_str[:50]}'...: {str(error)}."
|
||||||
)
|
)
|
||||||
|
|
||||||
return self.load_from_json(data)
|
return self.load_from_json(data)
|
||||||
|
|
||||||
def load_from_url(self, url: str) -> MaigretDatabase:
|
def load_from_path(self, path: str) -> "MaigretDatabase":
|
||||||
is_url_valid = url.startswith('http://') or url.startswith('https://')
|
if '://' in path:
|
||||||
|
return self.load_from_http(path)
|
||||||
|
else:
|
||||||
|
return self.load_from_file(path)
|
||||||
|
|
||||||
|
def load_from_http(self, url: str) -> "MaigretDatabase":
|
||||||
|
is_url_valid = url.startswith("http://") or url.startswith("https://")
|
||||||
|
|
||||||
if not is_url_valid:
|
if not is_url_valid:
|
||||||
raise FileNotFoundError(f"Invalid data file URL '{url}'.")
|
raise FileNotFoundError(f"Invalid data file URL '{url}'.")
|
||||||
@@ -281,7 +362,8 @@ class MaigretDatabase:
|
|||||||
try:
|
try:
|
||||||
response = requests.get(url=url)
|
response = requests.get(url=url)
|
||||||
except Exception as error:
|
except Exception as error:
|
||||||
raise FileNotFoundError(f"Problem while attempting to access "
|
raise FileNotFoundError(
|
||||||
|
f"Problem while attempting to access "
|
||||||
f"data file URL '{url}': "
|
f"data file URL '{url}': "
|
||||||
f"{str(error)}"
|
f"{str(error)}"
|
||||||
)
|
)
|
||||||
@@ -290,29 +372,30 @@ class MaigretDatabase:
|
|||||||
try:
|
try:
|
||||||
data = response.json()
|
data = response.json()
|
||||||
except Exception as error:
|
except Exception as error:
|
||||||
raise ValueError(f"Problem parsing json contents at "
|
raise ValueError(
|
||||||
f"'{url}': {str(error)}."
|
f"Problem parsing json contents at " f"'{url}': {str(error)}."
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
raise FileNotFoundError(f"Bad response while accessing "
|
raise FileNotFoundError(
|
||||||
f"data file URL '{url}'."
|
f"Bad response while accessing " f"data file URL '{url}'."
|
||||||
)
|
)
|
||||||
|
|
||||||
return self.load_from_json(data)
|
return self.load_from_json(data)
|
||||||
|
|
||||||
def load_from_file(self, filename: str) -> MaigretDatabase:
|
def load_from_file(self, filename: "str") -> "MaigretDatabase":
|
||||||
try:
|
try:
|
||||||
with open(filename, 'r', encoding='utf-8') as file:
|
with open(filename, "r", encoding="utf-8") as file:
|
||||||
try:
|
try:
|
||||||
data = json.load(file)
|
data = json.load(file)
|
||||||
except Exception as error:
|
except Exception as error:
|
||||||
raise ValueError(f"Problem parsing json contents from "
|
raise ValueError(
|
||||||
|
f"Problem parsing json contents from "
|
||||||
f"file '{filename}': {str(error)}."
|
f"file '{filename}': {str(error)}."
|
||||||
)
|
)
|
||||||
except FileNotFoundError as error:
|
except FileNotFoundError as error:
|
||||||
raise FileNotFoundError(f"Problem while attempting to access "
|
raise FileNotFoundError(
|
||||||
f"data file '{filename}'."
|
f"Problem while attempting to access " f"data file '{filename}'."
|
||||||
)
|
) from error
|
||||||
|
|
||||||
return self.load_from_json(data)
|
return self.load_from_json(data)
|
||||||
|
|
||||||
@@ -320,57 +403,57 @@ class MaigretDatabase:
|
|||||||
sites = sites_dict or self.sites_dict
|
sites = sites_dict or self.sites_dict
|
||||||
found_flags = {}
|
found_flags = {}
|
||||||
for _, s in sites.items():
|
for _, s in sites.items():
|
||||||
if 'presense_flag' in s.stats:
|
if "presense_flag" in s.stats:
|
||||||
flag = s.stats['presense_flag']
|
flag = s.stats["presense_flag"]
|
||||||
found_flags[flag] = found_flags.get(flag, 0) + 1
|
found_flags[flag] = found_flags.get(flag, 0) + 1
|
||||||
|
|
||||||
return found_flags
|
return found_flags
|
||||||
|
|
||||||
|
def extract_ids_from_url(self, url: str) -> dict:
|
||||||
|
results = {}
|
||||||
|
for s in self._sites:
|
||||||
|
result = s.extract_id_from_url(url)
|
||||||
|
if not result:
|
||||||
|
continue
|
||||||
|
_id, _type = result
|
||||||
|
results[_id] = _type
|
||||||
|
return results
|
||||||
|
|
||||||
def get_db_stats(self, sites_dict):
|
def get_db_stats(self, sites_dict):
|
||||||
if not sites_dict:
|
if not sites_dict:
|
||||||
sites_dict = self.sites_dict()
|
sites_dict = self.sites_dict()
|
||||||
|
|
||||||
output = ''
|
|
||||||
disabled_count = 0
|
|
||||||
total_count = len(sites_dict)
|
|
||||||
urls = {}
|
urls = {}
|
||||||
tags = {}
|
tags = {}
|
||||||
|
output = ""
|
||||||
|
disabled_count = 0
|
||||||
|
total_count = len(sites_dict)
|
||||||
|
|
||||||
for _, site in sites_dict.items():
|
for _, site in sites_dict.items():
|
||||||
if site.disabled:
|
if site.disabled:
|
||||||
disabled_count += 1
|
disabled_count += 1
|
||||||
|
|
||||||
url = URLMatcher.extract_main_part(site.url)
|
url_type = site.get_url_template()
|
||||||
if url.startswith('{username}'):
|
urls[url_type] = urls.get(url_type, 0) + 1
|
||||||
url = 'SUBDOMAIN'
|
|
||||||
elif url == '':
|
|
||||||
url = f'{site.url} ({site.engine})'
|
|
||||||
else:
|
|
||||||
parts = url.split('/')
|
|
||||||
url = '/' + '/'.join(parts[1:])
|
|
||||||
|
|
||||||
urls[url] = urls.get(url, 0) + 1
|
|
||||||
|
|
||||||
if not site.tags:
|
if not site.tags:
|
||||||
tags['NO_TAGS'] = tags.get('NO_TAGS', 0) + 1
|
tags["NO_TAGS"] = tags.get("NO_TAGS", 0) + 1
|
||||||
|
|
||||||
for tag in site.tags:
|
for tag in filter(lambda x: not is_country_tag(x), site.tags):
|
||||||
if is_country_tag(tag):
|
|
||||||
# currenty do not display country tags
|
|
||||||
continue
|
|
||||||
tags[tag] = tags.get(tag, 0) + 1
|
tags[tag] = tags.get(tag, 0) + 1
|
||||||
|
|
||||||
output += f'Enabled/total sites: {total_count - disabled_count}/{total_count}\n'
|
output += f"Enabled/total sites: {total_count - disabled_count}/{total_count}\n"
|
||||||
output += 'Top sites\' profile URLs:\n'
|
output += "Top profile URLs:\n"
|
||||||
for url, count in sorted(urls.items(), key=lambda x: x[1], reverse=True)[:20]:
|
for url, count in sorted(urls.items(), key=lambda x: x[1], reverse=True)[:20]:
|
||||||
if count == 1:
|
if count == 1:
|
||||||
break
|
break
|
||||||
output += f'{count}\t{url}\n'
|
output += f"{count}\t{url}\n"
|
||||||
output += 'Top sites\' tags:\n'
|
|
||||||
for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True):
|
output += "Top tags:\n"
|
||||||
mark = ''
|
for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True)[:200]:
|
||||||
if not tag in SUPPORTED_TAGS:
|
mark = ""
|
||||||
mark = ' (non-standard)'
|
if tag not in self._tags:
|
||||||
output += f'{count}\t{tag}{mark}\n'
|
mark = " (non-standard)"
|
||||||
|
output += f"{count}\t{tag}{mark}\n"
|
||||||
|
|
||||||
return output
|
return output
|
||||||
|
|||||||
+267
-116
@@ -1,34 +1,55 @@
|
|||||||
import difflib
|
import asyncio
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from typing import List
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from .checking import *
|
from .activation import import_aiohttp_cookies
|
||||||
|
from .checking import maigret
|
||||||
|
from .result import QueryStatus
|
||||||
|
from .settings import Settings
|
||||||
|
from .sites import MaigretDatabase, MaigretSite, MaigretEngine
|
||||||
|
from .utils import get_random_user_agent, get_match_ratio
|
||||||
|
|
||||||
|
|
||||||
DESIRED_STRINGS = ["username", "not found", "пользователь", "profile", "lastname", "firstname", "biography",
|
class Submitter:
|
||||||
"birthday", "репутация", "информация", "e-mail"]
|
HEADERS = {
|
||||||
|
"User-Agent": get_random_user_agent(),
|
||||||
|
}
|
||||||
|
|
||||||
SUPPOSED_USERNAMES = ['alex', 'god', 'admin', 'red', 'blue', 'john']
|
SEPARATORS = "\"'"
|
||||||
|
|
||||||
RATIO = 0.6
|
RATIO = 0.6
|
||||||
TOP_FEATURES = 5
|
TOP_FEATURES = 5
|
||||||
URL_RE = re.compile(r'https?://(www\.)?')
|
URL_RE = re.compile(r"https?://(www\.)?")
|
||||||
|
|
||||||
|
def __init__(self, db: MaigretDatabase, settings: Settings, logger):
|
||||||
|
self.settings = settings
|
||||||
|
self.db = db
|
||||||
|
self.logger = logger
|
||||||
|
|
||||||
def get_match_ratio(x):
|
@staticmethod
|
||||||
return round(max([
|
def get_alexa_rank(site_url_main):
|
||||||
difflib.SequenceMatcher(a=x.lower(), b=y).ratio()
|
url = f"http://data.alexa.com/data?cli=10&url={site_url_main}"
|
||||||
for y in DESIRED_STRINGS
|
xml_data = requests.get(url).text
|
||||||
]), 2)
|
root = ET.fromstring(xml_data)
|
||||||
|
alexa_rank = 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
alexa_rank = int(root.find('.//REACH').attrib['RANK'])
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return alexa_rank
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
def extract_mainpage_url(url):
|
def extract_mainpage_url(url):
|
||||||
return '/'.join(url.split('/', 3)[:3])
|
return "/".join(url.split("/", 3)[:3])
|
||||||
|
|
||||||
|
async def site_self_check(self, site, semaphore, silent=False):
|
||||||
async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False):
|
|
||||||
changes = {
|
changes = {
|
||||||
'disabled': False,
|
"disabled": False,
|
||||||
}
|
}
|
||||||
|
|
||||||
check_data = [
|
check_data = [
|
||||||
@@ -36,13 +57,13 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F
|
|||||||
(site.username_unclaimed, QueryStatus.AVAILABLE),
|
(site.username_unclaimed, QueryStatus.AVAILABLE),
|
||||||
]
|
]
|
||||||
|
|
||||||
logger.info(f'Checking {site.name}...')
|
self.logger.info(f"Checking {site.name}...")
|
||||||
|
|
||||||
for username, status in check_data:
|
for username, status in check_data:
|
||||||
results_dict = await maigret(
|
results_dict = await maigret(
|
||||||
username=username,
|
username=username,
|
||||||
site_dict={site.name: site},
|
site_dict={site.name: site},
|
||||||
logger=logger,
|
logger=self.logger,
|
||||||
timeout=30,
|
timeout=30,
|
||||||
id_type=site.type,
|
id_type=site.type,
|
||||||
forced=True,
|
forced=True,
|
||||||
@@ -52,11 +73,11 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F
|
|||||||
# don't disable entries with other ids types
|
# don't disable entries with other ids types
|
||||||
# TODO: make normal checking
|
# TODO: make normal checking
|
||||||
if site.name not in results_dict:
|
if site.name not in results_dict:
|
||||||
logger.info(results_dict)
|
self.logger.info(results_dict)
|
||||||
changes['disabled'] = True
|
changes["disabled"] = True
|
||||||
continue
|
continue
|
||||||
|
|
||||||
result = results_dict[site.name]['status']
|
result = results_dict[site.name]["status"]
|
||||||
|
|
||||||
site_status = result.status
|
site_status = result.status
|
||||||
|
|
||||||
@@ -64,169 +85,299 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F
|
|||||||
if site_status == QueryStatus.UNKNOWN:
|
if site_status == QueryStatus.UNKNOWN:
|
||||||
msgs = site.absence_strs
|
msgs = site.absence_strs
|
||||||
etype = site.check_type
|
etype = site.check_type
|
||||||
logger.warning(
|
self.logger.warning(
|
||||||
f'Error while searching {username} in {site.name}: {result.context}, {msgs}, type {etype}')
|
"Error while searching '%s' in %s: %s, %s, check type %s",
|
||||||
|
username,
|
||||||
|
site.name,
|
||||||
|
result.context,
|
||||||
|
msgs,
|
||||||
|
etype,
|
||||||
|
)
|
||||||
# don't disable in case of available username
|
# don't disable in case of available username
|
||||||
if status == QueryStatus.CLAIMED:
|
if status == QueryStatus.CLAIMED:
|
||||||
changes['disabled'] = True
|
changes["disabled"] = True
|
||||||
elif status == QueryStatus.CLAIMED:
|
elif status == QueryStatus.CLAIMED:
|
||||||
logger.warning(f'Not found `{username}` in {site.name}, must be claimed')
|
self.logger.warning(
|
||||||
logger.info(results_dict[site.name])
|
f"Not found `{username}` in {site.name}, must be claimed"
|
||||||
changes['disabled'] = True
|
)
|
||||||
|
self.logger.info(results_dict[site.name])
|
||||||
|
changes["disabled"] = True
|
||||||
else:
|
else:
|
||||||
logger.warning(f'Found `{username}` in {site.name}, must be available')
|
self.logger.warning(
|
||||||
logger.info(results_dict[site.name])
|
f"Found `{username}` in {site.name}, must be available"
|
||||||
changes['disabled'] = True
|
)
|
||||||
|
self.logger.info(results_dict[site.name])
|
||||||
|
changes["disabled"] = True
|
||||||
|
|
||||||
logger.info(f'Site {site.name} checking is finished')
|
self.logger.info(f"Site {site.name} checking is finished")
|
||||||
|
|
||||||
return changes
|
return changes
|
||||||
|
|
||||||
|
def generate_additional_fields_dialog(self, engine: MaigretEngine, dialog):
|
||||||
|
fields = {}
|
||||||
|
if 'urlSubpath' in engine.site.get('url', ''):
|
||||||
|
msg = (
|
||||||
|
'Detected engine suppose additional URL subpath using (/forum/, /blog/, etc). '
|
||||||
|
'Enter in manually if it exists: '
|
||||||
|
)
|
||||||
|
subpath = input(msg).strip('/')
|
||||||
|
if subpath:
|
||||||
|
fields['urlSubpath'] = f'/{subpath}'
|
||||||
|
return fields
|
||||||
|
|
||||||
async def detect_known_engine(db, url_exists, url_mainpage):
|
async def detect_known_engine(self, url_exists, url_mainpage) -> List[MaigretSite]:
|
||||||
try:
|
try:
|
||||||
r = requests.get(url_mainpage)
|
r = requests.get(url_mainpage)
|
||||||
|
self.logger.debug(r.text)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(e)
|
self.logger.warning(e)
|
||||||
print('Some error while checking main page')
|
print("Some error while checking main page")
|
||||||
return None
|
return []
|
||||||
|
|
||||||
for e in db.engines:
|
for engine in self.db.engines:
|
||||||
strs_to_check = e.__dict__.get('presenseStrs')
|
strs_to_check = engine.__dict__.get("presenseStrs")
|
||||||
if strs_to_check and r and r.text:
|
if strs_to_check and r and r.text:
|
||||||
all_strs_in_response = True
|
all_strs_in_response = True
|
||||||
for s in strs_to_check:
|
for s in strs_to_check:
|
||||||
if not s in r.text:
|
if s not in r.text:
|
||||||
all_strs_in_response = False
|
all_strs_in_response = False
|
||||||
if all_strs_in_response:
|
|
||||||
engine_name = e.__dict__.get('name')
|
|
||||||
print(f'Detected engine {engine_name} for site {url_mainpage}')
|
|
||||||
|
|
||||||
sites = []
|
sites = []
|
||||||
for u in SUPPOSED_USERNAMES:
|
if all_strs_in_response:
|
||||||
site_data = {
|
engine_name = engine.__dict__.get("name")
|
||||||
'urlMain': url_mainpage,
|
|
||||||
'name': url_mainpage.split('//')[0],
|
|
||||||
'engine': engine_name,
|
|
||||||
'usernameClaimed': u,
|
|
||||||
'usernameUnclaimed': 'noonewouldeverusethis7',
|
|
||||||
}
|
|
||||||
|
|
||||||
maigret_site = MaigretSite(url_mainpage.split('/')[-1], site_data)
|
print(f"Detected engine {engine_name} for site {url_mainpage}")
|
||||||
maigret_site.update_from_engine(db.engines_dict[engine_name])
|
|
||||||
|
usernames_to_check = self.settings.supposed_usernames
|
||||||
|
supposed_username = self.extract_username_dialog(url_exists)
|
||||||
|
if supposed_username:
|
||||||
|
usernames_to_check = [supposed_username] + usernames_to_check
|
||||||
|
|
||||||
|
add_fields = self.generate_additional_fields_dialog(
|
||||||
|
engine, url_exists
|
||||||
|
)
|
||||||
|
|
||||||
|
for u in usernames_to_check:
|
||||||
|
site_data = {
|
||||||
|
"urlMain": url_mainpage,
|
||||||
|
"name": url_mainpage.split("//")[1],
|
||||||
|
"engine": engine_name,
|
||||||
|
"usernameClaimed": u,
|
||||||
|
"usernameUnclaimed": "noonewouldeverusethis7",
|
||||||
|
**add_fields,
|
||||||
|
}
|
||||||
|
self.logger.info(site_data)
|
||||||
|
|
||||||
|
maigret_site = MaigretSite(
|
||||||
|
url_mainpage.split("/")[-1], site_data
|
||||||
|
)
|
||||||
|
maigret_site.update_from_engine(
|
||||||
|
self.db.engines_dict[engine_name]
|
||||||
|
)
|
||||||
sites.append(maigret_site)
|
sites.append(maigret_site)
|
||||||
|
|
||||||
return sites
|
return sites
|
||||||
|
|
||||||
return None
|
return []
|
||||||
|
|
||||||
|
def extract_username_dialog(self, url):
|
||||||
|
url_parts = url.rstrip("/").split("/")
|
||||||
|
supposed_username = url_parts[-1].strip('@')
|
||||||
|
entered_username = input(
|
||||||
|
f'Is "{supposed_username}" a valid username? If not, write it manually: '
|
||||||
|
)
|
||||||
|
return entered_username if entered_username else supposed_username
|
||||||
|
|
||||||
async def check_features_manually(db, url_exists, url_mainpage, cookie_file):
|
async def check_features_manually(
|
||||||
url_parts = url_exists.split('/')
|
self, url_exists, url_mainpage, cookie_file, redirects=False
|
||||||
supposed_username = url_parts[-1]
|
):
|
||||||
new_name = input(f'Is "{supposed_username}" a valid username? If not, write it manually: ')
|
custom_headers = {}
|
||||||
if new_name:
|
while True:
|
||||||
supposed_username = new_name
|
header_key = input(
|
||||||
non_exist_username = 'noonewouldeverusethis7'
|
'Specify custom header if you need or just press Enter to skip. Header name: '
|
||||||
|
)
|
||||||
|
if not header_key:
|
||||||
|
break
|
||||||
|
header_value = input('Header value: ')
|
||||||
|
custom_headers[header_key.strip()] = header_value.strip()
|
||||||
|
|
||||||
url_user = url_exists.replace(supposed_username, '{username}')
|
supposed_username = self.extract_username_dialog(url_exists)
|
||||||
|
non_exist_username = "noonewouldeverusethis7"
|
||||||
|
|
||||||
|
url_user = url_exists.replace(supposed_username, "{username}")
|
||||||
url_not_exists = url_exists.replace(supposed_username, non_exist_username)
|
url_not_exists = url_exists.replace(supposed_username, non_exist_username)
|
||||||
|
|
||||||
|
headers = dict(self.HEADERS)
|
||||||
|
headers.update(custom_headers)
|
||||||
|
|
||||||
# cookies
|
# cookies
|
||||||
cookie_dict = None
|
cookie_dict = None
|
||||||
if cookie_file:
|
if cookie_file:
|
||||||
cookie_jar = await import_aiohttp_cookies(cookie_file)
|
self.logger.info(f'Use {cookie_file} for cookies')
|
||||||
|
cookie_jar = import_aiohttp_cookies(cookie_file)
|
||||||
cookie_dict = {c.key: c.value for c in cookie_jar}
|
cookie_dict = {c.key: c.value for c in cookie_jar}
|
||||||
|
|
||||||
a = requests.get(url_exists, cookies=cookie_dict).text
|
exists_resp = requests.get(
|
||||||
b = requests.get(url_not_exists, cookies=cookie_dict).text
|
url_exists, cookies=cookie_dict, headers=headers, allow_redirects=redirects
|
||||||
|
)
|
||||||
|
self.logger.debug(url_exists)
|
||||||
|
self.logger.debug(exists_resp.status_code)
|
||||||
|
self.logger.debug(exists_resp.text)
|
||||||
|
|
||||||
tokens_a = set(a.split('"'))
|
non_exists_resp = requests.get(
|
||||||
tokens_b = set(b.split('"'))
|
url_not_exists,
|
||||||
|
cookies=cookie_dict,
|
||||||
|
headers=headers,
|
||||||
|
allow_redirects=redirects,
|
||||||
|
)
|
||||||
|
self.logger.debug(url_not_exists)
|
||||||
|
self.logger.debug(non_exists_resp.status_code)
|
||||||
|
self.logger.debug(non_exists_resp.text)
|
||||||
|
|
||||||
|
a = exists_resp.text
|
||||||
|
b = non_exists_resp.text
|
||||||
|
|
||||||
|
tokens_a = set(re.split(f'[{self.SEPARATORS}]', a))
|
||||||
|
tokens_b = set(re.split(f'[{self.SEPARATORS}]', b))
|
||||||
|
|
||||||
a_minus_b = tokens_a.difference(tokens_b)
|
a_minus_b = tokens_a.difference(tokens_b)
|
||||||
b_minus_a = tokens_b.difference(tokens_a)
|
b_minus_a = tokens_b.difference(tokens_a)
|
||||||
|
|
||||||
top_features_count = int(input(f'Specify count of features to extract [default {TOP_FEATURES}]: ') or TOP_FEATURES)
|
if len(a_minus_b) == len(b_minus_a) == 0:
|
||||||
|
print("The pages for existing and non-existing account are the same!")
|
||||||
|
|
||||||
presence_list = sorted(a_minus_b, key=get_match_ratio, reverse=True)[:top_features_count]
|
top_features_count = int(
|
||||||
|
input(
|
||||||
|
f"Specify count of features to extract [default {self.TOP_FEATURES}]: "
|
||||||
|
)
|
||||||
|
or self.TOP_FEATURES
|
||||||
|
)
|
||||||
|
|
||||||
print('Detected text features of existing account: ' + ', '.join(presence_list))
|
match_fun = get_match_ratio(self.settings.presence_strings)
|
||||||
features = input('If features was not detected correctly, write it manually: ')
|
|
||||||
|
presence_list = sorted(a_minus_b, key=match_fun, reverse=True)[
|
||||||
|
:top_features_count
|
||||||
|
]
|
||||||
|
|
||||||
|
print("Detected text features of existing account: " + ", ".join(presence_list))
|
||||||
|
features = input("If features was not detected correctly, write it manually: ")
|
||||||
|
|
||||||
if features:
|
if features:
|
||||||
presence_list = features.split(',')
|
presence_list = list(map(str.strip, features.split(",")))
|
||||||
|
|
||||||
absence_list = sorted(b_minus_a, key=get_match_ratio, reverse=True)[:top_features_count]
|
absence_list = sorted(b_minus_a, key=match_fun, reverse=True)[
|
||||||
print('Detected text features of non-existing account: ' + ', '.join(absence_list))
|
:top_features_count
|
||||||
features = input('If features was not detected correctly, write it manually: ')
|
]
|
||||||
|
print(
|
||||||
|
"Detected text features of non-existing account: " + ", ".join(absence_list)
|
||||||
|
)
|
||||||
|
features = input("If features was not detected correctly, write it manually: ")
|
||||||
|
|
||||||
if features:
|
if features:
|
||||||
absence_list = features.split(',')
|
absence_list = list(map(str.strip, features.split(",")))
|
||||||
|
|
||||||
site_data = {
|
site_data = {
|
||||||
'absenceStrs': absence_list,
|
"absenceStrs": absence_list,
|
||||||
'presenseStrs': presence_list,
|
"presenseStrs": presence_list,
|
||||||
'url': url_user,
|
"url": url_user,
|
||||||
'urlMain': url_mainpage,
|
"urlMain": url_mainpage,
|
||||||
'usernameClaimed': supposed_username,
|
"usernameClaimed": supposed_username,
|
||||||
'usernameUnclaimed': non_exist_username,
|
"usernameUnclaimed": non_exist_username,
|
||||||
'checkType': 'message',
|
"checkType": "message",
|
||||||
}
|
}
|
||||||
|
|
||||||
site = MaigretSite(url_mainpage.split('/')[-1], site_data)
|
if headers != self.HEADERS:
|
||||||
|
site_data['headers'] = headers
|
||||||
|
|
||||||
|
site = MaigretSite(url_mainpage.split("/")[-1], site_data)
|
||||||
return site
|
return site
|
||||||
|
|
||||||
async def submit_dialog(db, url_exists, cookie_file):
|
async def dialog(self, url_exists, cookie_file):
|
||||||
domain_raw = URL_RE.sub('', url_exists).strip().strip('/')
|
domain_raw = self.URL_RE.sub("", url_exists).strip().strip("/")
|
||||||
domain_raw = domain_raw.split('/')[0]
|
domain_raw = domain_raw.split("/")[0]
|
||||||
|
self.logger.info('Domain is %s', domain_raw)
|
||||||
|
|
||||||
# check for existence
|
# check for existence
|
||||||
matched_sites = list(filter(lambda x: domain_raw in x.url_main + x.url, db.sites))
|
matched_sites = list(
|
||||||
|
filter(lambda x: domain_raw in x.url_main + x.url, self.db.sites)
|
||||||
|
)
|
||||||
|
|
||||||
if matched_sites:
|
if matched_sites:
|
||||||
print(f'Sites with domain "{domain_raw}" already exists in the Maigret database!')
|
print(
|
||||||
status = lambda s: '(disabled)' if s.disabled else ''
|
f'Sites with domain "{domain_raw}" already exists in the Maigret database!'
|
||||||
url_block = lambda s: f'\n\t{s.url_main}\n\t{s.url}'
|
)
|
||||||
print('\n'.join([f'{site.name} {status(site)}{url_block(site)}' for site in matched_sites]))
|
status = lambda s: "(disabled)" if s.disabled else ""
|
||||||
|
url_block = lambda s: f"\n\t{s.url_main}\n\t{s.url}"
|
||||||
|
print(
|
||||||
|
"\n".join(
|
||||||
|
[
|
||||||
|
f"{site.name} {status(site)}{url_block(site)}"
|
||||||
|
for site in matched_sites
|
||||||
|
]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if input("Do you want to continue? [yN] ").lower() in "n":
|
||||||
return False
|
return False
|
||||||
|
|
||||||
url_mainpage = extract_mainpage_url(url_exists)
|
url_mainpage = self.extract_mainpage_url(url_exists)
|
||||||
|
|
||||||
|
print('Detecting site engine, please wait...')
|
||||||
|
sites = []
|
||||||
|
try:
|
||||||
|
sites = await self.detect_known_engine(url_exists, url_mainpage)
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print('Engine detect process is interrupted.')
|
||||||
|
|
||||||
sites = await detect_known_engine(db, url_exists, url_mainpage)
|
|
||||||
if not sites:
|
if not sites:
|
||||||
print('Unable to detect site engine, lets generate checking features')
|
print("Unable to detect site engine, lets generate checking features")
|
||||||
sites = [await check_features_manually(db, url_exists, url_mainpage, cookie_file)]
|
sites = [
|
||||||
|
await self.check_features_manually(
|
||||||
|
url_exists, url_mainpage, cookie_file
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
print(sites[0].__dict__)
|
self.logger.debug(sites[0].__dict__)
|
||||||
|
|
||||||
sem = asyncio.Semaphore(1)
|
sem = asyncio.Semaphore(1)
|
||||||
log_level = logging.INFO
|
|
||||||
logging.basicConfig(
|
|
||||||
format='[%(filename)s:%(lineno)d] %(levelname)-3s %(asctime)s %(message)s',
|
|
||||||
datefmt='%H:%M:%S',
|
|
||||||
level=log_level
|
|
||||||
)
|
|
||||||
logger = logging.getLogger('site-submit')
|
|
||||||
logger.setLevel(log_level)
|
|
||||||
|
|
||||||
|
print("Checking, please wait...")
|
||||||
found = False
|
found = False
|
||||||
chosen_site = None
|
chosen_site = None
|
||||||
for s in sites:
|
for s in sites:
|
||||||
chosen_site = s
|
chosen_site = s
|
||||||
result = await site_self_check(s, logger, sem, db)
|
result = await self.site_self_check(s, sem)
|
||||||
if not result['disabled']:
|
if not result["disabled"]:
|
||||||
found = True
|
found = True
|
||||||
break
|
break
|
||||||
|
|
||||||
if not found:
|
if not found:
|
||||||
print(f'Sorry, we couldn\'t find params to detect account presence/absence in {chosen_site.name}.')
|
print(
|
||||||
print('Try to run this mode again and increase features count or choose others.')
|
f"Sorry, we couldn't find params to detect account presence/absence in {chosen_site.name}."
|
||||||
else:
|
)
|
||||||
if input(f'Site {chosen_site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] ').lower() in 'y':
|
print(
|
||||||
print(chosen_site.json)
|
"Try to run this mode again and increase features count or choose others."
|
||||||
site_data = chosen_site.strip_engine_data()
|
)
|
||||||
print(site_data.json)
|
self.logger.debug(json.dumps(chosen_site.json))
|
||||||
db.update_site(site_data)
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
else:
|
||||||
|
if (
|
||||||
|
input(
|
||||||
|
f"Site {chosen_site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] "
|
||||||
|
)
|
||||||
|
.lower()
|
||||||
|
.strip("y")
|
||||||
|
):
|
||||||
|
return False
|
||||||
|
|
||||||
|
chosen_site.name = input("Change site name if you want: ") or chosen_site.name
|
||||||
|
chosen_site.tags = list(map(str.strip, input("Site tags: ").split(',')))
|
||||||
|
rank = Submitter.get_alexa_rank(chosen_site.url_main)
|
||||||
|
if rank:
|
||||||
|
print(f'New alexa rank: {rank}')
|
||||||
|
chosen_site.alexa_rank = rank
|
||||||
|
|
||||||
|
self.logger.debug(chosen_site.json)
|
||||||
|
site_data = chosen_site.strip_engine_data()
|
||||||
|
self.logger.debug(site_data.json)
|
||||||
|
self.db.update_site(site_data)
|
||||||
|
return True
|
||||||
|
|||||||
@@ -0,0 +1,11 @@
|
|||||||
|
from typing import Callable, List, Dict, Tuple, Any
|
||||||
|
|
||||||
|
|
||||||
|
# search query
|
||||||
|
QueryDraft = Tuple[Callable, List, Dict]
|
||||||
|
|
||||||
|
# options dict
|
||||||
|
QueryOptions = Dict[str, Any]
|
||||||
|
|
||||||
|
# TODO: throw out
|
||||||
|
QueryResultWrapper = Dict[str, Any]
|
||||||
+58
-23
@@ -1,78 +1,113 @@
|
|||||||
|
import ast
|
||||||
|
import difflib
|
||||||
import re
|
import re
|
||||||
|
import random
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_USER_AGENTS = [
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class CaseConverter:
|
class CaseConverter:
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def camel_to_snake(camelcased_string: str) -> str:
|
def camel_to_snake(camelcased_string: str) -> str:
|
||||||
return re.sub(r'(?<!^)(?=[A-Z])', '_', camelcased_string).lower()
|
return re.sub(r"(?<!^)(?=[A-Z])", "_", camelcased_string).lower()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def snake_to_camel(snakecased_string: str) -> str:
|
def snake_to_camel(snakecased_string: str) -> str:
|
||||||
formatted = ''.join(word.title() for word in snakecased_string.split('_'))
|
formatted = "".join(word.title() for word in snakecased_string.split("_"))
|
||||||
result = formatted[0].lower() + formatted[1:]
|
result = formatted[0].lower() + formatted[1:]
|
||||||
return result
|
return result
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def snake_to_title(snakecased_string: str) -> str:
|
def snake_to_title(snakecased_string: str) -> str:
|
||||||
words = snakecased_string.split('_')
|
words = snakecased_string.split("_")
|
||||||
words[0] = words[0].title()
|
words[0] = words[0].title()
|
||||||
return ' '.join(words)
|
return " ".join(words)
|
||||||
|
|
||||||
|
|
||||||
def is_country_tag(tag: str) -> bool:
|
def is_country_tag(tag: str) -> bool:
|
||||||
"""detect if tag represent a country"""
|
"""detect if tag represent a country"""
|
||||||
return bool(re.match("^([a-zA-Z]){2}$", tag)) or tag == 'global'
|
return bool(re.match("^([a-zA-Z]){2}$", tag)) or tag == "global"
|
||||||
|
|
||||||
|
|
||||||
def enrich_link_str(link: str) -> str:
|
def enrich_link_str(link: str) -> str:
|
||||||
link = link.strip()
|
link = link.strip()
|
||||||
if link.startswith('www.') or (link.startswith('http') and '//' in link):
|
if link.startswith("www.") or (link.startswith("http") and "//" in link):
|
||||||
return f'<a class="auto-link" href="{link}">{link}</a>'
|
return f'<a class="auto-link" href="{link}">{link}</a>'
|
||||||
return link
|
return link
|
||||||
|
|
||||||
|
|
||||||
class URLMatcher:
|
class URLMatcher:
|
||||||
_HTTP_URL_RE_STR = '^https?://(www.)?(.+)$'
|
_HTTP_URL_RE_STR = "^https?://(www.)?(.+)$"
|
||||||
HTTP_URL_RE = re.compile(_HTTP_URL_RE_STR)
|
HTTP_URL_RE = re.compile(_HTTP_URL_RE_STR)
|
||||||
UNSAFE_SYMBOLS = '.?'
|
UNSAFE_SYMBOLS = ".?"
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def extract_main_part(self, url: str) -> str:
|
def extract_main_part(self, url: str) -> str:
|
||||||
match = self.HTTP_URL_RE.search(url)
|
match = self.HTTP_URL_RE.search(url)
|
||||||
if match and match.group(2):
|
if match and match.group(2):
|
||||||
return match.group(2).rstrip('/')
|
return match.group(2).rstrip("/")
|
||||||
|
|
||||||
return ''
|
return ""
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def make_profile_url_regexp(self, url: str, username_regexp: str = ''):
|
def make_profile_url_regexp(self, url: str, username_regexp: str = ""):
|
||||||
url_main_part = self.extract_main_part(url)
|
url_main_part = self.extract_main_part(url)
|
||||||
for c in self.UNSAFE_SYMBOLS:
|
for c in self.UNSAFE_SYMBOLS:
|
||||||
url_main_part = url_main_part.replace(c, f'\\{c}')
|
url_main_part = url_main_part.replace(c, f"\\{c}")
|
||||||
username_regexp = username_regexp or '.+?'
|
prepared_username_regexp = (username_regexp or ".+?").lstrip('^').rstrip('$')
|
||||||
|
|
||||||
url_regexp = url_main_part.replace('{username}', f'({username_regexp})')
|
url_regexp = url_main_part.replace(
|
||||||
regexp_str = self._HTTP_URL_RE_STR.replace('(.+)', url_regexp)
|
"{username}", f"({prepared_username_regexp})"
|
||||||
|
)
|
||||||
|
regexp_str = self._HTTP_URL_RE_STR.replace("(.+)", url_regexp)
|
||||||
|
|
||||||
return re.compile(regexp_str)
|
return re.compile(regexp_str)
|
||||||
|
|
||||||
|
|
||||||
def get_dict_ascii_tree(items, prepend='', new_line=True):
|
def ascii_data_display(data: str) -> Any:
|
||||||
text = ''
|
return ast.literal_eval(data)
|
||||||
|
|
||||||
|
|
||||||
|
def get_dict_ascii_tree(items, prepend="", new_line=True):
|
||||||
|
text = ""
|
||||||
for num, item in enumerate(items):
|
for num, item in enumerate(items):
|
||||||
box_symbol = '┣╸' if num != len(items) - 1 else '┗╸'
|
box_symbol = "┣╸" if num != len(items) - 1 else "┗╸"
|
||||||
|
|
||||||
if type(item) == tuple:
|
if type(item) == tuple:
|
||||||
field_name, field_value = item
|
field_name, field_value = item
|
||||||
if field_value.startswith('[\''):
|
if field_value.startswith("['"):
|
||||||
is_last_item = num == len(items) - 1
|
is_last_item = num == len(items) - 1
|
||||||
prepend_symbols = ' ' * 3 if is_last_item else ' ┃ '
|
prepend_symbols = " " * 3 if is_last_item else " ┃ "
|
||||||
field_value = print_ascii_tree(eval(field_value), prepend_symbols)
|
data = ascii_data_display(field_value)
|
||||||
text += f'\n{prepend}{box_symbol}{field_name}: {field_value}'
|
field_value = get_dict_ascii_tree(data, prepend_symbols)
|
||||||
|
text += f"\n{prepend}{box_symbol}{field_name}: {field_value}"
|
||||||
else:
|
else:
|
||||||
text += f'\n{prepend}{box_symbol} {item}'
|
text += f"\n{prepend}{box_symbol} {item}"
|
||||||
|
|
||||||
if not new_line:
|
if not new_line:
|
||||||
text = text[1:]
|
text = text[1:]
|
||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def get_random_user_agent():
|
||||||
|
return random.choice(DEFAULT_USER_AGENTS)
|
||||||
|
|
||||||
|
|
||||||
|
def get_match_ratio(base_strs: list):
|
||||||
|
def get_match_inner(s: str):
|
||||||
|
return round(
|
||||||
|
max(
|
||||||
|
[
|
||||||
|
difflib.SequenceMatcher(a=s.lower(), b=s2.lower()).ratio()
|
||||||
|
for s2 in base_strs
|
||||||
|
]
|
||||||
|
),
|
||||||
|
2,
|
||||||
|
)
|
||||||
|
|
||||||
|
return get_match_inner
|
||||||
|
|||||||
+4
-1
@@ -1,3 +1,4 @@
|
|||||||
|
aiodns==3.0.0
|
||||||
aiohttp==3.7.4
|
aiohttp==3.7.4
|
||||||
aiohttp-socks==0.5.5
|
aiohttp-socks==0.5.5
|
||||||
arabic-reshaper==2.1.1
|
arabic-reshaper==2.1.1
|
||||||
@@ -26,7 +27,7 @@ python-socks==1.1.2
|
|||||||
requests>=2.24.0
|
requests>=2.24.0
|
||||||
requests-futures==1.0.0
|
requests-futures==1.0.0
|
||||||
six==1.15.0
|
six==1.15.0
|
||||||
socid-extractor>=0.0.16
|
socid-extractor>=0.0.21
|
||||||
soupsieve==2.1
|
soupsieve==2.1
|
||||||
stem==1.8.0
|
stem==1.8.0
|
||||||
torrequest==0.1.0
|
torrequest==0.1.0
|
||||||
@@ -36,3 +37,5 @@ webencodings==0.5.1
|
|||||||
xhtml2pdf==0.2.5
|
xhtml2pdf==0.2.5
|
||||||
XMind==1.2.0
|
XMind==1.2.0
|
||||||
yarl==1.6.3
|
yarl==1.6.3
|
||||||
|
networkx==2.5.1
|
||||||
|
pyvis==0.1.9
|
||||||
|
|||||||
@@ -1,3 +1,9 @@
|
|||||||
[egg_info]
|
[egg_info]
|
||||||
tag_build =
|
tag_build =
|
||||||
tag_date = 0
|
tag_date = 0
|
||||||
|
|
||||||
|
[flake8]
|
||||||
|
per-file-ignores = __init__.py:F401
|
||||||
|
|
||||||
|
[mypy]
|
||||||
|
ignore_missing_imports = True
|
||||||
@@ -5,14 +5,13 @@ from setuptools import (
|
|||||||
|
|
||||||
|
|
||||||
with open('README.md') as fh:
|
with open('README.md') as fh:
|
||||||
readme = fh.read()
|
long_description = fh.read()
|
||||||
long_description = readme.replace('./', 'https://raw.githubusercontent.com/soxoj/maigret/main/')
|
|
||||||
|
|
||||||
with open('requirements.txt') as rf:
|
with open('requirements.txt') as rf:
|
||||||
requires = rf.read().splitlines()
|
requires = rf.read().splitlines()
|
||||||
|
|
||||||
setup(name='maigret',
|
setup(name='maigret',
|
||||||
version='0.1.18',
|
version='0.3.1',
|
||||||
description='Collect a dossier on a person by username from a huge number of sites',
|
description='Collect a dossier on a person by username from a huge number of sites',
|
||||||
long_description=long_description,
|
long_description=long_description,
|
||||||
long_description_content_type="text/markdown",
|
long_description_content_type="text/markdown",
|
||||||
|
|||||||
@@ -0,0 +1,6 @@
|
|||||||
|
flake8==3.8.4
|
||||||
|
pytest==6.2.4
|
||||||
|
pytest-asyncio==0.14.0
|
||||||
|
pytest-cov==2.10.1
|
||||||
|
pytest-httpserver==1.0.0
|
||||||
|
pytest-rerunfailures==9.1.1
|
||||||
+27
-4
@@ -6,10 +6,14 @@ import pytest
|
|||||||
from _pytest.mark import Mark
|
from _pytest.mark import Mark
|
||||||
|
|
||||||
from maigret.sites import MaigretDatabase
|
from maigret.sites import MaigretDatabase
|
||||||
|
from maigret.maigret import setup_arguments_parser
|
||||||
|
|
||||||
|
|
||||||
CUR_PATH = os.path.dirname(os.path.realpath(__file__))
|
CUR_PATH = os.path.dirname(os.path.realpath(__file__))
|
||||||
JSON_FILE = os.path.join(CUR_PATH, '../maigret/resources/data.json')
|
JSON_FILE = os.path.join(CUR_PATH, '../maigret/resources/data.json')
|
||||||
empty_mark = Mark('', [], {})
|
TEST_JSON_FILE = os.path.join(CUR_PATH, 'db.json')
|
||||||
|
LOCAL_TEST_JSON_FILE = os.path.join(CUR_PATH, 'local.json')
|
||||||
|
empty_mark = Mark('', (), {})
|
||||||
|
|
||||||
|
|
||||||
def by_slow_marker(item):
|
def by_slow_marker(item):
|
||||||
@@ -26,15 +30,24 @@ def get_test_reports_filenames():
|
|||||||
|
|
||||||
def remove_test_reports():
|
def remove_test_reports():
|
||||||
reports_list = get_test_reports_filenames()
|
reports_list = get_test_reports_filenames()
|
||||||
for f in reports_list: os.remove(f)
|
for f in reports_list:
|
||||||
|
os.remove(f)
|
||||||
logging.error(f'Removed test reports {reports_list}')
|
logging.error(f'Removed test reports {reports_list}')
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope='session')
|
@pytest.fixture(scope='session')
|
||||||
def default_db():
|
def default_db():
|
||||||
db = MaigretDatabase().load_from_file(JSON_FILE)
|
return MaigretDatabase().load_from_file(JSON_FILE)
|
||||||
|
|
||||||
return db
|
|
||||||
|
@pytest.fixture(scope='function')
|
||||||
|
def test_db():
|
||||||
|
return MaigretDatabase().load_from_file(TEST_JSON_FILE)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope='function')
|
||||||
|
def local_test_db():
|
||||||
|
return MaigretDatabase().load_from_file(LOCAL_TEST_JSON_FILE)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
@pytest.fixture(autouse=True)
|
||||||
@@ -42,3 +55,13 @@ def reports_autoclean():
|
|||||||
remove_test_reports()
|
remove_test_reports()
|
||||||
yield
|
yield
|
||||||
remove_test_reports()
|
remove_test_reports()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope='session')
|
||||||
|
def argparser():
|
||||||
|
return setup_arguments_parser()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def httpserver_listen_address():
|
||||||
|
return ("localhost", 8989)
|
||||||
|
|||||||
@@ -0,0 +1,26 @@
|
|||||||
|
{
|
||||||
|
"engines": {},
|
||||||
|
"sites": {
|
||||||
|
"GooglePlayStore": {
|
||||||
|
"tags": ["global", "us"],
|
||||||
|
"disabled": false,
|
||||||
|
"checkType": "status_code",
|
||||||
|
"alexaRank": 1,
|
||||||
|
"url": "https://play.google.com/store/apps/developer?id={username}",
|
||||||
|
"urlMain": "https://play.google.com/store",
|
||||||
|
"usernameClaimed": "Facebook_nosuchname",
|
||||||
|
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||||
|
},
|
||||||
|
"Reddit": {
|
||||||
|
"tags": ["news", "social", "us"],
|
||||||
|
"checkType": "status_code",
|
||||||
|
"presenseStrs": ["totalKarma"],
|
||||||
|
"disabled": true,
|
||||||
|
"alexaRank": 17,
|
||||||
|
"url": "https://www.reddit.com/user/{username}",
|
||||||
|
"urlMain": "https://www.reddit.com/",
|
||||||
|
"usernameClaimed": "blue",
|
||||||
|
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,21 @@
|
|||||||
|
{
|
||||||
|
"engines": {},
|
||||||
|
"sites": {
|
||||||
|
"StatusCode": {
|
||||||
|
"checkType": "status_code",
|
||||||
|
"url": "http://localhost:8989/url?id={username}",
|
||||||
|
"urlMain": "http://localhost:8989/",
|
||||||
|
"usernameClaimed": "claimed",
|
||||||
|
"usernameUnclaimed": "unclaimed"
|
||||||
|
},
|
||||||
|
"Message": {
|
||||||
|
"checkType": "message",
|
||||||
|
"url": "http://localhost:8989/url?id={username}",
|
||||||
|
"urlMain": "http://localhost:8989/",
|
||||||
|
"presenseStrs": ["user", "profile"],
|
||||||
|
"absenseStrs": ["not found", "404"],
|
||||||
|
"usernameClaimed": "claimed",
|
||||||
|
"usernameUnclaimed": "unclaimed"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -22,6 +22,7 @@ httpbin.org FALSE / FALSE 0 a b
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.skip(reason="periodically fails")
|
||||||
@pytest.mark.slow
|
@pytest.mark.slow
|
||||||
def test_twitter_activation(default_db):
|
def test_twitter_activation(default_db):
|
||||||
twitter_site = default_db.sites_dict['Twitter']
|
twitter_site = default_db.sites_dict['Twitter']
|
||||||
@@ -39,13 +40,14 @@ async def test_import_aiohttp_cookies():
|
|||||||
with open(cookies_filename, 'w') as f:
|
with open(cookies_filename, 'w') as f:
|
||||||
f.write(COOKIES_TXT)
|
f.write(COOKIES_TXT)
|
||||||
|
|
||||||
cookie_jar = await import_aiohttp_cookies(cookies_filename)
|
cookie_jar = import_aiohttp_cookies(cookies_filename)
|
||||||
assert list(cookie_jar._cookies.keys()) == ['xss.is', 'httpbin.org']
|
assert list(cookie_jar._cookies.keys()) == ['xss.is', 'httpbin.org']
|
||||||
|
|
||||||
url = 'https://httpbin.org/cookies'
|
url = 'https://httpbin.org/cookies'
|
||||||
connector = aiohttp.TCPConnector(ssl=False)
|
connector = aiohttp.TCPConnector(ssl=False)
|
||||||
session = aiohttp.ClientSession(connector=connector, trust_env=True,
|
session = aiohttp.ClientSession(
|
||||||
cookie_jar=cookie_jar)
|
connector=connector, trust_env=True, cookie_jar=cookie_jar
|
||||||
|
)
|
||||||
|
|
||||||
response = await session.get(url=url)
|
response = await session.get(url=url)
|
||||||
result = json.loads(await response.content.read())
|
result = json.loads(await response.content.read())
|
||||||
|
|||||||
+51
-48
@@ -1,66 +1,69 @@
|
|||||||
"""Maigret checking logic test functions"""
|
from mock import Mock
|
||||||
import pytest
|
import pytest
|
||||||
import asyncio
|
|
||||||
import logging
|
|
||||||
from maigret.checking import AsyncioSimpleExecutor, AsyncioProgressbarExecutor, AsyncioProgressbarSemaphoreExecutor, AsyncioProgressbarQueueExecutor
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
from maigret import search
|
||||||
|
|
||||||
async def func(n):
|
|
||||||
await asyncio.sleep(0.1 * (n % 3))
|
|
||||||
return n
|
|
||||||
|
|
||||||
|
|
||||||
|
def site_result_except(server, username, **kwargs):
|
||||||
|
query = f'id={username}'
|
||||||
|
server.expect_request('/url', query_string=query).respond_with_data(**kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_simple_asyncio_executor():
|
async def test_checking_by_status_code(httpserver, local_test_db):
|
||||||
tasks = [(func, [n], {}) for n in range(10)]
|
sites_dict = local_test_db.sites_dict
|
||||||
executor = AsyncioSimpleExecutor(logger=logger)
|
|
||||||
assert await executor.run(tasks) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
|
|
||||||
assert executor.execution_time > 0.2
|
|
||||||
assert executor.execution_time < 0.3
|
|
||||||
|
|
||||||
|
site_result_except(httpserver, 'claimed', status=200)
|
||||||
|
site_result_except(httpserver, 'unclaimed', status=404)
|
||||||
|
|
||||||
|
result = await search('claimed', site_dict=sites_dict, logger=Mock())
|
||||||
|
assert result['StatusCode']['status'].is_found() is True
|
||||||
|
|
||||||
|
result = await search('unclaimed', site_dict=sites_dict, logger=Mock())
|
||||||
|
assert result['StatusCode']['status'].is_found() is False
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_asyncio_progressbar_executor():
|
async def test_checking_by_message_positive_full(httpserver, local_test_db):
|
||||||
tasks = [(func, [n], {}) for n in range(10)]
|
sites_dict = local_test_db.sites_dict
|
||||||
|
|
||||||
executor = AsyncioProgressbarExecutor(logger=logger)
|
site_result_except(httpserver, 'claimed', response_data="user profile")
|
||||||
# no guarantees for the results order
|
site_result_except(httpserver, 'unclaimed', response_data="404 not found")
|
||||||
assert sorted(await executor.run(tasks)) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
|
|
||||||
assert executor.execution_time > 0.2
|
result = await search('claimed', site_dict=sites_dict, logger=Mock())
|
||||||
assert executor.execution_time < 0.3
|
assert result['Message']['status'].is_found() is True
|
||||||
|
|
||||||
|
result = await search('unclaimed', site_dict=sites_dict, logger=Mock())
|
||||||
|
assert result['Message']['status'].is_found() is False
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_asyncio_progressbar_semaphore_executor():
|
async def test_checking_by_message_positive_part(httpserver, local_test_db):
|
||||||
tasks = [(func, [n], {}) for n in range(10)]
|
sites_dict = local_test_db.sites_dict
|
||||||
|
|
||||||
executor = AsyncioProgressbarSemaphoreExecutor(logger=logger, in_parallel=5)
|
site_result_except(httpserver, 'claimed', response_data="profile")
|
||||||
# no guarantees for the results order
|
site_result_except(httpserver, 'unclaimed', response_data="404")
|
||||||
assert sorted(await executor.run(tasks)) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
|
|
||||||
assert executor.execution_time > 0.2
|
result = await search('claimed', site_dict=sites_dict, logger=Mock())
|
||||||
assert executor.execution_time < 0.4
|
assert result['Message']['status'].is_found() is True
|
||||||
|
|
||||||
|
result = await search('unclaimed', site_dict=sites_dict, logger=Mock())
|
||||||
|
assert result['Message']['status'].is_found() is False
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_asyncio_progressbar_queue_executor():
|
async def test_checking_by_message_negative(httpserver, local_test_db):
|
||||||
tasks = [(func, [n], {}) for n in range(10)]
|
sites_dict = local_test_db.sites_dict
|
||||||
|
|
||||||
executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=2)
|
site_result_except(httpserver, 'claimed', response_data="")
|
||||||
assert await executor.run(tasks) == [0, 1, 3, 2, 4, 6, 7, 5, 9, 8]
|
site_result_except(httpserver, 'unclaimed', response_data="user 404")
|
||||||
assert executor.execution_time > 0.5
|
|
||||||
assert executor.execution_time < 0.6
|
|
||||||
|
|
||||||
executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=3)
|
result = await search('claimed', site_dict=sites_dict, logger=Mock())
|
||||||
assert await executor.run(tasks) == [0, 3, 1, 4, 6, 2, 7, 9, 5, 8]
|
assert result['Message']['status'].is_found() is False
|
||||||
assert executor.execution_time > 0.4
|
|
||||||
assert executor.execution_time < 0.5
|
|
||||||
|
|
||||||
executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=5)
|
result = await search('unclaimed', site_dict=sites_dict, logger=Mock())
|
||||||
assert await executor.run(tasks) == [0, 3, 6, 1, 4, 7, 9, 2, 5, 8]
|
assert result['Message']['status'].is_found() is True
|
||||||
assert executor.execution_time > 0.3
|
|
||||||
assert executor.execution_time < 0.4
|
|
||||||
|
|
||||||
executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=10)
|
|
||||||
assert await executor.run(tasks) == [0, 3, 6, 9, 1, 4, 7, 2, 5, 8]
|
|
||||||
assert executor.execution_time > 0.2
|
|
||||||
assert executor.execution_time < 0.3
|
|
||||||
|
|||||||
@@ -0,0 +1,98 @@
|
|||||||
|
"""Maigret command-line arguments parsing tests"""
|
||||||
|
from argparse import Namespace
|
||||||
|
from typing import Dict, Any
|
||||||
|
|
||||||
|
DEFAULT_ARGS: Dict[str, Any] = {
|
||||||
|
'all_sites': False,
|
||||||
|
'connections': 100,
|
||||||
|
'cookie_file': None,
|
||||||
|
'csv': False,
|
||||||
|
'db_file': None,
|
||||||
|
'debug': False,
|
||||||
|
'disable_extracting': False,
|
||||||
|
'disable_recursive_search': False,
|
||||||
|
'folderoutput': 'reports',
|
||||||
|
'html': False,
|
||||||
|
'graph': False,
|
||||||
|
'id_type': 'username',
|
||||||
|
'ignore_ids_list': [],
|
||||||
|
'info': False,
|
||||||
|
'json': '',
|
||||||
|
'new_site_to_submit': False,
|
||||||
|
'no_color': False,
|
||||||
|
'no_progressbar': False,
|
||||||
|
'parse_url': '',
|
||||||
|
'pdf': False,
|
||||||
|
'print_check_errors': False,
|
||||||
|
'print_not_found': False,
|
||||||
|
'proxy': None,
|
||||||
|
'reports_sorting': 'default',
|
||||||
|
'retries': 1,
|
||||||
|
'self_check': False,
|
||||||
|
'site_list': [],
|
||||||
|
'stats': False,
|
||||||
|
'tags': '',
|
||||||
|
'timeout': 30,
|
||||||
|
'tor_proxy': 'socks5://127.0.0.1:9050',
|
||||||
|
'i2p_proxy': 'http://127.0.0.1:4444',
|
||||||
|
'top_sites': 500,
|
||||||
|
'txt': False,
|
||||||
|
'use_disabled_sites': False,
|
||||||
|
'username': [],
|
||||||
|
'verbose': False,
|
||||||
|
'with_domains': False,
|
||||||
|
'xmind': False,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_args_search_mode(argparser):
|
||||||
|
args = argparser.parse_args('username'.split())
|
||||||
|
|
||||||
|
assert args.username == ['username']
|
||||||
|
|
||||||
|
want_args = dict(DEFAULT_ARGS)
|
||||||
|
want_args.update({'username': ['username']})
|
||||||
|
|
||||||
|
assert args == Namespace(**want_args)
|
||||||
|
|
||||||
|
|
||||||
|
def test_args_search_mode_several_usernames(argparser):
|
||||||
|
args = argparser.parse_args('username1 username2'.split())
|
||||||
|
|
||||||
|
assert args.username == ['username1', 'username2']
|
||||||
|
|
||||||
|
want_args = dict(DEFAULT_ARGS)
|
||||||
|
want_args.update({'username': ['username1', 'username2']})
|
||||||
|
|
||||||
|
assert args == Namespace(**want_args)
|
||||||
|
|
||||||
|
|
||||||
|
def test_args_self_check_mode(argparser):
|
||||||
|
args = argparser.parse_args('--self-check --site GitHub'.split())
|
||||||
|
|
||||||
|
want_args = dict(DEFAULT_ARGS)
|
||||||
|
want_args.update(
|
||||||
|
{
|
||||||
|
'self_check': True,
|
||||||
|
'site_list': ['GitHub'],
|
||||||
|
'username': [],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
assert args == Namespace(**want_args)
|
||||||
|
|
||||||
|
|
||||||
|
def test_args_multiple_sites(argparser):
|
||||||
|
args = argparser.parse_args(
|
||||||
|
'--site GitHub VK --site PornHub --site Taringa,Steam'.split()
|
||||||
|
)
|
||||||
|
|
||||||
|
want_args = dict(DEFAULT_ARGS)
|
||||||
|
want_args.update(
|
||||||
|
{
|
||||||
|
'site_list': ['GitHub', 'PornHub', 'Taringa,Steam'],
|
||||||
|
'username': ['VK'],
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
assert args == Namespace(**want_args)
|
||||||
@@ -0,0 +1,16 @@
|
|||||||
|
"""Maigret data test functions"""
|
||||||
|
|
||||||
|
from maigret.utils import is_country_tag
|
||||||
|
|
||||||
|
|
||||||
|
def test_tags_validity(default_db):
|
||||||
|
unknown_tags = set()
|
||||||
|
|
||||||
|
tags = default_db._tags
|
||||||
|
|
||||||
|
for site in default_db.sites:
|
||||||
|
for tag in filter(lambda x: not is_country_tag(x), site.tags):
|
||||||
|
if tag not in tags:
|
||||||
|
unknown_tags.add(tag)
|
||||||
|
|
||||||
|
assert unknown_tags == set()
|
||||||
@@ -0,0 +1,73 @@
|
|||||||
|
"""Maigret checking logic test functions"""
|
||||||
|
import pytest
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
from maigret.executors import (
|
||||||
|
AsyncioSimpleExecutor,
|
||||||
|
AsyncioProgressbarExecutor,
|
||||||
|
AsyncioProgressbarSemaphoreExecutor,
|
||||||
|
AsyncioProgressbarQueueExecutor,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
async def func(n):
|
||||||
|
await asyncio.sleep(0.1 * (n % 3))
|
||||||
|
return n
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_simple_asyncio_executor():
|
||||||
|
tasks = [(func, [n], {}) for n in range(10)]
|
||||||
|
executor = AsyncioSimpleExecutor(logger=logger)
|
||||||
|
assert await executor.run(tasks) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
|
||||||
|
assert executor.execution_time > 0.2
|
||||||
|
assert executor.execution_time < 0.3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_asyncio_progressbar_executor():
|
||||||
|
tasks = [(func, [n], {}) for n in range(10)]
|
||||||
|
|
||||||
|
executor = AsyncioProgressbarExecutor(logger=logger)
|
||||||
|
# no guarantees for the results order
|
||||||
|
assert sorted(await executor.run(tasks)) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
|
||||||
|
assert executor.execution_time > 0.2
|
||||||
|
assert executor.execution_time < 0.3
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_asyncio_progressbar_semaphore_executor():
|
||||||
|
tasks = [(func, [n], {}) for n in range(10)]
|
||||||
|
|
||||||
|
executor = AsyncioProgressbarSemaphoreExecutor(logger=logger, in_parallel=5)
|
||||||
|
# no guarantees for the results order
|
||||||
|
assert sorted(await executor.run(tasks)) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
|
||||||
|
assert executor.execution_time > 0.2
|
||||||
|
assert executor.execution_time < 0.4
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_asyncio_progressbar_queue_executor():
|
||||||
|
tasks = [(func, [n], {}) for n in range(10)]
|
||||||
|
|
||||||
|
executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=2)
|
||||||
|
assert await executor.run(tasks) == [0, 1, 3, 2, 4, 6, 7, 5, 9, 8]
|
||||||
|
assert executor.execution_time > 0.5
|
||||||
|
assert executor.execution_time < 0.6
|
||||||
|
|
||||||
|
executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=3)
|
||||||
|
assert await executor.run(tasks) == [0, 3, 1, 4, 6, 2, 7, 9, 5, 8]
|
||||||
|
assert executor.execution_time > 0.4
|
||||||
|
assert executor.execution_time < 0.5
|
||||||
|
|
||||||
|
executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=5)
|
||||||
|
assert await executor.run(tasks) == [0, 3, 6, 1, 4, 7, 9, 2, 5, 8]
|
||||||
|
assert executor.execution_time > 0.3
|
||||||
|
assert executor.execution_time < 0.4
|
||||||
|
|
||||||
|
executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=10)
|
||||||
|
assert await executor.run(tasks) == [0, 3, 6, 9, 1, 4, 7, 2, 5, 8]
|
||||||
|
assert executor.execution_time > 0.2
|
||||||
|
assert executor.execution_time < 0.3
|
||||||
+163
-92
@@ -1,106 +1,177 @@
|
|||||||
"""Maigret main module test functions"""
|
"""Maigret main module test functions"""
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import copy
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from mock import Mock
|
from mock import Mock
|
||||||
|
|
||||||
from maigret.maigret import self_check
|
from maigret.maigret import self_check, maigret
|
||||||
from maigret.sites import MaigretDatabase
|
from maigret.maigret import (
|
||||||
|
extract_ids_from_page,
|
||||||
|
extract_ids_from_results,
|
||||||
|
)
|
||||||
|
from maigret.sites import MaigretSite
|
||||||
|
from maigret.result import QueryResult, QueryStatus
|
||||||
|
|
||||||
EXAMPLE_DB = {
|
|
||||||
'engines': {
|
RESULTS_EXAMPLE = {
|
||||||
|
'Reddit': {
|
||||||
|
'cookies': None,
|
||||||
|
'parsing_enabled': False,
|
||||||
|
'url_main': 'https://www.reddit.com/',
|
||||||
|
'username': 'Facebook',
|
||||||
},
|
},
|
||||||
'sites': {
|
'GooglePlayStore': {
|
||||||
"GooglePlayStore": {
|
'cookies': None,
|
||||||
"tags": [
|
'http_status': 200,
|
||||||
"global",
|
'is_similar': False,
|
||||||
"us"
|
'parsing_enabled': False,
|
||||||
],
|
'rank': 1,
|
||||||
"disabled": False,
|
'url_main': 'https://play.google.com/store',
|
||||||
"checkType": "status_code",
|
'url_user': 'https://play.google.com/store/apps/developer?id=Facebook',
|
||||||
"alexaRank": 1,
|
'username': 'Facebook',
|
||||||
"url": "https://play.google.com/store/apps/developer?id={username}",
|
|
||||||
"urlMain": "https://play.google.com/store",
|
|
||||||
"usernameClaimed": "Facebook_nosuchname",
|
|
||||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
|
||||||
},
|
|
||||||
"Reddit": {
|
|
||||||
"tags": [
|
|
||||||
"news",
|
|
||||||
"social",
|
|
||||||
"us"
|
|
||||||
],
|
|
||||||
"checkType": "status_code",
|
|
||||||
"presenseStrs": [
|
|
||||||
"totalKarma"
|
|
||||||
],
|
|
||||||
"disabled": True,
|
|
||||||
"alexaRank": 17,
|
|
||||||
"url": "https://www.reddit.com/user/{username}",
|
|
||||||
"urlMain": "https://www.reddit.com/",
|
|
||||||
"usernameClaimed": "blue",
|
|
||||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
def test_self_check_db_positive_disable(test_db):
|
||||||
|
logger = Mock()
|
||||||
|
assert test_db.sites[0].disabled is False
|
||||||
|
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
loop.run_until_complete(
|
||||||
|
self_check(test_db, test_db.sites_dict, logger, silent=True)
|
||||||
|
)
|
||||||
|
|
||||||
|
assert test_db.sites[0].disabled is True
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
def test_self_check_db_positive_enable(test_db):
|
||||||
|
logger = Mock()
|
||||||
|
|
||||||
|
test_db.sites[0].disabled = True
|
||||||
|
test_db.sites[0].username_claimed = 'Facebook'
|
||||||
|
assert test_db.sites[0].disabled is True
|
||||||
|
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
loop.run_until_complete(
|
||||||
|
self_check(test_db, test_db.sites_dict, logger, silent=True)
|
||||||
|
)
|
||||||
|
|
||||||
|
assert test_db.sites[0].disabled is False
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
def test_self_check_db_negative_disabled(test_db):
|
||||||
|
logger = Mock()
|
||||||
|
|
||||||
|
test_db.sites[0].disabled = True
|
||||||
|
assert test_db.sites[0].disabled is True
|
||||||
|
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
loop.run_until_complete(
|
||||||
|
self_check(test_db, test_db.sites_dict, logger, silent=True)
|
||||||
|
)
|
||||||
|
|
||||||
|
assert test_db.sites[0].disabled is True
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
def test_self_check_db_negative_enabled(test_db):
|
||||||
|
logger = Mock()
|
||||||
|
|
||||||
|
test_db.sites[0].disabled = False
|
||||||
|
test_db.sites[0].username_claimed = 'Facebook'
|
||||||
|
assert test_db.sites[0].disabled is False
|
||||||
|
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
loop.run_until_complete(
|
||||||
|
self_check(test_db, test_db.sites_dict, logger, silent=True)
|
||||||
|
)
|
||||||
|
|
||||||
|
assert test_db.sites[0].disabled is False
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
def test_maigret_results(test_db):
|
||||||
|
logger = Mock()
|
||||||
|
|
||||||
|
username = 'Facebook'
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
results = loop.run_until_complete(
|
||||||
|
maigret(username, site_dict=test_db.sites_dict, logger=logger, timeout=30)
|
||||||
|
)
|
||||||
|
|
||||||
|
assert isinstance(results, dict)
|
||||||
|
|
||||||
|
reddit_site = results['Reddit']['site']
|
||||||
|
assert isinstance(reddit_site, MaigretSite)
|
||||||
|
|
||||||
|
assert reddit_site.json == {
|
||||||
|
'tags': ['news', 'social', 'us'],
|
||||||
|
'checkType': 'status_code',
|
||||||
|
'presenseStrs': ['totalKarma'],
|
||||||
|
'disabled': True,
|
||||||
|
'alexaRank': 17,
|
||||||
|
'url': 'https://www.reddit.com/user/{username}',
|
||||||
|
'urlMain': 'https://www.reddit.com/',
|
||||||
|
'usernameClaimed': 'blue',
|
||||||
|
'usernameUnclaimed': 'noonewouldeverusethis7',
|
||||||
|
}
|
||||||
|
|
||||||
|
del results['Reddit']['site']
|
||||||
|
del results['GooglePlayStore']['site']
|
||||||
|
|
||||||
|
reddit_status = results['Reddit']['status']
|
||||||
|
assert isinstance(reddit_status, QueryResult)
|
||||||
|
assert reddit_status.status == QueryStatus.ILLEGAL
|
||||||
|
|
||||||
|
playstore_status = results['GooglePlayStore']['status']
|
||||||
|
assert isinstance(playstore_status, QueryResult)
|
||||||
|
assert playstore_status.status == QueryStatus.CLAIMED
|
||||||
|
|
||||||
|
del results['Reddit']['status']
|
||||||
|
del results['GooglePlayStore']['status']
|
||||||
|
|
||||||
|
assert results['Reddit'].get('future') is None
|
||||||
|
del results['GooglePlayStore']['future']
|
||||||
|
del results['GooglePlayStore']['checker']
|
||||||
|
|
||||||
|
assert results == RESULTS_EXAMPLE
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_ids_from_url(default_db):
|
||||||
|
assert default_db.extract_ids_from_url('https://www.reddit.com/user/test') == {
|
||||||
|
'test': 'username'
|
||||||
|
}
|
||||||
|
assert default_db.extract_ids_from_url('https://vk.com/id123') == {'123': 'vk_id'}
|
||||||
|
assert default_db.extract_ids_from_url('https://vk.com/ida123') == {
|
||||||
|
'ida123': 'username'
|
||||||
|
}
|
||||||
|
assert default_db.extract_ids_from_url(
|
||||||
|
'https://my.mail.ru/yandex.ru/dipres8904/'
|
||||||
|
) == {'dipres8904': 'username'}
|
||||||
|
assert default_db.extract_ids_from_url(
|
||||||
|
'https://reviews.yandex.ru/user/adbced123'
|
||||||
|
) == {'adbced123': 'yandex_public_id'}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
def test_extract_ids_from_page(test_db):
|
||||||
|
logger = Mock()
|
||||||
|
extract_ids_from_page('https://www.reddit.com/user/test', logger) == {
|
||||||
|
'test': 'username'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.slow
|
def test_extract_ids_from_results(test_db):
|
||||||
def test_self_check_db_positive_disable():
|
TEST_EXAMPLE = copy.deepcopy(RESULTS_EXAMPLE)
|
||||||
logger = Mock()
|
TEST_EXAMPLE['Reddit']['ids_usernames'] = {'test1': 'yandex_public_id'}
|
||||||
db = MaigretDatabase()
|
TEST_EXAMPLE['Reddit']['ids_links'] = ['https://www.reddit.com/user/test2']
|
||||||
db.load_from_json(EXAMPLE_DB)
|
|
||||||
|
|
||||||
assert db.sites[0].disabled == False
|
extract_ids_from_results(TEST_EXAMPLE, test_db) == {
|
||||||
|
'test1': 'yandex_public_id',
|
||||||
loop = asyncio.get_event_loop()
|
'test2': 'username',
|
||||||
loop.run_until_complete(self_check(db, db.sites_dict, logger, silent=True))
|
}
|
||||||
|
|
||||||
assert db.sites[0].disabled == True
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.slow
|
|
||||||
def test_self_check_db_positive_enable():
|
|
||||||
logger = Mock()
|
|
||||||
db = MaigretDatabase()
|
|
||||||
db.load_from_json(EXAMPLE_DB)
|
|
||||||
|
|
||||||
db.sites[0].disabled = True
|
|
||||||
db.sites[0].username_claimed = 'Facebook'
|
|
||||||
assert db.sites[0].disabled == True
|
|
||||||
|
|
||||||
loop = asyncio.get_event_loop()
|
|
||||||
loop.run_until_complete(self_check(db, db.sites_dict, logger, silent=True))
|
|
||||||
|
|
||||||
assert db.sites[0].disabled == False
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.slow
|
|
||||||
def test_self_check_db_negative_disabled():
|
|
||||||
logger = Mock()
|
|
||||||
db = MaigretDatabase()
|
|
||||||
db.load_from_json(EXAMPLE_DB)
|
|
||||||
|
|
||||||
db.sites[0].disabled = True
|
|
||||||
assert db.sites[0].disabled == True
|
|
||||||
|
|
||||||
loop = asyncio.get_event_loop()
|
|
||||||
loop.run_until_complete(self_check(db, db.sites_dict, logger, silent=True))
|
|
||||||
|
|
||||||
assert db.sites[0].disabled == True
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.slow
|
|
||||||
def test_self_check_db_negative_enabled():
|
|
||||||
logger = Mock()
|
|
||||||
db = MaigretDatabase()
|
|
||||||
db.load_from_json(EXAMPLE_DB)
|
|
||||||
|
|
||||||
db.sites[0].disabled = False
|
|
||||||
db.sites[0].username_claimed = 'Facebook'
|
|
||||||
assert db.sites[0].disabled == False
|
|
||||||
|
|
||||||
loop = asyncio.get_event_loop()
|
|
||||||
loop.run_until_complete(self_check(db, db.sites_dict, logger, silent=True))
|
|
||||||
|
|
||||||
assert db.sites[0].disabled == False
|
|
||||||
|
|||||||
@@ -0,0 +1,64 @@
|
|||||||
|
from maigret.errors import CheckError
|
||||||
|
from maigret.notify import QueryNotifyPrint
|
||||||
|
from maigret.result import QueryStatus, QueryResult
|
||||||
|
|
||||||
|
|
||||||
|
def test_notify_illegal():
|
||||||
|
n = QueryNotifyPrint(color=False)
|
||||||
|
|
||||||
|
assert (
|
||||||
|
n.update(
|
||||||
|
QueryResult(
|
||||||
|
username="test",
|
||||||
|
status=QueryStatus.ILLEGAL,
|
||||||
|
site_name="TEST_SITE",
|
||||||
|
site_url_user="http://example.com/test",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
== "[-] TEST_SITE: Illegal Username Format For This Site!"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_notify_claimed():
|
||||||
|
n = QueryNotifyPrint(color=False)
|
||||||
|
|
||||||
|
assert (
|
||||||
|
n.update(
|
||||||
|
QueryResult(
|
||||||
|
username="test",
|
||||||
|
status=QueryStatus.CLAIMED,
|
||||||
|
site_name="TEST_SITE",
|
||||||
|
site_url_user="http://example.com/test",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
== "[+] TEST_SITE: http://example.com/test"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_notify_available():
|
||||||
|
n = QueryNotifyPrint(color=False)
|
||||||
|
|
||||||
|
assert (
|
||||||
|
n.update(
|
||||||
|
QueryResult(
|
||||||
|
username="test",
|
||||||
|
status=QueryStatus.AVAILABLE,
|
||||||
|
site_name="TEST_SITE",
|
||||||
|
site_url_user="http://example.com/test",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
== "[-] TEST_SITE: Not found!"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_notify_unknown():
|
||||||
|
n = QueryNotifyPrint(color=False)
|
||||||
|
result = QueryResult(
|
||||||
|
username="test",
|
||||||
|
status=QueryStatus.UNKNOWN,
|
||||||
|
site_name="TEST_SITE",
|
||||||
|
site_url_user="http://example.com/test",
|
||||||
|
)
|
||||||
|
result.error = CheckError('Type', 'Reason')
|
||||||
|
|
||||||
|
assert n.update(result) == "[?] TEST_SITE: Type error: Reason"
|
||||||
+302
-57
@@ -7,9 +7,23 @@ from io import StringIO
|
|||||||
import xmind
|
import xmind
|
||||||
from jinja2 import Template
|
from jinja2 import Template
|
||||||
|
|
||||||
from maigret.report import generate_csv_report, generate_txt_report, save_xmind_report, save_html_report, \
|
from maigret.report import (
|
||||||
save_pdf_report, generate_report_template, generate_report_context, generate_json_report
|
generate_csv_report,
|
||||||
|
generate_txt_report,
|
||||||
|
save_xmind_report,
|
||||||
|
save_html_report,
|
||||||
|
save_pdf_report,
|
||||||
|
generate_report_template,
|
||||||
|
generate_report_context,
|
||||||
|
generate_json_report,
|
||||||
|
get_plaintext_report,
|
||||||
|
)
|
||||||
from maigret.result import QueryResult, QueryStatus
|
from maigret.result import QueryResult, QueryStatus
|
||||||
|
from maigret.sites import MaigretSite
|
||||||
|
|
||||||
|
|
||||||
|
GOOD_RESULT = QueryResult('', '', '', QueryStatus.CLAIMED)
|
||||||
|
BAD_RESULT = QueryResult('', '', '', QueryStatus.AVAILABLE)
|
||||||
|
|
||||||
EXAMPLE_RESULTS = {
|
EXAMPLE_RESULTS = {
|
||||||
'GitHub': {
|
'GitHub': {
|
||||||
@@ -17,96 +31,234 @@ EXAMPLE_RESULTS = {
|
|||||||
'parsing_enabled': True,
|
'parsing_enabled': True,
|
||||||
'url_main': 'https://www.github.com/',
|
'url_main': 'https://www.github.com/',
|
||||||
'url_user': 'https://www.github.com/test',
|
'url_user': 'https://www.github.com/test',
|
||||||
'status': QueryResult('test',
|
'status': QueryResult(
|
||||||
|
'test',
|
||||||
'GitHub',
|
'GitHub',
|
||||||
'https://www.github.com/test',
|
'https://www.github.com/test',
|
||||||
QueryStatus.CLAIMED,
|
QueryStatus.CLAIMED,
|
||||||
tags=['test_tag']),
|
tags=['test_tag'],
|
||||||
|
),
|
||||||
'http_status': 200,
|
'http_status': 200,
|
||||||
'is_similar': False,
|
'is_similar': False,
|
||||||
'rank': 78
|
'rank': 78,
|
||||||
|
'site': MaigretSite('test', {}),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
GOOD_RESULT = QueryResult('', '', '', QueryStatus.CLAIMED)
|
BROKEN_RESULTS = {
|
||||||
BAD_RESULT = QueryResult('', '', '', QueryStatus.AVAILABLE)
|
'GitHub': {
|
||||||
|
'username': 'test',
|
||||||
|
'parsing_enabled': True,
|
||||||
|
'url_main': 'https://www.github.com/',
|
||||||
|
'url_user': 'https://www.github.com/test',
|
||||||
|
'http_status': 200,
|
||||||
|
'is_similar': False,
|
||||||
|
'rank': 78,
|
||||||
|
'site': MaigretSite('test', {}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
GOOD_500PX_RESULT = copy.deepcopy(GOOD_RESULT)
|
GOOD_500PX_RESULT = copy.deepcopy(GOOD_RESULT)
|
||||||
GOOD_500PX_RESULT.tags = ['photo', 'us', 'global']
|
GOOD_500PX_RESULT.tags = ['photo', 'us', 'global']
|
||||||
GOOD_500PX_RESULT.ids_data = {"uid": "dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==", "legacy_id": "26403415",
|
GOOD_500PX_RESULT.ids_data = {
|
||||||
"username": "alexaimephotographycars", "name": "Alex Aim\u00e9",
|
"uid": "dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==",
|
||||||
|
"legacy_id": "26403415",
|
||||||
|
"username": "alexaimephotographycars",
|
||||||
|
"name": "Alex Aim\u00e9",
|
||||||
"website": "www.flickr.com/photos/alexaimephotography/",
|
"website": "www.flickr.com/photos/alexaimephotography/",
|
||||||
"facebook_link": " www.instagram.com/street.reality.photography/",
|
"facebook_link": " www.instagram.com/street.reality.photography/",
|
||||||
"instagram_username": "alexaimephotography", "twitter_username": "Alexaimephotogr"}
|
"instagram_username": "alexaimephotography",
|
||||||
|
"twitter_username": "Alexaimephotogr",
|
||||||
|
}
|
||||||
|
|
||||||
GOOD_REDDIT_RESULT = copy.deepcopy(GOOD_RESULT)
|
GOOD_REDDIT_RESULT = copy.deepcopy(GOOD_RESULT)
|
||||||
GOOD_REDDIT_RESULT.tags = ['news', 'us']
|
GOOD_REDDIT_RESULT.tags = ['news', 'us']
|
||||||
GOOD_REDDIT_RESULT.ids_data = {"reddit_id": "t5_1nytpy", "reddit_username": "alexaimephotography",
|
GOOD_REDDIT_RESULT.ids_data = {
|
||||||
|
"reddit_id": "t5_1nytpy",
|
||||||
|
"reddit_username": "alexaimephotography",
|
||||||
"fullname": "alexaimephotography",
|
"fullname": "alexaimephotography",
|
||||||
"image": "https://styles.redditmedia.com/t5_1nytpy/styles/profileIcon_7vmhdwzd3g931.jpg?width=256&height=256&crop=256:256,smart&frame=1&s=4f355f16b4920844a3f4eacd4237a7bf76b2e97e",
|
"image": "https://styles.redditmedia.com/t5_1nytpy/styles/profileIcon_7vmhdwzd3g931.jpg?width=256&height=256&crop=256:256,smart&frame=1&s=4f355f16b4920844a3f4eacd4237a7bf76b2e97e",
|
||||||
"is_employee": "False", "is_nsfw": "False", "is_mod": "True", "is_following": "True",
|
"is_employee": "False",
|
||||||
"has_user_profile": "True", "hide_from_robots": "False",
|
"is_nsfw": "False",
|
||||||
"created_at": "2019-07-10 12:20:03", "total_karma": "53959", "post_karma": "52738"}
|
"is_mod": "True",
|
||||||
|
"is_following": "True",
|
||||||
|
"has_user_profile": "True",
|
||||||
|
"hide_from_robots": "False",
|
||||||
|
"created_at": "2019-07-10 12:20:03",
|
||||||
|
"total_karma": "53959",
|
||||||
|
"post_karma": "52738",
|
||||||
|
}
|
||||||
|
|
||||||
GOOD_IG_RESULT = copy.deepcopy(GOOD_RESULT)
|
GOOD_IG_RESULT = copy.deepcopy(GOOD_RESULT)
|
||||||
GOOD_IG_RESULT.tags = ['photo', 'global']
|
GOOD_IG_RESULT.tags = ['photo', 'global']
|
||||||
GOOD_IG_RESULT.ids_data = {"instagram_username": "alexaimephotography", "fullname": "Alexaimephotography",
|
GOOD_IG_RESULT.ids_data = {
|
||||||
|
"instagram_username": "alexaimephotography",
|
||||||
|
"fullname": "Alexaimephotography",
|
||||||
"id": "6828488620",
|
"id": "6828488620",
|
||||||
"image": "https://scontent-hel3-1.cdninstagram.com/v/t51.2885-19/s320x320/95420076_1169632876707608_8741505804647006208_n.jpg?_nc_ht=scontent-hel3-1.cdninstagram.com&_nc_ohc=jd87OUGsX4MAX_Ym5GX&tp=1&oh=0f42badd68307ba97ec7fb1ef7b4bfd4&oe=601E5E6F",
|
"image": "https://scontent-hel3-1.cdninstagram.com/v/t51.2885-19/s320x320/95420076_1169632876707608_8741505804647006208_n.jpg?_nc_ht=scontent-hel3-1.cdninstagram.com&_nc_ohc=jd87OUGsX4MAX_Ym5GX&tp=1&oh=0f42badd68307ba97ec7fb1ef7b4bfd4&oe=601E5E6F",
|
||||||
"bio": "Photographer \nChild of fine street arts",
|
"bio": "Photographer \nChild of fine street arts",
|
||||||
"external_url": "https://www.flickr.com/photos/alexaimephotography2020/"}
|
"external_url": "https://www.flickr.com/photos/alexaimephotography2020/",
|
||||||
|
}
|
||||||
|
|
||||||
GOOD_TWITTER_RESULT = copy.deepcopy(GOOD_RESULT)
|
GOOD_TWITTER_RESULT = copy.deepcopy(GOOD_RESULT)
|
||||||
GOOD_TWITTER_RESULT.tags = ['social', 'us']
|
GOOD_TWITTER_RESULT.tags = ['social', 'us']
|
||||||
|
|
||||||
TEST = [('alexaimephotographycars', 'username', {
|
TEST = [
|
||||||
'500px': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://500px.com/',
|
(
|
||||||
|
'alexaimephotographycars',
|
||||||
|
'username',
|
||||||
|
{
|
||||||
|
'500px': {
|
||||||
|
'username': 'alexaimephotographycars',
|
||||||
|
'parsing_enabled': True,
|
||||||
|
'url_main': 'https://500px.com/',
|
||||||
'url_user': 'https://500px.com/p/alexaimephotographycars',
|
'url_user': 'https://500px.com/p/alexaimephotographycars',
|
||||||
'ids_usernames': {'alexaimephotographycars': 'username', 'alexaimephotography': 'username',
|
'ids_usernames': {
|
||||||
'Alexaimephotogr': 'username'}, 'status': GOOD_500PX_RESULT, 'http_status': 200,
|
'alexaimephotographycars': 'username',
|
||||||
'is_similar': False, 'rank': 2981},
|
'alexaimephotography': 'username',
|
||||||
'Reddit': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/',
|
'Alexaimephotogr': 'username',
|
||||||
'url_user': 'https://www.reddit.com/user/alexaimephotographycars', 'status': BAD_RESULT,
|
},
|
||||||
'http_status': 404, 'is_similar': False, 'rank': 17},
|
'status': GOOD_500PX_RESULT,
|
||||||
'Twitter': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/',
|
'http_status': 200,
|
||||||
'url_user': 'https://twitter.com/alexaimephotographycars', 'status': BAD_RESULT, 'http_status': 400,
|
'is_similar': False,
|
||||||
'is_similar': False, 'rank': 55},
|
'rank': 2981,
|
||||||
'Instagram': {'username': 'alexaimephotographycars', 'parsing_enabled': True,
|
},
|
||||||
|
'Reddit': {
|
||||||
|
'username': 'alexaimephotographycars',
|
||||||
|
'parsing_enabled': True,
|
||||||
|
'url_main': 'https://www.reddit.com/',
|
||||||
|
'url_user': 'https://www.reddit.com/user/alexaimephotographycars',
|
||||||
|
'status': BAD_RESULT,
|
||||||
|
'http_status': 404,
|
||||||
|
'is_similar': False,
|
||||||
|
'rank': 17,
|
||||||
|
},
|
||||||
|
'Twitter': {
|
||||||
|
'username': 'alexaimephotographycars',
|
||||||
|
'parsing_enabled': True,
|
||||||
|
'url_main': 'https://www.twitter.com/',
|
||||||
|
'url_user': 'https://twitter.com/alexaimephotographycars',
|
||||||
|
'status': BAD_RESULT,
|
||||||
|
'http_status': 400,
|
||||||
|
'is_similar': False,
|
||||||
|
'rank': 55,
|
||||||
|
},
|
||||||
|
'Instagram': {
|
||||||
|
'username': 'alexaimephotographycars',
|
||||||
|
'parsing_enabled': True,
|
||||||
'url_main': 'https://www.instagram.com/',
|
'url_main': 'https://www.instagram.com/',
|
||||||
'url_user': 'https://www.instagram.com/alexaimephotographycars', 'status': BAD_RESULT,
|
'url_user': 'https://www.instagram.com/alexaimephotographycars',
|
||||||
'http_status': 404, 'is_similar': False, 'rank': 29}}), ('alexaimephotography', 'username', {
|
'status': BAD_RESULT,
|
||||||
'500px': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://500px.com/',
|
'http_status': 404,
|
||||||
'url_user': 'https://500px.com/p/alexaimephotography', 'status': BAD_RESULT, 'http_status': 200,
|
'is_similar': False,
|
||||||
'is_similar': False, 'rank': 2981},
|
'rank': 29,
|
||||||
'Reddit': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/',
|
},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
(
|
||||||
|
'alexaimephotography',
|
||||||
|
'username',
|
||||||
|
{
|
||||||
|
'500px': {
|
||||||
|
'username': 'alexaimephotography',
|
||||||
|
'parsing_enabled': True,
|
||||||
|
'url_main': 'https://500px.com/',
|
||||||
|
'url_user': 'https://500px.com/p/alexaimephotography',
|
||||||
|
'status': BAD_RESULT,
|
||||||
|
'http_status': 200,
|
||||||
|
'is_similar': False,
|
||||||
|
'rank': 2981,
|
||||||
|
},
|
||||||
|
'Reddit': {
|
||||||
|
'username': 'alexaimephotography',
|
||||||
|
'parsing_enabled': True,
|
||||||
|
'url_main': 'https://www.reddit.com/',
|
||||||
'url_user': 'https://www.reddit.com/user/alexaimephotography',
|
'url_user': 'https://www.reddit.com/user/alexaimephotography',
|
||||||
'ids_usernames': {'alexaimephotography': 'username'}, 'status': GOOD_REDDIT_RESULT, 'http_status': 200,
|
'ids_usernames': {'alexaimephotography': 'username'},
|
||||||
'is_similar': False, 'rank': 17},
|
'status': GOOD_REDDIT_RESULT,
|
||||||
'Twitter': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/',
|
'http_status': 200,
|
||||||
'url_user': 'https://twitter.com/alexaimephotography', 'status': BAD_RESULT, 'http_status': 400,
|
'is_similar': False,
|
||||||
'is_similar': False, 'rank': 55},
|
'rank': 17,
|
||||||
'Instagram': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.instagram.com/',
|
},
|
||||||
|
'Twitter': {
|
||||||
|
'username': 'alexaimephotography',
|
||||||
|
'parsing_enabled': True,
|
||||||
|
'url_main': 'https://www.twitter.com/',
|
||||||
|
'url_user': 'https://twitter.com/alexaimephotography',
|
||||||
|
'status': BAD_RESULT,
|
||||||
|
'http_status': 400,
|
||||||
|
'is_similar': False,
|
||||||
|
'rank': 55,
|
||||||
|
},
|
||||||
|
'Instagram': {
|
||||||
|
'username': 'alexaimephotography',
|
||||||
|
'parsing_enabled': True,
|
||||||
|
'url_main': 'https://www.instagram.com/',
|
||||||
'url_user': 'https://www.instagram.com/alexaimephotography',
|
'url_user': 'https://www.instagram.com/alexaimephotography',
|
||||||
'ids_usernames': {'alexaimephotography': 'username'}, 'status': GOOD_IG_RESULT, 'http_status': 200,
|
'ids_usernames': {'alexaimephotography': 'username'},
|
||||||
'is_similar': False, 'rank': 29}}), ('Alexaimephotogr', 'username', {
|
'status': GOOD_IG_RESULT,
|
||||||
'500px': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://500px.com/',
|
'http_status': 200,
|
||||||
'url_user': 'https://500px.com/p/Alexaimephotogr', 'status': BAD_RESULT, 'http_status': 200,
|
'is_similar': False,
|
||||||
'is_similar': False, 'rank': 2981},
|
'rank': 29,
|
||||||
'Reddit': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/',
|
},
|
||||||
'url_user': 'https://www.reddit.com/user/Alexaimephotogr', 'status': BAD_RESULT, 'http_status': 404,
|
},
|
||||||
'is_similar': False, 'rank': 17},
|
),
|
||||||
'Twitter': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/',
|
(
|
||||||
'url_user': 'https://twitter.com/Alexaimephotogr', 'status': GOOD_TWITTER_RESULT, 'http_status': 400,
|
'Alexaimephotogr',
|
||||||
'is_similar': False, 'rank': 55},
|
'username',
|
||||||
'Instagram': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.instagram.com/',
|
{
|
||||||
'url_user': 'https://www.instagram.com/Alexaimephotogr', 'status': BAD_RESULT, 'http_status': 404,
|
'500px': {
|
||||||
'is_similar': False, 'rank': 29}})]
|
'username': 'Alexaimephotogr',
|
||||||
|
'parsing_enabled': True,
|
||||||
|
'url_main': 'https://500px.com/',
|
||||||
|
'url_user': 'https://500px.com/p/Alexaimephotogr',
|
||||||
|
'status': BAD_RESULT,
|
||||||
|
'http_status': 200,
|
||||||
|
'is_similar': False,
|
||||||
|
'rank': 2981,
|
||||||
|
},
|
||||||
|
'Reddit': {
|
||||||
|
'username': 'Alexaimephotogr',
|
||||||
|
'parsing_enabled': True,
|
||||||
|
'url_main': 'https://www.reddit.com/',
|
||||||
|
'url_user': 'https://www.reddit.com/user/Alexaimephotogr',
|
||||||
|
'status': BAD_RESULT,
|
||||||
|
'http_status': 404,
|
||||||
|
'is_similar': False,
|
||||||
|
'rank': 17,
|
||||||
|
},
|
||||||
|
'Twitter': {
|
||||||
|
'username': 'Alexaimephotogr',
|
||||||
|
'parsing_enabled': True,
|
||||||
|
'url_main': 'https://www.twitter.com/',
|
||||||
|
'url_user': 'https://twitter.com/Alexaimephotogr',
|
||||||
|
'status': GOOD_TWITTER_RESULT,
|
||||||
|
'http_status': 400,
|
||||||
|
'is_similar': False,
|
||||||
|
'rank': 55,
|
||||||
|
},
|
||||||
|
'Instagram': {
|
||||||
|
'username': 'Alexaimephotogr',
|
||||||
|
'parsing_enabled': True,
|
||||||
|
'url_main': 'https://www.instagram.com/',
|
||||||
|
'url_user': 'https://www.instagram.com/Alexaimephotogr',
|
||||||
|
'status': BAD_RESULT,
|
||||||
|
'http_status': 404,
|
||||||
|
'is_similar': False,
|
||||||
|
'rank': 29,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
SUPPOSED_BRIEF = """Search by username alexaimephotographycars returned 1 accounts. Found target's other IDs: alexaimephotography, Alexaimephotogr. Search by username alexaimephotography returned 2 accounts. Search by username Alexaimephotogr returned 1 accounts. Extended info extracted from 3 accounts."""
|
SUPPOSED_BRIEF = """Search by username alexaimephotographycars returned 1 accounts. Found target's other IDs: alexaimephotography, Alexaimephotogr. Search by username alexaimephotography returned 2 accounts. Search by username Alexaimephotogr returned 1 accounts. Extended info extracted from 3 accounts."""
|
||||||
|
SUPPOSED_BROKEN_BRIEF = """Search by username alexaimephotographycars returned 0 accounts. Search by username alexaimephotography returned 2 accounts. Search by username Alexaimephotogr returned 1 accounts. Extended info extracted from 2 accounts."""
|
||||||
SUPPOSED_INTERESTS = "Interests: photo <span class=\"text-muted\">(2)</span>, news <span class=\"text-muted\">(1)</span>, social <span class=\"text-muted\">(1)</span>"
|
|
||||||
|
|
||||||
SUPPOSED_GEO = "Geo: us <span class=\"text-muted\">(3)</span>"
|
SUPPOSED_GEO = "Geo: us <span class=\"text-muted\">(3)</span>"
|
||||||
|
SUPPOSED_BROKEN_GEO = "Geo: us <span class=\"text-muted\">(2)</span>"
|
||||||
|
|
||||||
|
SUPPOSED_INTERESTS = "Interests: photo <span class=\"text-muted\">(2)</span>, news <span class=\"text-muted\">(1)</span>, social <span class=\"text-muted\">(1)</span>"
|
||||||
|
SUPPOSED_BROKEN_INTERESTS = "Interests: news <span class=\"text-muted\">(1)</span>, photo <span class=\"text-muted\">(1)</span>, social <span class=\"text-muted\">(1)</span>"
|
||||||
|
|
||||||
|
|
||||||
def test_generate_report_template():
|
def test_generate_report_template():
|
||||||
@@ -134,6 +286,19 @@ def test_generate_csv_report():
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_generate_csv_report_broken():
|
||||||
|
csvfile = StringIO()
|
||||||
|
generate_csv_report('test', BROKEN_RESULTS, csvfile)
|
||||||
|
|
||||||
|
csvfile.seek(0)
|
||||||
|
data = csvfile.readlines()
|
||||||
|
|
||||||
|
assert data == [
|
||||||
|
'username,name,url_main,url_user,exists,http_status\r\n',
|
||||||
|
'test,GitHub,https://www.github.com/,https://www.github.com/test,Unknown,200\r\n',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_generate_txt_report():
|
def test_generate_txt_report():
|
||||||
txtfile = StringIO()
|
txtfile = StringIO()
|
||||||
generate_txt_report('test', EXAMPLE_RESULTS, txtfile)
|
generate_txt_report('test', EXAMPLE_RESULTS, txtfile)
|
||||||
@@ -147,6 +312,18 @@ def test_generate_txt_report():
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_generate_txt_report_broken():
|
||||||
|
txtfile = StringIO()
|
||||||
|
generate_txt_report('test', BROKEN_RESULTS, txtfile)
|
||||||
|
|
||||||
|
txtfile.seek(0)
|
||||||
|
data = txtfile.readlines()
|
||||||
|
|
||||||
|
assert data == [
|
||||||
|
'Total Websites Username Detected On : 0',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_generate_json_simple_report():
|
def test_generate_json_simple_report():
|
||||||
jsonfile = StringIO()
|
jsonfile = StringIO()
|
||||||
MODIFIED_RESULTS = dict(EXAMPLE_RESULTS)
|
MODIFIED_RESULTS = dict(EXAMPLE_RESULTS)
|
||||||
@@ -160,6 +337,19 @@ def test_generate_json_simple_report():
|
|||||||
assert list(json.loads(data[0]).keys()) == ['GitHub', 'GitHub2']
|
assert list(json.loads(data[0]).keys()) == ['GitHub', 'GitHub2']
|
||||||
|
|
||||||
|
|
||||||
|
def test_generate_json_simple_report_broken():
|
||||||
|
jsonfile = StringIO()
|
||||||
|
MODIFIED_RESULTS = dict(BROKEN_RESULTS)
|
||||||
|
MODIFIED_RESULTS['GitHub2'] = BROKEN_RESULTS['GitHub']
|
||||||
|
generate_json_report('test', BROKEN_RESULTS, jsonfile, 'simple')
|
||||||
|
|
||||||
|
jsonfile.seek(0)
|
||||||
|
data = jsonfile.readlines()
|
||||||
|
|
||||||
|
assert len(data) == 1
|
||||||
|
assert list(json.loads(data[0]).keys()) == []
|
||||||
|
|
||||||
|
|
||||||
def test_generate_json_ndjson_report():
|
def test_generate_json_ndjson_report():
|
||||||
jsonfile = StringIO()
|
jsonfile = StringIO()
|
||||||
MODIFIED_RESULTS = dict(EXAMPLE_RESULTS)
|
MODIFIED_RESULTS = dict(EXAMPLE_RESULTS)
|
||||||
@@ -187,7 +377,24 @@ def test_save_xmind_report():
|
|||||||
assert data['topic']['topics'][0]['title'] == 'Undefined'
|
assert data['topic']['topics'][0]['title'] == 'Undefined'
|
||||||
assert data['topic']['topics'][1]['title'] == 'test_tag'
|
assert data['topic']['topics'][1]['title'] == 'test_tag'
|
||||||
assert len(data['topic']['topics'][1]['topics']) == 1
|
assert len(data['topic']['topics'][1]['topics']) == 1
|
||||||
assert data['topic']['topics'][1]['topics'][0]['label'] == 'https://www.github.com/test'
|
assert (
|
||||||
|
data['topic']['topics'][1]['topics'][0]['label']
|
||||||
|
== 'https://www.github.com/test'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_save_xmind_report_broken():
|
||||||
|
filename = 'report_test.xmind'
|
||||||
|
save_xmind_report(filename, 'test', BROKEN_RESULTS)
|
||||||
|
|
||||||
|
workbook = xmind.load(filename)
|
||||||
|
sheet = workbook.getPrimarySheet()
|
||||||
|
data = sheet.getData()
|
||||||
|
|
||||||
|
assert data['title'] == 'test Analysis'
|
||||||
|
assert data['topic']['title'] == 'test'
|
||||||
|
assert len(data['topic']['topics']) == 1
|
||||||
|
assert data['topic']['topics'][0]['title'] == 'Undefined'
|
||||||
|
|
||||||
|
|
||||||
def test_html_report():
|
def test_html_report():
|
||||||
@@ -202,9 +409,47 @@ def test_html_report():
|
|||||||
assert SUPPOSED_INTERESTS in report_text
|
assert SUPPOSED_INTERESTS in report_text
|
||||||
|
|
||||||
|
|
||||||
|
def test_html_report_broken():
|
||||||
|
report_name = 'report_test_broken.html'
|
||||||
|
BROKEN_DATA = copy.deepcopy(TEST)
|
||||||
|
BROKEN_DATA[0][2]['500px']['status'] = None
|
||||||
|
|
||||||
|
context = generate_report_context(BROKEN_DATA)
|
||||||
|
save_html_report(report_name, context)
|
||||||
|
|
||||||
|
report_text = open(report_name).read()
|
||||||
|
|
||||||
|
assert SUPPOSED_BROKEN_BRIEF in report_text
|
||||||
|
assert SUPPOSED_BROKEN_GEO in report_text
|
||||||
|
assert SUPPOSED_BROKEN_INTERESTS in report_text
|
||||||
|
|
||||||
|
|
||||||
def test_pdf_report():
|
def test_pdf_report():
|
||||||
report_name = 'report_test.pdf'
|
report_name = 'report_test.pdf'
|
||||||
context = generate_report_context(TEST)
|
context = generate_report_context(TEST)
|
||||||
save_pdf_report(report_name, context)
|
save_pdf_report(report_name, context)
|
||||||
|
|
||||||
assert os.path.exists(report_name)
|
assert os.path.exists(report_name)
|
||||||
|
|
||||||
|
|
||||||
|
def test_text_report():
|
||||||
|
context = generate_report_context(TEST)
|
||||||
|
report_text = get_plaintext_report(context)
|
||||||
|
|
||||||
|
for brief_part in SUPPOSED_BRIEF.split():
|
||||||
|
assert brief_part in report_text
|
||||||
|
assert 'us' in report_text
|
||||||
|
assert 'photo' in report_text
|
||||||
|
|
||||||
|
|
||||||
|
def test_text_report_broken():
|
||||||
|
BROKEN_DATA = copy.deepcopy(TEST)
|
||||||
|
BROKEN_DATA[0][2]['500px']['status'] = None
|
||||||
|
|
||||||
|
context = generate_report_context(BROKEN_DATA)
|
||||||
|
report_text = get_plaintext_report(context)
|
||||||
|
|
||||||
|
for brief_part in SUPPOSED_BROKEN_BRIEF.split():
|
||||||
|
assert brief_part in report_text
|
||||||
|
assert 'us' in report_text
|
||||||
|
assert 'photo' in report_text
|
||||||
|
|||||||
+38
-11
@@ -1,5 +1,6 @@
|
|||||||
"""Maigret Database test functions"""
|
"""Maigret Database test functions"""
|
||||||
from maigret.sites import MaigretDatabase, MaigretSite
|
from maigret.sites import MaigretDatabase, MaigretSite
|
||||||
|
from maigret.utils import URLMatcher
|
||||||
|
|
||||||
EXAMPLE_DB = {
|
EXAMPLE_DB = {
|
||||||
'engines': {
|
'engines': {
|
||||||
@@ -10,25 +11,21 @@ EXAMPLE_DB = {
|
|||||||
"The specified member cannot be found. Please enter a member's entire name.",
|
"The specified member cannot be found. Please enter a member's entire name.",
|
||||||
],
|
],
|
||||||
"checkType": "message",
|
"checkType": "message",
|
||||||
"errors": {
|
"errors": {"You must be logged-in to do that.": "Login required"},
|
||||||
"You must be logged-in to do that.": "Login required"
|
"url": "{urlMain}{urlSubpath}/members/?username={username}",
|
||||||
},
|
},
|
||||||
"url": "{urlMain}{urlSubpath}/members/?username={username}"
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
'sites': {
|
'sites': {
|
||||||
"Amperka": {
|
"Amperka": {
|
||||||
"engine": "XenForo",
|
"engine": "XenForo",
|
||||||
"rank": 121613,
|
"rank": 121613,
|
||||||
"tags": [
|
"tags": ["ru"],
|
||||||
"ru"
|
|
||||||
],
|
|
||||||
"urlMain": "http://forum.amperka.ru",
|
"urlMain": "http://forum.amperka.ru",
|
||||||
"usernameClaimed": "adam",
|
"usernameClaimed": "adam",
|
||||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
"usernameUnclaimed": "noonewouldeverusethis7",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -107,6 +104,7 @@ def test_saving_site_error():
|
|||||||
|
|
||||||
amperka = db.sites[0]
|
amperka = db.sites[0]
|
||||||
assert len(amperka.errors) == 2
|
assert len(amperka.errors) == 2
|
||||||
|
assert len(amperka.errors_dict) == 2
|
||||||
|
|
||||||
assert amperka.strip_engine_data().errors == {'error1': 'text1'}
|
assert amperka.strip_engine_data().errors == {'error1': 'text1'}
|
||||||
assert amperka.strip_engine_data().json['errors'] == {'error1': 'text1'}
|
assert amperka.strip_engine_data().json['errors'] == {'error1': 'text1'}
|
||||||
@@ -116,8 +114,14 @@ def test_site_url_detector():
|
|||||||
db = MaigretDatabase()
|
db = MaigretDatabase()
|
||||||
db.load_from_json(EXAMPLE_DB)
|
db.load_from_json(EXAMPLE_DB)
|
||||||
|
|
||||||
assert db.sites[0].url_regexp.pattern == r'^https?://(www.)?forum\.amperka\.ru/members/\?username=(.+?)$'
|
assert (
|
||||||
assert db.sites[0].detect_username('http://forum.amperka.ru/members/?username=test') == 'test'
|
db.sites[0].url_regexp.pattern
|
||||||
|
== r'^https?://(www.)?forum\.amperka\.ru/members/\?username=(.+?)$'
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
db.sites[0].detect_username('http://forum.amperka.ru/members/?username=test')
|
||||||
|
== 'test'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_ranked_sites_dict():
|
def test_ranked_sites_dict():
|
||||||
@@ -176,3 +180,26 @@ def test_ranked_sites_dict_id_type():
|
|||||||
assert len(db.ranked_sites_dict()) == 2
|
assert len(db.ranked_sites_dict()) == 2
|
||||||
assert len(db.ranked_sites_dict(id_type='username')) == 2
|
assert len(db.ranked_sites_dict(id_type='username')) == 2
|
||||||
assert len(db.ranked_sites_dict(id_type='gaia_id')) == 1
|
assert len(db.ranked_sites_dict(id_type='gaia_id')) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_url_template():
|
||||||
|
site = MaigretSite(
|
||||||
|
"test",
|
||||||
|
{
|
||||||
|
"urlMain": "https://ya.ru/",
|
||||||
|
"url": "{urlMain}{urlSubpath}/members/?username={username}",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
site.get_url_template()
|
||||||
|
== "{urlMain}{urlSubpath}/members/?username={username} (no engine)"
|
||||||
|
)
|
||||||
|
|
||||||
|
site = MaigretSite(
|
||||||
|
"test",
|
||||||
|
{
|
||||||
|
"urlMain": "https://ya.ru/",
|
||||||
|
"url": "https://{username}.ya.ru",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert site.get_url_template() == "SUBDOMAIN"
|
||||||
|
|||||||
+57
-12
@@ -2,7 +2,14 @@
|
|||||||
import itertools
|
import itertools
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from maigret.utils import CaseConverter, is_country_tag, enrich_link_str, URLMatcher, get_dict_ascii_tree
|
from maigret.utils import (
|
||||||
|
CaseConverter,
|
||||||
|
is_country_tag,
|
||||||
|
enrich_link_str,
|
||||||
|
URLMatcher,
|
||||||
|
get_dict_ascii_tree,
|
||||||
|
get_match_ratio,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_case_convert_camel_to_snake():
|
def test_case_convert_camel_to_snake():
|
||||||
@@ -34,19 +41,26 @@ def test_case_convert_camel_with_digits_to_snake():
|
|||||||
|
|
||||||
|
|
||||||
def test_is_country_tag():
|
def test_is_country_tag():
|
||||||
assert is_country_tag('ru') == True
|
assert is_country_tag('ru') is True
|
||||||
assert is_country_tag('FR') == True
|
assert is_country_tag('FR') is True
|
||||||
|
|
||||||
assert is_country_tag('a1') == False
|
assert is_country_tag('a1') is False
|
||||||
assert is_country_tag('dating') == False
|
assert is_country_tag('dating') is False
|
||||||
|
|
||||||
assert is_country_tag('global') == True
|
assert is_country_tag('global') is True
|
||||||
|
|
||||||
|
|
||||||
def test_enrich_link_str():
|
def test_enrich_link_str():
|
||||||
assert enrich_link_str('test') == 'test'
|
assert enrich_link_str('test') == 'test'
|
||||||
assert enrich_link_str(
|
assert (
|
||||||
' www.flickr.com/photos/alexaimephotography/') == '<a class="auto-link" href="www.flickr.com/photos/alexaimephotography/">www.flickr.com/photos/alexaimephotography/</a>'
|
enrich_link_str(' www.flickr.com/photos/alexaimephotography/')
|
||||||
|
== '<a class="auto-link" href="www.flickr.com/photos/alexaimephotography/">www.flickr.com/photos/alexaimephotography/</a>'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_url_extract_main_part_negative():
|
||||||
|
url_main_part = 'None'
|
||||||
|
assert URLMatcher.extract_main_part(url_main_part) == ''
|
||||||
|
|
||||||
|
|
||||||
def test_url_extract_main_part():
|
def test_url_extract_main_part():
|
||||||
@@ -60,8 +74,10 @@ def test_url_extract_main_part():
|
|||||||
]
|
]
|
||||||
|
|
||||||
url_regexp = re.compile('^https?://(www.)?flickr.com/photos/(.+?)$')
|
url_regexp = re.compile('^https?://(www.)?flickr.com/photos/(.+?)$')
|
||||||
|
# combine parts variations
|
||||||
for url_parts in itertools.product(*parts):
|
for url_parts in itertools.product(*parts):
|
||||||
url = ''.join(url_parts)
|
url = ''.join(url_parts)
|
||||||
|
# ensure all combinations give valid main part
|
||||||
assert URLMatcher.extract_main_part(url) == url_main_part
|
assert URLMatcher.extract_main_part(url) == url_main_part
|
||||||
assert not url_regexp.match(url) is None
|
assert not url_regexp.match(url) is None
|
||||||
|
|
||||||
@@ -76,21 +92,43 @@ def test_url_make_profile_url_regexp():
|
|||||||
['/', ''],
|
['/', ''],
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# combine parts variations
|
||||||
for url_parts in itertools.product(*parts):
|
for url_parts in itertools.product(*parts):
|
||||||
url = ''.join(url_parts)
|
url = ''.join(url_parts)
|
||||||
assert URLMatcher.make_profile_url_regexp(url).pattern == r'^https?://(www.)?flickr\.com/photos/(.+?)$'
|
# ensure all combinations match pattern
|
||||||
|
assert (
|
||||||
|
URLMatcher.make_profile_url_regexp(url).pattern
|
||||||
|
== r'^https?://(www.)?flickr\.com/photos/(.+?)$'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_get_dict_ascii_tree():
|
def test_get_dict_ascii_tree():
|
||||||
data = {'uid': 'dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==', 'legacy_id': '26403415', 'username': 'alexaimephotographycars', 'name': 'Alex Aimé', 'created_at': '2018-05-04T10:17:01.000+0000', 'image': 'https://drscdn.500px.org/user_avatar/26403415/q%3D85_w%3D300_h%3D300/v2?webp=true&v=2&sig=0235678a4f7b65e007e864033ebfaf5ef6d87fad34f80a8639d985320c20fe3b', 'image_bg': 'https://drscdn.500px.org/user_cover/26403415/q%3D65_m%3D2048/v2?webp=true&v=1&sig=bea411fb158391a4fdad498874ff17088f91257e59dfb376ff67e3a44c3a4201', 'website': 'www.instagram.com/street.reality.photography/', 'facebook_link': ' www.instagram.com/street.reality.photography/', 'instagram_username': 'Street.Reality.Photography', 'twitter_username': 'Alexaimephotogr'}
|
data = {
|
||||||
|
'uid': 'dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==',
|
||||||
|
'legacy_id': '26403415',
|
||||||
|
'username': 'alexaimephotographycars',
|
||||||
|
'name': 'Alex Aimé',
|
||||||
|
'links': "['www.instagram.com/street.reality.photography/']",
|
||||||
|
'created_at': '2018-05-04T10:17:01.000+0000',
|
||||||
|
'image': 'https://drscdn.500px.org/user_avatar/26403415/q%3D85_w%3D300_h%3D300/v2?webp=true&v=2&sig=0235678a4f7b65e007e864033ebfaf5ef6d87fad34f80a8639d985320c20fe3b',
|
||||||
|
'image_bg': 'https://drscdn.500px.org/user_cover/26403415/q%3D65_m%3D2048/v2?webp=true&v=1&sig=bea411fb158391a4fdad498874ff17088f91257e59dfb376ff67e3a44c3a4201',
|
||||||
|
'website': 'www.instagram.com/street.reality.photography/',
|
||||||
|
'facebook_link': ' www.instagram.com/street.reality.photography/',
|
||||||
|
'instagram_username': 'Street.Reality.Photography',
|
||||||
|
'twitter_username': 'Alexaimephotogr',
|
||||||
|
}
|
||||||
|
|
||||||
ascii_tree = get_dict_ascii_tree(data.items())
|
ascii_tree = get_dict_ascii_tree(data.items(), prepend=" ")
|
||||||
|
|
||||||
assert ascii_tree == """
|
assert (
|
||||||
|
ascii_tree
|
||||||
|
== """
|
||||||
┣╸uid: dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==
|
┣╸uid: dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==
|
||||||
┣╸legacy_id: 26403415
|
┣╸legacy_id: 26403415
|
||||||
┣╸username: alexaimephotographycars
|
┣╸username: alexaimephotographycars
|
||||||
┣╸name: Alex Aimé
|
┣╸name: Alex Aimé
|
||||||
|
┣╸links:
|
||||||
|
┃ ┗╸ www.instagram.com/street.reality.photography/
|
||||||
┣╸created_at: 2018-05-04T10:17:01.000+0000
|
┣╸created_at: 2018-05-04T10:17:01.000+0000
|
||||||
┣╸image: https://drscdn.500px.org/user_avatar/26403415/q%3D85_w%3D300_h%3D300/v2?webp=true&v=2&sig=0235678a4f7b65e007e864033ebfaf5ef6d87fad34f80a8639d985320c20fe3b
|
┣╸image: https://drscdn.500px.org/user_avatar/26403415/q%3D85_w%3D300_h%3D300/v2?webp=true&v=2&sig=0235678a4f7b65e007e864033ebfaf5ef6d87fad34f80a8639d985320c20fe3b
|
||||||
┣╸image_bg: https://drscdn.500px.org/user_cover/26403415/q%3D65_m%3D2048/v2?webp=true&v=1&sig=bea411fb158391a4fdad498874ff17088f91257e59dfb376ff67e3a44c3a4201
|
┣╸image_bg: https://drscdn.500px.org/user_cover/26403415/q%3D65_m%3D2048/v2?webp=true&v=1&sig=bea411fb158391a4fdad498874ff17088f91257e59dfb376ff67e3a44c3a4201
|
||||||
@@ -98,3 +136,10 @@ def test_get_dict_ascii_tree():
|
|||||||
┣╸facebook_link: www.instagram.com/street.reality.photography/
|
┣╸facebook_link: www.instagram.com/street.reality.photography/
|
||||||
┣╸instagram_username: Street.Reality.Photography
|
┣╸instagram_username: Street.Reality.Photography
|
||||||
┗╸twitter_username: Alexaimephotogr"""
|
┗╸twitter_username: Alexaimephotogr"""
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_match_ratio():
|
||||||
|
fun = get_match_ratio(["test", "maigret", "username"])
|
||||||
|
|
||||||
|
assert fun("test") == 1
|
||||||
|
|||||||
Executable
+57
@@ -0,0 +1,57 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import random
|
||||||
|
from argparse import ArgumentParser, RawDescriptionHelpFormatter
|
||||||
|
|
||||||
|
from maigret.maigret import MaigretDatabase
|
||||||
|
from maigret.submit import get_alexa_rank
|
||||||
|
|
||||||
|
|
||||||
|
def update_tags(site):
|
||||||
|
tags = []
|
||||||
|
if not site.tags:
|
||||||
|
print(f'Site {site.name} doesn\'t have tags')
|
||||||
|
else:
|
||||||
|
tags = site.tags
|
||||||
|
print(f'Site {site.name} tags: ' + ', '.join(tags))
|
||||||
|
|
||||||
|
print(f'URL: {site.url_main}')
|
||||||
|
|
||||||
|
new_tags = set(input('Enter new tags: ').split(', '))
|
||||||
|
if "disabled" in new_tags:
|
||||||
|
new_tags.remove("disabled")
|
||||||
|
site.disabled = True
|
||||||
|
|
||||||
|
print(f'Old alexa rank: {site.alexa_rank}')
|
||||||
|
rank = get_alexa_rank(site.url_main)
|
||||||
|
if rank:
|
||||||
|
print(f'New alexa rank: {rank}')
|
||||||
|
site.alexa_rank = rank
|
||||||
|
|
||||||
|
site.tags = [x for x in list(new_tags) if x]
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter
|
||||||
|
)
|
||||||
|
parser.add_argument("--base","-b", metavar="BASE_FILE",
|
||||||
|
dest="base_file", default="maigret/resources/data.json",
|
||||||
|
help="JSON file with sites data to update.")
|
||||||
|
|
||||||
|
pool = list()
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
db = MaigretDatabase()
|
||||||
|
db.load_from_file(args.base_file).sites
|
||||||
|
|
||||||
|
while True:
|
||||||
|
site = random.choice(db.sites)
|
||||||
|
if site.engine == 'uCoz':
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not 'in' in site.tags:
|
||||||
|
continue
|
||||||
|
|
||||||
|
update_tags(site)
|
||||||
|
|
||||||
|
db.save_to_file(args.base_file)
|
||||||
+16
-11
@@ -37,15 +37,15 @@ def get_rank(domain_to_query, site, print_errors=True):
|
|||||||
try:
|
try:
|
||||||
#Get ranking for this site.
|
#Get ranking for this site.
|
||||||
site.alexa_rank = int(root.find('.//REACH').attrib['RANK'])
|
site.alexa_rank = int(root.find('.//REACH').attrib['RANK'])
|
||||||
country = root.find('.//COUNTRY')
|
# country = root.find('.//COUNTRY')
|
||||||
if not country is None and country.attrib:
|
# if not country is None and country.attrib:
|
||||||
country_code = country.attrib['CODE']
|
# country_code = country.attrib['CODE']
|
||||||
tags = set(site.tags)
|
# tags = set(site.tags)
|
||||||
if country_code:
|
# if country_code:
|
||||||
tags.add(country_code.lower())
|
# tags.add(country_code.lower())
|
||||||
site.tags = sorted(list(tags))
|
# site.tags = sorted(list(tags))
|
||||||
if site.type != 'username':
|
# if site.type != 'username':
|
||||||
site.disabled = False
|
# site.disabled = False
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if print_errors:
|
if print_errors:
|
||||||
logging.error(e)
|
logging.error(e)
|
||||||
@@ -74,6 +74,7 @@ if __name__ == '__main__':
|
|||||||
dest="base_file", default="maigret/resources/data.json",
|
dest="base_file", default="maigret/resources/data.json",
|
||||||
help="JSON file with sites data to update.")
|
help="JSON file with sites data to update.")
|
||||||
|
|
||||||
|
parser.add_argument('--with-rank', help='update with use of local data only', action='store_true')
|
||||||
parser.add_argument('--empty-only', help='update only sites without rating', action='store_true')
|
parser.add_argument('--empty-only', help='update only sites without rating', action='store_true')
|
||||||
parser.add_argument('--exclude-engine', help='do not update score with certain engine',
|
parser.add_argument('--exclude-engine', help='do not update score with certain engine',
|
||||||
action="append", dest="exclude_engine_list", default=[])
|
action="append", dest="exclude_engine_list", default=[])
|
||||||
@@ -87,22 +88,25 @@ if __name__ == '__main__':
|
|||||||
|
|
||||||
with open("sites.md", "w") as site_file:
|
with open("sites.md", "w") as site_file:
|
||||||
site_file.write(f"""
|
site_file.write(f"""
|
||||||
## List of supported sites: total {len(sites_subset)}\n
|
## List of supported sites (search methods): total {len(sites_subset)}\n
|
||||||
Rank data fetched from Alexa by domains.
|
Rank data fetched from Alexa by domains.
|
||||||
|
|
||||||
""")
|
""")
|
||||||
|
|
||||||
for site in sites_subset:
|
for site in sites_subset:
|
||||||
|
if not args.with_rank:
|
||||||
|
break
|
||||||
url_main = site.url_main
|
url_main = site.url_main
|
||||||
if site.alexa_rank < sys.maxsize and args.empty_only:
|
if site.alexa_rank < sys.maxsize and args.empty_only:
|
||||||
continue
|
continue
|
||||||
if args.exclude_engine_list and site.engine in args.exclude_engine_list:
|
if args.exclude_engine_list and site.engine in args.exclude_engine_list:
|
||||||
continue
|
continue
|
||||||
site.alexa_rank = 0
|
site.alexa_rank = 0
|
||||||
th = threading.Thread(target=get_rank, args=(url_main, site))
|
th = threading.Thread(target=get_rank, args=(url_main, site,))
|
||||||
pool.append((site.name, url_main, th))
|
pool.append((site.name, url_main, th))
|
||||||
th.start()
|
th.start()
|
||||||
|
|
||||||
|
if args.with_rank:
|
||||||
index = 1
|
index = 1
|
||||||
for site_name, url_main, th in pool:
|
for site_name, url_main, th in pool:
|
||||||
th.join()
|
th.join()
|
||||||
@@ -123,6 +127,7 @@ Rank data fetched from Alexa by domains.
|
|||||||
url_main = site.url_main
|
url_main = site.url_main
|
||||||
valid_rank = get_step_rank(rank)
|
valid_rank = get_step_rank(rank)
|
||||||
all_tags = site.tags
|
all_tags = site.tags
|
||||||
|
all_tags.sort()
|
||||||
tags = ', ' + ', '.join(all_tags) if all_tags else ''
|
tags = ', ' + ', '.join(all_tags) if all_tags else ''
|
||||||
note = ''
|
note = ''
|
||||||
if site.disabled:
|
if site.disabled:
|
||||||
|
|||||||
@@ -26,7 +26,9 @@ if __name__ == '__main__':
|
|||||||
# user input
|
# user input
|
||||||
username = input('Enter username to search: ')
|
username = input('Enter username to search: ')
|
||||||
|
|
||||||
sites_count_raw = input(f'Select the number of sites to search ({TOP_SITES_COUNT} for default, {len(db.sites_dict)} max): ')
|
sites_count_raw = input(
|
||||||
|
f'Select the number of sites to search ({TOP_SITES_COUNT} for default, {len(db.sites_dict)} max): '
|
||||||
|
)
|
||||||
sites_count = int(sites_count_raw) or TOP_SITES_COUNT
|
sites_count = int(sites_count_raw) or TOP_SITES_COUNT
|
||||||
|
|
||||||
sites = db.ranked_sites_dict(top=sites_count)
|
sites = db.ranked_sites_dict(top=sites_count)
|
||||||
@@ -34,10 +36,14 @@ if __name__ == '__main__':
|
|||||||
show_progressbar_raw = input('Do you want to show a progressbar? [Yn] ')
|
show_progressbar_raw = input('Do you want to show a progressbar? [Yn] ')
|
||||||
show_progressbar = show_progressbar_raw.lower() != 'n'
|
show_progressbar = show_progressbar_raw.lower() != 'n'
|
||||||
|
|
||||||
extract_info_raw = input('Do you want to extract additional info from accounts\' pages? [Yn] ')
|
extract_info_raw = input(
|
||||||
|
'Do you want to extract additional info from accounts\' pages? [Yn] '
|
||||||
|
)
|
||||||
extract_info = extract_info_raw.lower() != 'n'
|
extract_info = extract_info_raw.lower() != 'n'
|
||||||
|
|
||||||
use_notifier_raw = input('Do you want to use notifier for displaying results while searching? [Yn] ')
|
use_notifier_raw = input(
|
||||||
|
'Do you want to use notifier for displaying results while searching? [Yn] '
|
||||||
|
)
|
||||||
use_notifier = use_notifier_raw.lower() != 'n'
|
use_notifier = use_notifier_raw.lower() != 'n'
|
||||||
|
|
||||||
notifier = None
|
notifier = None
|
||||||
@@ -45,7 +51,8 @@ if __name__ == '__main__':
|
|||||||
notifier = maigret.Notifier(print_found_only=True, skip_check_errors=True)
|
notifier = maigret.Notifier(print_found_only=True, skip_check_errors=True)
|
||||||
|
|
||||||
# search!
|
# search!
|
||||||
search_func = maigret.search(username=username,
|
search_func = maigret.search(
|
||||||
|
username=username,
|
||||||
site_dict=sites,
|
site_dict=sites,
|
||||||
timeout=TIMEOUT,
|
timeout=TIMEOUT,
|
||||||
logger=logger,
|
logger=logger,
|
||||||
|
|||||||
Reference in New Issue
Block a user