mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-07 23:27:43 +00:00
Compare commits
54 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 37854a867b | |||
| 6480eebbdf | |||
| aad862b2ed | |||
| c6d0f332bd | |||
| f1c006159e | |||
| 69a09fcd94 | |||
| 9f948928e6 | |||
| a3034c11ff | |||
| d47c72b972 | |||
| 8062ec30e9 | |||
| 32000a1cfd | |||
| 8af6ce3af5 | |||
| 0dd1dd5d76 | |||
| 4aab21046b | |||
| 92ac9ec8b7 | |||
| ca2c8b3502 | |||
| 4362a41fca | |||
| c7977f1cdf | |||
| 49708da980 | |||
| bc1398061f | |||
| e8634c8c56 | |||
| dc59b93f38 | |||
| c727cbae27 | |||
| e6c6cc8f6d | |||
| c80e8b1207 | |||
| 6e78fdeb81 | |||
| 9c22e09808 | |||
| f057fd3a68 | |||
| 9b0acc092a | |||
| e6b4cdfa77 | |||
| eb721dc7e3 | |||
| eba0c4531c | |||
| b4a26c03fe | |||
| 9b7f36dc24 | |||
| 05167ad30c | |||
| cee6f0aa43 | |||
| 02cf330e37 | |||
| 5c8f7a3af0 | |||
| 13e1b6f4d1 | |||
| 5179cb56eb | |||
| 1a2c7e944a | |||
| f7eae046a1 | |||
| bdff08cb70 | |||
| a468cb1cd3 | |||
| 0fe933e8a1 | |||
| 5c3de91181 | |||
| 3356463102 | |||
| 7ac03cf5ca | |||
| 4aeacef07d | |||
| 8de1830cf3 | |||
| ba6169659e | |||
| 4a5c5c3f07 | |||
| 4ba7fcb1ff | |||
| a76f95858f |
@@ -0,0 +1,13 @@
|
|||||||
|
---
|
||||||
|
name: Add a site
|
||||||
|
about: I want to add a new site for Maigret checks
|
||||||
|
title: New site
|
||||||
|
labels: new-site
|
||||||
|
assignees: soxoj
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Link to the site main page: https://example.com
|
||||||
|
Link to an existing account: https://example.com/users/john
|
||||||
|
Link to a nonexistent account: https://example.com/users/noonewouldeverusethis7
|
||||||
|
Tags: photo, us, ...
|
||||||
@@ -2,6 +2,16 @@
|
|||||||
|
|
||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
|
## [0.3.1] - 2021-10-31
|
||||||
|
* fixed false positives
|
||||||
|
* accelerated maigret start time by 3 times
|
||||||
|
|
||||||
|
## [0.3.0] - 2021-06-02
|
||||||
|
* added support of Tor and I2P sites
|
||||||
|
* added experimental DNS checking feature
|
||||||
|
* implemented sorting by data points for reports
|
||||||
|
* reports fixes
|
||||||
|
|
||||||
## [0.2.4] - 2021-05-18
|
## [0.2.4] - 2021-05-18
|
||||||
* cli output report
|
* cli output report
|
||||||
* various improvements
|
* various improvements
|
||||||
|
|||||||
@@ -0,0 +1,128 @@
|
|||||||
|
# Contributor Covenant Code of Conduct
|
||||||
|
|
||||||
|
## Our Pledge
|
||||||
|
|
||||||
|
We as members, contributors, and leaders pledge to make participation in our
|
||||||
|
community a harassment-free experience for everyone, regardless of age, body
|
||||||
|
size, visible or invisible disability, ethnicity, sex characteristics, gender
|
||||||
|
identity and expression, level of experience, education, socio-economic status,
|
||||||
|
nationality, personal appearance, race, religion, or sexual identity
|
||||||
|
and orientation.
|
||||||
|
|
||||||
|
We pledge to act and interact in ways that contribute to an open, welcoming,
|
||||||
|
diverse, inclusive, and healthy community.
|
||||||
|
|
||||||
|
## Our Standards
|
||||||
|
|
||||||
|
Examples of behavior that contributes to a positive environment for our
|
||||||
|
community include:
|
||||||
|
|
||||||
|
* Demonstrating empathy and kindness toward other people
|
||||||
|
* Being respectful of differing opinions, viewpoints, and experiences
|
||||||
|
* Giving and gracefully accepting constructive feedback
|
||||||
|
* Accepting responsibility and apologizing to those affected by our mistakes,
|
||||||
|
and learning from the experience
|
||||||
|
* Focusing on what is best not just for us as individuals, but for the
|
||||||
|
overall community
|
||||||
|
|
||||||
|
Examples of unacceptable behavior include:
|
||||||
|
|
||||||
|
* The use of sexualized language or imagery, and sexual attention or
|
||||||
|
advances of any kind
|
||||||
|
* Trolling, insulting or derogatory comments, and personal or political attacks
|
||||||
|
* Public or private harassment
|
||||||
|
* Publishing others' private information, such as a physical or email
|
||||||
|
address, without their explicit permission
|
||||||
|
* Other conduct which could reasonably be considered inappropriate in a
|
||||||
|
professional setting
|
||||||
|
|
||||||
|
## Enforcement Responsibilities
|
||||||
|
|
||||||
|
Community leaders are responsible for clarifying and enforcing our standards of
|
||||||
|
acceptable behavior and will take appropriate and fair corrective action in
|
||||||
|
response to any behavior that they deem inappropriate, threatening, offensive,
|
||||||
|
or harmful.
|
||||||
|
|
||||||
|
Community leaders have the right and responsibility to remove, edit, or reject
|
||||||
|
comments, commits, code, wiki edits, issues, and other contributions that are
|
||||||
|
not aligned to this Code of Conduct, and will communicate reasons for moderation
|
||||||
|
decisions when appropriate.
|
||||||
|
|
||||||
|
## Scope
|
||||||
|
|
||||||
|
This Code of Conduct applies within all community spaces, and also applies when
|
||||||
|
an individual is officially representing the community in public spaces.
|
||||||
|
Examples of representing our community include using an official e-mail address,
|
||||||
|
posting via an official social media account, or acting as an appointed
|
||||||
|
representative at an online or offline event.
|
||||||
|
|
||||||
|
## Enforcement
|
||||||
|
|
||||||
|
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
||||||
|
reported to the community leaders responsible for enforcement at
|
||||||
|
https://t.me/soxoj.
|
||||||
|
All complaints will be reviewed and investigated promptly and fairly.
|
||||||
|
|
||||||
|
All community leaders are obligated to respect the privacy and security of the
|
||||||
|
reporter of any incident.
|
||||||
|
|
||||||
|
## Enforcement Guidelines
|
||||||
|
|
||||||
|
Community leaders will follow these Community Impact Guidelines in determining
|
||||||
|
the consequences for any action they deem in violation of this Code of Conduct:
|
||||||
|
|
||||||
|
### 1. Correction
|
||||||
|
|
||||||
|
**Community Impact**: Use of inappropriate language or other behavior deemed
|
||||||
|
unprofessional or unwelcome in the community.
|
||||||
|
|
||||||
|
**Consequence**: A private, written warning from community leaders, providing
|
||||||
|
clarity around the nature of the violation and an explanation of why the
|
||||||
|
behavior was inappropriate. A public apology may be requested.
|
||||||
|
|
||||||
|
### 2. Warning
|
||||||
|
|
||||||
|
**Community Impact**: A violation through a single incident or series
|
||||||
|
of actions.
|
||||||
|
|
||||||
|
**Consequence**: A warning with consequences for continued behavior. No
|
||||||
|
interaction with the people involved, including unsolicited interaction with
|
||||||
|
those enforcing the Code of Conduct, for a specified period of time. This
|
||||||
|
includes avoiding interactions in community spaces as well as external channels
|
||||||
|
like social media. Violating these terms may lead to a temporary or
|
||||||
|
permanent ban.
|
||||||
|
|
||||||
|
### 3. Temporary Ban
|
||||||
|
|
||||||
|
**Community Impact**: A serious violation of community standards, including
|
||||||
|
sustained inappropriate behavior.
|
||||||
|
|
||||||
|
**Consequence**: A temporary ban from any sort of interaction or public
|
||||||
|
communication with the community for a specified period of time. No public or
|
||||||
|
private interaction with the people involved, including unsolicited interaction
|
||||||
|
with those enforcing the Code of Conduct, is allowed during this period.
|
||||||
|
Violating these terms may lead to a permanent ban.
|
||||||
|
|
||||||
|
### 4. Permanent Ban
|
||||||
|
|
||||||
|
**Community Impact**: Demonstrating a pattern of violation of community
|
||||||
|
standards, including sustained inappropriate behavior, harassment of an
|
||||||
|
individual, or aggression toward or disparagement of classes of individuals.
|
||||||
|
|
||||||
|
**Consequence**: A permanent ban from any sort of public interaction within
|
||||||
|
the community.
|
||||||
|
|
||||||
|
## Attribution
|
||||||
|
|
||||||
|
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
|
||||||
|
version 2.0, available at
|
||||||
|
https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
|
||||||
|
|
||||||
|
Community Impact Guidelines were inspired by [Mozilla's code of conduct
|
||||||
|
enforcement ladder](https://github.com/mozilla/diversity).
|
||||||
|
|
||||||
|
[homepage]: https://www.contributor-covenant.org
|
||||||
|
|
||||||
|
For answers to common questions about this code of conduct, see the FAQ at
|
||||||
|
https://www.contributor-covenant.org/faq. Translations are available at
|
||||||
|
https://www.contributor-covenant.org/translations.
|
||||||
@@ -0,0 +1,30 @@
|
|||||||
|
# How to contribute
|
||||||
|
|
||||||
|
Hey! I'm really glad you're reading this. Maigret contains a lot of sites, and it is very hard to keep all the sites operational. That's why any fix is important.
|
||||||
|
|
||||||
|
## How to add a new site
|
||||||
|
|
||||||
|
#### Beginner level
|
||||||
|
|
||||||
|
You can use Maigret **submit mode** (`maigret --submit URL`) to add a new site or update an existing site. In this mode Maigret do an automatic analysis of the given account URL or site main page URL to determine the site engine and methods to check account presence. After checking Maigret asks if you want to add the site, answering y/Y will rewrite the local database.
|
||||||
|
|
||||||
|
#### Advanced level
|
||||||
|
|
||||||
|
You can edit [the database JSON file](https://github.com/soxoj/maigret/blob/main/maigret/resources/data.json) (`./maigret/resources/data.json`) manually.
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
|
||||||
|
There are CI checks for every PR to the Maigret repository. But it will be better to run `make format`, `make link` and `make test` to ensure you've made a corrent changes.
|
||||||
|
|
||||||
|
## Submitting changes
|
||||||
|
|
||||||
|
To submit you changes you must [send a GitHub PR](https://github.com/soxoj/maigret/pulls) to the Maigret project.
|
||||||
|
Always write a clear log message for your commits. One-line messages are fine for small changes, but bigger changes should look like this:
|
||||||
|
|
||||||
|
$ git commit -m "A brief summary of the commit
|
||||||
|
>
|
||||||
|
> A paragraph describing what changed and its impact."
|
||||||
|
|
||||||
|
## Coding conventions
|
||||||
|
|
||||||
|
Start reading the code and you'll get the hang of it. ;)
|
||||||
+8
-17
@@ -1,25 +1,16 @@
|
|||||||
FROM python:3.7
|
FROM python:3.9
|
||||||
LABEL maintainer="Soxoj <soxoj@protonmail.com>"
|
MAINTAINER Soxoj <soxoj@protonmail.com>
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
ADD requirements.txt .
|
|
||||||
|
|
||||||
RUN pip install --upgrade pip
|
RUN pip install --upgrade pip
|
||||||
|
RUN apt update && \
|
||||||
RUN apt update -y
|
apt install -y \
|
||||||
|
|
||||||
RUN apt install -y\
|
|
||||||
gcc \
|
gcc \
|
||||||
musl-dev \
|
musl-dev \
|
||||||
libxml2 \
|
libxml2 \
|
||||||
libxml2-dev \
|
libxml2-dev \
|
||||||
libxslt-dev \
|
libxslt-dev
|
||||||
&& YARL_NO_EXTENSIONS=1 python3 -m pip install maigret \
|
RUN apt clean \
|
||||||
&& rm -rf /var/cache/apk/* \
|
&& rm -rf /var/lib/apt/lists/* /tmp/*
|
||||||
/tmp/* \
|
|
||||||
/var/tmp/*
|
|
||||||
|
|
||||||
ADD . .
|
ADD . .
|
||||||
|
RUN YARL_NO_EXTENSIONS=1 python3 -m pip install .
|
||||||
ENTRYPOINT ["maigret"]
|
ENTRYPOINT ["maigret"]
|
||||||
|
|||||||
@@ -0,0 +1,35 @@
|
|||||||
|
LINT_FILES=maigret wizard.py tests
|
||||||
|
|
||||||
|
test:
|
||||||
|
coverage run --source=./maigret -m pytest tests
|
||||||
|
coverage report -m
|
||||||
|
coverage html
|
||||||
|
|
||||||
|
rerun-tests:
|
||||||
|
pytest --lf -vv
|
||||||
|
|
||||||
|
lint:
|
||||||
|
@echo 'syntax errors or undefined names'
|
||||||
|
flake8 --count --select=E9,F63,F7,F82 --show-source --statistics ${LINT_FILES} maigret.py
|
||||||
|
|
||||||
|
@echo 'warning'
|
||||||
|
flake8 --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --ignore=E731,W503 ${LINT_FILES} maigret.py
|
||||||
|
|
||||||
|
@echo 'mypy'
|
||||||
|
mypy ${LINT_FILES}
|
||||||
|
|
||||||
|
format:
|
||||||
|
@echo 'black'
|
||||||
|
black --skip-string-normalization ${LINT_FILES}
|
||||||
|
|
||||||
|
pull:
|
||||||
|
git stash
|
||||||
|
git checkout main
|
||||||
|
git pull origin main
|
||||||
|
git stash pop
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -rf reports htmcov dist
|
||||||
|
|
||||||
|
install:
|
||||||
|
pip3 install .
|
||||||
@@ -8,15 +8,12 @@
|
|||||||
<a href="https://pypi.org/project/maigret/">
|
<a href="https://pypi.org/project/maigret/">
|
||||||
<img alt="PyPI - Downloads" src="https://img.shields.io/pypi/dw/maigret?style=flat-square">
|
<img alt="PyPI - Downloads" src="https://img.shields.io/pypi/dw/maigret?style=flat-square">
|
||||||
</a>
|
</a>
|
||||||
<a href="https://gitter.im/maigret-osint/community">
|
<a href="https://pypi.org/project/maigret/">
|
||||||
<img alt="Chat - Gitter" src="./static/chat_gitter.svg" />
|
<img alt="Views" src="https://komarev.com/ghpvc/?username=maigret&color=brightgreen&label=views&style=flat-square">
|
||||||
</a>
|
|
||||||
<a href="https://twitter.com/intent/follow?screen_name=sox0j">
|
|
||||||
<img src="https://img.shields.io/twitter/follow/sox0j?label=Follow%20sox0j&style=social&color=blue" alt="Follow @sox0j" />
|
|
||||||
</a>
|
</a>
|
||||||
</p>
|
</p>
|
||||||
<p align="center">
|
<p align="center">
|
||||||
<img src="./static/maigret.png" height="200"/>
|
<img src="https://raw.githubusercontent.com/soxoj/maigret/main/static/maigret.png" height="200"/>
|
||||||
</p>
|
</p>
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
@@ -24,9 +21,9 @@
|
|||||||
|
|
||||||
## About
|
## About
|
||||||
|
|
||||||
**Maigret** collect a dossier on a person **by username only**, checking for accounts on a huge number of sites and gathering all the available information from web pages. Maigret is an easy-to-use and powerful fork of [Sherlock](https://github.com/sherlock-project/sherlock).
|
**Maigret** collect a dossier on a person **by username only**, checking for accounts on a huge number of sites and gathering all the available information from web pages. No API keys required. Maigret is an easy-to-use and powerful fork of [Sherlock](https://github.com/sherlock-project/sherlock).
|
||||||
|
|
||||||
Currently supported more than 2000 sites ([full list](./sites.md)), search is launched against 500 popular sites in descending order of popularity by default.
|
Currently supported more than 2000 sites ([full list](https://raw.githubusercontent.com/soxoj/maigret/main/sites.md)), search is launched against 500 popular sites in descending order of popularity by default. Also supported checking of Tor sites, I2P sites, and domains (via DNS resolving).
|
||||||
|
|
||||||
## Main features
|
## Main features
|
||||||
|
|
||||||
@@ -41,10 +38,13 @@ See full description of Maigret features [in the Wiki](https://github.com/soxoj/
|
|||||||
## Installation
|
## Installation
|
||||||
|
|
||||||
Maigret can be installed using pip, Docker, or simply can be launched from the cloned repo.
|
Maigret can be installed using pip, Docker, or simply can be launched from the cloned repo.
|
||||||
Also you can run Maigret using cloud shells (see buttons below).
|
Also you can run Maigret using cloud shells and Jupyter notebooks (see buttons below).
|
||||||
|
|
||||||
[](https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/soxoj/maigret&tutorial=README.md) [](https://repl.it/github/soxoj/maigret)
|
[](https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/soxoj/maigret&tutorial=README.md)
|
||||||
<a href="https://colab.research.google.com/gist//soxoj/879b51bc3b2f8b695abb054090645000/maigret.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" height="40"></a>
|
<a href="https://repl.it/github/soxoj/maigret"><img src="https://user-images.githubusercontent.com/27065646/92304596-bf719b00-ef7f-11ea-987f-2c1f3c323088.png" alt="Run on Repl.it" height="50"></a>
|
||||||
|
|
||||||
|
<a href="https://colab.research.google.com/gist/soxoj/879b51bc3b2f8b695abb054090645000/maigret-collab.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" height="45"></a>
|
||||||
|
<a href="https://mybinder.org/v2/gist/soxoj/9d65c2f4d3bec5dd25949197ea73cf3a/HEAD"><img src="https://mybinder.org/badge_logo.svg" alt="Open In Binder" height="45"></a>
|
||||||
|
|
||||||
### Package installing
|
### Package installing
|
||||||
|
|
||||||
@@ -103,16 +103,16 @@ Use `maigret --help` to get full options description. Also options are documente
|
|||||||
|
|
||||||
## Demo with page parsing and recursive username search
|
## Demo with page parsing and recursive username search
|
||||||
|
|
||||||
[PDF report](./static/report_alexaimephotographycars.pdf), [HTML report](https://htmlpreview.github.io/?https://raw.githubusercontent.com/soxoj/maigret/main/static/report_alexaimephotographycars.html)
|
[PDF report](https://raw.githubusercontent.com/soxoj/maigret/main/static/report_alexaimephotographycars.pdf), [HTML report](https://htmlpreview.github.io/?https://raw.githubusercontent.com/soxoj/maigret/main/static/report_alexaimephotographycars.html)
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||

|

|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
|
|
||||||
[Full console output](./static/recursive_search.md)
|
[Full console output](https://raw.githubusercontent.com/soxoj/maigret/main/static/recursive_search.md)
|
||||||
|
|
||||||
## License
|
## License
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,68 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "8v6PEfyXb0Gx"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# clone the repo\n",
|
||||||
|
"!git clone https://github.com/soxoj/maigret\n",
|
||||||
|
"!pip3 install -r maigret/requirements.txt"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "cXOQUAhDchkl"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# help\n",
|
||||||
|
"!python3 maigret/maigret.py --help"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {
|
||||||
|
"id": "SjDmpN4QGnJu"
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# search\n",
|
||||||
|
"!python3 maigret/maigret.py user"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"colab": {
|
||||||
|
"collapsed_sections": [],
|
||||||
|
"include_colab_link": true,
|
||||||
|
"name": "maigret.ipynb",
|
||||||
|
"provenance": []
|
||||||
|
},
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.7.10"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 1
|
||||||
|
}
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
FILES="maigret wizard.py maigret.py tests"
|
|
||||||
|
|
||||||
echo 'black'
|
|
||||||
black --skip-string-normalization $FILES
|
|
||||||
@@ -1,11 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
FILES="maigret wizard.py maigret.py tests"
|
|
||||||
|
|
||||||
echo 'syntax errors or undefined names'
|
|
||||||
flake8 --count --select=E9,F63,F7,F82 --show-source --statistics $FILES
|
|
||||||
|
|
||||||
echo 'warning'
|
|
||||||
flake8 --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --ignore=E731,W503 $FILES
|
|
||||||
|
|
||||||
echo 'mypy'
|
|
||||||
mypy ./maigret ./wizard.py ./tests
|
|
||||||
@@ -1,3 +1,3 @@
|
|||||||
"""Maigret version file"""
|
"""Maigret version file"""
|
||||||
|
|
||||||
__version__ = '0.2.4'
|
__version__ = '0.3.1'
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ class ParsingActivator:
|
|||||||
site.headers["authorization"] = f"Bearer {bearer_token}"
|
site.headers["authorization"] = f"Bearer {bearer_token}"
|
||||||
|
|
||||||
|
|
||||||
async def import_aiohttp_cookies(cookiestxt_filename):
|
def import_aiohttp_cookies(cookiestxt_filename):
|
||||||
cookies_obj = MozillaCookieJar(cookiestxt_filename)
|
cookies_obj = MozillaCookieJar(cookiestxt_filename)
|
||||||
cookies_obj.load(ignore_discard=True, ignore_expires=True)
|
cookies_obj.load(ignore_discard=True, ignore_expires=True)
|
||||||
|
|
||||||
|
|||||||
+184
-31
@@ -1,6 +1,11 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
try:
|
||||||
from mock import Mock
|
from mock import Mock
|
||||||
|
except ImportError:
|
||||||
|
from unittest.mock import Mock
|
||||||
|
|
||||||
import re
|
import re
|
||||||
import ssl
|
import ssl
|
||||||
import sys
|
import sys
|
||||||
@@ -8,11 +13,11 @@ import tqdm
|
|||||||
from typing import Tuple, Optional, Dict, List
|
from typing import Tuple, Optional, Dict, List
|
||||||
from urllib.parse import quote
|
from urllib.parse import quote
|
||||||
|
|
||||||
import aiohttp
|
import aiodns
|
||||||
import tqdm.asyncio
|
import tqdm.asyncio
|
||||||
from aiohttp_socks import ProxyConnector
|
|
||||||
from python_socks import _errors as proxy_errors
|
from python_socks import _errors as proxy_errors
|
||||||
from socid_extractor import extract
|
from socid_extractor import extract
|
||||||
|
from aiohttp import TCPConnector, ClientSession, http_exceptions
|
||||||
from aiohttp.client_exceptions import ServerDisconnectedError, ClientConnectorError
|
from aiohttp.client_exceptions import ServerDisconnectedError, ClientConnectorError
|
||||||
|
|
||||||
from .activation import ParsingActivator, import_aiohttp_cookies
|
from .activation import ParsingActivator, import_aiohttp_cookies
|
||||||
@@ -30,6 +35,7 @@ from .utils import get_random_user_agent, ascii_data_display
|
|||||||
|
|
||||||
|
|
||||||
SUPPORTED_IDS = (
|
SUPPORTED_IDS = (
|
||||||
|
"username",
|
||||||
"yandex_public_id",
|
"yandex_public_id",
|
||||||
"gaia_id",
|
"gaia_id",
|
||||||
"vk_id",
|
"vk_id",
|
||||||
@@ -43,13 +49,53 @@ SUPPORTED_IDS = (
|
|||||||
BAD_CHARS = "#"
|
BAD_CHARS = "#"
|
||||||
|
|
||||||
|
|
||||||
async def get_response(request_future, logger) -> Tuple[str, int, Optional[CheckError]]:
|
class CheckerBase:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class SimpleAiohttpChecker(CheckerBase):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
proxy = kwargs.get('proxy')
|
||||||
|
cookie_jar = kwargs.get('cookie_jar')
|
||||||
|
self.logger = kwargs.get('logger', Mock())
|
||||||
|
|
||||||
|
# moved here to speed up the launch of Maigret
|
||||||
|
from aiohttp_socks import ProxyConnector
|
||||||
|
|
||||||
|
# make http client session
|
||||||
|
connector = (
|
||||||
|
ProxyConnector.from_url(proxy) if proxy else TCPConnector(ssl=False)
|
||||||
|
)
|
||||||
|
connector.verify_ssl = False
|
||||||
|
self.session = ClientSession(
|
||||||
|
connector=connector, trust_env=True, cookie_jar=cookie_jar
|
||||||
|
)
|
||||||
|
|
||||||
|
def prepare(self, url, headers=None, allow_redirects=True, timeout=0, method='get'):
|
||||||
|
if method == 'get':
|
||||||
|
request_method = self.session.get
|
||||||
|
else:
|
||||||
|
request_method = self.session.head
|
||||||
|
|
||||||
|
future = request_method(
|
||||||
|
url=url,
|
||||||
|
headers=headers,
|
||||||
|
allow_redirects=allow_redirects,
|
||||||
|
timeout=timeout,
|
||||||
|
)
|
||||||
|
|
||||||
|
return future
|
||||||
|
|
||||||
|
async def close(self):
|
||||||
|
await self.session.close()
|
||||||
|
|
||||||
|
async def check(self, future) -> Tuple[str, int, Optional[CheckError]]:
|
||||||
html_text = None
|
html_text = None
|
||||||
status_code = 0
|
status_code = 0
|
||||||
error: Optional[CheckError] = CheckError("Unknown")
|
error: Optional[CheckError] = CheckError("Unknown")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = await request_future
|
response = await future
|
||||||
|
|
||||||
status_code = response.status
|
status_code = response.status
|
||||||
response_content = await response.content.read()
|
response_content = await response.content.read()
|
||||||
@@ -61,7 +107,7 @@ async def get_response(request_future, logger) -> Tuple[str, int, Optional[Check
|
|||||||
if status_code == 0:
|
if status_code == 0:
|
||||||
error = CheckError("Connection lost")
|
error = CheckError("Connection lost")
|
||||||
|
|
||||||
logger.debug(html_text)
|
self.logger.debug(html_text)
|
||||||
|
|
||||||
except asyncio.TimeoutError as e:
|
except asyncio.TimeoutError as e:
|
||||||
error = CheckError("Request timeout", str(e))
|
error = CheckError("Request timeout", str(e))
|
||||||
@@ -69,7 +115,7 @@ async def get_response(request_future, logger) -> Tuple[str, int, Optional[Check
|
|||||||
error = CheckError("Connecting failure", str(e))
|
error = CheckError("Connecting failure", str(e))
|
||||||
except ServerDisconnectedError as e:
|
except ServerDisconnectedError as e:
|
||||||
error = CheckError("Server disconnected", str(e))
|
error = CheckError("Server disconnected", str(e))
|
||||||
except aiohttp.http_exceptions.BadHttpMessage as e:
|
except http_exceptions.BadHttpMessage as e:
|
||||||
error = CheckError("HTTP", str(e))
|
error = CheckError("HTTP", str(e))
|
||||||
except proxy_errors.ProxyError as e:
|
except proxy_errors.ProxyError as e:
|
||||||
error = CheckError("Proxy", str(e))
|
error = CheckError("Proxy", str(e))
|
||||||
@@ -78,16 +124,75 @@ async def get_response(request_future, logger) -> Tuple[str, int, Optional[Check
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
# python-specific exceptions
|
# python-specific exceptions
|
||||||
if sys.version_info.minor > 6 and (
|
if sys.version_info.minor > 6 and (
|
||||||
isinstance(e, ssl.SSLCertVerificationError) or isinstance(e, ssl.SSLError)
|
isinstance(e, ssl.SSLCertVerificationError)
|
||||||
|
or isinstance(e, ssl.SSLError)
|
||||||
):
|
):
|
||||||
error = CheckError("SSL", str(e))
|
error = CheckError("SSL", str(e))
|
||||||
else:
|
else:
|
||||||
logger.debug(e, exc_info=True)
|
self.logger.debug(e, exc_info=True)
|
||||||
error = CheckError("Unexpected", str(e))
|
error = CheckError("Unexpected", str(e))
|
||||||
|
|
||||||
return str(html_text), status_code, error
|
return str(html_text), status_code, error
|
||||||
|
|
||||||
|
|
||||||
|
class ProxiedAiohttpChecker(SimpleAiohttpChecker):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
proxy = kwargs.get('proxy')
|
||||||
|
cookie_jar = kwargs.get('cookie_jar')
|
||||||
|
self.logger = kwargs.get('logger', Mock())
|
||||||
|
|
||||||
|
# moved here to speed up the launch of Maigret
|
||||||
|
from aiohttp_socks import ProxyConnector
|
||||||
|
|
||||||
|
connector = ProxyConnector.from_url(proxy)
|
||||||
|
connector.verify_ssl = False
|
||||||
|
self.session = ClientSession(
|
||||||
|
connector=connector, trust_env=True, cookie_jar=cookie_jar
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class AiodnsDomainResolver(CheckerBase):
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
self.logger = kwargs.get('logger', Mock())
|
||||||
|
self.resolver = aiodns.DNSResolver(loop=loop)
|
||||||
|
|
||||||
|
def prepare(self, url, headers=None, allow_redirects=True, timeout=0, method='get'):
|
||||||
|
return self.resolver.query(url, 'A')
|
||||||
|
|
||||||
|
async def check(self, future) -> Tuple[str, int, Optional[CheckError]]:
|
||||||
|
status = 404
|
||||||
|
error = None
|
||||||
|
text = ''
|
||||||
|
|
||||||
|
try:
|
||||||
|
res = await future
|
||||||
|
text = str(res[0].host)
|
||||||
|
status = 200
|
||||||
|
except aiodns.error.DNSError:
|
||||||
|
pass
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.error(e, exc_info=True)
|
||||||
|
error = CheckError('DNS resolve error', str(e))
|
||||||
|
|
||||||
|
return text, status, error
|
||||||
|
|
||||||
|
|
||||||
|
class CheckerMock:
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def prepare(self, url, headers=None, allow_redirects=True, timeout=0, method='get'):
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def check(self, future) -> Tuple[str, int, Optional[CheckError]]:
|
||||||
|
await asyncio.sleep(0)
|
||||||
|
return '', 0, None
|
||||||
|
|
||||||
|
async def close(self):
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
# TODO: move to separate class
|
# TODO: move to separate class
|
||||||
def detect_error_page(
|
def detect_error_page(
|
||||||
html_text, status_code, fail_flags, ignore_403
|
html_text, status_code, fail_flags, ignore_403
|
||||||
@@ -322,7 +427,8 @@ def make_site_result(
|
|||||||
# workaround to prevent slash errors
|
# workaround to prevent slash errors
|
||||||
url = re.sub("(?<!:)/+", "/", url)
|
url = re.sub("(?<!:)/+", "/", url)
|
||||||
|
|
||||||
session = options['session']
|
# always clearweb_checker for now
|
||||||
|
checker = options["checkers"][site.protocol]
|
||||||
|
|
||||||
# site check is disabled
|
# site check is disabled
|
||||||
if site.disabled and not options['forced']:
|
if site.disabled and not options['forced']:
|
||||||
@@ -381,12 +487,12 @@ def make_site_result(
|
|||||||
# In most cases when we are detecting by status code,
|
# In most cases when we are detecting by status code,
|
||||||
# it is not necessary to get the entire body: we can
|
# it is not necessary to get the entire body: we can
|
||||||
# detect fine with just the HEAD response.
|
# detect fine with just the HEAD response.
|
||||||
request_method = session.head
|
request_method = 'head'
|
||||||
else:
|
else:
|
||||||
# Either this detect method needs the content associated
|
# Either this detect method needs the content associated
|
||||||
# with the GET response, or this specific website will
|
# with the GET response, or this specific website will
|
||||||
# not respond properly unless we request the whole page.
|
# not respond properly unless we request the whole page.
|
||||||
request_method = session.get
|
request_method = 'get'
|
||||||
|
|
||||||
if site.check_type == "response_url":
|
if site.check_type == "response_url":
|
||||||
# Site forwards request to a different URL if username not
|
# Site forwards request to a different URL if username not
|
||||||
@@ -398,7 +504,8 @@ def make_site_result(
|
|||||||
# The final result of the request will be what is available.
|
# The final result of the request will be what is available.
|
||||||
allow_redirects = True
|
allow_redirects = True
|
||||||
|
|
||||||
future = request_method(
|
future = checker.prepare(
|
||||||
|
method=request_method,
|
||||||
url=url_probe,
|
url=url_probe,
|
||||||
headers=headers,
|
headers=headers,
|
||||||
allow_redirects=allow_redirects,
|
allow_redirects=allow_redirects,
|
||||||
@@ -407,6 +514,7 @@ def make_site_result(
|
|||||||
|
|
||||||
# Store future request object in the results object
|
# Store future request object in the results object
|
||||||
results_site["future"] = future
|
results_site["future"] = future
|
||||||
|
results_site["checker"] = checker
|
||||||
|
|
||||||
return results_site
|
return results_site
|
||||||
|
|
||||||
@@ -419,7 +527,9 @@ async def check_site_for_username(
|
|||||||
if not future:
|
if not future:
|
||||||
return site.name, default_result
|
return site.name, default_result
|
||||||
|
|
||||||
response = await get_response(request_future=future, logger=logger)
|
checker = default_result["checker"]
|
||||||
|
|
||||||
|
response = await checker.check(future=future)
|
||||||
|
|
||||||
response_result = process_site_result(
|
response_result = process_site_result(
|
||||||
response, query_notify, logger, default_result, site
|
response, query_notify, logger, default_result, site
|
||||||
@@ -430,9 +540,9 @@ async def check_site_for_username(
|
|||||||
return site.name, response_result
|
return site.name, response_result
|
||||||
|
|
||||||
|
|
||||||
async def debug_ip_request(session, logger):
|
async def debug_ip_request(checker, logger):
|
||||||
future = session.get(url="https://icanhazip.com")
|
future = checker.prepare(url="https://icanhazip.com")
|
||||||
ip, status, check_error = await get_response(future, logger)
|
ip, status, check_error = await checker.check(future)
|
||||||
if ip:
|
if ip:
|
||||||
logger.debug(f"My IP is: {ip.strip()}")
|
logger.debug(f"My IP is: {ip.strip()}")
|
||||||
else:
|
else:
|
||||||
@@ -456,6 +566,8 @@ async def maigret(
|
|||||||
logger,
|
logger,
|
||||||
query_notify=None,
|
query_notify=None,
|
||||||
proxy=None,
|
proxy=None,
|
||||||
|
tor_proxy=None,
|
||||||
|
i2p_proxy=None,
|
||||||
timeout=3,
|
timeout=3,
|
||||||
is_parsing_enabled=False,
|
is_parsing_enabled=False,
|
||||||
id_type="username",
|
id_type="username",
|
||||||
@@ -465,6 +577,7 @@ async def maigret(
|
|||||||
no_progressbar=False,
|
no_progressbar=False,
|
||||||
cookies=None,
|
cookies=None,
|
||||||
retries=0,
|
retries=0,
|
||||||
|
check_domains=False,
|
||||||
) -> QueryResultWrapper:
|
) -> QueryResultWrapper:
|
||||||
"""Main search func
|
"""Main search func
|
||||||
|
|
||||||
@@ -508,23 +621,36 @@ async def maigret(
|
|||||||
|
|
||||||
query_notify.start(username, id_type)
|
query_notify.start(username, id_type)
|
||||||
|
|
||||||
# make http client session
|
|
||||||
connector = (
|
|
||||||
ProxyConnector.from_url(proxy) if proxy else aiohttp.TCPConnector(ssl=False)
|
|
||||||
)
|
|
||||||
connector.verify_ssl = False
|
|
||||||
|
|
||||||
cookie_jar = None
|
cookie_jar = None
|
||||||
if cookies:
|
if cookies:
|
||||||
logger.debug(f"Using cookies jar file {cookies}")
|
logger.debug(f"Using cookies jar file {cookies}")
|
||||||
cookie_jar = await import_aiohttp_cookies(cookies)
|
cookie_jar = import_aiohttp_cookies(cookies)
|
||||||
|
|
||||||
session = aiohttp.ClientSession(
|
clearweb_checker = SimpleAiohttpChecker(
|
||||||
connector=connector, trust_env=True, cookie_jar=cookie_jar
|
proxy=proxy, cookie_jar=cookie_jar, logger=logger
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# TODO
|
||||||
|
tor_checker = CheckerMock()
|
||||||
|
if tor_proxy:
|
||||||
|
tor_checker = ProxiedAiohttpChecker( # type: ignore
|
||||||
|
proxy=tor_proxy, cookie_jar=cookie_jar, logger=logger
|
||||||
|
)
|
||||||
|
|
||||||
|
# TODO
|
||||||
|
i2p_checker = CheckerMock()
|
||||||
|
if i2p_proxy:
|
||||||
|
i2p_checker = ProxiedAiohttpChecker( # type: ignore
|
||||||
|
proxy=i2p_proxy, cookie_jar=cookie_jar, logger=logger
|
||||||
|
)
|
||||||
|
|
||||||
|
# TODO
|
||||||
|
dns_checker = CheckerMock()
|
||||||
|
if check_domains:
|
||||||
|
dns_checker = AiodnsDomainResolver(logger=logger) # type: ignore
|
||||||
|
|
||||||
if logger.level == logging.DEBUG:
|
if logger.level == logging.DEBUG:
|
||||||
await debug_ip_request(session, logger)
|
await debug_ip_request(clearweb_checker, logger)
|
||||||
|
|
||||||
# setup parallel executor
|
# setup parallel executor
|
||||||
executor: Optional[AsyncExecutor] = None
|
executor: Optional[AsyncExecutor] = None
|
||||||
@@ -538,7 +664,12 @@ async def maigret(
|
|||||||
# make options objects for all the requests
|
# make options objects for all the requests
|
||||||
options: QueryOptions = {}
|
options: QueryOptions = {}
|
||||||
options["cookies"] = cookie_jar
|
options["cookies"] = cookie_jar
|
||||||
options["session"] = session
|
options["checkers"] = {
|
||||||
|
'': clearweb_checker,
|
||||||
|
'tor': tor_checker,
|
||||||
|
'dns': dns_checker,
|
||||||
|
'i2p': i2p_checker,
|
||||||
|
}
|
||||||
options["parsing"] = is_parsing_enabled
|
options["parsing"] = is_parsing_enabled
|
||||||
options["timeout"] = timeout
|
options["timeout"] = timeout
|
||||||
options["id_type"] = id_type
|
options["id_type"] = id_type
|
||||||
@@ -591,7 +722,11 @@ async def maigret(
|
|||||||
)
|
)
|
||||||
|
|
||||||
# closing http client session
|
# closing http client session
|
||||||
await session.close()
|
await clearweb_checker.close()
|
||||||
|
if tor_proxy:
|
||||||
|
await tor_checker.close()
|
||||||
|
if i2p_proxy:
|
||||||
|
await i2p_checker.close()
|
||||||
|
|
||||||
# notify caller that all queries are finished
|
# notify caller that all queries are finished
|
||||||
query_notify.finish()
|
query_notify.finish()
|
||||||
@@ -625,7 +760,13 @@ def timeout_check(value):
|
|||||||
|
|
||||||
|
|
||||||
async def site_self_check(
|
async def site_self_check(
|
||||||
site: MaigretSite, logger, semaphore, db: MaigretDatabase, silent=False
|
site: MaigretSite,
|
||||||
|
logger,
|
||||||
|
semaphore,
|
||||||
|
db: MaigretDatabase,
|
||||||
|
silent=False,
|
||||||
|
tor_proxy=None,
|
||||||
|
i2p_proxy=None,
|
||||||
):
|
):
|
||||||
changes = {
|
changes = {
|
||||||
"disabled": False,
|
"disabled": False,
|
||||||
@@ -649,6 +790,8 @@ async def site_self_check(
|
|||||||
forced=True,
|
forced=True,
|
||||||
no_progressbar=True,
|
no_progressbar=True,
|
||||||
retries=1,
|
retries=1,
|
||||||
|
tor_proxy=tor_proxy,
|
||||||
|
i2p_proxy=i2p_proxy,
|
||||||
)
|
)
|
||||||
|
|
||||||
# don't disable entries with other ids types
|
# don't disable entries with other ids types
|
||||||
@@ -658,6 +801,8 @@ async def site_self_check(
|
|||||||
changes["disabled"] = True
|
changes["disabled"] = True
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
logger.debug(results_dict)
|
||||||
|
|
||||||
result = results_dict[site.name]["status"]
|
result = results_dict[site.name]["status"]
|
||||||
|
|
||||||
site_status = result.status
|
site_status = result.status
|
||||||
@@ -696,7 +841,13 @@ async def site_self_check(
|
|||||||
|
|
||||||
|
|
||||||
async def self_check(
|
async def self_check(
|
||||||
db: MaigretDatabase, site_data: dict, logger, silent=False, max_connections=10
|
db: MaigretDatabase,
|
||||||
|
site_data: dict,
|
||||||
|
logger,
|
||||||
|
silent=False,
|
||||||
|
max_connections=10,
|
||||||
|
tor_proxy=None,
|
||||||
|
i2p_proxy=None,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
sem = asyncio.Semaphore(max_connections)
|
sem = asyncio.Semaphore(max_connections)
|
||||||
tasks = []
|
tasks = []
|
||||||
@@ -708,7 +859,9 @@ async def self_check(
|
|||||||
disabled_old_count = disabled_count(all_sites.values())
|
disabled_old_count = disabled_count(all_sites.values())
|
||||||
|
|
||||||
for _, site in all_sites.items():
|
for _, site in all_sites.items():
|
||||||
check_coro = site_self_check(site, logger, sem, db, silent)
|
check_coro = site_self_check(
|
||||||
|
site, logger, sem, db, silent, tor_proxy, i2p_proxy
|
||||||
|
)
|
||||||
future = asyncio.ensure_future(check_coro)
|
future = asyncio.ensure_future(check_coro)
|
||||||
tasks.append(future)
|
tasks.append(future)
|
||||||
|
|
||||||
|
|||||||
+74
-24
@@ -1,7 +1,6 @@
|
|||||||
"""
|
"""
|
||||||
Maigret main module
|
Maigret main module
|
||||||
"""
|
"""
|
||||||
import aiohttp
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
@@ -10,8 +9,7 @@ import platform
|
|||||||
from argparse import ArgumentParser, RawDescriptionHelpFormatter
|
from argparse import ArgumentParser, RawDescriptionHelpFormatter
|
||||||
from typing import List, Tuple
|
from typing import List, Tuple
|
||||||
|
|
||||||
import requests
|
from socid_extractor import extract, parse
|
||||||
from socid_extractor import extract, parse, __version__ as socid_version
|
|
||||||
|
|
||||||
from .__version__ import __version__
|
from .__version__ import __version__
|
||||||
from .checking import (
|
from .checking import (
|
||||||
@@ -33,11 +31,14 @@ from .report import (
|
|||||||
SUPPORTED_JSON_REPORT_FORMATS,
|
SUPPORTED_JSON_REPORT_FORMATS,
|
||||||
save_json_report,
|
save_json_report,
|
||||||
get_plaintext_report,
|
get_plaintext_report,
|
||||||
|
sort_report_by_data_points,
|
||||||
|
save_graph_report,
|
||||||
)
|
)
|
||||||
from .sites import MaigretDatabase
|
from .sites import MaigretDatabase
|
||||||
from .submit import submit_dialog
|
from .submit import Submitter
|
||||||
from .types import QueryResultWrapper
|
from .types import QueryResultWrapper
|
||||||
from .utils import get_dict_ascii_tree
|
from .utils import get_dict_ascii_tree
|
||||||
|
from .settings import Settings
|
||||||
|
|
||||||
|
|
||||||
def notify_about_errors(search_results: QueryResultWrapper, query_notify):
|
def notify_about_errors(search_results: QueryResultWrapper, query_notify):
|
||||||
@@ -60,17 +61,6 @@ def notify_about_errors(search_results: QueryResultWrapper, query_notify):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def extract_ids_from_url(url: str, db: MaigretDatabase) -> dict:
|
|
||||||
results = {}
|
|
||||||
for s in db.sites:
|
|
||||||
result = s.extract_id_from_url(url)
|
|
||||||
if not result:
|
|
||||||
continue
|
|
||||||
_id, _type = result
|
|
||||||
results[_id] = _type
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
def extract_ids_from_page(url, logger, timeout=5) -> dict:
|
def extract_ids_from_page(url, logger, timeout=5) -> dict:
|
||||||
results = {}
|
results = {}
|
||||||
# url, headers
|
# url, headers
|
||||||
@@ -116,18 +106,22 @@ def extract_ids_from_results(results: QueryResultWrapper, db: MaigretDatabase) -
|
|||||||
ids_results[u] = utype
|
ids_results[u] = utype
|
||||||
|
|
||||||
for url in dictionary.get('ids_links', []):
|
for url in dictionary.get('ids_links', []):
|
||||||
ids_results.update(extract_ids_from_url(url, db))
|
ids_results.update(db.extract_ids_from_url(url))
|
||||||
|
|
||||||
return ids_results
|
return ids_results
|
||||||
|
|
||||||
|
|
||||||
def setup_arguments_parser():
|
def setup_arguments_parser():
|
||||||
|
from aiohttp import __version__ as aiohttp_version
|
||||||
|
from requests import __version__ as requests_version
|
||||||
|
from socid_extractor import __version__ as socid_version
|
||||||
|
|
||||||
version_string = '\n'.join(
|
version_string = '\n'.join(
|
||||||
[
|
[
|
||||||
f'%(prog)s {__version__}',
|
f'%(prog)s {__version__}',
|
||||||
f'Socid-extractor: {socid_version}',
|
f'Socid-extractor: {socid_version}',
|
||||||
f'Aiohttp: {aiohttp.__version__}',
|
f'Aiohttp: {aiohttp_version}',
|
||||||
f'Requests: {requests.__version__}',
|
f'Requests: {requests_version}',
|
||||||
f'Python: {platform.python_version()}',
|
f'Python: {platform.python_version()}',
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
@@ -203,7 +197,7 @@ def setup_arguments_parser():
|
|||||||
metavar="DB_FILE",
|
metavar="DB_FILE",
|
||||||
dest="db_file",
|
dest="db_file",
|
||||||
default=None,
|
default=None,
|
||||||
help="Load Maigret database from a JSON file or an online, valid, JSON file.",
|
help="Load Maigret database from a JSON file or HTTP web resource.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--cookies-jar-file",
|
"--cookies-jar-file",
|
||||||
@@ -238,6 +232,26 @@ def setup_arguments_parser():
|
|||||||
default=None,
|
default=None,
|
||||||
help="Make requests over a proxy. e.g. socks5://127.0.0.1:1080",
|
help="Make requests over a proxy. e.g. socks5://127.0.0.1:1080",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--tor-proxy",
|
||||||
|
metavar='TOR_PROXY_URL',
|
||||||
|
action="store",
|
||||||
|
default='socks5://127.0.0.1:9050',
|
||||||
|
help="Specify URL of your Tor gateway. Default is socks5://127.0.0.1:9050",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--i2p-proxy",
|
||||||
|
metavar='I2P_PROXY_URL',
|
||||||
|
action="store",
|
||||||
|
default='http://127.0.0.1:4444',
|
||||||
|
help="Specify URL of your I2P gateway. Default is http://127.0.0.1:4444",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--with-domains",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Enable (experimental) feature of checking domains on usernames.",
|
||||||
|
)
|
||||||
|
|
||||||
filter_group = parser.add_argument_group(
|
filter_group = parser.add_argument_group(
|
||||||
'Site filtering', 'Options to set site search scope'
|
'Site filtering', 'Options to set site search scope'
|
||||||
@@ -409,6 +423,14 @@ def setup_arguments_parser():
|
|||||||
default=False,
|
default=False,
|
||||||
help="Generate a PDF report (general report on all usernames).",
|
help="Generate a PDF report (general report on all usernames).",
|
||||||
)
|
)
|
||||||
|
report_group.add_argument(
|
||||||
|
"-G",
|
||||||
|
"--graph",
|
||||||
|
action="store_true",
|
||||||
|
dest="graph",
|
||||||
|
default=False,
|
||||||
|
help="Generate a graph report (general report on all usernames).",
|
||||||
|
)
|
||||||
report_group.add_argument(
|
report_group.add_argument(
|
||||||
"-J",
|
"-J",
|
||||||
"--json",
|
"--json",
|
||||||
@@ -420,6 +442,13 @@ def setup_arguments_parser():
|
|||||||
help=f"Generate a JSON report of specific type: {', '.join(SUPPORTED_JSON_REPORT_FORMATS)}"
|
help=f"Generate a JSON report of specific type: {', '.join(SUPPORTED_JSON_REPORT_FORMATS)}"
|
||||||
" (one report per username).",
|
" (one report per username).",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--reports-sorting",
|
||||||
|
default='default',
|
||||||
|
choices=('default', 'data'),
|
||||||
|
help="Method of results sorting in reports (default: in order of getting the result)",
|
||||||
|
)
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
@@ -468,6 +497,12 @@ async def main():
|
|||||||
if args.tags:
|
if args.tags:
|
||||||
args.tags = list(set(str(args.tags).split(',')))
|
args.tags = list(set(str(args.tags).split(',')))
|
||||||
|
|
||||||
|
settings = Settings(
|
||||||
|
os.path.join(
|
||||||
|
os.path.dirname(os.path.realpath(__file__)), "resources/settings.json"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
if args.db_file is None:
|
if args.db_file is None:
|
||||||
args.db_file = os.path.join(
|
args.db_file = os.path.join(
|
||||||
os.path.dirname(os.path.realpath(__file__)), "resources/data.json"
|
os.path.dirname(os.path.realpath(__file__)), "resources/data.json"
|
||||||
@@ -486,7 +521,7 @@ async def main():
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Create object with all information about sites we are aware of.
|
# Create object with all information about sites we are aware of.
|
||||||
db = MaigretDatabase().load_from_file(args.db_file)
|
db = MaigretDatabase().load_from_path(args.db_file)
|
||||||
get_top_sites_for_id = lambda x: db.ranked_sites_dict(
|
get_top_sites_for_id = lambda x: db.ranked_sites_dict(
|
||||||
top=args.top_sites,
|
top=args.top_sites,
|
||||||
tags=args.tags,
|
tags=args.tags,
|
||||||
@@ -498,9 +533,8 @@ async def main():
|
|||||||
site_data = get_top_sites_for_id(args.id_type)
|
site_data = get_top_sites_for_id(args.id_type)
|
||||||
|
|
||||||
if args.new_site_to_submit:
|
if args.new_site_to_submit:
|
||||||
is_submitted = await submit_dialog(
|
submitter = Submitter(db=db, logger=logger, settings=settings)
|
||||||
db, args.new_site_to_submit, args.cookie_file, logger
|
is_submitted = await submitter.dialog(args.new_site_to_submit, args.cookie_file)
|
||||||
)
|
|
||||||
if is_submitted:
|
if is_submitted:
|
||||||
db.save_to_file(args.db_file)
|
db.save_to_file(args.db_file)
|
||||||
|
|
||||||
@@ -508,7 +542,12 @@ async def main():
|
|||||||
if args.self_check:
|
if args.self_check:
|
||||||
print('Maigret sites database self-checking...')
|
print('Maigret sites database self-checking...')
|
||||||
is_need_update = await self_check(
|
is_need_update = await self_check(
|
||||||
db, site_data, logger, max_connections=args.connections
|
db,
|
||||||
|
site_data,
|
||||||
|
logger,
|
||||||
|
max_connections=args.connections,
|
||||||
|
tor_proxy=args.tor_proxy,
|
||||||
|
i2p_proxy=args.i2p_proxy,
|
||||||
)
|
)
|
||||||
if is_need_update:
|
if is_need_update:
|
||||||
if input('Do you want to save changes permanently? [Yn]\n').lower() in (
|
if input('Do you want to save changes permanently? [Yn]\n').lower() in (
|
||||||
@@ -584,6 +623,8 @@ async def main():
|
|||||||
site_dict=dict(sites_to_check),
|
site_dict=dict(sites_to_check),
|
||||||
query_notify=query_notify,
|
query_notify=query_notify,
|
||||||
proxy=args.proxy,
|
proxy=args.proxy,
|
||||||
|
tor_proxy=args.tor_proxy,
|
||||||
|
i2p_proxy=args.i2p_proxy,
|
||||||
timeout=args.timeout,
|
timeout=args.timeout,
|
||||||
is_parsing_enabled=parsing_enabled,
|
is_parsing_enabled=parsing_enabled,
|
||||||
id_type=id_type,
|
id_type=id_type,
|
||||||
@@ -594,10 +635,14 @@ async def main():
|
|||||||
max_connections=args.connections,
|
max_connections=args.connections,
|
||||||
no_progressbar=args.no_progressbar,
|
no_progressbar=args.no_progressbar,
|
||||||
retries=args.retries,
|
retries=args.retries,
|
||||||
|
check_domains=args.with_domains,
|
||||||
)
|
)
|
||||||
|
|
||||||
notify_about_errors(results, query_notify)
|
notify_about_errors(results, query_notify)
|
||||||
|
|
||||||
|
if args.reports_sorting == "data":
|
||||||
|
results = sort_report_by_data_points(results)
|
||||||
|
|
||||||
general_results.append((username, id_type, results))
|
general_results.append((username, id_type, results))
|
||||||
|
|
||||||
# TODO: tests
|
# TODO: tests
|
||||||
@@ -648,6 +693,11 @@ async def main():
|
|||||||
save_pdf_report(filename, report_context)
|
save_pdf_report(filename, report_context)
|
||||||
query_notify.warning(f'PDF report on all usernames saved in {filename}')
|
query_notify.warning(f'PDF report on all usernames saved in {filename}')
|
||||||
|
|
||||||
|
if args.graph:
|
||||||
|
filename = report_filepath_tpl.format(username=username, postfix='.html')
|
||||||
|
save_graph_report(filename, general_results, db)
|
||||||
|
query_notify.warning(f'Graph report on all usernames saved in {filename}')
|
||||||
|
|
||||||
text_report = get_plaintext_report(report_context)
|
text_report = get_plaintext_report(report_context)
|
||||||
if text_report:
|
if text_report:
|
||||||
query_notify.info('Short text report:')
|
query_notify.info('Short text report:')
|
||||||
|
|||||||
+170
-11
@@ -1,3 +1,4 @@
|
|||||||
|
import ast
|
||||||
import csv
|
import csv
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
@@ -6,13 +7,13 @@ import os
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Dict, Any
|
from typing import Dict, Any
|
||||||
|
|
||||||
import pycountry
|
|
||||||
import xmind
|
import xmind
|
||||||
from dateutil.parser import parse as parse_datetime_str
|
from dateutil.parser import parse as parse_datetime_str
|
||||||
from jinja2 import Template
|
from jinja2 import Template
|
||||||
from xhtml2pdf import pisa
|
|
||||||
|
|
||||||
|
from .checking import SUPPORTED_IDS
|
||||||
from .result import QueryStatus
|
from .result import QueryStatus
|
||||||
|
from .sites import MaigretDatabase
|
||||||
from .utils import is_country_tag, CaseConverter, enrich_link_str
|
from .utils import is_country_tag, CaseConverter, enrich_link_str
|
||||||
|
|
||||||
SUPPORTED_JSON_REPORT_FORMATS = [
|
SUPPORTED_JSON_REPORT_FORMATS = [
|
||||||
@@ -36,6 +37,18 @@ def filter_supposed_data(data):
|
|||||||
return filtered_supposed_data
|
return filtered_supposed_data
|
||||||
|
|
||||||
|
|
||||||
|
def sort_report_by_data_points(results):
|
||||||
|
return dict(
|
||||||
|
sorted(
|
||||||
|
results.items(),
|
||||||
|
key=lambda x: len(
|
||||||
|
(x[1].get('status') and x[1]['status'].ids_data or {}).keys()
|
||||||
|
),
|
||||||
|
reverse=True,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
REPORTS SAVING
|
REPORTS SAVING
|
||||||
"""
|
"""
|
||||||
@@ -61,6 +74,10 @@ def save_html_report(filename: str, context: dict):
|
|||||||
def save_pdf_report(filename: str, context: dict):
|
def save_pdf_report(filename: str, context: dict):
|
||||||
template, css = generate_report_template(is_pdf=True)
|
template, css = generate_report_template(is_pdf=True)
|
||||||
filled_template = template.render(**context)
|
filled_template = template.render(**context)
|
||||||
|
|
||||||
|
# moved here to speed up the launch of Maigret
|
||||||
|
from xhtml2pdf import pisa
|
||||||
|
|
||||||
with open(filename, "w+b") as f:
|
with open(filename, "w+b") as f:
|
||||||
pisa.pisaDocument(io.StringIO(filled_template), dest=f, default_css=css)
|
pisa.pisaDocument(io.StringIO(filled_template), dest=f, default_css=css)
|
||||||
|
|
||||||
@@ -70,6 +87,131 @@ def save_json_report(filename: str, username: str, results: dict, report_type: s
|
|||||||
generate_json_report(username, results, f, report_type=report_type)
|
generate_json_report(username, results, f, report_type=report_type)
|
||||||
|
|
||||||
|
|
||||||
|
class MaigretGraph:
|
||||||
|
other_params = {'size': 10, 'group': 3}
|
||||||
|
site_params = {'size': 15, 'group': 2}
|
||||||
|
username_params = {'size': 20, 'group': 1}
|
||||||
|
|
||||||
|
def __init__(self, graph):
|
||||||
|
self.G = graph
|
||||||
|
|
||||||
|
def add_node(self, key, value):
|
||||||
|
node_name = f'{key}: {value}'
|
||||||
|
|
||||||
|
params = self.other_params
|
||||||
|
if key in SUPPORTED_IDS:
|
||||||
|
params = self.username_params
|
||||||
|
elif value.startswith('http'):
|
||||||
|
params = self.site_params
|
||||||
|
|
||||||
|
self.G.add_node(node_name, title=node_name, **params)
|
||||||
|
|
||||||
|
if value != value.lower():
|
||||||
|
normalized_node_name = self.add_node(key, value.lower())
|
||||||
|
self.link(node_name, normalized_node_name)
|
||||||
|
|
||||||
|
return node_name
|
||||||
|
|
||||||
|
def link(self, node1_name, node2_name):
|
||||||
|
self.G.add_edge(node1_name, node2_name, weight=2)
|
||||||
|
|
||||||
|
|
||||||
|
def save_graph_report(filename: str, username_results: list, db: MaigretDatabase):
|
||||||
|
# moved here to speed up the launch of Maigret
|
||||||
|
import networkx as nx
|
||||||
|
|
||||||
|
G = nx.Graph()
|
||||||
|
graph = MaigretGraph(G)
|
||||||
|
|
||||||
|
for username, id_type, results in username_results:
|
||||||
|
username_node_name = graph.add_node(id_type, username)
|
||||||
|
|
||||||
|
for website_name in results:
|
||||||
|
dictionary = results[website_name]
|
||||||
|
# TODO: fix no site data issue
|
||||||
|
if not dictionary:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if dictionary.get("is_similar"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
status = dictionary.get("status")
|
||||||
|
if not status: # FIXME: currently in case of timeout
|
||||||
|
continue
|
||||||
|
|
||||||
|
if dictionary["status"].status != QueryStatus.CLAIMED:
|
||||||
|
continue
|
||||||
|
|
||||||
|
site_fallback_name = dictionary.get(
|
||||||
|
'url_user', f'{website_name}: {username.lower()}'
|
||||||
|
)
|
||||||
|
# site_node_name = dictionary.get('url_user', f'{website_name}: {username.lower()}')
|
||||||
|
site_node_name = graph.add_node('site', site_fallback_name)
|
||||||
|
graph.link(username_node_name, site_node_name)
|
||||||
|
|
||||||
|
def process_ids(parent_node, ids):
|
||||||
|
for k, v in ids.items():
|
||||||
|
if k.endswith('_count') or k.startswith('is_') or k.endswith('_at'):
|
||||||
|
continue
|
||||||
|
if k in 'image':
|
||||||
|
continue
|
||||||
|
|
||||||
|
v_data = v
|
||||||
|
if v.startswith('['):
|
||||||
|
try:
|
||||||
|
v_data = ast.literal_eval(v)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(e)
|
||||||
|
|
||||||
|
# value is a list
|
||||||
|
if isinstance(v_data, list):
|
||||||
|
list_node_name = graph.add_node(k, site_fallback_name)
|
||||||
|
for vv in v_data:
|
||||||
|
data_node_name = graph.add_node(vv, site_fallback_name)
|
||||||
|
graph.link(list_node_name, data_node_name)
|
||||||
|
|
||||||
|
add_ids = {
|
||||||
|
a: b for b, a in db.extract_ids_from_url(vv).items()
|
||||||
|
}
|
||||||
|
if add_ids:
|
||||||
|
process_ids(data_node_name, add_ids)
|
||||||
|
else:
|
||||||
|
# value is just a string
|
||||||
|
# ids_data_name = f'{k}: {v}'
|
||||||
|
# if ids_data_name == parent_node:
|
||||||
|
# continue
|
||||||
|
|
||||||
|
ids_data_name = graph.add_node(k, v)
|
||||||
|
# G.add_node(ids_data_name, size=10, title=ids_data_name, group=3)
|
||||||
|
graph.link(parent_node, ids_data_name)
|
||||||
|
|
||||||
|
# check for username
|
||||||
|
if 'username' in k or k in SUPPORTED_IDS:
|
||||||
|
new_username_node_name = graph.add_node('username', v)
|
||||||
|
graph.link(ids_data_name, new_username_node_name)
|
||||||
|
|
||||||
|
add_ids = {k: v for v, k in db.extract_ids_from_url(v).items()}
|
||||||
|
if add_ids:
|
||||||
|
process_ids(ids_data_name, add_ids)
|
||||||
|
|
||||||
|
if status.ids_data:
|
||||||
|
process_ids(site_node_name, status.ids_data)
|
||||||
|
|
||||||
|
nodes_to_remove = []
|
||||||
|
for node in G.nodes:
|
||||||
|
if len(str(node)) > 100:
|
||||||
|
nodes_to_remove.append(node)
|
||||||
|
|
||||||
|
[G.remove_node(node) for node in nodes_to_remove]
|
||||||
|
|
||||||
|
# moved here to speed up the launch of Maigret
|
||||||
|
from pyvis.network import Network
|
||||||
|
|
||||||
|
nt = Network(notebook=True, height="750px", width="100%")
|
||||||
|
nt.from_nx(G)
|
||||||
|
nt.show(filename)
|
||||||
|
|
||||||
|
|
||||||
def get_plaintext_report(context: dict) -> str:
|
def get_plaintext_report(context: dict) -> str:
|
||||||
output = (context['brief'] + " ").replace('. ', '.\n')
|
output = (context['brief'] + " ").replace('. ', '.\n')
|
||||||
interests = list(map(lambda x: x[0], context.get('interests_tuple_list', [])))
|
interests = list(map(lambda x: x[0], context.get('interests_tuple_list', [])))
|
||||||
@@ -118,6 +260,9 @@ def generate_report_context(username_results: list):
|
|||||||
|
|
||||||
first_seen = None
|
first_seen = None
|
||||||
|
|
||||||
|
# moved here to speed up the launch of Maigret
|
||||||
|
import pycountry
|
||||||
|
|
||||||
for username, id_type, results in username_results:
|
for username, id_type, results in username_results:
|
||||||
found_accounts = 0
|
found_accounts = 0
|
||||||
new_ids = []
|
new_ids = []
|
||||||
@@ -243,14 +388,18 @@ def generate_csv_report(username: str, results: dict, csvfile):
|
|||||||
["username", "name", "url_main", "url_user", "exists", "http_status"]
|
["username", "name", "url_main", "url_user", "exists", "http_status"]
|
||||||
)
|
)
|
||||||
for site in results:
|
for site in results:
|
||||||
|
# TODO: fix the reason
|
||||||
|
status = 'Unknown'
|
||||||
|
if "status" in results[site]:
|
||||||
|
status = str(results[site]["status"].status)
|
||||||
writer.writerow(
|
writer.writerow(
|
||||||
[
|
[
|
||||||
username,
|
username,
|
||||||
site,
|
site,
|
||||||
results[site]["url_main"],
|
results[site].get("url_main", ""),
|
||||||
results[site]["url_user"],
|
results[site].get("url_user", ""),
|
||||||
str(results[site]["status"].status),
|
status,
|
||||||
results[site]["http_status"],
|
results[site].get("http_status", 0),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -262,7 +411,10 @@ def generate_txt_report(username: str, results: dict, file):
|
|||||||
# TODO: fix no site data issue
|
# TODO: fix no site data issue
|
||||||
if not dictionary:
|
if not dictionary:
|
||||||
continue
|
continue
|
||||||
if dictionary.get("status").status == QueryStatus.CLAIMED:
|
if (
|
||||||
|
dictionary.get("status")
|
||||||
|
and dictionary["status"].status == QueryStatus.CLAIMED
|
||||||
|
):
|
||||||
exists_counter += 1
|
exists_counter += 1
|
||||||
file.write(dictionary["url_user"] + "\n")
|
file.write(dictionary["url_user"] + "\n")
|
||||||
file.write(f"Total Websites Username Detected On : {exists_counter}")
|
file.write(f"Total Websites Username Detected On : {exists_counter}")
|
||||||
@@ -275,14 +427,18 @@ def generate_json_report(username: str, results: dict, file, report_type):
|
|||||||
for sitename in results:
|
for sitename in results:
|
||||||
site_result = results[sitename]
|
site_result = results[sitename]
|
||||||
# TODO: fix no site data issue
|
# TODO: fix no site data issue
|
||||||
if not site_result or site_result.get("status").status != QueryStatus.CLAIMED:
|
if not site_result or not site_result.get("status"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if site_result["status"].status != QueryStatus.CLAIMED:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
data = dict(site_result)
|
data = dict(site_result)
|
||||||
data["status"] = data["status"].json()
|
data["status"] = data["status"].json()
|
||||||
data["site"] = data["site"].json
|
data["site"] = data["site"].json
|
||||||
if "future" in data:
|
for field in ["future", "checker"]:
|
||||||
del data["future"]
|
if field in data:
|
||||||
|
del data[field]
|
||||||
|
|
||||||
if is_report_per_line:
|
if is_report_per_line:
|
||||||
data["sitename"] = sitename
|
data["sitename"] = sitename
|
||||||
@@ -331,8 +487,11 @@ def design_xmind_sheet(sheet, username, results):
|
|||||||
|
|
||||||
for website_name in results:
|
for website_name in results:
|
||||||
dictionary = results[website_name]
|
dictionary = results[website_name]
|
||||||
|
if not dictionary:
|
||||||
|
continue
|
||||||
result_status = dictionary.get("status")
|
result_status = dictionary.get("status")
|
||||||
if result_status.status != QueryStatus.CLAIMED:
|
# TODO: fix the reason
|
||||||
|
if not result_status or result_status.status != QueryStatus.CLAIMED:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
stripped_tags = list(map(lambda x: x.strip(), result_status.tags))
|
stripped_tags = list(map(lambda x: x.strip(), result_status.tags))
|
||||||
|
|||||||
+773
-53
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,17 @@
|
|||||||
|
{
|
||||||
|
"presence_strings": [
|
||||||
|
"username",
|
||||||
|
"not found",
|
||||||
|
"пользователь",
|
||||||
|
"profile",
|
||||||
|
"lastname",
|
||||||
|
"firstname",
|
||||||
|
"biography",
|
||||||
|
"birthday",
|
||||||
|
"репутация",
|
||||||
|
"информация",
|
||||||
|
"e-mail"
|
||||||
|
],
|
||||||
|
"supposed_usernames": [
|
||||||
|
"alex", "god", "admin", "red", "blue", "john"]
|
||||||
|
}
|
||||||
@@ -68,7 +68,7 @@
|
|||||||
<div class="row-mb">
|
<div class="row-mb">
|
||||||
<div class="col-md">
|
<div class="col-md">
|
||||||
<div class="card flex-md-row mb-4 box-shadow h-md-250">
|
<div class="card flex-md-row mb-4 box-shadow h-md-250">
|
||||||
<img class="card-img-right flex-auto d-md-block" alt="Photo" style="width: 200px; height: 200px; object-fit: scale-down;" src="{{ v.status.ids_data.image or 'https://i.imgur.com/040fmbw.png' }}" data-holder-rendered="true">
|
<img class="card-img-right flex-auto d-md-block" alt="Photo" style="width: 200px; height: 200px; object-fit: scale-down;" src="{{ v.status and v.status.ids_data and v.status.ids_data.image or 'https://i.imgur.com/040fmbw.png' }}" data-holder-rendered="true">
|
||||||
<div class="card-body d-flex flex-column align-items-start" style="padding-top: 0;">
|
<div class="card-body d-flex flex-column align-items-start" style="padding-top: 0;">
|
||||||
<h3 class="mb-0" style="padding-top: 1rem;">
|
<h3 class="mb-0" style="padding-top: 1rem;">
|
||||||
<a class="text-dark" href="{{ v.url_main }}" target="_blank">{{ k }}</a>
|
<a class="text-dark" href="{{ v.url_main }}" target="_blank">{{ k }}</a>
|
||||||
|
|||||||
@@ -0,0 +1,29 @@
|
|||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
class Settings:
|
||||||
|
presence_strings: list
|
||||||
|
supposed_usernames: list
|
||||||
|
|
||||||
|
def __init__(self, filename):
|
||||||
|
data = {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open(filename, "r", encoding="utf-8") as file:
|
||||||
|
try:
|
||||||
|
data = json.load(file)
|
||||||
|
except Exception as error:
|
||||||
|
raise ValueError(
|
||||||
|
f"Problem with parsing json contents of "
|
||||||
|
f"settings file '{filename}': {str(error)}."
|
||||||
|
)
|
||||||
|
except FileNotFoundError as error:
|
||||||
|
raise FileNotFoundError(
|
||||||
|
f"Problem while attempting to access settings file '{filename}'."
|
||||||
|
) from error
|
||||||
|
|
||||||
|
self.__dict__.update(data)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def json(self):
|
||||||
|
return self.__dict__
|
||||||
+40
-66
@@ -9,64 +9,6 @@ import requests
|
|||||||
|
|
||||||
from .utils import CaseConverter, URLMatcher, is_country_tag
|
from .utils import CaseConverter, URLMatcher, is_country_tag
|
||||||
|
|
||||||
# TODO: move to data.json
|
|
||||||
SUPPORTED_TAGS = [
|
|
||||||
"gaming",
|
|
||||||
"coding",
|
|
||||||
"photo",
|
|
||||||
"music",
|
|
||||||
"blog",
|
|
||||||
"finance",
|
|
||||||
"freelance",
|
|
||||||
"dating",
|
|
||||||
"tech",
|
|
||||||
"forum",
|
|
||||||
"porn",
|
|
||||||
"erotic",
|
|
||||||
"webcam",
|
|
||||||
"video",
|
|
||||||
"movies",
|
|
||||||
"hacking",
|
|
||||||
"art",
|
|
||||||
"discussion",
|
|
||||||
"sharing",
|
|
||||||
"writing",
|
|
||||||
"wiki",
|
|
||||||
"business",
|
|
||||||
"shopping",
|
|
||||||
"sport",
|
|
||||||
"books",
|
|
||||||
"news",
|
|
||||||
"documents",
|
|
||||||
"travel",
|
|
||||||
"maps",
|
|
||||||
"hobby",
|
|
||||||
"apps",
|
|
||||||
"classified",
|
|
||||||
"career",
|
|
||||||
"geosocial",
|
|
||||||
"streaming",
|
|
||||||
"education",
|
|
||||||
"networking",
|
|
||||||
"torrent",
|
|
||||||
"science",
|
|
||||||
"medicine",
|
|
||||||
"reading",
|
|
||||||
"stock",
|
|
||||||
"messaging",
|
|
||||||
"trading",
|
|
||||||
"links",
|
|
||||||
"fashion",
|
|
||||||
"tasks",
|
|
||||||
"military",
|
|
||||||
"auto",
|
|
||||||
"gambling",
|
|
||||||
"cybercriminal",
|
|
||||||
"review",
|
|
||||||
"bookmarks",
|
|
||||||
"design",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class MaigretEngine:
|
class MaigretEngine:
|
||||||
site: Dict[str, Any] = {}
|
site: Dict[str, Any] = {}
|
||||||
@@ -122,6 +64,8 @@ class MaigretSite:
|
|||||||
alexa_rank = None
|
alexa_rank = None
|
||||||
source = None
|
source = None
|
||||||
|
|
||||||
|
protocol = ''
|
||||||
|
|
||||||
def __init__(self, name, information):
|
def __init__(self, name, information):
|
||||||
self.name = name
|
self.name = name
|
||||||
self.url_subpath = ""
|
self.url_subpath = ""
|
||||||
@@ -200,12 +144,12 @@ class MaigretSite:
|
|||||||
errors.update(self.errors)
|
errors.update(self.errors)
|
||||||
return errors
|
return errors
|
||||||
|
|
||||||
def get_url_type(self) -> str:
|
def get_url_template(self) -> str:
|
||||||
url = URLMatcher.extract_main_part(self.url)
|
url = URLMatcher.extract_main_part(self.url)
|
||||||
if url.startswith("{username}"):
|
if url.startswith("{username}"):
|
||||||
url = "SUBDOMAIN"
|
url = "SUBDOMAIN"
|
||||||
elif url == "":
|
elif url == "":
|
||||||
url = f"{self.url} ({self.engine})"
|
url = f"{self.url} ({self.engine or 'no engine'})"
|
||||||
else:
|
else:
|
||||||
parts = url.split("/")
|
parts = url.split("/")
|
||||||
url = "/" + "/".join(parts[1:])
|
url = "/" + "/".join(parts[1:])
|
||||||
@@ -269,8 +213,9 @@ class MaigretSite:
|
|||||||
|
|
||||||
class MaigretDatabase:
|
class MaigretDatabase:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self._sites = []
|
self._tags: list = []
|
||||||
self._engines = []
|
self._sites: list = []
|
||||||
|
self._engines: list = []
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def sites(self):
|
def sites(self):
|
||||||
@@ -301,12 +246,18 @@ class MaigretDatabase:
|
|||||||
lambda x: isinstance(x.engine, str) and x.engine.lower() in normalized_tags
|
lambda x: isinstance(x.engine, str) and x.engine.lower() in normalized_tags
|
||||||
)
|
)
|
||||||
is_tags_ok = lambda x: set(x.tags).intersection(set(normalized_tags))
|
is_tags_ok = lambda x: set(x.tags).intersection(set(normalized_tags))
|
||||||
|
is_protocol_in_tags = lambda x: x.protocol and x.protocol in normalized_tags
|
||||||
is_disabled_needed = lambda x: not x.disabled or (
|
is_disabled_needed = lambda x: not x.disabled or (
|
||||||
"disabled" in tags or disabled
|
"disabled" in tags or disabled
|
||||||
)
|
)
|
||||||
is_id_type_ok = lambda x: x.type == id_type
|
is_id_type_ok = lambda x: x.type == id_type
|
||||||
|
|
||||||
filter_tags_engines_fun = lambda x: not tags or is_engine_ok(x) or is_tags_ok(x)
|
filter_tags_engines_fun = (
|
||||||
|
lambda x: not tags
|
||||||
|
or is_engine_ok(x)
|
||||||
|
or is_tags_ok(x)
|
||||||
|
or is_protocol_in_tags(x)
|
||||||
|
)
|
||||||
filter_names_fun = lambda x: not names or is_name_ok(x) or is_source_ok(x)
|
filter_names_fun = lambda x: not names or is_name_ok(x) or is_source_ok(x)
|
||||||
|
|
||||||
filter_fun = (
|
filter_fun = (
|
||||||
@@ -341,9 +292,13 @@ class MaigretDatabase:
|
|||||||
return self
|
return self
|
||||||
|
|
||||||
def save_to_file(self, filename: str) -> "MaigretDatabase":
|
def save_to_file(self, filename: str) -> "MaigretDatabase":
|
||||||
|
if '://' in filename:
|
||||||
|
return self
|
||||||
|
|
||||||
db_data = {
|
db_data = {
|
||||||
"sites": {site.name: site.strip_engine_data().json for site in self._sites},
|
"sites": {site.name: site.strip_engine_data().json for site in self._sites},
|
||||||
"engines": {engine.name: engine.json for engine in self._engines},
|
"engines": {engine.name: engine.json for engine in self._engines},
|
||||||
|
"tags": self._tags,
|
||||||
}
|
}
|
||||||
|
|
||||||
json_data = json.dumps(db_data, indent=4)
|
json_data = json.dumps(db_data, indent=4)
|
||||||
@@ -357,6 +312,9 @@ class MaigretDatabase:
|
|||||||
# Add all of site information from the json file to internal site list.
|
# Add all of site information from the json file to internal site list.
|
||||||
site_data = json_data.get("sites", {})
|
site_data = json_data.get("sites", {})
|
||||||
engines_data = json_data.get("engines", {})
|
engines_data = json_data.get("engines", {})
|
||||||
|
tags = json_data.get("tags", [])
|
||||||
|
|
||||||
|
self._tags += tags
|
||||||
|
|
||||||
for engine_name in engines_data:
|
for engine_name in engines_data:
|
||||||
self._engines.append(MaigretEngine(engine_name, engines_data[engine_name]))
|
self._engines.append(MaigretEngine(engine_name, engines_data[engine_name]))
|
||||||
@@ -389,7 +347,13 @@ class MaigretDatabase:
|
|||||||
|
|
||||||
return self.load_from_json(data)
|
return self.load_from_json(data)
|
||||||
|
|
||||||
def load_from_url(self, url: str) -> "MaigretDatabase":
|
def load_from_path(self, path: str) -> "MaigretDatabase":
|
||||||
|
if '://' in path:
|
||||||
|
return self.load_from_http(path)
|
||||||
|
else:
|
||||||
|
return self.load_from_file(path)
|
||||||
|
|
||||||
|
def load_from_http(self, url: str) -> "MaigretDatabase":
|
||||||
is_url_valid = url.startswith("http://") or url.startswith("https://")
|
is_url_valid = url.startswith("http://") or url.startswith("https://")
|
||||||
|
|
||||||
if not is_url_valid:
|
if not is_url_valid:
|
||||||
@@ -445,6 +409,16 @@ class MaigretDatabase:
|
|||||||
|
|
||||||
return found_flags
|
return found_flags
|
||||||
|
|
||||||
|
def extract_ids_from_url(self, url: str) -> dict:
|
||||||
|
results = {}
|
||||||
|
for s in self._sites:
|
||||||
|
result = s.extract_id_from_url(url)
|
||||||
|
if not result:
|
||||||
|
continue
|
||||||
|
_id, _type = result
|
||||||
|
results[_id] = _type
|
||||||
|
return results
|
||||||
|
|
||||||
def get_db_stats(self, sites_dict):
|
def get_db_stats(self, sites_dict):
|
||||||
if not sites_dict:
|
if not sites_dict:
|
||||||
sites_dict = self.sites_dict()
|
sites_dict = self.sites_dict()
|
||||||
@@ -459,7 +433,7 @@ class MaigretDatabase:
|
|||||||
if site.disabled:
|
if site.disabled:
|
||||||
disabled_count += 1
|
disabled_count += 1
|
||||||
|
|
||||||
url_type = site.get_url_type()
|
url_type = site.get_url_template()
|
||||||
urls[url_type] = urls.get(url_type, 0) + 1
|
urls[url_type] = urls.get(url_type, 0) + 1
|
||||||
|
|
||||||
if not site.tags:
|
if not site.tags:
|
||||||
@@ -478,7 +452,7 @@ class MaigretDatabase:
|
|||||||
output += "Top tags:\n"
|
output += "Top tags:\n"
|
||||||
for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True)[:200]:
|
for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True)[:200]:
|
||||||
mark = ""
|
mark = ""
|
||||||
if tag not in SUPPORTED_TAGS:
|
if tag not in self._tags:
|
||||||
mark = " (non-standard)"
|
mark = " (non-standard)"
|
||||||
output += f"{count}\t{tag}{mark}\n"
|
output += f"{count}\t{tag}{mark}\n"
|
||||||
|
|
||||||
|
|||||||
+104
-89
@@ -1,5 +1,5 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import difflib
|
import json
|
||||||
import re
|
import re
|
||||||
from typing import List
|
from typing import List
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
@@ -8,44 +8,28 @@ import requests
|
|||||||
from .activation import import_aiohttp_cookies
|
from .activation import import_aiohttp_cookies
|
||||||
from .checking import maigret
|
from .checking import maigret
|
||||||
from .result import QueryStatus
|
from .result import QueryStatus
|
||||||
|
from .settings import Settings
|
||||||
from .sites import MaigretDatabase, MaigretSite, MaigretEngine
|
from .sites import MaigretDatabase, MaigretSite, MaigretEngine
|
||||||
from .utils import get_random_user_agent
|
from .utils import get_random_user_agent, get_match_ratio
|
||||||
|
|
||||||
|
|
||||||
DESIRED_STRINGS = [
|
class Submitter:
|
||||||
"username",
|
|
||||||
"not found",
|
|
||||||
"пользователь",
|
|
||||||
"profile",
|
|
||||||
"lastname",
|
|
||||||
"firstname",
|
|
||||||
"biography",
|
|
||||||
"birthday",
|
|
||||||
"репутация",
|
|
||||||
"информация",
|
|
||||||
"e-mail",
|
|
||||||
]
|
|
||||||
|
|
||||||
SUPPOSED_USERNAMES = ["alex", "god", "admin", "red", "blue", "john"]
|
|
||||||
|
|
||||||
HEADERS = {
|
HEADERS = {
|
||||||
"User-Agent": get_random_user_agent(),
|
"User-Agent": get_random_user_agent(),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
SEPARATORS = "\"'"
|
||||||
|
|
||||||
RATIO = 0.6
|
RATIO = 0.6
|
||||||
TOP_FEATURES = 5
|
TOP_FEATURES = 5
|
||||||
URL_RE = re.compile(r"https?://(www\.)?")
|
URL_RE = re.compile(r"https?://(www\.)?")
|
||||||
|
|
||||||
|
def __init__(self, db: MaigretDatabase, settings: Settings, logger):
|
||||||
|
self.settings = settings
|
||||||
|
self.db = db
|
||||||
|
self.logger = logger
|
||||||
|
|
||||||
def get_match_ratio(x):
|
@staticmethod
|
||||||
return round(
|
|
||||||
max(
|
|
||||||
[difflib.SequenceMatcher(a=x.lower(), b=y).ratio() for y in DESIRED_STRINGS]
|
|
||||||
),
|
|
||||||
2,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_alexa_rank(site_url_main):
|
def get_alexa_rank(site_url_main):
|
||||||
url = f"http://data.alexa.com/data?cli=10&url={site_url_main}"
|
url = f"http://data.alexa.com/data?cli=10&url={site_url_main}"
|
||||||
xml_data = requests.get(url).text
|
xml_data = requests.get(url).text
|
||||||
@@ -59,12 +43,11 @@ def get_alexa_rank(site_url_main):
|
|||||||
|
|
||||||
return alexa_rank
|
return alexa_rank
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
def extract_mainpage_url(url):
|
def extract_mainpage_url(url):
|
||||||
return "/".join(url.split("/", 3)[:3])
|
return "/".join(url.split("/", 3)[:3])
|
||||||
|
|
||||||
|
async def site_self_check(self, site, semaphore, silent=False):
|
||||||
async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False):
|
|
||||||
changes = {
|
changes = {
|
||||||
"disabled": False,
|
"disabled": False,
|
||||||
}
|
}
|
||||||
@@ -74,13 +57,13 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F
|
|||||||
(site.username_unclaimed, QueryStatus.AVAILABLE),
|
(site.username_unclaimed, QueryStatus.AVAILABLE),
|
||||||
]
|
]
|
||||||
|
|
||||||
logger.info(f"Checking {site.name}...")
|
self.logger.info(f"Checking {site.name}...")
|
||||||
|
|
||||||
for username, status in check_data:
|
for username, status in check_data:
|
||||||
results_dict = await maigret(
|
results_dict = await maigret(
|
||||||
username=username,
|
username=username,
|
||||||
site_dict={site.name: site},
|
site_dict={site.name: site},
|
||||||
logger=logger,
|
logger=self.logger,
|
||||||
timeout=30,
|
timeout=30,
|
||||||
id_type=site.type,
|
id_type=site.type,
|
||||||
forced=True,
|
forced=True,
|
||||||
@@ -90,7 +73,7 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F
|
|||||||
# don't disable entries with other ids types
|
# don't disable entries with other ids types
|
||||||
# TODO: make normal checking
|
# TODO: make normal checking
|
||||||
if site.name not in results_dict:
|
if site.name not in results_dict:
|
||||||
logger.info(results_dict)
|
self.logger.info(results_dict)
|
||||||
changes["disabled"] = True
|
changes["disabled"] = True
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -102,7 +85,7 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F
|
|||||||
if site_status == QueryStatus.UNKNOWN:
|
if site_status == QueryStatus.UNKNOWN:
|
||||||
msgs = site.absence_strs
|
msgs = site.absence_strs
|
||||||
etype = site.check_type
|
etype = site.check_type
|
||||||
logger.warning(
|
self.logger.warning(
|
||||||
"Error while searching '%s' in %s: %s, %s, check type %s",
|
"Error while searching '%s' in %s: %s, %s, check type %s",
|
||||||
username,
|
username,
|
||||||
site.name,
|
site.name,
|
||||||
@@ -114,22 +97,23 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F
|
|||||||
if status == QueryStatus.CLAIMED:
|
if status == QueryStatus.CLAIMED:
|
||||||
changes["disabled"] = True
|
changes["disabled"] = True
|
||||||
elif status == QueryStatus.CLAIMED:
|
elif status == QueryStatus.CLAIMED:
|
||||||
logger.warning(
|
self.logger.warning(
|
||||||
f"Not found `{username}` in {site.name}, must be claimed"
|
f"Not found `{username}` in {site.name}, must be claimed"
|
||||||
)
|
)
|
||||||
logger.info(results_dict[site.name])
|
self.logger.info(results_dict[site.name])
|
||||||
changes["disabled"] = True
|
changes["disabled"] = True
|
||||||
else:
|
else:
|
||||||
logger.warning(f"Found `{username}` in {site.name}, must be available")
|
self.logger.warning(
|
||||||
logger.info(results_dict[site.name])
|
f"Found `{username}` in {site.name}, must be available"
|
||||||
|
)
|
||||||
|
self.logger.info(results_dict[site.name])
|
||||||
changes["disabled"] = True
|
changes["disabled"] = True
|
||||||
|
|
||||||
logger.info(f"Site {site.name} checking is finished")
|
self.logger.info(f"Site {site.name} checking is finished")
|
||||||
|
|
||||||
return changes
|
return changes
|
||||||
|
|
||||||
|
def generate_additional_fields_dialog(self, engine: MaigretEngine, dialog):
|
||||||
def generate_additional_fields_dialog(engine: MaigretEngine, dialog):
|
|
||||||
fields = {}
|
fields = {}
|
||||||
if 'urlSubpath' in engine.site.get('url', ''):
|
if 'urlSubpath' in engine.site.get('url', ''):
|
||||||
msg = (
|
msg = (
|
||||||
@@ -141,19 +125,16 @@ def generate_additional_fields_dialog(engine: MaigretEngine, dialog):
|
|||||||
fields['urlSubpath'] = f'/{subpath}'
|
fields['urlSubpath'] = f'/{subpath}'
|
||||||
return fields
|
return fields
|
||||||
|
|
||||||
|
async def detect_known_engine(self, url_exists, url_mainpage) -> List[MaigretSite]:
|
||||||
async def detect_known_engine(
|
|
||||||
db, url_exists, url_mainpage, logger
|
|
||||||
) -> List[MaigretSite]:
|
|
||||||
try:
|
try:
|
||||||
r = requests.get(url_mainpage)
|
r = requests.get(url_mainpage)
|
||||||
logger.debug(r.text)
|
self.logger.debug(r.text)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(e)
|
self.logger.warning(e)
|
||||||
print("Some error while checking main page")
|
print("Some error while checking main page")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
for engine in db.engines:
|
for engine in self.db.engines:
|
||||||
strs_to_check = engine.__dict__.get("presenseStrs")
|
strs_to_check = engine.__dict__.get("presenseStrs")
|
||||||
if strs_to_check and r and r.text:
|
if strs_to_check and r and r.text:
|
||||||
all_strs_in_response = True
|
all_strs_in_response = True
|
||||||
@@ -166,12 +147,14 @@ async def detect_known_engine(
|
|||||||
|
|
||||||
print(f"Detected engine {engine_name} for site {url_mainpage}")
|
print(f"Detected engine {engine_name} for site {url_mainpage}")
|
||||||
|
|
||||||
usernames_to_check = SUPPOSED_USERNAMES
|
usernames_to_check = self.settings.supposed_usernames
|
||||||
supposed_username = extract_username_dialog(url_exists)
|
supposed_username = self.extract_username_dialog(url_exists)
|
||||||
if supposed_username:
|
if supposed_username:
|
||||||
usernames_to_check = [supposed_username] + usernames_to_check
|
usernames_to_check = [supposed_username] + usernames_to_check
|
||||||
|
|
||||||
add_fields = generate_additional_fields_dialog(engine, url_exists)
|
add_fields = self.generate_additional_fields_dialog(
|
||||||
|
engine, url_exists
|
||||||
|
)
|
||||||
|
|
||||||
for u in usernames_to_check:
|
for u in usernames_to_check:
|
||||||
site_data = {
|
site_data = {
|
||||||
@@ -182,59 +165,79 @@ async def detect_known_engine(
|
|||||||
"usernameUnclaimed": "noonewouldeverusethis7",
|
"usernameUnclaimed": "noonewouldeverusethis7",
|
||||||
**add_fields,
|
**add_fields,
|
||||||
}
|
}
|
||||||
logger.info(site_data)
|
self.logger.info(site_data)
|
||||||
|
|
||||||
maigret_site = MaigretSite(url_mainpage.split("/")[-1], site_data)
|
maigret_site = MaigretSite(
|
||||||
maigret_site.update_from_engine(db.engines_dict[engine_name])
|
url_mainpage.split("/")[-1], site_data
|
||||||
|
)
|
||||||
|
maigret_site.update_from_engine(
|
||||||
|
self.db.engines_dict[engine_name]
|
||||||
|
)
|
||||||
sites.append(maigret_site)
|
sites.append(maigret_site)
|
||||||
|
|
||||||
return sites
|
return sites
|
||||||
|
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
def extract_username_dialog(self, url):
|
||||||
def extract_username_dialog(url):
|
|
||||||
url_parts = url.rstrip("/").split("/")
|
url_parts = url.rstrip("/").split("/")
|
||||||
supposed_username = url_parts[-1]
|
supposed_username = url_parts[-1].strip('@')
|
||||||
entered_username = input(
|
entered_username = input(
|
||||||
f'Is "{supposed_username}" a valid username? If not, write it manually: '
|
f'Is "{supposed_username}" a valid username? If not, write it manually: '
|
||||||
)
|
)
|
||||||
return entered_username if entered_username else supposed_username
|
return entered_username if entered_username else supposed_username
|
||||||
|
|
||||||
|
|
||||||
async def check_features_manually(
|
async def check_features_manually(
|
||||||
db, url_exists, url_mainpage, cookie_file, logger, redirects=True
|
self, url_exists, url_mainpage, cookie_file, redirects=False
|
||||||
):
|
):
|
||||||
supposed_username = extract_username_dialog(url_exists)
|
custom_headers = {}
|
||||||
|
while True:
|
||||||
|
header_key = input(
|
||||||
|
'Specify custom header if you need or just press Enter to skip. Header name: '
|
||||||
|
)
|
||||||
|
if not header_key:
|
||||||
|
break
|
||||||
|
header_value = input('Header value: ')
|
||||||
|
custom_headers[header_key.strip()] = header_value.strip()
|
||||||
|
|
||||||
|
supposed_username = self.extract_username_dialog(url_exists)
|
||||||
non_exist_username = "noonewouldeverusethis7"
|
non_exist_username = "noonewouldeverusethis7"
|
||||||
|
|
||||||
url_user = url_exists.replace(supposed_username, "{username}")
|
url_user = url_exists.replace(supposed_username, "{username}")
|
||||||
url_not_exists = url_exists.replace(supposed_username, non_exist_username)
|
url_not_exists = url_exists.replace(supposed_username, non_exist_username)
|
||||||
|
|
||||||
|
headers = dict(self.HEADERS)
|
||||||
|
headers.update(custom_headers)
|
||||||
|
|
||||||
# cookies
|
# cookies
|
||||||
cookie_dict = None
|
cookie_dict = None
|
||||||
if cookie_file:
|
if cookie_file:
|
||||||
logger.info(f'Use {cookie_file} for cookies')
|
self.logger.info(f'Use {cookie_file} for cookies')
|
||||||
cookie_jar = await import_aiohttp_cookies(cookie_file)
|
cookie_jar = import_aiohttp_cookies(cookie_file)
|
||||||
cookie_dict = {c.key: c.value for c in cookie_jar}
|
cookie_dict = {c.key: c.value for c in cookie_jar}
|
||||||
|
|
||||||
exists_resp = requests.get(
|
exists_resp = requests.get(
|
||||||
url_exists, cookies=cookie_dict, headers=HEADERS, allow_redirects=redirects
|
url_exists, cookies=cookie_dict, headers=headers, allow_redirects=redirects
|
||||||
)
|
)
|
||||||
logger.debug(exists_resp.status_code)
|
self.logger.debug(url_exists)
|
||||||
logger.debug(exists_resp.text)
|
self.logger.debug(exists_resp.status_code)
|
||||||
|
self.logger.debug(exists_resp.text)
|
||||||
|
|
||||||
non_exists_resp = requests.get(
|
non_exists_resp = requests.get(
|
||||||
url_not_exists, cookies=cookie_dict, headers=HEADERS, allow_redirects=redirects
|
url_not_exists,
|
||||||
|
cookies=cookie_dict,
|
||||||
|
headers=headers,
|
||||||
|
allow_redirects=redirects,
|
||||||
)
|
)
|
||||||
logger.debug(non_exists_resp.status_code)
|
self.logger.debug(url_not_exists)
|
||||||
logger.debug(non_exists_resp.text)
|
self.logger.debug(non_exists_resp.status_code)
|
||||||
|
self.logger.debug(non_exists_resp.text)
|
||||||
|
|
||||||
a = exists_resp.text
|
a = exists_resp.text
|
||||||
b = non_exists_resp.text
|
b = non_exists_resp.text
|
||||||
|
|
||||||
tokens_a = set(a.split('"'))
|
tokens_a = set(re.split(f'[{self.SEPARATORS}]', a))
|
||||||
tokens_b = set(b.split('"'))
|
tokens_b = set(re.split(f'[{self.SEPARATORS}]', b))
|
||||||
|
|
||||||
a_minus_b = tokens_a.difference(tokens_b)
|
a_minus_b = tokens_a.difference(tokens_b)
|
||||||
b_minus_a = tokens_b.difference(tokens_a)
|
b_minus_a = tokens_b.difference(tokens_a)
|
||||||
@@ -243,11 +246,15 @@ async def check_features_manually(
|
|||||||
print("The pages for existing and non-existing account are the same!")
|
print("The pages for existing and non-existing account are the same!")
|
||||||
|
|
||||||
top_features_count = int(
|
top_features_count = int(
|
||||||
input(f"Specify count of features to extract [default {TOP_FEATURES}]: ")
|
input(
|
||||||
or TOP_FEATURES
|
f"Specify count of features to extract [default {self.TOP_FEATURES}]: "
|
||||||
|
)
|
||||||
|
or self.TOP_FEATURES
|
||||||
)
|
)
|
||||||
|
|
||||||
presence_list = sorted(a_minus_b, key=get_match_ratio, reverse=True)[
|
match_fun = get_match_ratio(self.settings.presence_strings)
|
||||||
|
|
||||||
|
presence_list = sorted(a_minus_b, key=match_fun, reverse=True)[
|
||||||
:top_features_count
|
:top_features_count
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -257,10 +264,12 @@ async def check_features_manually(
|
|||||||
if features:
|
if features:
|
||||||
presence_list = list(map(str.strip, features.split(",")))
|
presence_list = list(map(str.strip, features.split(",")))
|
||||||
|
|
||||||
absence_list = sorted(b_minus_a, key=get_match_ratio, reverse=True)[
|
absence_list = sorted(b_minus_a, key=match_fun, reverse=True)[
|
||||||
:top_features_count
|
:top_features_count
|
||||||
]
|
]
|
||||||
print("Detected text features of non-existing account: " + ", ".join(absence_list))
|
print(
|
||||||
|
"Detected text features of non-existing account: " + ", ".join(absence_list)
|
||||||
|
)
|
||||||
features = input("If features was not detected correctly, write it manually: ")
|
features = input("If features was not detected correctly, write it manually: ")
|
||||||
|
|
||||||
if features:
|
if features:
|
||||||
@@ -276,16 +285,21 @@ async def check_features_manually(
|
|||||||
"checkType": "message",
|
"checkType": "message",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if headers != self.HEADERS:
|
||||||
|
site_data['headers'] = headers
|
||||||
|
|
||||||
site = MaigretSite(url_mainpage.split("/")[-1], site_data)
|
site = MaigretSite(url_mainpage.split("/")[-1], site_data)
|
||||||
return site
|
return site
|
||||||
|
|
||||||
|
async def dialog(self, url_exists, cookie_file):
|
||||||
async def submit_dialog(db, url_exists, cookie_file, logger):
|
domain_raw = self.URL_RE.sub("", url_exists).strip().strip("/")
|
||||||
domain_raw = URL_RE.sub("", url_exists).strip().strip("/")
|
|
||||||
domain_raw = domain_raw.split("/")[0]
|
domain_raw = domain_raw.split("/")[0]
|
||||||
|
self.logger.info('Domain is %s', domain_raw)
|
||||||
|
|
||||||
# check for existence
|
# check for existence
|
||||||
matched_sites = list(filter(lambda x: domain_raw in x.url_main + x.url, db.sites))
|
matched_sites = list(
|
||||||
|
filter(lambda x: domain_raw in x.url_main + x.url, self.db.sites)
|
||||||
|
)
|
||||||
|
|
||||||
if matched_sites:
|
if matched_sites:
|
||||||
print(
|
print(
|
||||||
@@ -305,24 +319,24 @@ async def submit_dialog(db, url_exists, cookie_file, logger):
|
|||||||
if input("Do you want to continue? [yN] ").lower() in "n":
|
if input("Do you want to continue? [yN] ").lower() in "n":
|
||||||
return False
|
return False
|
||||||
|
|
||||||
url_mainpage = extract_mainpage_url(url_exists)
|
url_mainpage = self.extract_mainpage_url(url_exists)
|
||||||
|
|
||||||
print('Detecting site engine, please wait...')
|
print('Detecting site engine, please wait...')
|
||||||
sites = []
|
sites = []
|
||||||
try:
|
try:
|
||||||
sites = await detect_known_engine(db, url_exists, url_mainpage, logger)
|
sites = await self.detect_known_engine(url_exists, url_mainpage)
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
print('Engine detect process is interrupted.')
|
print('Engine detect process is interrupted.')
|
||||||
|
|
||||||
if not sites:
|
if not sites:
|
||||||
print("Unable to detect site engine, lets generate checking features")
|
print("Unable to detect site engine, lets generate checking features")
|
||||||
sites = [
|
sites = [
|
||||||
await check_features_manually(
|
await self.check_features_manually(
|
||||||
db, url_exists, url_mainpage, cookie_file, logger
|
url_exists, url_mainpage, cookie_file
|
||||||
)
|
)
|
||||||
]
|
]
|
||||||
|
|
||||||
logger.debug(sites[0].__dict__)
|
self.logger.debug(sites[0].__dict__)
|
||||||
|
|
||||||
sem = asyncio.Semaphore(1)
|
sem = asyncio.Semaphore(1)
|
||||||
|
|
||||||
@@ -331,7 +345,7 @@ async def submit_dialog(db, url_exists, cookie_file, logger):
|
|||||||
chosen_site = None
|
chosen_site = None
|
||||||
for s in sites:
|
for s in sites:
|
||||||
chosen_site = s
|
chosen_site = s
|
||||||
result = await site_self_check(s, logger, sem, db)
|
result = await self.site_self_check(s, sem)
|
||||||
if not result["disabled"]:
|
if not result["disabled"]:
|
||||||
found = True
|
found = True
|
||||||
break
|
break
|
||||||
@@ -343,6 +357,7 @@ async def submit_dialog(db, url_exists, cookie_file, logger):
|
|||||||
print(
|
print(
|
||||||
"Try to run this mode again and increase features count or choose others."
|
"Try to run this mode again and increase features count or choose others."
|
||||||
)
|
)
|
||||||
|
self.logger.debug(json.dumps(chosen_site.json))
|
||||||
return False
|
return False
|
||||||
else:
|
else:
|
||||||
if (
|
if (
|
||||||
@@ -356,13 +371,13 @@ async def submit_dialog(db, url_exists, cookie_file, logger):
|
|||||||
|
|
||||||
chosen_site.name = input("Change site name if you want: ") or chosen_site.name
|
chosen_site.name = input("Change site name if you want: ") or chosen_site.name
|
||||||
chosen_site.tags = list(map(str.strip, input("Site tags: ").split(',')))
|
chosen_site.tags = list(map(str.strip, input("Site tags: ").split(',')))
|
||||||
rank = get_alexa_rank(chosen_site.url_main)
|
rank = Submitter.get_alexa_rank(chosen_site.url_main)
|
||||||
if rank:
|
if rank:
|
||||||
print(f'New alexa rank: {rank}')
|
print(f'New alexa rank: {rank}')
|
||||||
chosen_site.alexa_rank = rank
|
chosen_site.alexa_rank = rank
|
||||||
|
|
||||||
logger.debug(chosen_site.json)
|
self.logger.debug(chosen_site.json)
|
||||||
site_data = chosen_site.strip_engine_data()
|
site_data = chosen_site.strip_engine_data()
|
||||||
logger.debug(site_data.json)
|
self.logger.debug(site_data.json)
|
||||||
db.update_site(site_data)
|
self.db.update_site(site_data)
|
||||||
return True
|
return True
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import ast
|
import ast
|
||||||
|
import difflib
|
||||||
import re
|
import re
|
||||||
import random
|
import random
|
||||||
from typing import Any
|
from typing import Any
|
||||||
@@ -95,3 +96,18 @@ def get_dict_ascii_tree(items, prepend="", new_line=True):
|
|||||||
|
|
||||||
def get_random_user_agent():
|
def get_random_user_agent():
|
||||||
return random.choice(DEFAULT_USER_AGENTS)
|
return random.choice(DEFAULT_USER_AGENTS)
|
||||||
|
|
||||||
|
|
||||||
|
def get_match_ratio(base_strs: list):
|
||||||
|
def get_match_inner(s: str):
|
||||||
|
return round(
|
||||||
|
max(
|
||||||
|
[
|
||||||
|
difflib.SequenceMatcher(a=s.lower(), b=s2.lower()).ratio()
|
||||||
|
for s2 in base_strs
|
||||||
|
]
|
||||||
|
),
|
||||||
|
2,
|
||||||
|
)
|
||||||
|
|
||||||
|
return get_match_inner
|
||||||
|
|||||||
+4
-1
@@ -1,3 +1,4 @@
|
|||||||
|
aiodns==3.0.0
|
||||||
aiohttp==3.7.4
|
aiohttp==3.7.4
|
||||||
aiohttp-socks==0.5.5
|
aiohttp-socks==0.5.5
|
||||||
arabic-reshaper==2.1.1
|
arabic-reshaper==2.1.1
|
||||||
@@ -26,7 +27,7 @@ python-socks==1.1.2
|
|||||||
requests>=2.24.0
|
requests>=2.24.0
|
||||||
requests-futures==1.0.0
|
requests-futures==1.0.0
|
||||||
six==1.15.0
|
six==1.15.0
|
||||||
socid-extractor>=0.0.20
|
socid-extractor>=0.0.21
|
||||||
soupsieve==2.1
|
soupsieve==2.1
|
||||||
stem==1.8.0
|
stem==1.8.0
|
||||||
torrequest==0.1.0
|
torrequest==0.1.0
|
||||||
@@ -36,3 +37,5 @@ webencodings==0.5.1
|
|||||||
xhtml2pdf==0.2.5
|
xhtml2pdf==0.2.5
|
||||||
XMind==1.2.0
|
XMind==1.2.0
|
||||||
yarl==1.6.3
|
yarl==1.6.3
|
||||||
|
networkx==2.5.1
|
||||||
|
pyvis==0.1.9
|
||||||
|
|||||||
@@ -5,14 +5,13 @@ from setuptools import (
|
|||||||
|
|
||||||
|
|
||||||
with open('README.md') as fh:
|
with open('README.md') as fh:
|
||||||
readme = fh.read()
|
long_description = fh.read()
|
||||||
long_description = readme.replace('./', 'https://raw.githubusercontent.com/soxoj/maigret/main/')
|
|
||||||
|
|
||||||
with open('requirements.txt') as rf:
|
with open('requirements.txt') as rf:
|
||||||
requires = rf.read().splitlines()
|
requires = rf.read().splitlines()
|
||||||
|
|
||||||
setup(name='maigret',
|
setup(name='maigret',
|
||||||
version='0.2.4',
|
version='0.3.1',
|
||||||
description='Collect a dossier on a person by username from a huge number of sites',
|
description='Collect a dossier on a person by username from a huge number of sites',
|
||||||
long_description=long_description,
|
long_description=long_description,
|
||||||
long_description_content_type="text/markdown",
|
long_description_content_type="text/markdown",
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
|
|
||||||
## List of supported sites (search methods): total 2515
|
## List of supported sites (search methods): total 2560
|
||||||
|
|
||||||
Rank data fetched from Alexa by domains.
|
Rank data fetched from Alexa by domains.
|
||||||
|
|
||||||
@@ -39,8 +39,9 @@ Rank data fetched from Alexa by domains.
|
|||||||
1.  [Tanks (https://tanks.mail.ru)](https://tanks.mail.ru)*: top 50, forum, gaming, ru*
|
1.  [Tanks (https://tanks.mail.ru)](https://tanks.mail.ru)*: top 50, forum, gaming, ru*
|
||||||
1.  [Warface (https://wf.mail.ru)](https://wf.mail.ru)*: top 50, forum, ru*
|
1.  [Warface (https://wf.mail.ru)](https://wf.mail.ru)*: top 50, forum, ru*
|
||||||
1.  [YandexReviews (https://yandex.ru/)](https://yandex.ru/)*: top 50, ru*
|
1.  [YandexReviews (https://yandex.ru/)](https://yandex.ru/)*: top 50, ru*
|
||||||
1.  [YandexBugbounty (https://yandex.ru/bugbounty/)](https://yandex.ru/bugbounty/)*: top 50, hacking, ru*
|
1.  [YandexBugbounty (https://yandex.ru/bugbounty/)](https://yandex.ru/bugbounty/)*: top 50, hacking, ru*, search is disabled
|
||||||
1.  [YandexCollections API (https://yandex.ru/collections/)](https://yandex.ru/collections/)*: top 50, ru, sharing*
|
1.  [YandexCollections API (https://yandex.ru/collections/)](https://yandex.ru/collections/)*: top 50, ru, sharing*
|
||||||
|
1.  [YandexCollections API (by yandex_public_id) (https://yandex.ru/collections/)](https://yandex.ru/collections/)*: top 50, ru, sharing*
|
||||||
1.  [YandexMarket (https://market.yandex.ru/)](https://market.yandex.ru/)*: top 50, ru*
|
1.  [YandexMarket (https://market.yandex.ru/)](https://market.yandex.ru/)*: top 50, ru*
|
||||||
1.  [YandexMusic (https://music.yandex.ru/)](https://music.yandex.ru/)*: top 50, music, ru*
|
1.  [YandexMusic (https://music.yandex.ru/)](https://music.yandex.ru/)*: top 50, music, ru*
|
||||||
1.  [YandexZnatoki (https://yandex.ru/q/)](https://yandex.ru/q/)*: top 50, ru*
|
1.  [YandexZnatoki (https://yandex.ru/q/)](https://yandex.ru/q/)*: top 50, ru*
|
||||||
@@ -107,6 +108,7 @@ Rank data fetched from Alexa by domains.
|
|||||||
1.  [mercadolivre (https://www.mercadolivre.com.br)](https://www.mercadolivre.com.br)*: top 500, br*
|
1.  [mercadolivre (https://www.mercadolivre.com.br)](https://www.mercadolivre.com.br)*: top 500, br*
|
||||||
1.  [Crunchyroll (https://www.crunchyroll.com/)](https://www.crunchyroll.com/)*: top 500, forum, movies, us*
|
1.  [Crunchyroll (https://www.crunchyroll.com/)](https://www.crunchyroll.com/)*: top 500, forum, movies, us*
|
||||||
1.  [WordPressOrg (https://wordpress.org/)](https://wordpress.org/)*: top 500, in*
|
1.  [WordPressOrg (https://wordpress.org/)](https://wordpress.org/)*: top 500, in*
|
||||||
|
1.  [Ameblo (https://ameblo.jp)](https://ameblo.jp)*: top 500, blog, jp*
|
||||||
1.  [Unsplash (https://unsplash.com/)](https://unsplash.com/)*: top 500, art, photo*
|
1.  [Unsplash (https://unsplash.com/)](https://unsplash.com/)*: top 500, art, photo*
|
||||||
1.  [Steam (https://steamcommunity.com/)](https://steamcommunity.com/)*: top 500, gaming*
|
1.  [Steam (https://steamcommunity.com/)](https://steamcommunity.com/)*: top 500, gaming*
|
||||||
1.  [Steam (by id) (https://steamcommunity.com/)](https://steamcommunity.com/)*: top 500, gaming*
|
1.  [Steam (by id) (https://steamcommunity.com/)](https://steamcommunity.com/)*: top 500, gaming*
|
||||||
@@ -131,12 +133,12 @@ Rank data fetched from Alexa by domains.
|
|||||||
1.  [Kickstarter (https://www.kickstarter.com)](https://www.kickstarter.com)*: top 1K, finance, us*
|
1.  [Kickstarter (https://www.kickstarter.com)](https://www.kickstarter.com)*: top 1K, finance, us*
|
||||||
1.  [forums.ea.com (https://forums.ea.com)](https://forums.ea.com)*: top 1K, forum, gaming, us*
|
1.  [forums.ea.com (https://forums.ea.com)](https://forums.ea.com)*: top 1K, forum, gaming, us*
|
||||||
1.  [Envato (https://forums.envato.com)](https://forums.envato.com)*: top 1K, au, forum, in*
|
1.  [Envato (https://forums.envato.com)](https://forums.envato.com)*: top 1K, au, forum, in*
|
||||||
1.  [Giphy (https://giphy.com/)](https://giphy.com/)*: top 1K, photo, us, video*
|
|
||||||
1.  [Ultimate-Guitar (https://ultimate-guitar.com/)](https://ultimate-guitar.com/)*: top 1K, us*
|
1.  [Ultimate-Guitar (https://ultimate-guitar.com/)](https://ultimate-guitar.com/)*: top 1K, us*
|
||||||
1.  [Freelancer.com (https://www.freelancer.com/)](https://www.freelancer.com/)*: top 1K, freelance, us*
|
1.  [Freelancer.com (https://www.freelancer.com/)](https://www.freelancer.com/)*: top 1K, freelance, us*
|
||||||
1.  [YouPorn (https://youporn.com)](https://youporn.com)*: top 1K, porn, us*
|
1.  [YouPorn (https://youporn.com)](https://youporn.com)*: top 1K, porn, us*
|
||||||
1.  [Dreamstime (https://www.dreamstime.com)](https://www.dreamstime.com)*: top 1K, art, photo, stock*
|
1.  [Dreamstime (https://www.dreamstime.com)](https://www.dreamstime.com)*: top 1K, art, photo, stock*
|
||||||
1.  [TheVerge (https://www.theverge.com)](https://www.theverge.com)*: top 1K, us*
|
1.  [TheVerge (https://www.theverge.com)](https://www.theverge.com)*: top 1K, us*
|
||||||
|
1.  [giphy.com (https://giphy.com)](https://giphy.com)*: top 1K, video*
|
||||||
1.  [Championat (https://www.championat.com/)](https://www.championat.com/)*: top 1K, ru*
|
1.  [Championat (https://www.championat.com/)](https://www.championat.com/)*: top 1K, ru*
|
||||||
1.  [Wattpad (https://www.wattpad.com/)](https://www.wattpad.com/)*: top 1K, reading, writing*
|
1.  [Wattpad (https://www.wattpad.com/)](https://www.wattpad.com/)*: top 1K, reading, writing*
|
||||||
1.  [Disqus (https://disqus.com/)](https://disqus.com/)*: top 1K, discussion*
|
1.  [Disqus (https://disqus.com/)](https://disqus.com/)*: top 1K, discussion*
|
||||||
@@ -246,6 +248,7 @@ Rank data fetched from Alexa by domains.
|
|||||||
1.  [4pda (https://4pda.ru/)](https://4pda.ru/)*: top 5K, ru*
|
1.  [4pda (https://4pda.ru/)](https://4pda.ru/)*: top 5K, ru*
|
||||||
1.  [Weforum (https://www.weforum.org)](https://www.weforum.org)*: top 5K, forum, us*
|
1.  [Weforum (https://www.weforum.org)](https://www.weforum.org)*: top 5K, forum, us*
|
||||||
1.  [techspot.com (http://www.techspot.com/community/)](http://www.techspot.com/community/)*: top 5K, forum, us*
|
1.  [techspot.com (http://www.techspot.com/community/)](http://www.techspot.com/community/)*: top 5K, forum, us*
|
||||||
|
1.  [lyricstranslate.com (https://lyricstranslate.com)](https://lyricstranslate.com)*: top 5K, music*
|
||||||
1.  [Venmo (https://venmo.com/)](https://venmo.com/)*: top 5K, finance, us*
|
1.  [Venmo (https://venmo.com/)](https://venmo.com/)*: top 5K, finance, us*
|
||||||
1.  [Wikidot (http://www.wikidot.com/)](http://www.wikidot.com/)*: top 5K, us*
|
1.  [Wikidot (http://www.wikidot.com/)](http://www.wikidot.com/)*: top 5K, us*
|
||||||
1.  [Letterboxd (https://letterboxd.com/)](https://letterboxd.com/)*: top 5K, us*
|
1.  [Letterboxd (https://letterboxd.com/)](https://letterboxd.com/)*: top 5K, us*
|
||||||
@@ -254,6 +257,7 @@ Rank data fetched from Alexa by domains.
|
|||||||
1.  [We Heart It (https://weheartit.com/)](https://weheartit.com/)*: top 5K, blog, in, photo*
|
1.  [We Heart It (https://weheartit.com/)](https://weheartit.com/)*: top 5K, blog, in, photo*
|
||||||
1.  [FilmWeb (https://www.filmweb.pl/user/adam)](https://www.filmweb.pl/user/adam)*: top 5K, movies, pl*
|
1.  [FilmWeb (https://www.filmweb.pl/user/adam)](https://www.filmweb.pl/user/adam)*: top 5K, movies, pl*
|
||||||
1.  [forums.bulbagarden.net (http://forums.bulbagarden.net)](http://forums.bulbagarden.net)*: top 5K, forum, us*
|
1.  [forums.bulbagarden.net (http://forums.bulbagarden.net)](http://forums.bulbagarden.net)*: top 5K, forum, us*
|
||||||
|
1.  [videohive.net (https://videohive.net)](https://videohive.net)*: top 5K, video*
|
||||||
1.  [BoardGameGeek (https://www.boardgamegeek.com)](https://www.boardgamegeek.com)*: top 5K, gaming, us*
|
1.  [BoardGameGeek (https://www.boardgamegeek.com)](https://www.boardgamegeek.com)*: top 5K, gaming, us*
|
||||||
1.  [osu! (https://osu.ppy.sh/)](https://osu.ppy.sh/)*: top 5K, us*
|
1.  [osu! (https://osu.ppy.sh/)](https://osu.ppy.sh/)*: top 5K, us*
|
||||||
1.  [Pluralsight (https://app.pluralsight.com)](https://app.pluralsight.com)*: top 5K, in, us*
|
1.  [Pluralsight (https://app.pluralsight.com)](https://app.pluralsight.com)*: top 5K, in, us*
|
||||||
@@ -392,8 +396,8 @@ Rank data fetched from Alexa by domains.
|
|||||||
1.  [AnimeNewsNetwork (https://www.animenewsnetwork.com)](https://www.animenewsnetwork.com)*: top 100K, gb, us*
|
1.  [AnimeNewsNetwork (https://www.animenewsnetwork.com)](https://www.animenewsnetwork.com)*: top 100K, gb, us*
|
||||||
1.  [Smule (https://www.smule.com/)](https://www.smule.com/)*: top 100K, music*
|
1.  [Smule (https://www.smule.com/)](https://www.smule.com/)*: top 100K, music*
|
||||||
1.  [TVTropes (https://tvtropes.org)](https://tvtropes.org)*: top 100K, us*
|
1.  [TVTropes (https://tvtropes.org)](https://tvtropes.org)*: top 100K, us*
|
||||||
1.  [author.today (https://author.today)](https://author.today)*: top 100K, ru*
|
1.  [author.today (https://author.today)](https://author.today)*: top 100K, reading, ru*
|
||||||
1.  [TheSimsResource (https://www.thesimsresource.com/)](https://www.thesimsresource.com/)*: top 100K, de, gaming, it, us*
|
1.  [TheSimsResource (https://www.thesimsresource.com/)](https://www.thesimsresource.com/)*: top 100K, gaming*
|
||||||
1.  [N4g (https://n4g.com/)](https://n4g.com/)*: top 100K, gaming, news, us*
|
1.  [N4g (https://n4g.com/)](https://n4g.com/)*: top 100K, gaming, news, us*
|
||||||
1.  [Teletype (https://teletype.in)](https://teletype.in)*: top 100K, in, writing*
|
1.  [Teletype (https://teletype.in)](https://teletype.in)*: top 100K, in, writing*
|
||||||
1.  [Empflix (https://www.empflix.com)](https://www.empflix.com)*: top 100K, de, fr, porn*
|
1.  [Empflix (https://www.empflix.com)](https://www.empflix.com)*: top 100K, de, fr, porn*
|
||||||
@@ -453,6 +457,7 @@ Rank data fetched from Alexa by domains.
|
|||||||
1.  [Jimdo (https://jimdosite.com/)](https://jimdosite.com/)*: top 100K, jp*
|
1.  [Jimdo (https://jimdosite.com/)](https://jimdosite.com/)*: top 100K, jp*
|
||||||
1.  [club.cnews.ru (https://club.cnews.ru/)](https://club.cnews.ru/)*: top 100K, blog, ru*
|
1.  [club.cnews.ru (https://club.cnews.ru/)](https://club.cnews.ru/)*: top 100K, blog, ru*
|
||||||
1.  [PSNProfiles.com (https://psnprofiles.com/)](https://psnprofiles.com/)*: top 100K, gaming*
|
1.  [PSNProfiles.com (https://psnprofiles.com/)](https://psnprofiles.com/)*: top 100K, gaming*
|
||||||
|
1.  [donorbox (https://donorbox.org)](https://donorbox.org)*: top 100K, finance*
|
||||||
1.  [Sbazar.cz (https://www.sbazar.cz/)](https://www.sbazar.cz/)*: top 100K, cz, shopping*
|
1.  [Sbazar.cz (https://www.sbazar.cz/)](https://www.sbazar.cz/)*: top 100K, cz, shopping*
|
||||||
1.  [EuroFootball (https://www.euro-football.ru)](https://www.euro-football.ru)*: top 100K, ru*
|
1.  [EuroFootball (https://www.euro-football.ru)](https://www.euro-football.ru)*: top 100K, ru*
|
||||||
1.  [Raidforums (https://raidforums.com/)](https://raidforums.com/)*: top 100K, cybercriminal, forum*
|
1.  [Raidforums (https://raidforums.com/)](https://raidforums.com/)*: top 100K, cybercriminal, forum*
|
||||||
@@ -488,9 +493,10 @@ Rank data fetched from Alexa by domains.
|
|||||||
1.  [ESET (https://forum.esetnod32.ru)](https://forum.esetnod32.ru)*: top 100K, forum, ru*
|
1.  [ESET (https://forum.esetnod32.ru)](https://forum.esetnod32.ru)*: top 100K, forum, ru*
|
||||||
1.  [Dreamwidth (https://dreamwidth.org/profile)](https://dreamwidth.org/profile)*: top 100K, in, us*
|
1.  [Dreamwidth (https://dreamwidth.org/profile)](https://dreamwidth.org/profile)*: top 100K, in, us*
|
||||||
1.  [sparkpeople (https://www.sparkpeople.com)](https://www.sparkpeople.com)*: top 100K, us*
|
1.  [sparkpeople (https://www.sparkpeople.com)](https://www.sparkpeople.com)*: top 100K, us*
|
||||||
1.  [Destructoid (https://www.destructoid.com)](https://www.destructoid.com)*: top 100K, us*
|
1.  [Destructoid (https://www.destructoid.com)](https://www.destructoid.com)*: top 100K, us*, search is disabled
|
||||||
1.  [uID.me (by username) (https://uid.me/)](https://uid.me/)*: top 100K, ru*
|
1.  [uID.me (by username) (https://uid.me/)](https://uid.me/)*: top 100K, ru*
|
||||||
1.  [uID.me (by uguid) (https://uid.me/)](https://uid.me/)*: top 100K, ru*
|
1.  [uID.me (by uguid) (https://uid.me/)](https://uid.me/)*: top 100K, ru*
|
||||||
|
1.  [Observable (https://observablehq.com)](https://observablehq.com)*: top 100K, sharing*
|
||||||
1.  [Overclockers (https://overclockers.ru)](https://overclockers.ru)*: top 100K, ru*
|
1.  [Overclockers (https://overclockers.ru)](https://overclockers.ru)*: top 100K, ru*
|
||||||
1.  [HackingWithSwift (https://www.hackingwithswift.com)](https://www.hackingwithswift.com)*: top 100K, us*
|
1.  [HackingWithSwift (https://www.hackingwithswift.com)](https://www.hackingwithswift.com)*: top 100K, us*
|
||||||
1.  [YouNow (https://www.younow.com/)](https://www.younow.com/)*: top 100K, be, us*
|
1.  [YouNow (https://www.younow.com/)](https://www.younow.com/)*: top 100K, be, us*
|
||||||
@@ -526,6 +532,7 @@ Rank data fetched from Alexa by domains.
|
|||||||
1.  [forums.battlefield.com (https://forums.battlefield.com)](https://forums.battlefield.com)*: top 100K, forum, gaming, gb, us*, search is disabled
|
1.  [forums.battlefield.com (https://forums.battlefield.com)](https://forums.battlefield.com)*: top 100K, forum, gaming, gb, us*, search is disabled
|
||||||
1.  [GotovimDoma (https://gotovim-doma.ru)](https://gotovim-doma.ru)*: top 100K, ru*
|
1.  [GotovimDoma (https://gotovim-doma.ru)](https://gotovim-doma.ru)*: top 100K, ru*
|
||||||
1.  [prosportsdaily (https://forums.prosportsdaily.com)](https://forums.prosportsdaily.com)*: top 100K, forum, in, us*
|
1.  [prosportsdaily (https://forums.prosportsdaily.com)](https://forums.prosportsdaily.com)*: top 100K, forum, in, us*
|
||||||
|
1.  [clarity.fm (https://clarity.fm)](https://clarity.fm)*: top 100K, business*
|
||||||
1.  [Bukkit (https://bukkit.org/)](https://bukkit.org/)*: top 100K, at, forum, us*
|
1.  [Bukkit (https://bukkit.org/)](https://bukkit.org/)*: top 100K, at, forum, us*
|
||||||
1.  [Elakiri (https://elakiri.com)](https://elakiri.com)*: top 100K, lk*
|
1.  [Elakiri (https://elakiri.com)](https://elakiri.com)*: top 100K, lk*
|
||||||
1.  [Manutd (https://manutd.one)](https://manutd.one)*: top 100K, forum, sport*
|
1.  [Manutd (https://manutd.one)](https://manutd.one)*: top 100K, forum, sport*
|
||||||
@@ -778,6 +785,7 @@ Rank data fetched from Alexa by domains.
|
|||||||
1.  [Mobile-files (https://www.mobile-files.com/)](https://www.mobile-files.com/)*: top 10M, forum, ru, us*
|
1.  [Mobile-files (https://www.mobile-files.com/)](https://www.mobile-files.com/)*: top 10M, forum, ru, us*
|
||||||
1.  [Fluther (https://www.fluther.com/)](https://www.fluther.com/)*: top 10M, in, us*
|
1.  [Fluther (https://www.fluther.com/)](https://www.fluther.com/)*: top 10M, in, us*
|
||||||
1.  [Comedy (https://www.comedy.co.uk)](https://www.comedy.co.uk)*: top 10M, gb, in, movies, pk, us*
|
1.  [Comedy (https://www.comedy.co.uk)](https://www.comedy.co.uk)*: top 10M, gb, in, movies, pk, us*
|
||||||
|
1.  [sessionize.com (https://sessionize.com)](https://sessionize.com)*: top 10M, business*
|
||||||
1.  [Fireworktv (https://fireworktv.com)](https://fireworktv.com)*: top 10M, in, jp*
|
1.  [Fireworktv (https://fireworktv.com)](https://fireworktv.com)*: top 10M, in, jp*
|
||||||
1.  [funcom (https://forums.funcom.com)](https://forums.funcom.com)*: top 10M, forum, us*
|
1.  [funcom (https://forums.funcom.com)](https://forums.funcom.com)*: top 10M, forum, us*
|
||||||
1.  [RoyalCams (https://royalcams.com)](https://royalcams.com)*: top 10M, gr, in, ng, ru, us, webcam*
|
1.  [RoyalCams (https://royalcams.com)](https://royalcams.com)*: top 10M, gr, in, ng, ru, us, webcam*
|
||||||
@@ -817,6 +825,7 @@ Rank data fetched from Alexa by domains.
|
|||||||
1.  [Ustream (http://www.ustream.tv)](http://www.ustream.tv)*: top 10M, eg, us*
|
1.  [Ustream (http://www.ustream.tv)](http://www.ustream.tv)*: top 10M, eg, us*
|
||||||
1.  [Geodesist (https://geodesist.ru)](https://geodesist.ru)*: top 10M, forum, ru*
|
1.  [Geodesist (https://geodesist.ru)](https://geodesist.ru)*: top 10M, forum, ru*
|
||||||
1.  [Serveradmin (https://serveradmin.ru/)](https://serveradmin.ru/)*: top 10M, ru*
|
1.  [Serveradmin (https://serveradmin.ru/)](https://serveradmin.ru/)*: top 10M, ru*
|
||||||
|
1.  [telescope.ac (https://telescope.ac)](https://telescope.ac)*: top 10M, blog*
|
||||||
1.  [Ya-uchitel (https://ya-uchitel.ru/)](https://ya-uchitel.ru/)*: top 10M, ru*
|
1.  [Ya-uchitel (https://ya-uchitel.ru/)](https://ya-uchitel.ru/)*: top 10M, ru*
|
||||||
1.  [ResidentAdvisor (https://www.residentadvisor.net)](https://www.residentadvisor.net)*: top 10M, us*
|
1.  [ResidentAdvisor (https://www.residentadvisor.net)](https://www.residentadvisor.net)*: top 10M, us*
|
||||||
1.  [Weburg (https://weburg.net)](https://weburg.net)*: top 10M, ru*
|
1.  [Weburg (https://weburg.net)](https://weburg.net)*: top 10M, ru*
|
||||||
@@ -840,6 +849,7 @@ Rank data fetched from Alexa by domains.
|
|||||||
1.  [Cqham (http://www.cqham.ru)](http://www.cqham.ru)*: top 10M, ru, tech*
|
1.  [Cqham (http://www.cqham.ru)](http://www.cqham.ru)*: top 10M, ru, tech*
|
||||||
1.  [KharkovForum (https://www.kharkovforum.com/)](https://www.kharkovforum.com/)*: top 10M, forum, ua*
|
1.  [KharkovForum (https://www.kharkovforum.com/)](https://www.kharkovforum.com/)*: top 10M, forum, ua*
|
||||||
1.  [Studwork (https://studwork.org/)](https://studwork.org/)*: top 10M, ru*
|
1.  [Studwork (https://studwork.org/)](https://studwork.org/)*: top 10M, ru*
|
||||||
|
1.  [forum.freeton.org (https://forum.freeton.org)](https://forum.freeton.org)*: top 10M, finance, forum*
|
||||||
1.  [Playlists (https://playlists.net)](https://playlists.net)*: top 10M, in, us*
|
1.  [Playlists (https://playlists.net)](https://playlists.net)*: top 10M, in, us*
|
||||||
1.  [Liberapay (https://liberapay.com)](https://liberapay.com)*: top 10M, eg, finance, in, pk, us, za*
|
1.  [Liberapay (https://liberapay.com)](https://liberapay.com)*: top 10M, eg, finance, in, pk, us, za*
|
||||||
1.  [artinvestment (https://forum.artinvestment.ru/)](https://forum.artinvestment.ru/)*: top 10M, forum, ru*
|
1.  [artinvestment (https://forum.artinvestment.ru/)](https://forum.artinvestment.ru/)*: top 10M, forum, ru*
|
||||||
@@ -871,6 +881,7 @@ Rank data fetched from Alexa by domains.
|
|||||||
1.  [Paypal (https://www.paypal.me)](https://www.paypal.me)*: top 10M, finance*
|
1.  [Paypal (https://www.paypal.me)](https://www.paypal.me)*: top 10M, finance*
|
||||||
1.  [Seatracker (https://seatracker.ru/)](https://seatracker.ru/)*: top 10M, ru*
|
1.  [Seatracker (https://seatracker.ru/)](https://seatracker.ru/)*: top 10M, ru*
|
||||||
1.  [Hctorpedo (http://hctorpedo.ru)](http://hctorpedo.ru)*: top 10M, ru*
|
1.  [Hctorpedo (http://hctorpedo.ru)](http://hctorpedo.ru)*: top 10M, ru*
|
||||||
|
1.  [getmakerlog.com (https://getmakerlog.com)](https://getmakerlog.com)*: top 10M, business*
|
||||||
1.  [Cmet4uk (https://cmet4uk.ru)](https://cmet4uk.ru)*: top 10M, ru*
|
1.  [Cmet4uk (https://cmet4uk.ru)](https://cmet4uk.ru)*: top 10M, ru*
|
||||||
1.  [popjustice (https://forum.popjustice.com)](https://forum.popjustice.com)*: top 10M, co, forum, in, sg, us*
|
1.  [popjustice (https://forum.popjustice.com)](https://forum.popjustice.com)*: top 10M, co, forum, in, sg, us*
|
||||||
1.  [RPGGeek (https://rpggeek.com)](https://rpggeek.com)*: top 10M, gaming, us*
|
1.  [RPGGeek (https://rpggeek.com)](https://rpggeek.com)*: top 10M, gaming, us*
|
||||||
@@ -2518,5 +2529,39 @@ Rank data fetched from Alexa by domains.
|
|||||||
1.  [discuss.hashicorp.com (https://discuss.hashicorp.com)](https://discuss.hashicorp.com)*: top 100M, tech*
|
1.  [discuss.hashicorp.com (https://discuss.hashicorp.com)](https://discuss.hashicorp.com)*: top 100M, tech*
|
||||||
1.  [Blogger (by GAIA id) (https://www.blogger.com)](https://www.blogger.com)*: top 100M, blog*
|
1.  [Blogger (by GAIA id) (https://www.blogger.com)](https://www.blogger.com)*: top 100M, blog*
|
||||||
1.  [Weebly (http://weebly.com)](http://weebly.com)*: top 100M, business*
|
1.  [Weebly (http://weebly.com)](http://weebly.com)*: top 100M, business*
|
||||||
|
1.  [HiddenAnswers (http://answerszuvs3gg2l64e6hmnryudl5zgrmwm3vh65hzszdghblddvfiqd.onion)](http://answerszuvs3gg2l64e6hmnryudl5zgrmwm3vh65hzszdghblddvfiqd.onion)*: top 100M, tor*
|
||||||
|
1.  [.com ({username}.com)]({username}.com)*: top 100M*
|
||||||
|
1.  [galactictalk.org (https://galactictalk.org)](https://galactictalk.org)*: top 100M*
|
||||||
|
1.  [discuss.bootstrapped.fm (https://discuss.bootstrapped.fm)](https://discuss.bootstrapped.fm)*: top 100M*
|
||||||
|
1.  [discourse.mozilla.org (https://discourse.mozilla.org)](https://discourse.mozilla.org)*: top 100M*
|
||||||
|
1.  [ipinit.in (http://ipinit.in)](http://ipinit.in)*: top 100M*
|
||||||
|
1.  [boards.theforce.net (https://boards.theforce.net)](https://boards.theforce.net)*: top 100M*
|
||||||
|
1.  [aussiehomebrewer.com (https://aussiehomebrewer.com)](https://aussiehomebrewer.com)*: top 100M*
|
||||||
|
1.  [forum-ukraina.net (https://forum-ukraina.net)](https://forum-ukraina.net)*: top 100M*
|
||||||
|
1.  [forum-history.ru (http://forum-history.ru)](http://forum-history.ru)*: top 100M*
|
||||||
|
1.  [forum.vn.ua (http://forum.vn.ua)](http://forum.vn.ua)*: top 100M*
|
||||||
|
1.  [forum.bestflowers.ru (https://forum.bestflowers.ru)](https://forum.bestflowers.ru)*: top 100M*
|
||||||
|
1.  [forum.alconar.ru (https://forum.alconar.ru)](https://forum.alconar.ru)*: top 100M*
|
||||||
|
1.  [forum.lancerx.ru (https://forum.lancerx.ru)](https://forum.lancerx.ru)*: top 100M*
|
||||||
|
1.  [mfarmer.ru (http://www.mfarmer.ru)](http://www.mfarmer.ru)*: top 100M*
|
||||||
|
1.  [forum.league17.ru (https://forum.league17.ru)](https://forum.league17.ru)*: top 100M*
|
||||||
|
1.  [krskforum.com (https://krskforum.com)](https://krskforum.com)*: top 100M*
|
||||||
|
1.  [forum.rarib.ag (https://forum.rarib.ag)](https://forum.rarib.ag)*: top 100M*
|
||||||
|
1.  [forum.oneclickchicks.com (https://forum.oneclickchicks.com)](https://forum.oneclickchicks.com)*: top 100M*
|
||||||
|
1.  [forum.trade-print.ru (http://forum.trade-print.ru)](http://forum.trade-print.ru)*: top 100M*
|
||||||
|
1.  [forum.setcombg.com (https://forum.setcombg.com)](https://forum.setcombg.com)*: top 100M*
|
||||||
|
1.  [vw-bus.ru (https://vw-bus.ru)](https://vw-bus.ru)*: top 100M*
|
||||||
|
1.  [forum.ya1.ru (https://forum.ya1.ru)](https://forum.ya1.ru)*: top 100M*
|
||||||
|
1.  [forum.wordreference.com (https://forum.wordreference.com)](https://forum.wordreference.com)*: top 100M*
|
||||||
|
1.  [forums.zooclub.ru (https://forums.zooclub.ru)](https://forums.zooclub.ru)*: top 100M*
|
||||||
|
1.  [homebrewtalk.com (https://www.homebrewtalk.com)](https://www.homebrewtalk.com)*: top 100M*
|
||||||
|
1.  [navimba.com (https://navimba.com)](https://navimba.com)*: top 100M*
|
||||||
|
1.  [niva-club.net (https://www.niva-club.net)](https://www.niva-club.net)*: top 100M*
|
||||||
|
1.  [red-forum.com (https://red-forum.com)](https://red-forum.com)*: top 100M*
|
||||||
|
1.  [scaleforum.ru (http://www.scaleforum.ru)](http://www.scaleforum.ru)*: top 100M*
|
||||||
|
1.  [sign-forum.ru (https://sign-forum.ru)](https://sign-forum.ru)*: top 100M*
|
||||||
|
1.  [rec.poker (https://rec.poker)](https://rec.poker)*: top 100M*
|
||||||
|
1.  [uforum.uz (https://uforum.uz)](https://uforum.uz)*: top 100M*
|
||||||
|
1.  [DarkNet Trust (http://dntrustmucd4mwec.onion)](http://dntrustmucd4mwec.onion)*: top 100M, tor*
|
||||||
|
|
||||||
Alexa.com rank data fetched at (2021-05-16 14:01:29.561381 UTC)
|
Alexa.com rank data fetched at (2021-05-31 21:26:56.886650 UTC)
|
||||||
|
|||||||
@@ -1,4 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
coverage run --source=./maigret -m pytest tests
|
|
||||||
coverage report -m
|
|
||||||
coverage html
|
|
||||||
@@ -40,7 +40,7 @@ async def test_import_aiohttp_cookies():
|
|||||||
with open(cookies_filename, 'w') as f:
|
with open(cookies_filename, 'w') as f:
|
||||||
f.write(COOKIES_TXT)
|
f.write(COOKIES_TXT)
|
||||||
|
|
||||||
cookie_jar = await import_aiohttp_cookies(cookies_filename)
|
cookie_jar = import_aiohttp_cookies(cookies_filename)
|
||||||
assert list(cookie_jar._cookies.keys()) == ['xss.is', 'httpbin.org']
|
assert list(cookie_jar._cookies.keys()) == ['xss.is', 'httpbin.org']
|
||||||
|
|
||||||
url = 'https://httpbin.org/cookies'
|
url = 'https://httpbin.org/cookies'
|
||||||
|
|||||||
@@ -9,6 +9,7 @@ def site_result_except(server, username, **kwargs):
|
|||||||
server.expect_request('/url', query_string=query).respond_with_data(**kwargs)
|
server.expect_request('/url', query_string=query).respond_with_data(**kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_checking_by_status_code(httpserver, local_test_db):
|
async def test_checking_by_status_code(httpserver, local_test_db):
|
||||||
sites_dict = local_test_db.sites_dict
|
sites_dict = local_test_db.sites_dict
|
||||||
@@ -23,6 +24,7 @@ async def test_checking_by_status_code(httpserver, local_test_db):
|
|||||||
assert result['StatusCode']['status'].is_found() is False
|
assert result['StatusCode']['status'].is_found() is False
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_checking_by_message_positive_full(httpserver, local_test_db):
|
async def test_checking_by_message_positive_full(httpserver, local_test_db):
|
||||||
sites_dict = local_test_db.sites_dict
|
sites_dict = local_test_db.sites_dict
|
||||||
@@ -37,6 +39,7 @@ async def test_checking_by_message_positive_full(httpserver, local_test_db):
|
|||||||
assert result['Message']['status'].is_found() is False
|
assert result['Message']['status'].is_found() is False
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_checking_by_message_positive_part(httpserver, local_test_db):
|
async def test_checking_by_message_positive_part(httpserver, local_test_db):
|
||||||
sites_dict = local_test_db.sites_dict
|
sites_dict = local_test_db.sites_dict
|
||||||
@@ -51,6 +54,7 @@ async def test_checking_by_message_positive_part(httpserver, local_test_db):
|
|||||||
assert result['Message']['status'].is_found() is False
|
assert result['Message']['status'].is_found() is False
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_checking_by_message_negative(httpserver, local_test_db):
|
async def test_checking_by_message_negative(httpserver, local_test_db):
|
||||||
sites_dict = local_test_db.sites_dict
|
sites_dict = local_test_db.sites_dict
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ DEFAULT_ARGS: Dict[str, Any] = {
|
|||||||
'disable_recursive_search': False,
|
'disable_recursive_search': False,
|
||||||
'folderoutput': 'reports',
|
'folderoutput': 'reports',
|
||||||
'html': False,
|
'html': False,
|
||||||
|
'graph': False,
|
||||||
'id_type': 'username',
|
'id_type': 'username',
|
||||||
'ignore_ids_list': [],
|
'ignore_ids_list': [],
|
||||||
'info': False,
|
'info': False,
|
||||||
@@ -25,17 +26,21 @@ DEFAULT_ARGS: Dict[str, Any] = {
|
|||||||
'print_check_errors': False,
|
'print_check_errors': False,
|
||||||
'print_not_found': False,
|
'print_not_found': False,
|
||||||
'proxy': None,
|
'proxy': None,
|
||||||
|
'reports_sorting': 'default',
|
||||||
'retries': 1,
|
'retries': 1,
|
||||||
'self_check': False,
|
'self_check': False,
|
||||||
'site_list': [],
|
'site_list': [],
|
||||||
'stats': False,
|
'stats': False,
|
||||||
'tags': '',
|
'tags': '',
|
||||||
'timeout': 30,
|
'timeout': 30,
|
||||||
|
'tor_proxy': 'socks5://127.0.0.1:9050',
|
||||||
|
'i2p_proxy': 'http://127.0.0.1:4444',
|
||||||
'top_sites': 500,
|
'top_sites': 500,
|
||||||
'txt': False,
|
'txt': False,
|
||||||
'use_disabled_sites': False,
|
'use_disabled_sites': False,
|
||||||
'username': [],
|
'username': [],
|
||||||
'verbose': False,
|
'verbose': False,
|
||||||
|
'with_domains': False,
|
||||||
'xmind': False,
|
'xmind': False,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
+3
-2
@@ -1,15 +1,16 @@
|
|||||||
"""Maigret data test functions"""
|
"""Maigret data test functions"""
|
||||||
|
|
||||||
from maigret.utils import is_country_tag
|
from maigret.utils import is_country_tag
|
||||||
from maigret.sites import SUPPORTED_TAGS
|
|
||||||
|
|
||||||
|
|
||||||
def test_tags_validity(default_db):
|
def test_tags_validity(default_db):
|
||||||
unknown_tags = set()
|
unknown_tags = set()
|
||||||
|
|
||||||
|
tags = default_db._tags
|
||||||
|
|
||||||
for site in default_db.sites:
|
for site in default_db.sites:
|
||||||
for tag in filter(lambda x: not is_country_tag(x), site.tags):
|
for tag in filter(lambda x: not is_country_tag(x), site.tags):
|
||||||
if tag not in SUPPORTED_TAGS:
|
if tag not in tags:
|
||||||
unknown_tags.add(tag)
|
unknown_tags.add(tag)
|
||||||
|
|
||||||
assert unknown_tags == set()
|
assert unknown_tags == set()
|
||||||
|
|||||||
@@ -9,7 +9,6 @@ from maigret.maigret import self_check, maigret
|
|||||||
from maigret.maigret import (
|
from maigret.maigret import (
|
||||||
extract_ids_from_page,
|
extract_ids_from_page,
|
||||||
extract_ids_from_results,
|
extract_ids_from_results,
|
||||||
extract_ids_from_url,
|
|
||||||
)
|
)
|
||||||
from maigret.sites import MaigretSite
|
from maigret.sites import MaigretSite
|
||||||
from maigret.result import QueryResult, QueryStatus
|
from maigret.result import QueryResult, QueryStatus
|
||||||
@@ -138,23 +137,24 @@ def test_maigret_results(test_db):
|
|||||||
|
|
||||||
assert results['Reddit'].get('future') is None
|
assert results['Reddit'].get('future') is None
|
||||||
del results['GooglePlayStore']['future']
|
del results['GooglePlayStore']['future']
|
||||||
|
del results['GooglePlayStore']['checker']
|
||||||
|
|
||||||
assert results == RESULTS_EXAMPLE
|
assert results == RESULTS_EXAMPLE
|
||||||
|
|
||||||
|
|
||||||
def test_extract_ids_from_url(default_db):
|
def test_extract_ids_from_url(default_db):
|
||||||
assert extract_ids_from_url('https://www.reddit.com/user/test', default_db) == {
|
assert default_db.extract_ids_from_url('https://www.reddit.com/user/test') == {
|
||||||
'test': 'username'
|
'test': 'username'
|
||||||
}
|
}
|
||||||
assert extract_ids_from_url('https://vk.com/id123', default_db) == {'123': 'vk_id'}
|
assert default_db.extract_ids_from_url('https://vk.com/id123') == {'123': 'vk_id'}
|
||||||
assert extract_ids_from_url('https://vk.com/ida123', default_db) == {
|
assert default_db.extract_ids_from_url('https://vk.com/ida123') == {
|
||||||
'ida123': 'username'
|
'ida123': 'username'
|
||||||
}
|
}
|
||||||
assert extract_ids_from_url(
|
assert default_db.extract_ids_from_url(
|
||||||
'https://my.mail.ru/yandex.ru/dipres8904/', default_db
|
'https://my.mail.ru/yandex.ru/dipres8904/'
|
||||||
) == {'dipres8904': 'username'}
|
) == {'dipres8904': 'username'}
|
||||||
assert extract_ids_from_url(
|
assert default_db.extract_ids_from_url(
|
||||||
'https://reviews.yandex.ru/user/adbced123', default_db
|
'https://reviews.yandex.ru/user/adbced123'
|
||||||
) == {'adbced123': 'yandex_public_id'}
|
) == {'adbced123': 'yandex_public_id'}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
+98
-2
@@ -45,6 +45,19 @@ EXAMPLE_RESULTS = {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
BROKEN_RESULTS = {
|
||||||
|
'GitHub': {
|
||||||
|
'username': 'test',
|
||||||
|
'parsing_enabled': True,
|
||||||
|
'url_main': 'https://www.github.com/',
|
||||||
|
'url_user': 'https://www.github.com/test',
|
||||||
|
'http_status': 200,
|
||||||
|
'is_similar': False,
|
||||||
|
'rank': 78,
|
||||||
|
'site': MaigretSite('test', {}),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
GOOD_500PX_RESULT = copy.deepcopy(GOOD_RESULT)
|
GOOD_500PX_RESULT = copy.deepcopy(GOOD_RESULT)
|
||||||
GOOD_500PX_RESULT.tags = ['photo', 'us', 'global']
|
GOOD_500PX_RESULT.tags = ['photo', 'us', 'global']
|
||||||
GOOD_500PX_RESULT.ids_data = {
|
GOOD_500PX_RESULT.ids_data = {
|
||||||
@@ -239,10 +252,13 @@ TEST = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
SUPPOSED_BRIEF = """Search by username alexaimephotographycars returned 1 accounts. Found target's other IDs: alexaimephotography, Alexaimephotogr. Search by username alexaimephotography returned 2 accounts. Search by username Alexaimephotogr returned 1 accounts. Extended info extracted from 3 accounts."""
|
SUPPOSED_BRIEF = """Search by username alexaimephotographycars returned 1 accounts. Found target's other IDs: alexaimephotography, Alexaimephotogr. Search by username alexaimephotography returned 2 accounts. Search by username Alexaimephotogr returned 1 accounts. Extended info extracted from 3 accounts."""
|
||||||
|
SUPPOSED_BROKEN_BRIEF = """Search by username alexaimephotographycars returned 0 accounts. Search by username alexaimephotography returned 2 accounts. Search by username Alexaimephotogr returned 1 accounts. Extended info extracted from 2 accounts."""
|
||||||
SUPPOSED_INTERESTS = "Interests: photo <span class=\"text-muted\">(2)</span>, news <span class=\"text-muted\">(1)</span>, social <span class=\"text-muted\">(1)</span>"
|
|
||||||
|
|
||||||
SUPPOSED_GEO = "Geo: us <span class=\"text-muted\">(3)</span>"
|
SUPPOSED_GEO = "Geo: us <span class=\"text-muted\">(3)</span>"
|
||||||
|
SUPPOSED_BROKEN_GEO = "Geo: us <span class=\"text-muted\">(2)</span>"
|
||||||
|
|
||||||
|
SUPPOSED_INTERESTS = "Interests: photo <span class=\"text-muted\">(2)</span>, news <span class=\"text-muted\">(1)</span>, social <span class=\"text-muted\">(1)</span>"
|
||||||
|
SUPPOSED_BROKEN_INTERESTS = "Interests: news <span class=\"text-muted\">(1)</span>, photo <span class=\"text-muted\">(1)</span>, social <span class=\"text-muted\">(1)</span>"
|
||||||
|
|
||||||
|
|
||||||
def test_generate_report_template():
|
def test_generate_report_template():
|
||||||
@@ -270,6 +286,19 @@ def test_generate_csv_report():
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_generate_csv_report_broken():
|
||||||
|
csvfile = StringIO()
|
||||||
|
generate_csv_report('test', BROKEN_RESULTS, csvfile)
|
||||||
|
|
||||||
|
csvfile.seek(0)
|
||||||
|
data = csvfile.readlines()
|
||||||
|
|
||||||
|
assert data == [
|
||||||
|
'username,name,url_main,url_user,exists,http_status\r\n',
|
||||||
|
'test,GitHub,https://www.github.com/,https://www.github.com/test,Unknown,200\r\n',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_generate_txt_report():
|
def test_generate_txt_report():
|
||||||
txtfile = StringIO()
|
txtfile = StringIO()
|
||||||
generate_txt_report('test', EXAMPLE_RESULTS, txtfile)
|
generate_txt_report('test', EXAMPLE_RESULTS, txtfile)
|
||||||
@@ -283,6 +312,18 @@ def test_generate_txt_report():
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_generate_txt_report_broken():
|
||||||
|
txtfile = StringIO()
|
||||||
|
generate_txt_report('test', BROKEN_RESULTS, txtfile)
|
||||||
|
|
||||||
|
txtfile.seek(0)
|
||||||
|
data = txtfile.readlines()
|
||||||
|
|
||||||
|
assert data == [
|
||||||
|
'Total Websites Username Detected On : 0',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def test_generate_json_simple_report():
|
def test_generate_json_simple_report():
|
||||||
jsonfile = StringIO()
|
jsonfile = StringIO()
|
||||||
MODIFIED_RESULTS = dict(EXAMPLE_RESULTS)
|
MODIFIED_RESULTS = dict(EXAMPLE_RESULTS)
|
||||||
@@ -296,6 +337,19 @@ def test_generate_json_simple_report():
|
|||||||
assert list(json.loads(data[0]).keys()) == ['GitHub', 'GitHub2']
|
assert list(json.loads(data[0]).keys()) == ['GitHub', 'GitHub2']
|
||||||
|
|
||||||
|
|
||||||
|
def test_generate_json_simple_report_broken():
|
||||||
|
jsonfile = StringIO()
|
||||||
|
MODIFIED_RESULTS = dict(BROKEN_RESULTS)
|
||||||
|
MODIFIED_RESULTS['GitHub2'] = BROKEN_RESULTS['GitHub']
|
||||||
|
generate_json_report('test', BROKEN_RESULTS, jsonfile, 'simple')
|
||||||
|
|
||||||
|
jsonfile.seek(0)
|
||||||
|
data = jsonfile.readlines()
|
||||||
|
|
||||||
|
assert len(data) == 1
|
||||||
|
assert list(json.loads(data[0]).keys()) == []
|
||||||
|
|
||||||
|
|
||||||
def test_generate_json_ndjson_report():
|
def test_generate_json_ndjson_report():
|
||||||
jsonfile = StringIO()
|
jsonfile = StringIO()
|
||||||
MODIFIED_RESULTS = dict(EXAMPLE_RESULTS)
|
MODIFIED_RESULTS = dict(EXAMPLE_RESULTS)
|
||||||
@@ -329,6 +383,20 @@ def test_save_xmind_report():
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_save_xmind_report_broken():
|
||||||
|
filename = 'report_test.xmind'
|
||||||
|
save_xmind_report(filename, 'test', BROKEN_RESULTS)
|
||||||
|
|
||||||
|
workbook = xmind.load(filename)
|
||||||
|
sheet = workbook.getPrimarySheet()
|
||||||
|
data = sheet.getData()
|
||||||
|
|
||||||
|
assert data['title'] == 'test Analysis'
|
||||||
|
assert data['topic']['title'] == 'test'
|
||||||
|
assert len(data['topic']['topics']) == 1
|
||||||
|
assert data['topic']['topics'][0]['title'] == 'Undefined'
|
||||||
|
|
||||||
|
|
||||||
def test_html_report():
|
def test_html_report():
|
||||||
report_name = 'report_test.html'
|
report_name = 'report_test.html'
|
||||||
context = generate_report_context(TEST)
|
context = generate_report_context(TEST)
|
||||||
@@ -341,6 +409,21 @@ def test_html_report():
|
|||||||
assert SUPPOSED_INTERESTS in report_text
|
assert SUPPOSED_INTERESTS in report_text
|
||||||
|
|
||||||
|
|
||||||
|
def test_html_report_broken():
|
||||||
|
report_name = 'report_test_broken.html'
|
||||||
|
BROKEN_DATA = copy.deepcopy(TEST)
|
||||||
|
BROKEN_DATA[0][2]['500px']['status'] = None
|
||||||
|
|
||||||
|
context = generate_report_context(BROKEN_DATA)
|
||||||
|
save_html_report(report_name, context)
|
||||||
|
|
||||||
|
report_text = open(report_name).read()
|
||||||
|
|
||||||
|
assert SUPPOSED_BROKEN_BRIEF in report_text
|
||||||
|
assert SUPPOSED_BROKEN_GEO in report_text
|
||||||
|
assert SUPPOSED_BROKEN_INTERESTS in report_text
|
||||||
|
|
||||||
|
|
||||||
def test_pdf_report():
|
def test_pdf_report():
|
||||||
report_name = 'report_test.pdf'
|
report_name = 'report_test.pdf'
|
||||||
context = generate_report_context(TEST)
|
context = generate_report_context(TEST)
|
||||||
@@ -357,3 +440,16 @@ def test_text_report():
|
|||||||
assert brief_part in report_text
|
assert brief_part in report_text
|
||||||
assert 'us' in report_text
|
assert 'us' in report_text
|
||||||
assert 'photo' in report_text
|
assert 'photo' in report_text
|
||||||
|
|
||||||
|
|
||||||
|
def test_text_report_broken():
|
||||||
|
BROKEN_DATA = copy.deepcopy(TEST)
|
||||||
|
BROKEN_DATA[0][2]['500px']['status'] = None
|
||||||
|
|
||||||
|
context = generate_report_context(BROKEN_DATA)
|
||||||
|
report_text = get_plaintext_report(context)
|
||||||
|
|
||||||
|
for brief_part in SUPPOSED_BROKEN_BRIEF.split():
|
||||||
|
assert brief_part in report_text
|
||||||
|
assert 'us' in report_text
|
||||||
|
assert 'photo' in report_text
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
"""Maigret Database test functions"""
|
"""Maigret Database test functions"""
|
||||||
from maigret.sites import MaigretDatabase, MaigretSite
|
from maigret.sites import MaigretDatabase, MaigretSite
|
||||||
|
from maigret.utils import URLMatcher
|
||||||
|
|
||||||
EXAMPLE_DB = {
|
EXAMPLE_DB = {
|
||||||
'engines': {
|
'engines': {
|
||||||
@@ -179,3 +180,26 @@ def test_ranked_sites_dict_id_type():
|
|||||||
assert len(db.ranked_sites_dict()) == 2
|
assert len(db.ranked_sites_dict()) == 2
|
||||||
assert len(db.ranked_sites_dict(id_type='username')) == 2
|
assert len(db.ranked_sites_dict(id_type='username')) == 2
|
||||||
assert len(db.ranked_sites_dict(id_type='gaia_id')) == 1
|
assert len(db.ranked_sites_dict(id_type='gaia_id')) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_url_template():
|
||||||
|
site = MaigretSite(
|
||||||
|
"test",
|
||||||
|
{
|
||||||
|
"urlMain": "https://ya.ru/",
|
||||||
|
"url": "{urlMain}{urlSubpath}/members/?username={username}",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
site.get_url_template()
|
||||||
|
== "{urlMain}{urlSubpath}/members/?username={username} (no engine)"
|
||||||
|
)
|
||||||
|
|
||||||
|
site = MaigretSite(
|
||||||
|
"test",
|
||||||
|
{
|
||||||
|
"urlMain": "https://ya.ru/",
|
||||||
|
"url": "https://{username}.ya.ru",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert site.get_url_template() == "SUBDOMAIN"
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ from maigret.utils import (
|
|||||||
enrich_link_str,
|
enrich_link_str,
|
||||||
URLMatcher,
|
URLMatcher,
|
||||||
get_dict_ascii_tree,
|
get_dict_ascii_tree,
|
||||||
|
get_match_ratio,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -136,3 +137,9 @@ def test_get_dict_ascii_tree():
|
|||||||
┣╸instagram_username: Street.Reality.Photography
|
┣╸instagram_username: Street.Reality.Photography
|
||||||
┗╸twitter_username: Alexaimephotogr"""
|
┗╸twitter_username: Alexaimephotogr"""
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_match_ratio():
|
||||||
|
fun = get_match_ratio(["test", "maigret", "username"])
|
||||||
|
|
||||||
|
assert fun("test") == 1
|
||||||
|
|||||||
Reference in New Issue
Block a user