Compare commits

..

63 Commits

Author SHA1 Message Date
soxoj bea900dda0 Merge pull request #155 from soxoj/0.2.4
Bump to 0.2.4
2021-05-18 01:20:00 +03:00
Soxoj bb1bde833d Bump to 0.2.4 2021-05-18 01:17:35 +03:00
soxoj 5b405c6abb Merge pull request #154 from soxoj/tests-improving
Improved tests
2021-05-18 00:57:31 +03:00
Soxoj 99fa58ceed Disabled Twitter activation test 2021-05-18 00:55:18 +03:00
Soxoj c71e404f63 Added test dependencies 2021-05-18 00:49:13 +03:00
Soxoj 2c04ccce57 Improved tests 2021-05-18 00:43:56 +03:00
soxoj 435db7cdc9 Merge pull request #153 from soxoj/sites-update-16-05-21
Several sites added, updated site list
2021-05-17 00:35:56 +03:00
Soxoj 413a0502a4 Several sites added, updated site list 2021-05-16 17:02:41 +03:00
soxoj 2aedcc3166 Merge pull request #152 from soxoj/cli-plaintext-report
Added text report to CLI output
2021-05-15 16:57:22 +03:00
Soxoj 28835204f5 Added text report to CLI output 2021-05-15 16:55:05 +03:00
soxoj b11a247dfd Merge pull request #151 from soxoj/tags-socid-extractor
Tags updated, added tests for tags
2021-05-15 14:55:01 +03:00
Soxoj c9219d91ec Tags updated, added tests for tags
Added several sites
Updated socid_extractor version to avoid bug #150
2021-05-15 14:51:30 +03:00
soxoj aa6cd0eca9 Merge pull request #149 from soxoj/0.2.3
Bump to 0.2.3
2021-05-12 22:40:02 +03:00
Soxoj 38e5d5c664 Bump to 0.2.3 2021-05-12 22:37:19 +03:00
soxoj 8a562d06ae Merge pull request #148 from soxoj/sites-updates-12-05
Fixed Anobii, added several new sites
2021-05-12 19:27:19 +03:00
Soxoj aa50ee9672 Fixed Anobii, added several new sites 2021-05-12 19:25:14 +03:00
soxoj 51327f9647 Merge pull request #146 from soxoj/links-sites
Added several links sites
2021-05-10 14:21:48 +03:00
Soxoj 4a368c9bb6 Added several links sites 2021-05-10 14:19:52 +03:00
soxoj 6fd5f6e33a Update build-docker-image.yml 2021-05-10 02:51:56 +03:00
soxoj fa3db9c39c Merge pull request #144 from soxoj/stackoverflow
Added fuzzy search by StackOverflow
2021-05-10 00:42:02 +03:00
Soxoj 5912ad4fbc Added fuzzy search by StackOverflow 2021-05-10 00:39:36 +03:00
soxoj ee36dc0187 Merge pull request #143 from soxoj/tags-updates-1
Tags sorting and some updates
2021-05-09 23:21:57 +03:00
Soxoj 9eb62e4e22 Tags sorting and some updates 2021-05-09 23:19:41 +03:00
soxoj ead048af93 Merge pull request #142 from soxoj/photo-sites-1
Photo sites added
2021-05-09 18:24:33 +03:00
Soxoj acc751ff98 Photo sites added 2021-05-09 16:48:46 +03:00
soxoj b7bdd71cf0 Merge pull request #141 from soxoj/tags-update-script
Tags updates, script added
2021-05-09 16:47:41 +03:00
Soxoj 43f189f774 Tags updates, script added 2021-05-09 16:25:42 +03:00
soxoj 5bda7fb339 Merge pull request #140 from soxoj/tags-updates
Tags updates
2021-05-09 00:18:53 +03:00
Soxoj 414523a8ac Tags updates 2021-05-09 00:16:58 +03:00
soxoj 6d4e268706 Merge pull request #139 from soxoj/photo-sites
Added some photo sites, improved errors detecting
2021-05-08 20:39:46 +03:00
Soxoj b696b982f4 Added some photo sites, improved errors detecting 2021-05-08 20:37:34 +03:00
soxoj d4234036c0 Merge pull request #137 from soxoj/minor-fixes
Version patch and some minor fixes
2021-05-08 16:57:30 +03:00
Soxoj b57c70091c Added __version__.py 2021-05-08 16:55:49 +03:00
Soxoj e90df3560b Version patch and some minor fixes 2021-05-08 16:46:38 +03:00
soxoj bc6ee48b8c Merge pull request #136 from soxoj/dockerhub-image
Create build-docker-image.yml
2021-05-08 15:41:57 +03:00
soxoj e70bdf3789 Readme update 2021-05-08 15:41:38 +03:00
soxoj 84f9d417cf Create build-docker-image.yml 2021-05-08 15:16:37 +03:00
soxoj 4333c40be7 Merge pull request #135 from soxoj/new-sites-08-05-21
Added Weibo, Reddit BigData search, Wigle and several other sites
2021-05-08 13:56:06 +03:00
Soxoj 9e504c0094 Added Weibo, Reddit BigData search, Wigle and several other sites 2021-05-08 13:54:25 +03:00
soxoj 2f752a0368 Merge pull request #132 from soxoj/yelp
Added Yelp and yelp_userid support
2021-05-08 03:36:23 +03:00
Soxoj 53e9dab677 Added Yelp and yelp_userid support 2021-05-08 03:34:03 +03:00
soxoj 11b70a2a48 Merge pull request #131 from soxoj/facebook-ids-fixes
Facebook parsing fixed, website field added
2021-05-08 02:38:54 +03:00
Soxoj 960708ef2e Facebook parsing fixed, website field added 2021-05-08 02:25:54 +03:00
soxoj e6f6d8735d Merge pull request #130 from soxoj/tags-stabilization
Tags markup stabilization
2021-05-08 01:04:46 +03:00
Soxoj f77d7d307a Tags markup stabilization 2021-05-08 00:59:54 +03:00
soxoj 158f739a59 Merge pull request #129 from soxoj/0.2.2
Updated sites, improved submit dialog, bump to 0.2.2
2021-05-07 12:30:55 +03:00
Soxoj b6a207d0e3 Updated sites, improved submit dialog, bump to 0.2.2 2021-05-07 12:27:24 +03:00
soxoj d59867b0d9 Merge pull request #128 from soxoj/sites-improvements
Added several sites, some improvements
2021-05-07 01:23:23 +03:00
Soxoj 2145027196 Added several sites, some improvements 2021-05-07 01:20:20 +03:00
soxoj 386e9eba4f Merge pull request #127 from soxoj/extraction-notify-tests
Improve extracting ids from URLs, tests
2021-05-06 22:38:22 +03:00
Soxoj 0e9655c46a Improve extracting ids from URLs, tests 2021-05-06 22:35:44 +03:00
soxoj 009d51c380 Merge pull request #126 from soxoj/main-refactoring
Main maigret function refactoring
2021-05-05 23:32:27 +03:00
Soxoj 78e9688ece Test data fix 2021-05-05 23:27:30 +03:00
Soxoj 3cbb9df7b3 Main maigret function refactoring 2021-05-05 18:02:13 +03:00
soxoj 2fb1f19948 Merge pull request #125 from soxoj/argparser-tests
CLI arguments improvements, tests added
2021-05-05 15:34:36 +03:00
Soxoj 3b91a9cd31 CLI arguments improvements, tests added 2021-05-05 15:27:56 +03:00
soxoj 9858e71349 Merge pull request #124 from soxoj/refactoring-complexity-decrease
Refactored to decrease cyclomatic complexity
2021-05-05 10:59:11 +03:00
Soxoj c88e194d07 Refactored to decrease cyclomatic complexity 2021-05-05 10:55:33 +03:00
soxoj ad5c7fbc7d Merge pull request #123 from soxoj/new-sites-engines
Added some new sites, engines updates
2021-05-03 03:18:40 +03:00
Soxoj 66d6c7a93c Added some new sites, engines updates 2021-05-03 03:16:02 +03:00
soxoj bdfb4911ce Merge pull request #122 from soxoj/0.2.1-bugfix
Fixed json report generation bug, bump to 0.2.1
2021-05-02 20:14:22 +03:00
Soxoj 951be44452 Fixed test fixture scope 2021-05-02 20:12:36 +03:00
Soxoj 188edc1b7f Fixed json report generation bug, bump to 0.2.1 2021-05-02 20:06:15 +03:00
36 changed files with 8142 additions and 4457 deletions
+32
View File
@@ -0,0 +1,32 @@
name: Build docker image and push to DockerHub
on:
push:
branches: [ main ]
jobs:
docker:
runs-on: ubuntu-latest
steps:
-
name: Set up QEMU
uses: docker/setup-qemu-action@v1
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
-
name: Login to DockerHub
uses: docker/login-action@v1
with:
username: ${{ secrets.DOCKER_HUB_USERNAME }}
password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }}
-
name: Build and push
id: docker_build
uses: docker/build-push-action@v2
with:
push: true
tags: ${{ secrets.DOCKER_HUB_USERNAME }}/maigret:latest
-
name: Image digest
run: echo ${{ steps.docker_build.outputs.digest }}
+1 -1
View File
@@ -26,7 +26,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install flake8 pytest pytest-rerunfailures
python -m pip install -r test-requirements.txt
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Test with pytest
run: |
+6 -3
View File
@@ -22,9 +22,12 @@ src/
# Comma-Separated Values (CSV) Reports
*.csv
# Excluded sites list
tests/.excluded_sites
# MacOS Folder Metadata File
.DS_Store
/reports/
# Testing
.coverage
dist/
htmlcov/
/test_*
+17
View File
@@ -2,6 +2,23 @@
## [Unreleased]
## [0.2.4] - 2021-05-18
* cli output report
* various improvements
## [0.2.3] - 2021-05-12
* added Yelp and yelp_userid support
* tags markup stabilization
* improved errors detection
## [0.2.2] - 2021-05-07
* improved ids extractors
* updated sites and engines
* updates CLI options
## [0.2.1] - 2021-05-02
* fixed json reports generation bug, added tests
## [0.2.0] - 2021-05-02
* added `--retries` option
* added `source` feature for sites' mirrors
+48 -41
View File
@@ -1,40 +1,55 @@
# Maigret
![PyPI](https://img.shields.io/pypi/v/maigret?style=flat-square)
![PyPI - Downloads](https://img.shields.io/pypi/dw/maigret?style=flat-square)
[![Chat - Gitter](./static/chat_gitter.svg)](https://gitter.im/maigret-osint/community)
<p align="center">
<img src="./static/maigret.png" />
<p align="center">
<a href="https://pypi.org/project/maigret/">
<img alt="PyPI" src="https://img.shields.io/pypi/v/maigret?style=flat-square">
</a>
<a href="https://pypi.org/project/maigret/">
<img alt="PyPI - Downloads" src="https://img.shields.io/pypi/dw/maigret?style=flat-square">
</a>
<a href="https://gitter.im/maigret-osint/community">
<img alt="Chat - Gitter" src="./static/chat_gitter.svg" />
</a>
<a href="https://twitter.com/intent/follow?screen_name=sox0j">
<img src="https://img.shields.io/twitter/follow/sox0j?label=Follow%20sox0j&style=social&color=blue" alt="Follow @sox0j" />
</a>
</p>
<p align="center">
<img src="./static/maigret.png" height="200"/>
</p>
</p>
<i>The Commissioner Jules Maigret is a fictional French police detective, created by Georges Simenon. His investigation method is based on understanding the personality of different people and their interactions.</i>
## About
Purpose of Maigret - **collect a dossier on a person by username only**, checking for accounts on a huge number of sites.
**Maigret** collect a dossier on a person **by username only**, checking for accounts on a huge number of sites and gathering all the available information from web pages. Maigret is an easy-to-use and powerful fork of [Sherlock](https://github.com/sherlock-project/sherlock).
This is a [sherlock](https://github.com/sherlock-project/) fork with cool features under heavy development.
*Don't forget to regularly update source code from repo*.
Currently supported more than 2000 sites ([full list](./sites.md)), by default search is launched against 500 popular sites in descending order of popularity.
Currently supported more than 2000 sites ([full list](./sites.md)), search is launched against 500 popular sites in descending order of popularity by default.
## Main features
* Profile pages parsing, [extracting](https://github.com/soxoj/socid_extractor) personal info, links to other profiles, etc.
* Recursive search by new usernames found
* Profile pages parsing, [extraction](https://github.com/soxoj/socid_extractor) of personal info, links to other profiles, etc.
* Recursive search by new usernames and other ids found
* Search by tags (site categories, countries)
* Censorship and captcha detection
* Very few false positives
* Failed requests' restarts
* Requests retries
See full description of Maigret features [in the Wiki](https://github.com/soxoj/maigret/wiki/Features).
## Installation
**NOTE**: Python 3.6 or higher and pip is required.
Maigret can be installed using pip, Docker, or simply can be launched from the cloned repo.
Also you can run Maigret using cloud shells (see buttons below).
**Python 3.8 is recommended.**
[![Open in Cloud Shell](https://user-images.githubusercontent.com/27065646/92304704-8d146d80-ef80-11ea-8c29-0deaabb1c702.png)](https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/soxoj/maigret&tutorial=README.md) [![Run on Repl.it](https://user-images.githubusercontent.com/27065646/92304596-bf719b00-ef7f-11ea-987f-2c1f3c323088.png)](https://repl.it/github/soxoj/maigret)
<a href="https://colab.research.google.com/gist//soxoj/879b51bc3b2f8b695abb054090645000/maigret.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" height="40"></a>
### Package installing
**NOTE**: Python 3.6 or higher and pip is required, **Python 3.8 is recommended.**
```bash
# install from pypi
pip3 install maigret
@@ -42,34 +57,36 @@ pip3 install maigret
# or clone and install manually
git clone https://github.com/soxoj/maigret && cd maigret
pip3 install .
# usage
maigret username
```
### Cloning a repository
```bash
git clone https://github.com/soxoj/maigret && cd maigret
```
You can use a free virtual machine, the repo will be automatically cloned:
[![Open in Cloud Shell](https://user-images.githubusercontent.com/27065646/92304704-8d146d80-ef80-11ea-8c29-0deaabb1c702.png)](https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/soxoj/maigret&tutorial=README.md) [![Run on Repl.it](https://user-images.githubusercontent.com/27065646/92304596-bf719b00-ef7f-11ea-987f-2c1f3c323088.png)](https://repl.it/github/soxoj/maigret)
<a href="https://colab.research.google.com/gist//soxoj/879b51bc3b2f8b695abb054090645000/maigret.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" height="40"></a>
```bash
pip3 install -r requirements.txt
# usage
./maigret.py username
```
## Using examples
### Docker
```bash
# for a cloned repo
./maigret.py user
# official image
docker pull soxoj/maigret
# for a package
maigret user
# usage
docker run soxoj/maigret:latest username
# manual build
docker build -t maigret .
```
Features:
## Usage examples
```bash
# make HTML and PDF reports
maigret user --html --pdf
@@ -77,22 +94,12 @@ maigret user --html --pdf
# search on sites marked with tags photo & dating
maigret user --tags photo,dating
# search for three usernames on all available sites
maigret user1 user2 user3 -a
```
Run `maigret --help` to get arguments description. Also options are documented in [the Maigret Wiki](https://github.com/soxoj/maigret/wiki/Command-line-options).
Use `maigret --help` to get full options description. Also options are documented in [the Maigret Wiki](https://github.com/soxoj/maigret/wiki/Command-line-options).
With Docker:
```
# manual build
docker build -t maigret . && docker run maigret user
# official image
docker run soxoj/maigret:latest user
```
## Demo with page parsing and recursive username search
+7
View File
@@ -1,5 +1,12 @@
"""Maigret"""
__title__ = 'Maigret'
__package__ = 'maigret'
__author__ = 'Soxoj'
__author_email__ = 'soxoj@protonmail.com'
from .__version__ import __version__
from .checking import maigret as search
from .sites import MaigretEngine, MaigretSite, MaigretDatabase
from .notify import QueryNotifyPrint as Notifier
+3
View File
@@ -0,0 +1,3 @@
"""Maigret version file"""
__version__ = '0.2.4'
-18
View File
@@ -34,24 +34,6 @@ class ParsingActivator:
bearer_token = r.json()["accessToken"]
site.headers["authorization"] = f"Bearer {bearer_token}"
@staticmethod
def xssis(site, logger, cookies={}):
if not cookies:
logger.debug("You must have cookies to activate xss.is parsing!")
return
headers = dict(site.headers)
post_data = {
"_xfResponseType": "json",
"_xfToken": "1611177919,a2710362e45dad9aa1da381e21941a38",
}
headers["content-type"] = "application/x-www-form-urlencoded; charset=UTF-8"
r = requests.post(
site.activation["url"], headers=headers, cookies=cookies, data=post_data
)
csrf = r.json()["csrf"]
site.get_params["_xfToken"] = csrf
async def import_aiohttp_cookies(cookiestxt_filename):
cookies_obj = MozillaCookieJar(cookiestxt_filename)
+64 -54
View File
@@ -6,12 +6,14 @@ import ssl
import sys
import tqdm
from typing import Tuple, Optional, Dict, List
from urllib.parse import quote
import aiohttp
import tqdm.asyncio
from aiohttp_socks import ProxyConnector
from python_socks import _errors as proxy_errors
from socid_extractor import extract
from aiohttp.client_exceptions import ServerDisconnectedError, ClientConnectorError
from .activation import ParsingActivator, import_aiohttp_cookies
from . import errors
@@ -24,10 +26,10 @@ from .executors import (
from .result import QueryResult, QueryStatus
from .sites import MaigretDatabase, MaigretSite
from .types import QueryOptions, QueryResultWrapper
from .utils import get_random_user_agent
from .utils import get_random_user_agent, ascii_data_display
supported_recursive_search_ids = (
SUPPORTED_IDS = (
"yandex_public_id",
"gaia_id",
"vk_id",
@@ -35,9 +37,10 @@ supported_recursive_search_ids = (
"wikimapia_uid",
"steam_id",
"uidme_uguid",
"yelp_userid",
)
unsupported_characters = "#"
BAD_CHARS = "#"
async def get_response(request_future, logger) -> Tuple[str, int, Optional[CheckError]]:
@@ -54,17 +57,18 @@ async def get_response(request_future, logger) -> Tuple[str, int, Optional[Check
decoded_content = response_content.decode(charset, "ignore")
html_text = decoded_content
error = None
if status_code == 0:
error = CheckError("Connection lost")
else:
error = None
logger.debug(html_text)
except asyncio.TimeoutError as e:
error = CheckError("Request timeout", str(e))
except aiohttp.client_exceptions.ClientConnectorError as e:
except ClientConnectorError as e:
error = CheckError("Connecting failure", str(e))
except ServerDisconnectedError as e:
error = CheckError("Server disconnected", str(e))
except aiohttp.http_exceptions.BadHttpMessage as e:
error = CheckError("HTTP", str(e))
except proxy_errors.ProxyError as e:
@@ -73,11 +77,10 @@ async def get_response(request_future, logger) -> Tuple[str, int, Optional[Check
error = CheckError("Interrupted")
except Exception as e:
# python-specific exceptions
if sys.version_info.minor > 6:
if isinstance(e, ssl.SSLCertVerificationError) or isinstance(
e, ssl.SSLError
):
error = CheckError("SSL", str(e))
if sys.version_info.minor > 6 and (
isinstance(e, ssl.SSLCertVerificationError) or isinstance(e, ssl.SSLError)
):
error = CheckError("SSL", str(e))
else:
logger.debug(e, exc_info=True)
error = CheckError("Unexpected", str(e))
@@ -109,6 +112,14 @@ def detect_error_page(
return None
def debug_response_logging(url, html_text, status_code, check_error):
with open("debug.log", "a") as f:
status = status_code or "No response"
f.write(f"url: {url}\nerror: {check_error}\nr: {status}\n")
if html_text:
f.write(f"code: {status}\nresponse: {str(html_text)}\n")
def process_site_result(
response, query_notify, logger, results_info: QueryResultWrapper, site: MaigretSite
):
@@ -121,7 +132,7 @@ def process_site_result(
username = results_info["username"]
is_parsing_enabled = results_info["parsing_enabled"]
url = results_info.get("url_user")
logger.debug(url)
logger.info(url)
status = results_info.get("status")
if status is not None:
@@ -142,40 +153,42 @@ def process_site_result(
response_time = None
if logger.level == logging.DEBUG:
with open("debug.txt", "a") as f:
status = status_code or "No response"
f.write(f"url: {url}\nerror: {check_error}\nr: {status}\n")
if html_text:
f.write(f"code: {status}\nresponse: {str(html_text)}\n")
debug_response_logging(url, html_text, status_code, check_error)
# additional check for errors
if status_code and not check_error:
check_error = detect_error_page(
html_text, status_code, site.errors, site.ignore403
html_text, status_code, site.errors_dict, site.ignore403
)
if site.activation and html_text:
is_need_activation = any(
[s for s in site.activation["marks"] if s in html_text]
)
if is_need_activation:
method = site.activation["method"]
try:
activate_fun = getattr(ParsingActivator(), method)
# TODO: async call
activate_fun(site, logger)
except AttributeError:
logger.warning(
f"Activation method {method} for site {site.name} not found!"
)
except Exception as e:
logger.warning(f"Failed activation {method} for site {site.name}: {e}")
# parsing activation
is_need_activation = any(
[s for s in site.activation.get("marks", []) if s in html_text]
)
if site.activation and html_text and is_need_activation:
method = site.activation["method"]
try:
activate_fun = getattr(ParsingActivator(), method)
# TODO: async call
activate_fun(site, logger)
except AttributeError:
logger.warning(
f"Activation method {method} for site {site.name} not found!"
)
except Exception as e:
logger.warning(
f"Failed activation {method} for site {site.name}: {str(e)}",
exc_info=True,
)
# TODO: temporary check error
site_name = site.pretty_name
# presense flags
# True by default
presense_flags = site.presense_strs
is_presense_detected = False
if html_text:
if not presense_flags:
is_presense_detected = True
@@ -200,7 +213,7 @@ def process_site_result(
)
if check_error:
logger.debug(check_error)
logger.warning(check_error)
result = QueryResult(
username,
site_name,
@@ -220,9 +233,9 @@ def process_site_result(
result = build_result(QueryStatus.CLAIMED)
else:
result = build_result(QueryStatus.AVAILABLE)
elif check_type == "status_code":
elif check_type in "status_code":
# Checks if the status code of the response is 2XX
if is_presense_detected and (not status_code >= 300 or status_code < 200):
if 200 <= status_code < 300:
result = build_result(QueryStatus.CLAIMED)
else:
result = build_result(QueryStatus.AVAILABLE)
@@ -255,16 +268,16 @@ def process_site_result(
for k, v in extracted_ids_data.items():
if "username" in k:
new_usernames[v] = "username"
if k in supported_recursive_search_ids:
if k in SUPPORTED_IDS:
new_usernames[v] = k
results_info["ids_usernames"] = new_usernames
results_info["ids_links"] = eval(extracted_ids_data.get("links", "[]"))
links = ascii_data_display(extracted_ids_data.get("links", "[]"))
if "website" in extracted_ids_data:
links.append(extracted_ids_data["website"])
results_info["ids_links"] = links
result.ids_data = extracted_ids_data
# Notify caller about results of query.
query_notify.update(result, site.similar_search)
# Save status of request
results_info["status"] = result
@@ -303,7 +316,7 @@ def make_site_result(
# URL of user on site (if it exists)
url = site.url.format(
urlMain=site.url_main, urlSubpath=site.url_subpath, username=username
urlMain=site.url_main, urlSubpath=site.url_subpath, username=quote(username)
)
# workaround to prevent slash errors
@@ -412,6 +425,8 @@ async def check_site_for_username(
response, query_notify, logger, default_result, site
)
query_notify.update(response_result['status'], site.similar_search)
return site.name, response_result
@@ -441,7 +456,7 @@ async def maigret(
logger,
query_notify=None,
proxy=None,
timeout=None,
timeout=3,
is_parsing_enabled=False,
id_type="username",
debug=False,
@@ -463,7 +478,7 @@ async def maigret(
query results.
logger -- Standard Python logger object.
timeout -- Time in seconds to wait before timing out request.
Default is no timeout.
Default is 3 seconds.
is_parsing_enabled -- Extract additional info from account pages.
id_type -- Type of username to search.
Default is 'username', see all supported here:
@@ -616,15 +631,10 @@ async def site_self_check(
"disabled": False,
}
try:
check_data = [
(site.username_claimed, QueryStatus.CLAIMED),
(site.username_unclaimed, QueryStatus.AVAILABLE),
]
except Exception as e:
logger.error(e)
logger.error(site.__dict__)
check_data = []
check_data = [
(site.username_claimed, QueryStatus.CLAIMED),
(site.username_unclaimed, QueryStatus.AVAILABLE),
]
logger.info(f"Checking {site.name}...")
+17 -2
View File
@@ -1,6 +1,7 @@
from typing import Dict, List, Any
from .result import QueryResult
from .types import QueryResultWrapper
# error got as a result of completed search query
@@ -34,6 +35,12 @@ COMMON_ERRORS = {
'Please stand by, while we are checking your browser': CheckError(
'Bot protection', 'Cloudflare'
),
'<span data-translate="checking_browser">Checking your browser before accessing</span>': CheckError(
'Bot protection', 'Cloudflare'
),
'This website is using a security service to protect itself from online attacks.': CheckError(
'Access denied', 'Cloudflare'
),
'<title>Доступ ограничен</title>': CheckError('Censorship', 'Rostelecom'),
'document.getElementById(\'validate_form_submit\').disabled=true': CheckError(
'Captcha', 'Mail.ru'
@@ -48,6 +55,9 @@ COMMON_ERRORS = {
'Censorship', 'MGTS'
),
'Incapsula incident ID': CheckError('Bot protection', 'Incapsula'),
'Сайт заблокирован хостинг-провайдером': CheckError(
'Site-specific', 'Site is disabled (Beget)'
),
}
ERRORS_TYPES = {
@@ -57,6 +67,11 @@ ERRORS_TYPES = {
'Request timeout': 'Try to increase timeout or to switch to another internet service provider',
}
# TODO: checking for reason
ERRORS_REASONS = {
'Login required': 'Add authorization cookies through `--cookies-jar-file` (see cookies.txt)',
}
TEMPORARY_ERRORS_TYPES = [
'Request timeout',
'Unknown',
@@ -90,9 +105,9 @@ def solution_of(err_type) -> str:
return ERRORS_TYPES.get(err_type, '')
def extract_and_group(search_res: dict) -> List[Dict[str, Any]]:
def extract_and_group(search_res: QueryResultWrapper) -> List[Dict[str, Any]]:
errors_counts: Dict[str, int] = {}
for r in search_res:
for r in search_res.values():
if r and isinstance(r, dict) and r.get('status'):
if not isinstance(r['status'], QueryResult):
continue
+271 -225
View File
@@ -8,15 +8,17 @@ import os
import sys
import platform
from argparse import ArgumentParser, RawDescriptionHelpFormatter
from typing import List, Tuple
import requests
from socid_extractor import extract, parse, __version__ as socid_version
from .__version__ import __version__
from .checking import (
timeout_check,
supported_recursive_search_ids,
SUPPORTED_IDS,
self_check,
unsupported_characters,
BAD_CHARS,
maigret,
)
from . import errors
@@ -29,18 +31,17 @@ from .report import (
generate_report_context,
save_txt_report,
SUPPORTED_JSON_REPORT_FORMATS,
check_supported_json_format,
save_json_report,
get_plaintext_report,
)
from .sites import MaigretDatabase
from .submit import submit_dialog
from .types import QueryResultWrapper
from .utils import get_dict_ascii_tree
__version__ = '0.2.0'
def notify_about_errors(search_results, query_notify):
errs = errors.extract_and_group(search_results.values())
def notify_about_errors(search_results: QueryResultWrapper, query_notify):
errs = errors.extract_and_group(search_results)
was_errs_displayed = False
for e in errs:
if not errors.is_important(e):
@@ -48,7 +49,7 @@ def notify_about_errors(search_results, query_notify):
text = f'Too many errors of type "{e["err"]}" ({e["perc"]}%)'
solution = errors.solution_of(e['err'])
if solution:
text = '. '.join([text, solution])
text = '. '.join([text, solution.capitalize()])
query_notify.warning(text, '!')
was_errs_displayed = True
@@ -59,6 +60,67 @@ def notify_about_errors(search_results, query_notify):
)
def extract_ids_from_url(url: str, db: MaigretDatabase) -> dict:
results = {}
for s in db.sites:
result = s.extract_id_from_url(url)
if not result:
continue
_id, _type = result
results[_id] = _type
return results
def extract_ids_from_page(url, logger, timeout=5) -> dict:
results = {}
# url, headers
reqs: List[Tuple[str, set]] = [(url, set())]
try:
# temporary workaround for URL mutations MVP
from socid_extractor import mutate_url
reqs += list(mutate_url(url))
except Exception as e:
logger.warning(e)
for req in reqs:
url, headers = req
print(f'Scanning webpage by URL {url}...')
page, _ = parse(url, cookies_str='', headers=headers, timeout=timeout)
logger.debug(page)
info = extract(page)
if not info:
print('Nothing extracted')
else:
print(get_dict_ascii_tree(info.items(), new_line=False), ' ')
for k, v in info.items():
if 'username' in k:
results[v] = 'username'
if k in SUPPORTED_IDS:
results[v] = k
return results
def extract_ids_from_results(results: QueryResultWrapper, db: MaigretDatabase) -> dict:
ids_results = {}
for website_name in results:
dictionary = results[website_name]
# TODO: fix no site data issue
if not dictionary:
continue
new_usernames = dictionary.get('ids_usernames')
if new_usernames:
for u, utype in new_usernames.items():
ids_results[u] = utype
for url in dictionary.get('ids_links', []):
ids_results.update(extract_ids_from_url(url, db))
return ids_results
def setup_arguments_parser():
version_string = '\n'.join(
[
@@ -74,68 +136,18 @@ def setup_arguments_parser():
formatter_class=RawDescriptionHelpFormatter,
description=f"Maigret v{__version__}",
)
parser.add_argument(
"username",
nargs='*',
metavar="USERNAMES",
help="One or more usernames to search by.",
)
parser.add_argument(
"--version",
action="version",
version=version_string,
help="Display version information and dependencies.",
)
parser.add_argument(
"--info",
"-vv",
action="store_true",
dest="info",
default=False,
help="Display service information.",
)
parser.add_argument(
"--verbose",
"-v",
action="store_true",
dest="verbose",
default=False,
help="Display extra information and metrics.",
)
parser.add_argument(
"-d",
"--debug",
"-vvv",
action="store_true",
dest="debug",
default=False,
help="Saving debugging information and sites responses in debug.txt.",
)
parser.add_argument(
"--site",
action="append",
metavar='SITE_NAME',
dest="site_list",
default=[],
help="Limit analysis to just the listed sites (use several times to specify more than one)",
)
parser.add_argument(
"--proxy",
"-p",
metavar='PROXY_URL',
action="store",
dest="proxy",
default=None,
help="Make requests over a proxy. e.g. socks5://127.0.0.1:1080",
)
parser.add_argument(
"--db",
metavar="DB_FILE",
dest="db_file",
default=None,
help="Load Maigret database from a JSON file or an online, valid, JSON file.",
)
parser.add_argument(
"--cookies-jar-file",
metavar="COOKIE_FILE",
dest="cookie_file",
default=None,
help="File with cookies.",
)
parser.add_argument(
"--timeout",
action="store",
@@ -143,7 +155,7 @@ def setup_arguments_parser():
dest="timeout",
type=timeout_check,
default=30,
help="Time (in seconds) to wait for response to requests. "
help="Time in seconds to wait for response to requests. "
"Default timeout of 30.0s. "
"A longer timeout will be more likely to get results from slow sites. "
"On the other hand, this may cause a long delay to gather all results. ",
@@ -154,7 +166,7 @@ def setup_arguments_parser():
type=int,
metavar='RETRIES',
default=1,
help="Attempts to restart temporary failed requests.",
help="Attempts to restart temporarily failed requests.",
)
parser.add_argument(
"-n",
@@ -165,65 +177,6 @@ def setup_arguments_parser():
default=100,
help="Allowed number of concurrent connections.",
)
parser.add_argument(
"-a",
"--all-sites",
action="store_true",
dest="all_sites",
default=False,
help="Use all sites for scan.",
)
parser.add_argument(
"--top-sites",
action="store",
default=500,
type=int,
help="Count of sites for scan ranked by Alexa Top (default: 500).",
)
parser.add_argument(
"--print-not-found",
action="store_true",
dest="print_not_found",
default=False,
help="Print sites where the username was not found.",
)
parser.add_argument(
"--print-errors",
action="store_true",
dest="print_check_errors",
default=False,
help="Print errors messages: connection, captcha, site country ban, etc.",
)
parser.add_argument(
"--submit",
metavar='EXISTING_USER_URL',
type=str,
dest="new_site_to_submit",
default=False,
help="URL of existing profile in new site to submit.",
)
parser.add_argument(
"--no-color",
action="store_true",
dest="no_color",
default=False,
help="Don't color terminal output",
)
parser.add_argument(
"--no-progressbar",
action="store_true",
dest="no_progressbar",
default=False,
help="Don't show progressbar.",
)
parser.add_argument(
"--browse",
"-b",
action="store_true",
dest="browse",
default=False,
help="Browse to all results on default bowser.",
)
parser.add_argument(
"--no-recursion",
action="store_true",
@@ -238,33 +191,27 @@ def setup_arguments_parser():
default=False,
help="Disable parsing pages for additional data and other usernames.",
)
parser.add_argument(
"--self-check",
action="store_true",
default=False,
help="Do self check for sites and database and disable non-working ones.",
)
parser.add_argument(
"--stats", action="store_true", default=False, help="Show database statistics."
)
parser.add_argument(
"--use-disabled-sites",
action="store_true",
default=False,
help="Use disabled sites to search (may cause many false positives).",
)
parser.add_argument(
"--parse",
dest="parse_url",
default='',
help="Parse page by URL and extract username and IDs to use for search.",
)
parser.add_argument(
"--id-type",
dest="id_type",
default='username',
choices=SUPPORTED_IDS,
help="Specify identifier(s) type (default: username).",
)
parser.add_argument(
"--db",
metavar="DB_FILE",
dest="db_file",
default=None,
help="Load Maigret database from a JSON file or an online, valid, JSON file.",
)
parser.add_argument(
"--cookies-jar-file",
metavar="COOKIE_FILE",
dest="cookie_file",
default=None,
help="File with cookies.",
)
parser.add_argument(
"--ignore-ids",
action="append",
@@ -273,25 +220,156 @@ def setup_arguments_parser():
default=[],
help="Do not make search by the specified username or other ids.",
)
parser.add_argument(
"username",
nargs='+',
metavar='USERNAMES',
action="store",
help="One or more usernames to check with social networks.",
)
parser.add_argument(
"--tags", dest="tags", default='', help="Specify tags of sites."
)
# reports options
parser.add_argument(
"--folderoutput",
"-fo",
dest="folderoutput",
default="reports",
metavar="PATH",
help="If using multiple usernames, the output of the results will be saved to this folder.",
)
parser.add_argument(
"--proxy",
"-p",
metavar='PROXY_URL',
action="store",
dest="proxy",
default=None,
help="Make requests over a proxy. e.g. socks5://127.0.0.1:1080",
)
filter_group = parser.add_argument_group(
'Site filtering', 'Options to set site search scope'
)
filter_group.add_argument(
"-a",
"--all-sites",
action="store_true",
dest="all_sites",
default=False,
help="Use all sites for scan.",
)
filter_group.add_argument(
"--top-sites",
action="store",
default=500,
metavar="N",
type=int,
help="Count of sites for scan ranked by Alexa Top (default: 500).",
)
filter_group.add_argument(
"--tags", dest="tags", default='', help="Specify tags of sites (see `--stats`)."
)
filter_group.add_argument(
"--site",
action="append",
metavar='SITE_NAME',
dest="site_list",
default=[],
help="Limit analysis to just the specified sites (multiple option).",
)
filter_group.add_argument(
"--use-disabled-sites",
action="store_true",
default=False,
help="Use disabled sites to search (may cause many false positives).",
)
modes_group = parser.add_argument_group(
'Operating modes',
'Various functions except the default search by a username. '
'Modes are executed sequentially in the order of declaration.',
)
modes_group.add_argument(
"--parse",
dest="parse_url",
default='',
metavar='URL',
help="Parse page by URL and extract username and IDs to use for search.",
)
modes_group.add_argument(
"--submit",
metavar='URL',
type=str,
dest="new_site_to_submit",
default=False,
help="URL of existing profile in new site to submit.",
)
modes_group.add_argument(
"--self-check",
action="store_true",
default=False,
help="Do self check for sites and database and disable non-working ones.",
)
modes_group.add_argument(
"--stats",
action="store_true",
default=False,
help="Show database statistics (most frequent sites engines and tags).",
)
output_group = parser.add_argument_group(
'Output options', 'Options to change verbosity and view of the console output'
)
output_group.add_argument(
"--print-not-found",
action="store_true",
dest="print_not_found",
default=False,
help="Print sites where the username was not found.",
)
output_group.add_argument(
"--print-errors",
action="store_true",
dest="print_check_errors",
default=False,
help="Print errors messages: connection, captcha, site country ban, etc.",
)
output_group.add_argument(
"--verbose",
"-v",
action="store_true",
dest="verbose",
default=False,
help="Display extra information and metrics.",
)
output_group.add_argument(
"--info",
"-vv",
action="store_true",
dest="info",
default=False,
help="Display extra/service information and metrics.",
)
output_group.add_argument(
"--debug",
"-vvv",
"-d",
action="store_true",
dest="debug",
default=False,
help="Display extra/service/debug information and metrics, save responses in debug.log.",
)
output_group.add_argument(
"--no-color",
action="store_true",
dest="no_color",
default=False,
help="Don't color terminal output",
)
output_group.add_argument(
"--no-progressbar",
action="store_true",
dest="no_progressbar",
default=False,
help="Don't show progressbar.",
)
report_group = parser.add_argument_group(
'Report formats', 'Supported formats of report files'
)
report_group.add_argument(
"-T",
"--txt",
action="store_true",
@@ -299,7 +377,7 @@ def setup_arguments_parser():
default=False,
help="Create a TXT report (one report per username).",
)
parser.add_argument(
report_group.add_argument(
"-C",
"--csv",
action="store_true",
@@ -307,7 +385,7 @@ def setup_arguments_parser():
default=False,
help="Create a CSV report (one report per username).",
)
parser.add_argument(
report_group.add_argument(
"-H",
"--html",
action="store_true",
@@ -315,7 +393,7 @@ def setup_arguments_parser():
default=False,
help="Create an HTML report file (general report on all usernames).",
)
parser.add_argument(
report_group.add_argument(
"-X",
"--xmind",
action="store_true",
@@ -323,7 +401,7 @@ def setup_arguments_parser():
default=False,
help="Generate an XMind 8 mindmap report (one report per username).",
)
parser.add_argument(
report_group.add_argument(
"-P",
"--pdf",
action="store_true",
@@ -331,14 +409,14 @@ def setup_arguments_parser():
default=False,
help="Generate a PDF report (general report on all usernames).",
)
parser.add_argument(
report_group.add_argument(
"-J",
"--json",
action="store",
metavar='REPORT_TYPE',
metavar='TYPE',
dest="json",
default='',
type=check_supported_json_format,
choices=SUPPORTED_JSON_REPORT_FORMATS,
help=f"Generate a JSON report of specific type: {', '.join(SUPPORTED_JSON_REPORT_FORMATS)}"
" (one report per username).",
)
@@ -371,7 +449,7 @@ async def main():
usernames = {
u: args.id_type
for u in args.username
if u not in ['-'] and u not in args.ignore_ids_list
if u and u not in ['-'] and u not in args.ignore_ids_list
}
parsing_enabled = not args.disable_extracting
@@ -382,31 +460,10 @@ async def main():
print("Using the proxy: " + args.proxy)
if args.parse_url:
# url, headers
reqs = [(args.parse_url, set())]
try:
# temporary workaround for URL mutations MVP
from socid_extractor import mutate_url
reqs += list(mutate_url(args.parse_url))
except Exception as e:
logger.warning(e)
pass
for req in reqs:
url, headers = req
print(f'Scanning webpage by URL {url}...')
page, _ = parse(url, cookies_str='', headers=headers)
info = extract(page)
if not info:
print('Nothing extracted')
else:
print(get_dict_ascii_tree(info.items(), new_line=False), ' ')
for k, v in info.items():
if 'username' in k:
usernames[v] = 'username'
if k in supported_recursive_search_ids:
usernames[v] = k
extracted_ids = extract_ids_from_page(
args.parse_url, logger, timeout=args.timeout
)
usernames.update(extracted_ids)
if args.tags:
args.tags = list(set(str(args.tags).split(',')))
@@ -434,7 +491,7 @@ async def main():
top=args.top_sites,
tags=args.tags,
names=args.site_list,
disabled=False,
disabled=args.use_disabled_sites,
id_type=x,
)
@@ -454,13 +511,17 @@ async def main():
db, site_data, logger, max_connections=args.connections
)
if is_need_update:
if input('Do you want to save changes permanently? [Yn]\n').lower() == 'y':
if input('Do you want to save changes permanently? [Yn]\n').lower() in (
'y',
'',
):
db.save_to_file(args.db_file)
print('Database was successfully updated.')
else:
print('Updates will be applied only for current search session.')
print(db.get_scan_stats(site_data))
print('Scan sessions flags stats: ' + str(db.get_scan_stats(site_data)))
# Database statistics
if args.stats:
print(db.get_db_stats(db.sites_dict))
@@ -470,11 +531,6 @@ async def main():
# Define one report filename template
report_filepath_tpl = os.path.join(args.folderoutput, 'report_{username}{postfix}')
# Database stats
# TODO: verbose info about filtered sites
# enabled_count = len(list(filter(lambda x: not x.disabled, site_data.values())))
# print(f'Sites in database, enabled/total: {enabled_count}/{len(site_data)}')
if usernames == {}:
# magic params to exit after init
query_notify.warning('No usernames to check, exiting.')
@@ -483,14 +539,14 @@ async def main():
if not site_data:
query_notify.warning('No sites to check, exiting!')
sys.exit(2)
else:
query_notify.warning(
f'Starting a search on top {len(site_data)} sites from the Maigret database...'
)
if not args.all_sites:
query_notify.warning(
f'Starting a search on top {len(site_data)} sites from the Maigret database...'
'You can run search by full list of sites with flag `-a`', '!'
)
if not args.all_sites:
query_notify.warning(
'You can run search by full list of sites with flag `-a`', '!'
)
already_checked = set()
general_results = []
@@ -501,8 +557,8 @@ async def main():
if username.lower() in already_checked:
continue
else:
already_checked.add(username.lower())
already_checked.add(username.lower())
if username in args.ignore_ids_list:
query_notify.warning(
@@ -511,10 +567,7 @@ async def main():
continue
# check for characters do not supported by sites generally
found_unsupported_chars = set(unsupported_characters).intersection(
set(username)
)
found_unsupported_chars = set(BAD_CHARS).intersection(set(username))
if found_unsupported_chars:
pretty_chars_str = ','.join(
map(lambda s: f'"{s}"', found_unsupported_chars)
@@ -548,22 +601,9 @@ async def main():
general_results.append((username, id_type, results))
# TODO: tests
for website_name in results:
dictionary = results[website_name]
# TODO: fix no site data issue
if not dictionary or not recursive_search_enabled:
continue
new_usernames = dictionary.get('ids_usernames')
if new_usernames:
for u, utype in new_usernames.items():
usernames[u] = utype
for url in dictionary.get('ids_links', []):
for s in db.sites:
u = s.detect_username(url)
if u:
usernames[u] = 'username'
if recursive_search_enabled:
extracted_ids = extract_ids_from_results(results, db)
usernames.update(extracted_ids)
# reporting for a one username
if args.xmind:
@@ -607,6 +647,12 @@ async def main():
filename = report_filepath_tpl.format(username=username, postfix='.pdf')
save_pdf_report(filename, report_context)
query_notify.warning(f'PDF report on all usernames saved in {filename}')
text_report = get_plaintext_report(report_context)
if text_report:
query_notify.info('Short text report:')
print(text_report)
# update database
db.save_to_file(args.db_file)
+39 -36
View File
@@ -152,6 +152,27 @@ class QueryNotifyPrint(QueryNotify):
return
def make_colored_terminal_notify(
self, status, text, status_color, text_color, appendix
):
text = [
f"{Style.BRIGHT}{Fore.WHITE}[{status_color}{status}{Fore.WHITE}]"
+ f"{text_color} {text}: {Style.RESET_ALL}"
+ f"{appendix}"
]
return "".join(text)
def make_simple_terminal_notify(
self, status, text, status_color, text_color, appendix
):
return f"[{status}] {text}: {appendix}"
def make_terminal_notify(self, *args):
if self.color:
return self.make_colored_terminal_notify(*args)
else:
return self.make_simple_terminal_notify(*args)
def start(self, message, id_type):
"""Notify Start.
@@ -184,13 +205,20 @@ class QueryNotifyPrint(QueryNotify):
else:
print(f"[*] {title} {message} on:")
def warning(self, message, symbol="-"):
msg = f"[{symbol}] {message}"
def _colored_print(self, fore_color, msg):
if self.color:
print(Style.BRIGHT + Fore.YELLOW + msg)
print(Style.BRIGHT + fore_color + msg)
else:
print(msg)
def warning(self, message, symbol="-"):
msg = f"[{symbol}] {message}"
self._colored_print(Fore.YELLOW, msg)
def info(self, message, symbol="*"):
msg = f"[{symbol}] {message}"
self._colored_print(Fore.BLUE, msg)
def update(self, result, is_similar=False):
"""Notify Update.
@@ -204,40 +232,18 @@ class QueryNotifyPrint(QueryNotify):
Return Value:
Nothing.
"""
notify = None
self.result = result
if not self.result.ids_data:
ids_data_text = ""
else:
ids_data_text = ""
if self.result.ids_data:
ids_data_text = get_dict_ascii_tree(self.result.ids_data.items(), " ")
def make_colored_terminal_notify(
status, text, status_color, text_color, appendix
):
text = [
f"{Style.BRIGHT}{Fore.WHITE}[{status_color}{status}{Fore.WHITE}]"
+ f"{text_color} {text}: {Style.RESET_ALL}"
+ f"{appendix}"
]
return "".join(text)
def make_simple_terminal_notify(status, text, appendix):
return f"[{status}] {text}: {appendix}"
def make_terminal_notify(is_colored=True, *args):
if is_colored:
return make_colored_terminal_notify(*args)
else:
return make_simple_terminal_notify(*args)
notify = None
# Output to the terminal is desired.
if result.status == QueryStatus.CLAIMED:
color = Fore.BLUE if is_similar else Fore.GREEN
status = "?" if is_similar else "+"
notify = make_terminal_notify(
self.color,
notify = self.make_terminal_notify(
status,
result.site_name,
color,
@@ -246,8 +252,7 @@ class QueryNotifyPrint(QueryNotify):
)
elif result.status == QueryStatus.AVAILABLE:
if not self.print_found_only:
notify = make_terminal_notify(
self.color,
notify = self.make_terminal_notify(
"-",
result.site_name,
Fore.RED,
@@ -256,8 +261,7 @@ class QueryNotifyPrint(QueryNotify):
)
elif result.status == QueryStatus.UNKNOWN:
if not self.skip_check_errors:
notify = make_terminal_notify(
self.color,
notify = self.make_terminal_notify(
"?",
result.site_name,
Fore.RED,
@@ -267,8 +271,7 @@ class QueryNotifyPrint(QueryNotify):
elif result.status == QueryStatus.ILLEGAL:
if not self.print_found_only:
text = "Illegal Username Format For This Site!"
notify = make_terminal_notify(
self.color,
notify = self.make_terminal_notify(
"-",
result.site_name,
Fore.RED,
@@ -286,7 +289,7 @@ class QueryNotifyPrint(QueryNotify):
sys.stdout.write("\x1b[1K\r")
print(notify)
return
return notify
def __str__(self):
"""Convert Object To String.
+55 -54
View File
@@ -3,7 +3,6 @@ import io
import json
import logging
import os
from argparse import ArgumentTypeError
from datetime import datetime
from typing import Dict, Any
@@ -71,6 +70,17 @@ def save_json_report(filename: str, username: str, results: dict, report_type: s
generate_json_report(username, results, f, report_type=report_type)
def get_plaintext_report(context: dict) -> str:
output = (context['brief'] + " ").replace('. ', '.\n')
interests = list(map(lambda x: x[0], context.get('interests_tuple_list', [])))
countries = list(map(lambda x: x[0], context.get('countries_tuple_list', [])))
if countries:
output += f'Countries: {", ".join(countries)}\n'
if interests:
output += f'Interests (tags): {", ".join(interests)}\n'
return output.strip()
"""
REPORTS GENERATING
"""
@@ -216,6 +226,7 @@ def generate_report_context(username_results: list):
return {
"username": first_username,
# TODO: return brief list
"brief": brief,
"results": username_results,
"first_seen": first_seen,
@@ -269,6 +280,9 @@ def generate_json_report(username: str, results: dict, file, report_type):
data = dict(site_result)
data["status"] = data["status"].json()
data["site"] = data["site"].json
if "future" in data:
del data["future"]
if is_report_per_line:
data["sitename"] = sitename
@@ -290,11 +304,20 @@ def save_xmind_report(filename, username, results):
os.remove(filename)
workbook = xmind.load(filename)
sheet = workbook.getPrimarySheet()
design_sheet(sheet, username, results)
design_xmind_sheet(sheet, username, results)
xmind.save(workbook, path=filename)
def design_sheet(sheet, username, results):
def add_xmind_subtopic(userlink, k, v, supposed_data):
currentsublabel = userlink.addSubTopic()
field = "fullname" if k == "name" else k
if field not in supposed_data:
supposed_data[field] = []
supposed_data[field].append(v)
currentsublabel.setTitle("%s: %s" % (k, v))
def design_xmind_sheet(sheet, username, results):
alltags = {}
supposed_data = {}
@@ -308,64 +331,42 @@ def design_sheet(sheet, username, results):
for website_name in results:
dictionary = results[website_name]
result_status = dictionary.get("status")
if result_status.status != QueryStatus.CLAIMED:
continue
if dictionary.get("status").status == QueryStatus.CLAIMED:
# firsttime I found that entry
for tag in dictionary.get("status").tags:
if tag.strip() == "":
continue
if tag not in alltags.keys():
if not is_country_tag(tag):
tagsection = root_topic1.addSubTopic()
tagsection.setTitle(tag)
alltags[tag] = tagsection
stripped_tags = list(map(lambda x: x.strip(), result_status.tags))
normalized_tags = list(
filter(lambda x: x and not is_country_tag(x), stripped_tags)
)
category = None
for tag in dictionary.get("status").tags:
if tag.strip() == "":
continue
if not is_country_tag(tag):
category = tag
category = None
for tag in normalized_tags:
if tag in alltags.keys():
continue
tagsection = root_topic1.addSubTopic()
tagsection.setTitle(tag)
alltags[tag] = tagsection
category = tag
if category is None:
userlink = undefinedsection.addSubTopic()
userlink.addLabel(dictionary.get("status").site_url_user)
section = alltags[category] if category else undefinedsection
userlink = section.addSubTopic()
userlink.addLabel(result_status.site_url_user)
ids_data = result_status.ids_data or {}
for k, v in ids_data.items():
# suppose target data
if isinstance(v, list):
for currentval in v:
add_xmind_subtopic(userlink, k, currentval, supposed_data)
else:
userlink = alltags[category].addSubTopic()
userlink.addLabel(dictionary.get("status").site_url_user)
add_xmind_subtopic(userlink, k, v, supposed_data)
if dictionary.get("status").ids_data:
for k, v in dictionary.get("status").ids_data.items():
# suppose target data
if not isinstance(v, list):
currentsublabel = userlink.addSubTopic()
field = "fullname" if k == "name" else k
if field not in supposed_data:
supposed_data[field] = []
supposed_data[field].append(v)
currentsublabel.setTitle("%s: %s" % (k, v))
else:
for currentval in v:
currentsublabel = userlink.addSubTopic()
field = "fullname" if k == "name" else k
if field not in supposed_data:
supposed_data[field] = []
supposed_data[field].append(currentval)
currentsublabel.setTitle("%s: %s" % (k, currentval))
# add supposed data
filterede_supposed_data = filter_supposed_data(supposed_data)
if len(filterede_supposed_data) > 0:
filtered_supposed_data = filter_supposed_data(supposed_data)
if len(filtered_supposed_data) > 0:
undefinedsection = root_topic1.addSubTopic()
undefinedsection.setTitle("SUPPOSED DATA")
for k, v in filterede_supposed_data.items():
for k, v in filtered_supposed_data.items():
currentsublabel = undefinedsection.addSubTopic()
currentsublabel.setTitle("%s: %s" % (k, v))
def check_supported_json_format(value):
if value and value not in SUPPORTED_JSON_REPORT_FORMATS:
raise ArgumentTypeError(
"JSON report type must be one of the following types: "
+ ", ".join(SUPPORTED_JSON_REPORT_FORMATS)
)
return value
+5200 -2389
View File
File diff suppressed because it is too large Load Diff
+54 -20
View File
@@ -3,7 +3,7 @@
import copy
import json
import sys
from typing import Optional, List, Dict, Any
from typing import Optional, List, Dict, Any, Tuple
import requests
@@ -53,6 +53,18 @@ SUPPORTED_TAGS = [
"medicine",
"reading",
"stock",
"messaging",
"trading",
"links",
"fashion",
"tasks",
"military",
"auto",
"gambling",
"cybercriminal",
"review",
"bookmarks",
"design",
]
@@ -146,6 +158,19 @@ class MaigretSite:
return None
def extract_id_from_url(self, url: str) -> Optional[Tuple[str, str]]:
if not self.url_regexp:
return None
match_groups = self.url_regexp.match(url)
if not match_groups:
return None
_id = match_groups.groups()[-1].rstrip("/")
_type = self.type
return _id, _type
@property
def pretty_name(self):
if self.source:
@@ -167,6 +192,25 @@ class MaigretSite:
return result
@property
def errors_dict(self) -> dict:
errors: Dict[str, str] = {}
if self.engine_obj:
errors.update(self.engine_obj.site.get('errors', {}))
errors.update(self.errors)
return errors
def get_url_type(self) -> str:
url = URLMatcher.extract_main_part(self.url)
if url.startswith("{username}"):
url = "SUBDOMAIN"
elif url == "":
url = f"{self.url} ({self.engine})"
else:
parts = url.split("/")
url = "/" + "/".join(parts[1:])
return url
def update(self, updates: "dict") -> "MaigretSite":
self.__dict__.update(updates)
self.update_detectors()
@@ -405,44 +449,34 @@ class MaigretDatabase:
if not sites_dict:
sites_dict = self.sites_dict()
urls = {}
tags = {}
output = ""
disabled_count = 0
total_count = len(sites_dict)
urls = {}
tags = {}
for _, site in sites_dict.items():
if site.disabled:
disabled_count += 1
url = URLMatcher.extract_main_part(site.url)
if url.startswith("{username}"):
url = "SUBDOMAIN"
elif url == "":
url = f"{site.url} ({site.engine})"
else:
parts = url.split("/")
url = "/" + "/".join(parts[1:])
urls[url] = urls.get(url, 0) + 1
url_type = site.get_url_type()
urls[url_type] = urls.get(url_type, 0) + 1
if not site.tags:
tags["NO_TAGS"] = tags.get("NO_TAGS", 0) + 1
for tag in site.tags:
if is_country_tag(tag):
# currenty do not display country tags
continue
for tag in filter(lambda x: not is_country_tag(x), site.tags):
tags[tag] = tags.get(tag, 0) + 1
output += f"Enabled/total sites: {total_count - disabled_count}/{total_count}\n"
output += "Top sites' profile URLs:\n"
output += "Top profile URLs:\n"
for url, count in sorted(urls.items(), key=lambda x: x[1], reverse=True)[:20]:
if count == 1:
break
output += f"{count}\t{url}\n"
output += "Top sites' tags:\n"
for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True):
output += "Top tags:\n"
for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True)[:200]:
mark = ""
if tag not in SUPPORTED_TAGS:
mark = " (non-standard)"
+44 -12
View File
@@ -2,7 +2,7 @@ import asyncio
import difflib
import re
from typing import List
import xml.etree.ElementTree as ET
import requests
from .activation import import_aiohttp_cookies
@@ -46,6 +46,20 @@ def get_match_ratio(x):
)
def get_alexa_rank(site_url_main):
url = f"http://data.alexa.com/data?cli=10&url={site_url_main}"
xml_data = requests.get(url).text
root = ET.fromstring(xml_data)
alexa_rank = 0
try:
alexa_rank = int(root.find('.//REACH').attrib['RANK'])
except Exception:
pass
return alexa_rank
def extract_mainpage_url(url):
return "/".join(url.split("/", 3)[:3])
@@ -133,6 +147,7 @@ async def detect_known_engine(
) -> List[MaigretSite]:
try:
r = requests.get(url_mainpage)
logger.debug(r.text)
except Exception as e:
logger.warning(e)
print("Some error while checking main page")
@@ -199,6 +214,7 @@ async def check_features_manually(
# cookies
cookie_dict = None
if cookie_file:
logger.info(f'Use {cookie_file} for cookies')
cookie_jar = await import_aiohttp_cookies(cookie_file)
cookie_dict = {c.key: c.value for c in cookie_jar}
@@ -239,7 +255,7 @@ async def check_features_manually(
features = input("If features was not detected correctly, write it manually: ")
if features:
presence_list = features.split(",")
presence_list = list(map(str.strip, features.split(",")))
absence_list = sorted(b_minus_a, key=get_match_ratio, reverse=True)[
:top_features_count
@@ -248,7 +264,7 @@ async def check_features_manually(
features = input("If features was not detected correctly, write it manually: ")
if features:
absence_list = features.split(",")
absence_list = list(map(str.strip, features.split(",")))
site_data = {
"absenceStrs": absence_list,
@@ -291,7 +307,13 @@ async def submit_dialog(db, url_exists, cookie_file, logger):
url_mainpage = extract_mainpage_url(url_exists)
sites = await detect_known_engine(db, url_exists, url_mainpage, logger)
print('Detecting site engine, please wait...')
sites = []
try:
sites = await detect_known_engine(db, url_exists, url_mainpage, logger)
except KeyboardInterrupt:
print('Engine detect process is interrupted.')
if not sites:
print("Unable to detect site engine, lets generate checking features")
sites = [
@@ -304,6 +326,7 @@ async def submit_dialog(db, url_exists, cookie_file, logger):
sem = asyncio.Semaphore(1)
print("Checking, please wait...")
found = False
chosen_site = None
for s in sites:
@@ -320,17 +343,26 @@ async def submit_dialog(db, url_exists, cookie_file, logger):
print(
"Try to run this mode again and increase features count or choose others."
)
return False
else:
if (
input(
f"Site {chosen_site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] "
).lower()
in "y"
)
.lower()
.strip("y")
):
logger.debug(chosen_site.json)
site_data = chosen_site.strip_engine_data()
logger.debug(site_data.json)
db.update_site(site_data)
return True
return False
return False
chosen_site.name = input("Change site name if you want: ") or chosen_site.name
chosen_site.tags = list(map(str.strip, input("Site tags: ").split(',')))
rank = get_alexa_rank(chosen_site.url_main)
if rank:
print(f'New alexa rank: {rank}')
chosen_site.alexa_rank = rank
logger.debug(chosen_site.json)
site_data = chosen_site.strip_engine_data()
logger.debug(site_data.json)
db.update_site(site_data)
return True
+12 -3
View File
@@ -1,5 +1,7 @@
import ast
import re
import random
from typing import Any
DEFAULT_USER_AGENTS = [
@@ -55,14 +57,20 @@ class URLMatcher:
url_main_part = self.extract_main_part(url)
for c in self.UNSAFE_SYMBOLS:
url_main_part = url_main_part.replace(c, f"\\{c}")
username_regexp = username_regexp or ".+?"
prepared_username_regexp = (username_regexp or ".+?").lstrip('^').rstrip('$')
url_regexp = url_main_part.replace("{username}", f"({username_regexp})")
url_regexp = url_main_part.replace(
"{username}", f"({prepared_username_regexp})"
)
regexp_str = self._HTTP_URL_RE_STR.replace("(.+)", url_regexp)
return re.compile(regexp_str)
def ascii_data_display(data: str) -> Any:
return ast.literal_eval(data)
def get_dict_ascii_tree(items, prepend="", new_line=True):
text = ""
for num, item in enumerate(items):
@@ -73,7 +81,8 @@ def get_dict_ascii_tree(items, prepend="", new_line=True):
if field_value.startswith("['"):
is_last_item = num == len(items) - 1
prepend_symbols = " " * 3 if is_last_item else ""
field_value = get_dict_ascii_tree(eval(field_value), prepend_symbols)
data = ascii_data_display(field_value)
field_value = get_dict_ascii_tree(data, prepend_symbols)
text += f"\n{prepend}{box_symbol}{field_name}: {field_value}"
else:
text += f"\n{prepend}{box_symbol} {item}"
+1 -1
View File
@@ -26,7 +26,7 @@ python-socks==1.1.2
requests>=2.24.0
requests-futures==1.0.0
six==1.15.0
socid-extractor>=0.0.16
socid-extractor>=0.0.20
soupsieve==2.1
stem==1.8.0
torrequest==0.1.0
+1 -1
View File
@@ -12,7 +12,7 @@ with open('requirements.txt') as rf:
requires = rf.read().splitlines()
setup(name='maigret',
version='0.2.0',
version='0.2.4',
description='Collect a dossier on a person by username from a huge number of sites',
long_description=long_description,
long_description_content_type="text/markdown",
+1688 -1500
View File
File diff suppressed because it is too large Load Diff
+6
View File
@@ -0,0 +1,6 @@
flake8==3.8.4
pytest==6.2.4
pytest-asyncio==0.14.0
pytest-cov==2.10.1
pytest-httpserver==1.0.0
pytest-rerunfailures==9.1.1
+3 -1
View File
@@ -1,2 +1,4 @@
#!/bin/sh
pytest tests
coverage run --source=./maigret -m pytest tests
coverage report -m
coverage html
+25 -3
View File
@@ -6,10 +6,14 @@ import pytest
from _pytest.mark import Mark
from maigret.sites import MaigretDatabase
from maigret.maigret import setup_arguments_parser
CUR_PATH = os.path.dirname(os.path.realpath(__file__))
JSON_FILE = os.path.join(CUR_PATH, '../maigret/resources/data.json')
empty_mark = Mark('', [], {})
TEST_JSON_FILE = os.path.join(CUR_PATH, 'db.json')
LOCAL_TEST_JSON_FILE = os.path.join(CUR_PATH, 'local.json')
empty_mark = Mark('', (), {})
def by_slow_marker(item):
@@ -33,9 +37,17 @@ def remove_test_reports():
@pytest.fixture(scope='session')
def default_db():
db = MaigretDatabase().load_from_file(JSON_FILE)
return MaigretDatabase().load_from_file(JSON_FILE)
return db
@pytest.fixture(scope='function')
def test_db():
return MaigretDatabase().load_from_file(TEST_JSON_FILE)
@pytest.fixture(scope='function')
def local_test_db():
return MaigretDatabase().load_from_file(LOCAL_TEST_JSON_FILE)
@pytest.fixture(autouse=True)
@@ -43,3 +55,13 @@ def reports_autoclean():
remove_test_reports()
yield
remove_test_reports()
@pytest.fixture(scope='session')
def argparser():
return setup_arguments_parser()
@pytest.fixture(scope="session")
def httpserver_listen_address():
return ("localhost", 8989)
+26
View File
@@ -0,0 +1,26 @@
{
"engines": {},
"sites": {
"GooglePlayStore": {
"tags": ["global", "us"],
"disabled": false,
"checkType": "status_code",
"alexaRank": 1,
"url": "https://play.google.com/store/apps/developer?id={username}",
"urlMain": "https://play.google.com/store",
"usernameClaimed": "Facebook_nosuchname",
"usernameUnclaimed": "noonewouldeverusethis7"
},
"Reddit": {
"tags": ["news", "social", "us"],
"checkType": "status_code",
"presenseStrs": ["totalKarma"],
"disabled": true,
"alexaRank": 17,
"url": "https://www.reddit.com/user/{username}",
"urlMain": "https://www.reddit.com/",
"usernameClaimed": "blue",
"usernameUnclaimed": "noonewouldeverusethis7"
}
}
}
+21
View File
@@ -0,0 +1,21 @@
{
"engines": {},
"sites": {
"StatusCode": {
"checkType": "status_code",
"url": "http://localhost:8989/url?id={username}",
"urlMain": "http://localhost:8989/",
"usernameClaimed": "claimed",
"usernameUnclaimed": "unclaimed"
},
"Message": {
"checkType": "message",
"url": "http://localhost:8989/url?id={username}",
"urlMain": "http://localhost:8989/",
"presenseStrs": ["user", "profile"],
"absenseStrs": ["not found", "404"],
"usernameClaimed": "claimed",
"usernameUnclaimed": "unclaimed"
}
}
}
+1
View File
@@ -22,6 +22,7 @@ httpbin.org FALSE / FALSE 0 a b
"""
@pytest.mark.skip(reason="periodically fails")
@pytest.mark.slow
def test_twitter_activation(default_db):
twitter_site = default_db.sites_dict['Twitter']
+65
View File
@@ -0,0 +1,65 @@
from mock import Mock
import pytest
from maigret import search
def site_result_except(server, username, **kwargs):
query = f'id={username}'
server.expect_request('/url', query_string=query).respond_with_data(**kwargs)
@pytest.mark.asyncio
async def test_checking_by_status_code(httpserver, local_test_db):
sites_dict = local_test_db.sites_dict
site_result_except(httpserver, 'claimed', status=200)
site_result_except(httpserver, 'unclaimed', status=404)
result = await search('claimed', site_dict=sites_dict, logger=Mock())
assert result['StatusCode']['status'].is_found() is True
result = await search('unclaimed', site_dict=sites_dict, logger=Mock())
assert result['StatusCode']['status'].is_found() is False
@pytest.mark.asyncio
async def test_checking_by_message_positive_full(httpserver, local_test_db):
sites_dict = local_test_db.sites_dict
site_result_except(httpserver, 'claimed', response_data="user profile")
site_result_except(httpserver, 'unclaimed', response_data="404 not found")
result = await search('claimed', site_dict=sites_dict, logger=Mock())
assert result['Message']['status'].is_found() is True
result = await search('unclaimed', site_dict=sites_dict, logger=Mock())
assert result['Message']['status'].is_found() is False
@pytest.mark.asyncio
async def test_checking_by_message_positive_part(httpserver, local_test_db):
sites_dict = local_test_db.sites_dict
site_result_except(httpserver, 'claimed', response_data="profile")
site_result_except(httpserver, 'unclaimed', response_data="404")
result = await search('claimed', site_dict=sites_dict, logger=Mock())
assert result['Message']['status'].is_found() is True
result = await search('unclaimed', site_dict=sites_dict, logger=Mock())
assert result['Message']['status'].is_found() is False
@pytest.mark.asyncio
async def test_checking_by_message_negative(httpserver, local_test_db):
sites_dict = local_test_db.sites_dict
site_result_except(httpserver, 'claimed', response_data="")
site_result_except(httpserver, 'unclaimed', response_data="user 404")
result = await search('claimed', site_dict=sites_dict, logger=Mock())
assert result['Message']['status'].is_found() is False
result = await search('unclaimed', site_dict=sites_dict, logger=Mock())
assert result['Message']['status'].is_found() is True
+93
View File
@@ -0,0 +1,93 @@
"""Maigret command-line arguments parsing tests"""
from argparse import Namespace
from typing import Dict, Any
DEFAULT_ARGS: Dict[str, Any] = {
'all_sites': False,
'connections': 100,
'cookie_file': None,
'csv': False,
'db_file': None,
'debug': False,
'disable_extracting': False,
'disable_recursive_search': False,
'folderoutput': 'reports',
'html': False,
'id_type': 'username',
'ignore_ids_list': [],
'info': False,
'json': '',
'new_site_to_submit': False,
'no_color': False,
'no_progressbar': False,
'parse_url': '',
'pdf': False,
'print_check_errors': False,
'print_not_found': False,
'proxy': None,
'retries': 1,
'self_check': False,
'site_list': [],
'stats': False,
'tags': '',
'timeout': 30,
'top_sites': 500,
'txt': False,
'use_disabled_sites': False,
'username': [],
'verbose': False,
'xmind': False,
}
def test_args_search_mode(argparser):
args = argparser.parse_args('username'.split())
assert args.username == ['username']
want_args = dict(DEFAULT_ARGS)
want_args.update({'username': ['username']})
assert args == Namespace(**want_args)
def test_args_search_mode_several_usernames(argparser):
args = argparser.parse_args('username1 username2'.split())
assert args.username == ['username1', 'username2']
want_args = dict(DEFAULT_ARGS)
want_args.update({'username': ['username1', 'username2']})
assert args == Namespace(**want_args)
def test_args_self_check_mode(argparser):
args = argparser.parse_args('--self-check --site GitHub'.split())
want_args = dict(DEFAULT_ARGS)
want_args.update(
{
'self_check': True,
'site_list': ['GitHub'],
'username': [],
}
)
assert args == Namespace(**want_args)
def test_args_multiple_sites(argparser):
args = argparser.parse_args(
'--site GitHub VK --site PornHub --site Taringa,Steam'.split()
)
want_args = dict(DEFAULT_ARGS)
want_args.update(
{
'site_list': ['GitHub', 'PornHub', 'Taringa,Steam'],
'username': ['VK'],
}
)
assert args == Namespace(**want_args)
+15
View File
@@ -0,0 +1,15 @@
"""Maigret data test functions"""
from maigret.utils import is_country_tag
from maigret.sites import SUPPORTED_TAGS
def test_tags_validity(default_db):
unknown_tags = set()
for site in default_db.sites:
for tag in filter(lambda x: not is_country_tag(x), site.tags):
if tag not in SUPPORTED_TAGS:
unknown_tags.add(tag)
assert unknown_tags == set()
+137 -56
View File
@@ -1,96 +1,177 @@
"""Maigret main module test functions"""
import asyncio
import copy
import pytest
from mock import Mock
from maigret.maigret import self_check
from maigret.sites import MaigretDatabase
from maigret.maigret import self_check, maigret
from maigret.maigret import (
extract_ids_from_page,
extract_ids_from_results,
extract_ids_from_url,
)
from maigret.sites import MaigretSite
from maigret.result import QueryResult, QueryStatus
EXAMPLE_DB = {
'engines': {},
'sites': {
"GooglePlayStore": {
"tags": ["global", "us"],
"disabled": False,
"checkType": "status_code",
"alexaRank": 1,
"url": "https://play.google.com/store/apps/developer?id={username}",
"urlMain": "https://play.google.com/store",
"usernameClaimed": "Facebook_nosuchname",
"usernameUnclaimed": "noonewouldeverusethis7",
},
"Reddit": {
"tags": ["news", "social", "us"],
"checkType": "status_code",
"presenseStrs": ["totalKarma"],
"disabled": True,
"alexaRank": 17,
"url": "https://www.reddit.com/user/{username}",
"urlMain": "https://www.reddit.com/",
"usernameClaimed": "blue",
"usernameUnclaimed": "noonewouldeverusethis7",
},
RESULTS_EXAMPLE = {
'Reddit': {
'cookies': None,
'parsing_enabled': False,
'url_main': 'https://www.reddit.com/',
'username': 'Facebook',
},
'GooglePlayStore': {
'cookies': None,
'http_status': 200,
'is_similar': False,
'parsing_enabled': False,
'rank': 1,
'url_main': 'https://play.google.com/store',
'url_user': 'https://play.google.com/store/apps/developer?id=Facebook',
'username': 'Facebook',
},
}
@pytest.mark.slow
def test_self_check_db_positive_disable():
def test_self_check_db_positive_disable(test_db):
logger = Mock()
db = MaigretDatabase()
db.load_from_json(EXAMPLE_DB)
assert db.sites[0].disabled == False
assert test_db.sites[0].disabled is False
loop = asyncio.get_event_loop()
loop.run_until_complete(self_check(db, db.sites_dict, logger, silent=True))
loop.run_until_complete(
self_check(test_db, test_db.sites_dict, logger, silent=True)
)
assert db.sites[0].disabled == True
assert test_db.sites[0].disabled is True
@pytest.mark.slow
def test_self_check_db_positive_enable():
def test_self_check_db_positive_enable(test_db):
logger = Mock()
db = MaigretDatabase()
db.load_from_json(EXAMPLE_DB)
db.sites[0].disabled = True
db.sites[0].username_claimed = 'Facebook'
assert db.sites[0].disabled == True
test_db.sites[0].disabled = True
test_db.sites[0].username_claimed = 'Facebook'
assert test_db.sites[0].disabled is True
loop = asyncio.get_event_loop()
loop.run_until_complete(self_check(db, db.sites_dict, logger, silent=True))
loop.run_until_complete(
self_check(test_db, test_db.sites_dict, logger, silent=True)
)
assert db.sites[0].disabled == False
assert test_db.sites[0].disabled is False
@pytest.mark.slow
def test_self_check_db_negative_disabled():
def test_self_check_db_negative_disabled(test_db):
logger = Mock()
db = MaigretDatabase()
db.load_from_json(EXAMPLE_DB)
db.sites[0].disabled = True
assert db.sites[0].disabled == True
test_db.sites[0].disabled = True
assert test_db.sites[0].disabled is True
loop = asyncio.get_event_loop()
loop.run_until_complete(self_check(db, db.sites_dict, logger, silent=True))
loop.run_until_complete(
self_check(test_db, test_db.sites_dict, logger, silent=True)
)
assert db.sites[0].disabled == True
assert test_db.sites[0].disabled is True
@pytest.mark.slow
def test_self_check_db_negative_enabled():
def test_self_check_db_negative_enabled(test_db):
logger = Mock()
db = MaigretDatabase()
db.load_from_json(EXAMPLE_DB)
db.sites[0].disabled = False
db.sites[0].username_claimed = 'Facebook'
assert db.sites[0].disabled == False
test_db.sites[0].disabled = False
test_db.sites[0].username_claimed = 'Facebook'
assert test_db.sites[0].disabled is False
loop = asyncio.get_event_loop()
loop.run_until_complete(self_check(db, db.sites_dict, logger, silent=True))
loop.run_until_complete(
self_check(test_db, test_db.sites_dict, logger, silent=True)
)
assert db.sites[0].disabled == False
assert test_db.sites[0].disabled is False
@pytest.mark.slow
def test_maigret_results(test_db):
logger = Mock()
username = 'Facebook'
loop = asyncio.get_event_loop()
results = loop.run_until_complete(
maigret(username, site_dict=test_db.sites_dict, logger=logger, timeout=30)
)
assert isinstance(results, dict)
reddit_site = results['Reddit']['site']
assert isinstance(reddit_site, MaigretSite)
assert reddit_site.json == {
'tags': ['news', 'social', 'us'],
'checkType': 'status_code',
'presenseStrs': ['totalKarma'],
'disabled': True,
'alexaRank': 17,
'url': 'https://www.reddit.com/user/{username}',
'urlMain': 'https://www.reddit.com/',
'usernameClaimed': 'blue',
'usernameUnclaimed': 'noonewouldeverusethis7',
}
del results['Reddit']['site']
del results['GooglePlayStore']['site']
reddit_status = results['Reddit']['status']
assert isinstance(reddit_status, QueryResult)
assert reddit_status.status == QueryStatus.ILLEGAL
playstore_status = results['GooglePlayStore']['status']
assert isinstance(playstore_status, QueryResult)
assert playstore_status.status == QueryStatus.CLAIMED
del results['Reddit']['status']
del results['GooglePlayStore']['status']
assert results['Reddit'].get('future') is None
del results['GooglePlayStore']['future']
assert results == RESULTS_EXAMPLE
def test_extract_ids_from_url(default_db):
assert extract_ids_from_url('https://www.reddit.com/user/test', default_db) == {
'test': 'username'
}
assert extract_ids_from_url('https://vk.com/id123', default_db) == {'123': 'vk_id'}
assert extract_ids_from_url('https://vk.com/ida123', default_db) == {
'ida123': 'username'
}
assert extract_ids_from_url(
'https://my.mail.ru/yandex.ru/dipres8904/', default_db
) == {'dipres8904': 'username'}
assert extract_ids_from_url(
'https://reviews.yandex.ru/user/adbced123', default_db
) == {'adbced123': 'yandex_public_id'}
@pytest.mark.slow
def test_extract_ids_from_page(test_db):
logger = Mock()
extract_ids_from_page('https://www.reddit.com/user/test', logger) == {
'test': 'username'
}
def test_extract_ids_from_results(test_db):
TEST_EXAMPLE = copy.deepcopy(RESULTS_EXAMPLE)
TEST_EXAMPLE['Reddit']['ids_usernames'] = {'test1': 'yandex_public_id'}
TEST_EXAMPLE['Reddit']['ids_links'] = ['https://www.reddit.com/user/test2']
extract_ids_from_results(TEST_EXAMPLE, test_db) == {
'test1': 'yandex_public_id',
'test2': 'username',
}
+64
View File
@@ -0,0 +1,64 @@
from maigret.errors import CheckError
from maigret.notify import QueryNotifyPrint
from maigret.result import QueryStatus, QueryResult
def test_notify_illegal():
n = QueryNotifyPrint(color=False)
assert (
n.update(
QueryResult(
username="test",
status=QueryStatus.ILLEGAL,
site_name="TEST_SITE",
site_url_user="http://example.com/test",
)
)
== "[-] TEST_SITE: Illegal Username Format For This Site!"
)
def test_notify_claimed():
n = QueryNotifyPrint(color=False)
assert (
n.update(
QueryResult(
username="test",
status=QueryStatus.CLAIMED,
site_name="TEST_SITE",
site_url_user="http://example.com/test",
)
)
== "[+] TEST_SITE: http://example.com/test"
)
def test_notify_available():
n = QueryNotifyPrint(color=False)
assert (
n.update(
QueryResult(
username="test",
status=QueryStatus.AVAILABLE,
site_name="TEST_SITE",
site_url_user="http://example.com/test",
)
)
== "[-] TEST_SITE: Not found!"
)
def test_notify_unknown():
n = QueryNotifyPrint(color=False)
result = QueryResult(
username="test",
status=QueryStatus.UNKNOWN,
site_name="TEST_SITE",
site_url_user="http://example.com/test",
)
result.error = CheckError('Type', 'Reason')
assert n.update(result) == "[?] TEST_SITE: Type error: Reason"
+17 -3
View File
@@ -16,8 +16,14 @@ from maigret.report import (
generate_report_template,
generate_report_context,
generate_json_report,
get_plaintext_report,
)
from maigret.result import QueryResult, QueryStatus
from maigret.sites import MaigretSite
GOOD_RESULT = QueryResult('', '', '', QueryStatus.CLAIMED)
BAD_RESULT = QueryResult('', '', '', QueryStatus.AVAILABLE)
EXAMPLE_RESULTS = {
'GitHub': {
@@ -35,12 +41,10 @@ EXAMPLE_RESULTS = {
'http_status': 200,
'is_similar': False,
'rank': 78,
'site': MaigretSite('test', {}),
}
}
GOOD_RESULT = QueryResult('', '', '', QueryStatus.CLAIMED)
BAD_RESULT = QueryResult('', '', '', QueryStatus.AVAILABLE)
GOOD_500PX_RESULT = copy.deepcopy(GOOD_RESULT)
GOOD_500PX_RESULT.tags = ['photo', 'us', 'global']
GOOD_500PX_RESULT.ids_data = {
@@ -343,3 +347,13 @@ def test_pdf_report():
save_pdf_report(report_name, context)
assert os.path.exists(report_name)
def test_text_report():
context = generate_report_context(TEST)
report_text = get_plaintext_report(context)
for brief_part in SUPPOSED_BRIEF.split():
assert brief_part in report_text
assert 'us' in report_text
assert 'photo' in report_text
+1
View File
@@ -103,6 +103,7 @@ def test_saving_site_error():
amperka = db.sites[0]
assert len(amperka.errors) == 2
assert len(amperka.errors_dict) == 2
assert amperka.strip_engine_data().errors == {'error1': 'text1'}
assert amperka.strip_engine_data().json['errors'] == {'error1': 'text1'}
+29 -17
View File
@@ -40,13 +40,13 @@ def test_case_convert_camel_with_digits_to_snake():
def test_is_country_tag():
assert is_country_tag('ru') == True
assert is_country_tag('FR') == True
assert is_country_tag('ru') is True
assert is_country_tag('FR') is True
assert is_country_tag('a1') == False
assert is_country_tag('dating') == False
assert is_country_tag('a1') is False
assert is_country_tag('dating') is False
assert is_country_tag('global') == True
assert is_country_tag('global') is True
def test_enrich_link_str():
@@ -57,6 +57,11 @@ def test_enrich_link_str():
)
def test_url_extract_main_part_negative():
url_main_part = 'None'
assert URLMatcher.extract_main_part(url_main_part) == ''
def test_url_extract_main_part():
url_main_part = 'flickr.com/photos/alexaimephotography'
@@ -68,8 +73,10 @@ def test_url_extract_main_part():
]
url_regexp = re.compile('^https?://(www.)?flickr.com/photos/(.+?)$')
# combine parts variations
for url_parts in itertools.product(*parts):
url = ''.join(url_parts)
# ensure all combinations give valid main part
assert URLMatcher.extract_main_part(url) == url_main_part
assert not url_regexp.match(url) is None
@@ -84,8 +91,10 @@ def test_url_make_profile_url_regexp():
['/', ''],
]
# combine parts variations
for url_parts in itertools.product(*parts):
url = ''.join(url_parts)
# ensure all combinations match pattern
assert (
URLMatcher.make_profile_url_regexp(url).pattern
== r'^https?://(www.)?flickr\.com/photos/(.+?)$'
@@ -98,6 +107,7 @@ def test_get_dict_ascii_tree():
'legacy_id': '26403415',
'username': 'alexaimephotographycars',
'name': 'Alex Aimé',
'links': "['www.instagram.com/street.reality.photography/']",
'created_at': '2018-05-04T10:17:01.000+0000',
'image': 'https://drscdn.500px.org/user_avatar/26403415/q%3D85_w%3D300_h%3D300/v2?webp=true&v=2&sig=0235678a4f7b65e007e864033ebfaf5ef6d87fad34f80a8639d985320c20fe3b',
'image_bg': 'https://drscdn.500px.org/user_cover/26403415/q%3D65_m%3D2048/v2?webp=true&v=1&sig=bea411fb158391a4fdad498874ff17088f91257e59dfb376ff67e3a44c3a4201',
@@ -107,20 +117,22 @@ def test_get_dict_ascii_tree():
'twitter_username': 'Alexaimephotogr',
}
ascii_tree = get_dict_ascii_tree(data.items())
ascii_tree = get_dict_ascii_tree(data.items(), prepend=" ")
assert (
ascii_tree
== """
┣╸uid: dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==
┣╸legacy_id: 26403415
┣╸username: alexaimephotographycars
┣╸name: Alex Aimé
┣╸created_at: 2018-05-04T10:17:01.000+0000
┣╸image: https://drscdn.500px.org/user_avatar/26403415/q%3D85_w%3D300_h%3D300/v2?webp=true&v=2&sig=0235678a4f7b65e007e864033ebfaf5ef6d87fad34f80a8639d985320c20fe3b
┣╸image_bg: https://drscdn.500px.org/user_cover/26403415/q%3D65_m%3D2048/v2?webp=true&v=1&sig=bea411fb158391a4fdad498874ff17088f91257e59dfb376ff67e3a44c3a4201
┣╸website: www.instagram.com/street.reality.photography/
┣╸facebook_link: www.instagram.com/street.reality.photography/
┣╸instagram_username: Street.Reality.Photography
┗╸twitter_username: Alexaimephotogr"""
┣╸uid: dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==
┣╸legacy_id: 26403415
┣╸username: alexaimephotographycars
┣╸name: Alex Aimé
┣╸links:
┃ ┗╸ www.instagram.com/street.reality.photography/
┣╸created_at: 2018-05-04T10:17:01.000+0000
┣╸image: https://drscdn.500px.org/user_avatar/26403415/q%3D85_w%3D300_h%3D300/v2?webp=true&v=2&sig=0235678a4f7b65e007e864033ebfaf5ef6d87fad34f80a8639d985320c20fe3b
┣╸image_bg: https://drscdn.500px.org/user_cover/26403415/q%3D65_m%3D2048/v2?webp=true&v=1&sig=bea411fb158391a4fdad498874ff17088f91257e59dfb376ff67e3a44c3a4201
┣╸website: www.instagram.com/street.reality.photography/
┣╸facebook_link: www.instagram.com/street.reality.photography/
┣╸instagram_username: Street.Reality.Photography
┗╸twitter_username: Alexaimephotogr"""
)
+57
View File
@@ -0,0 +1,57 @@
#!/usr/bin/env python3
import random
from argparse import ArgumentParser, RawDescriptionHelpFormatter
from maigret.maigret import MaigretDatabase
from maigret.submit import get_alexa_rank
def update_tags(site):
tags = []
if not site.tags:
print(f'Site {site.name} doesn\'t have tags')
else:
tags = site.tags
print(f'Site {site.name} tags: ' + ', '.join(tags))
print(f'URL: {site.url_main}')
new_tags = set(input('Enter new tags: ').split(', '))
if "disabled" in new_tags:
new_tags.remove("disabled")
site.disabled = True
print(f'Old alexa rank: {site.alexa_rank}')
rank = get_alexa_rank(site.url_main)
if rank:
print(f'New alexa rank: {rank}')
site.alexa_rank = rank
site.tags = [x for x in list(new_tags) if x]
if __name__ == '__main__':
parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter
)
parser.add_argument("--base","-b", metavar="BASE_FILE",
dest="base_file", default="maigret/resources/data.json",
help="JSON file with sites data to update.")
pool = list()
args = parser.parse_args()
db = MaigretDatabase()
db.load_from_file(args.base_file).sites
while True:
site = random.choice(db.sites)
if site.engine == 'uCoz':
continue
if not 'in' in site.tags:
continue
update_tags(site)
db.save_to_file(args.base_file)
+22 -17
View File
@@ -37,15 +37,15 @@ def get_rank(domain_to_query, site, print_errors=True):
try:
#Get ranking for this site.
site.alexa_rank = int(root.find('.//REACH').attrib['RANK'])
country = root.find('.//COUNTRY')
if not country is None and country.attrib:
country_code = country.attrib['CODE']
tags = set(site.tags)
if country_code:
tags.add(country_code.lower())
site.tags = sorted(list(tags))
if site.type != 'username':
site.disabled = False
# country = root.find('.//COUNTRY')
# if not country is None and country.attrib:
# country_code = country.attrib['CODE']
# tags = set(site.tags)
# if country_code:
# tags.add(country_code.lower())
# site.tags = sorted(list(tags))
# if site.type != 'username':
# site.disabled = False
except Exception as e:
if print_errors:
logging.error(e)
@@ -74,6 +74,7 @@ if __name__ == '__main__':
dest="base_file", default="maigret/resources/data.json",
help="JSON file with sites data to update.")
parser.add_argument('--with-rank', help='update with use of local data only', action='store_true')
parser.add_argument('--empty-only', help='update only sites without rating', action='store_true')
parser.add_argument('--exclude-engine', help='do not update score with certain engine',
action="append", dest="exclude_engine_list", default=[])
@@ -87,28 +88,31 @@ if __name__ == '__main__':
with open("sites.md", "w") as site_file:
site_file.write(f"""
## List of supported sites: total {len(sites_subset)}\n
## List of supported sites (search methods): total {len(sites_subset)}\n
Rank data fetched from Alexa by domains.
""")
for site in sites_subset:
if not args.with_rank:
break
url_main = site.url_main
if site.alexa_rank < sys.maxsize and args.empty_only:
continue
if args.exclude_engine_list and site.engine in args.exclude_engine_list:
continue
site.alexa_rank = 0
th = threading.Thread(target=get_rank, args=(url_main, site))
th = threading.Thread(target=get_rank, args=(url_main, site,))
pool.append((site.name, url_main, th))
th.start()
index = 1
for site_name, url_main, th in pool:
th.join()
sys.stdout.write("\r{0}".format(f"Updated {index} out of {len(sites_subset)} entries"))
sys.stdout.flush()
index = index + 1
if args.with_rank:
index = 1
for site_name, url_main, th in pool:
th.join()
sys.stdout.write("\r{0}".format(f"Updated {index} out of {len(sites_subset)} entries"))
sys.stdout.flush()
index = index + 1
sites_full_list = [(s, s.alexa_rank) for s in sites_subset]
@@ -123,6 +127,7 @@ Rank data fetched from Alexa by domains.
url_main = site.url_main
valid_rank = get_step_rank(rank)
all_tags = site.tags
all_tags.sort()
tags = ', ' + ', '.join(all_tags) if all_tags else ''
note = ''
if site.disabled: