Compare commits

..

45 Commits

Author SHA1 Message Date
soxoj bea900dda0 Merge pull request #155 from soxoj/0.2.4
Bump to 0.2.4
2021-05-18 01:20:00 +03:00
Soxoj bb1bde833d Bump to 0.2.4 2021-05-18 01:17:35 +03:00
soxoj 5b405c6abb Merge pull request #154 from soxoj/tests-improving
Improved tests
2021-05-18 00:57:31 +03:00
Soxoj 99fa58ceed Disabled Twitter activation test 2021-05-18 00:55:18 +03:00
Soxoj c71e404f63 Added test dependencies 2021-05-18 00:49:13 +03:00
Soxoj 2c04ccce57 Improved tests 2021-05-18 00:43:56 +03:00
soxoj 435db7cdc9 Merge pull request #153 from soxoj/sites-update-16-05-21
Several sites added, updated site list
2021-05-17 00:35:56 +03:00
Soxoj 413a0502a4 Several sites added, updated site list 2021-05-16 17:02:41 +03:00
soxoj 2aedcc3166 Merge pull request #152 from soxoj/cli-plaintext-report
Added text report to CLI output
2021-05-15 16:57:22 +03:00
Soxoj 28835204f5 Added text report to CLI output 2021-05-15 16:55:05 +03:00
soxoj b11a247dfd Merge pull request #151 from soxoj/tags-socid-extractor
Tags updated, added tests for tags
2021-05-15 14:55:01 +03:00
Soxoj c9219d91ec Tags updated, added tests for tags
Added several sites
Updated socid_extractor version to avoid bug #150
2021-05-15 14:51:30 +03:00
soxoj aa6cd0eca9 Merge pull request #149 from soxoj/0.2.3
Bump to 0.2.3
2021-05-12 22:40:02 +03:00
Soxoj 38e5d5c664 Bump to 0.2.3 2021-05-12 22:37:19 +03:00
soxoj 8a562d06ae Merge pull request #148 from soxoj/sites-updates-12-05
Fixed Anobii, added several new sites
2021-05-12 19:27:19 +03:00
Soxoj aa50ee9672 Fixed Anobii, added several new sites 2021-05-12 19:25:14 +03:00
soxoj 51327f9647 Merge pull request #146 from soxoj/links-sites
Added several links sites
2021-05-10 14:21:48 +03:00
Soxoj 4a368c9bb6 Added several links sites 2021-05-10 14:19:52 +03:00
soxoj 6fd5f6e33a Update build-docker-image.yml 2021-05-10 02:51:56 +03:00
soxoj fa3db9c39c Merge pull request #144 from soxoj/stackoverflow
Added fuzzy search by StackOverflow
2021-05-10 00:42:02 +03:00
Soxoj 5912ad4fbc Added fuzzy search by StackOverflow 2021-05-10 00:39:36 +03:00
soxoj ee36dc0187 Merge pull request #143 from soxoj/tags-updates-1
Tags sorting and some updates
2021-05-09 23:21:57 +03:00
Soxoj 9eb62e4e22 Tags sorting and some updates 2021-05-09 23:19:41 +03:00
soxoj ead048af93 Merge pull request #142 from soxoj/photo-sites-1
Photo sites added
2021-05-09 18:24:33 +03:00
Soxoj acc751ff98 Photo sites added 2021-05-09 16:48:46 +03:00
soxoj b7bdd71cf0 Merge pull request #141 from soxoj/tags-update-script
Tags updates, script added
2021-05-09 16:47:41 +03:00
Soxoj 43f189f774 Tags updates, script added 2021-05-09 16:25:42 +03:00
soxoj 5bda7fb339 Merge pull request #140 from soxoj/tags-updates
Tags updates
2021-05-09 00:18:53 +03:00
Soxoj 414523a8ac Tags updates 2021-05-09 00:16:58 +03:00
soxoj 6d4e268706 Merge pull request #139 from soxoj/photo-sites
Added some photo sites, improved errors detecting
2021-05-08 20:39:46 +03:00
Soxoj b696b982f4 Added some photo sites, improved errors detecting 2021-05-08 20:37:34 +03:00
soxoj d4234036c0 Merge pull request #137 from soxoj/minor-fixes
Version patch and some minor fixes
2021-05-08 16:57:30 +03:00
Soxoj b57c70091c Added __version__.py 2021-05-08 16:55:49 +03:00
Soxoj e90df3560b Version patch and some minor fixes 2021-05-08 16:46:38 +03:00
soxoj bc6ee48b8c Merge pull request #136 from soxoj/dockerhub-image
Create build-docker-image.yml
2021-05-08 15:41:57 +03:00
soxoj e70bdf3789 Readme update 2021-05-08 15:41:38 +03:00
soxoj 84f9d417cf Create build-docker-image.yml 2021-05-08 15:16:37 +03:00
soxoj 4333c40be7 Merge pull request #135 from soxoj/new-sites-08-05-21
Added Weibo, Reddit BigData search, Wigle and several other sites
2021-05-08 13:56:06 +03:00
Soxoj 9e504c0094 Added Weibo, Reddit BigData search, Wigle and several other sites 2021-05-08 13:54:25 +03:00
soxoj 2f752a0368 Merge pull request #132 from soxoj/yelp
Added Yelp and yelp_userid support
2021-05-08 03:36:23 +03:00
Soxoj 53e9dab677 Added Yelp and yelp_userid support 2021-05-08 03:34:03 +03:00
soxoj 11b70a2a48 Merge pull request #131 from soxoj/facebook-ids-fixes
Facebook parsing fixed, website field added
2021-05-08 02:38:54 +03:00
Soxoj 960708ef2e Facebook parsing fixed, website field added 2021-05-08 02:25:54 +03:00
soxoj e6f6d8735d Merge pull request #130 from soxoj/tags-stabilization
Tags markup stabilization
2021-05-08 01:04:46 +03:00
Soxoj f77d7d307a Tags markup stabilization 2021-05-08 00:59:54 +03:00
28 changed files with 5931 additions and 3993 deletions
+32
View File
@@ -0,0 +1,32 @@
name: Build docker image and push to DockerHub
on:
push:
branches: [ main ]
jobs:
docker:
runs-on: ubuntu-latest
steps:
-
name: Set up QEMU
uses: docker/setup-qemu-action@v1
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
-
name: Login to DockerHub
uses: docker/login-action@v1
with:
username: ${{ secrets.DOCKER_HUB_USERNAME }}
password: ${{ secrets.DOCKER_HUB_ACCESS_TOKEN }}
-
name: Build and push
id: docker_build
uses: docker/build-push-action@v2
with:
push: true
tags: ${{ secrets.DOCKER_HUB_USERNAME }}/maigret:latest
-
name: Image digest
run: echo ${{ steps.docker_build.outputs.digest }}
+1 -1
View File
@@ -26,7 +26,7 @@ jobs:
- name: Install dependencies - name: Install dependencies
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
python -m pip install flake8 pytest pytest-rerunfailures python -m pip install -r test-requirements.txt
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Test with pytest - name: Test with pytest
run: | run: |
+9
View File
@@ -2,6 +2,15 @@
## [Unreleased] ## [Unreleased]
## [0.2.4] - 2021-05-18
* cli output report
* various improvements
## [0.2.3] - 2021-05-12
* added Yelp and yelp_userid support
* tags markup stabilization
* improved errors detection
## [0.2.2] - 2021-05-07 ## [0.2.2] - 2021-05-07
* improved ids extractors * improved ids extractors
* updated sites and engines * updated sites and engines
+48 -41
View File
@@ -1,40 +1,55 @@
# Maigret # Maigret
![PyPI](https://img.shields.io/pypi/v/maigret?style=flat-square)
![PyPI - Downloads](https://img.shields.io/pypi/dw/maigret?style=flat-square)
[![Chat - Gitter](./static/chat_gitter.svg)](https://gitter.im/maigret-osint/community)
<p align="center"> <p align="center">
<img src="./static/maigret.png" /> <p align="center">
<a href="https://pypi.org/project/maigret/">
<img alt="PyPI" src="https://img.shields.io/pypi/v/maigret?style=flat-square">
</a>
<a href="https://pypi.org/project/maigret/">
<img alt="PyPI - Downloads" src="https://img.shields.io/pypi/dw/maigret?style=flat-square">
</a>
<a href="https://gitter.im/maigret-osint/community">
<img alt="Chat - Gitter" src="./static/chat_gitter.svg" />
</a>
<a href="https://twitter.com/intent/follow?screen_name=sox0j">
<img src="https://img.shields.io/twitter/follow/sox0j?label=Follow%20sox0j&style=social&color=blue" alt="Follow @sox0j" />
</a>
</p>
<p align="center">
<img src="./static/maigret.png" height="200"/>
</p>
</p> </p>
<i>The Commissioner Jules Maigret is a fictional French police detective, created by Georges Simenon. His investigation method is based on understanding the personality of different people and their interactions.</i> <i>The Commissioner Jules Maigret is a fictional French police detective, created by Georges Simenon. His investigation method is based on understanding the personality of different people and their interactions.</i>
## About ## About
Purpose of Maigret - **collect a dossier on a person by username only**, checking for accounts on a huge number of sites. **Maigret** collect a dossier on a person **by username only**, checking for accounts on a huge number of sites and gathering all the available information from web pages. Maigret is an easy-to-use and powerful fork of [Sherlock](https://github.com/sherlock-project/sherlock).
This is a [sherlock](https://github.com/sherlock-project/) fork with cool features under heavy development. Currently supported more than 2000 sites ([full list](./sites.md)), search is launched against 500 popular sites in descending order of popularity by default.
*Don't forget to regularly update source code from repo*.
Currently supported more than 2000 sites ([full list](./sites.md)), by default search is launched against 500 popular sites in descending order of popularity.
## Main features ## Main features
* Profile pages parsing, [extracting](https://github.com/soxoj/socid_extractor) personal info, links to other profiles, etc. * Profile pages parsing, [extraction](https://github.com/soxoj/socid_extractor) of personal info, links to other profiles, etc.
* Recursive search by new usernames found * Recursive search by new usernames and other ids found
* Search by tags (site categories, countries) * Search by tags (site categories, countries)
* Censorship and captcha detection * Censorship and captcha detection
* Very few false positives * Requests retries
* Failed requests' restarts
See full description of Maigret features [in the Wiki](https://github.com/soxoj/maigret/wiki/Features).
## Installation ## Installation
**NOTE**: Python 3.6 or higher and pip is required. Maigret can be installed using pip, Docker, or simply can be launched from the cloned repo.
Also you can run Maigret using cloud shells (see buttons below).
**Python 3.8 is recommended.** [![Open in Cloud Shell](https://user-images.githubusercontent.com/27065646/92304704-8d146d80-ef80-11ea-8c29-0deaabb1c702.png)](https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/soxoj/maigret&tutorial=README.md) [![Run on Repl.it](https://user-images.githubusercontent.com/27065646/92304596-bf719b00-ef7f-11ea-987f-2c1f3c323088.png)](https://repl.it/github/soxoj/maigret)
<a href="https://colab.research.google.com/gist//soxoj/879b51bc3b2f8b695abb054090645000/maigret.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" height="40"></a>
### Package installing ### Package installing
**NOTE**: Python 3.6 or higher and pip is required, **Python 3.8 is recommended.**
```bash ```bash
# install from pypi # install from pypi
pip3 install maigret pip3 install maigret
@@ -42,34 +57,36 @@ pip3 install maigret
# or clone and install manually # or clone and install manually
git clone https://github.com/soxoj/maigret && cd maigret git clone https://github.com/soxoj/maigret && cd maigret
pip3 install . pip3 install .
# usage
maigret username
``` ```
### Cloning a repository ### Cloning a repository
```bash ```bash
git clone https://github.com/soxoj/maigret && cd maigret git clone https://github.com/soxoj/maigret && cd maigret
```
You can use a free virtual machine, the repo will be automatically cloned:
[![Open in Cloud Shell](https://user-images.githubusercontent.com/27065646/92304704-8d146d80-ef80-11ea-8c29-0deaabb1c702.png)](https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/soxoj/maigret&tutorial=README.md) [![Run on Repl.it](https://user-images.githubusercontent.com/27065646/92304596-bf719b00-ef7f-11ea-987f-2c1f3c323088.png)](https://repl.it/github/soxoj/maigret)
<a href="https://colab.research.google.com/gist//soxoj/879b51bc3b2f8b695abb054090645000/maigret.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" height="40"></a>
```bash
pip3 install -r requirements.txt pip3 install -r requirements.txt
# usage
./maigret.py username
``` ```
## Using examples ### Docker
```bash ```bash
# for a cloned repo # official image
./maigret.py user docker pull soxoj/maigret
# for a package # usage
maigret user docker run soxoj/maigret:latest username
# manual build
docker build -t maigret .
``` ```
Features: ## Usage examples
```bash ```bash
# make HTML and PDF reports # make HTML and PDF reports
maigret user --html --pdf maigret user --html --pdf
@@ -77,22 +94,12 @@ maigret user --html --pdf
# search on sites marked with tags photo & dating # search on sites marked with tags photo & dating
maigret user --tags photo,dating maigret user --tags photo,dating
# search for three usernames on all available sites # search for three usernames on all available sites
maigret user1 user2 user3 -a maigret user1 user2 user3 -a
``` ```
Run `maigret --help` to get arguments description. Also options are documented in [the Maigret Wiki](https://github.com/soxoj/maigret/wiki/Command-line-options). Use `maigret --help` to get full options description. Also options are documented in [the Maigret Wiki](https://github.com/soxoj/maigret/wiki/Command-line-options).
With Docker:
```
# manual build
docker build -t maigret . && docker run maigret user
# official image
docker run soxoj/maigret:latest user
```
## Demo with page parsing and recursive username search ## Demo with page parsing and recursive username search
+7
View File
@@ -1,5 +1,12 @@
"""Maigret""" """Maigret"""
__title__ = 'Maigret'
__package__ = 'maigret'
__author__ = 'Soxoj'
__author_email__ = 'soxoj@protonmail.com'
from .__version__ import __version__
from .checking import maigret as search from .checking import maigret as search
from .sites import MaigretEngine, MaigretSite, MaigretDatabase from .sites import MaigretEngine, MaigretSite, MaigretDatabase
from .notify import QueryNotifyPrint as Notifier from .notify import QueryNotifyPrint as Notifier
+3
View File
@@ -0,0 +1,3 @@
"""Maigret version file"""
__version__ = '0.2.4'
+15 -8
View File
@@ -13,6 +13,7 @@ import tqdm.asyncio
from aiohttp_socks import ProxyConnector from aiohttp_socks import ProxyConnector
from python_socks import _errors as proxy_errors from python_socks import _errors as proxy_errors
from socid_extractor import extract from socid_extractor import extract
from aiohttp.client_exceptions import ServerDisconnectedError, ClientConnectorError
from .activation import ParsingActivator, import_aiohttp_cookies from .activation import ParsingActivator, import_aiohttp_cookies
from . import errors from . import errors
@@ -25,7 +26,7 @@ from .executors import (
from .result import QueryResult, QueryStatus from .result import QueryResult, QueryStatus
from .sites import MaigretDatabase, MaigretSite from .sites import MaigretDatabase, MaigretSite
from .types import QueryOptions, QueryResultWrapper from .types import QueryOptions, QueryResultWrapper
from .utils import get_random_user_agent from .utils import get_random_user_agent, ascii_data_display
SUPPORTED_IDS = ( SUPPORTED_IDS = (
@@ -36,6 +37,7 @@ SUPPORTED_IDS = (
"wikimapia_uid", "wikimapia_uid",
"steam_id", "steam_id",
"uidme_uguid", "uidme_uguid",
"yelp_userid",
) )
BAD_CHARS = "#" BAD_CHARS = "#"
@@ -63,8 +65,10 @@ async def get_response(request_future, logger) -> Tuple[str, int, Optional[Check
except asyncio.TimeoutError as e: except asyncio.TimeoutError as e:
error = CheckError("Request timeout", str(e)) error = CheckError("Request timeout", str(e))
except aiohttp.client_exceptions.ClientConnectorError as e: except ClientConnectorError as e:
error = CheckError("Connecting failure", str(e)) error = CheckError("Connecting failure", str(e))
except ServerDisconnectedError as e:
error = CheckError("Server disconnected", str(e))
except aiohttp.http_exceptions.BadHttpMessage as e: except aiohttp.http_exceptions.BadHttpMessage as e:
error = CheckError("HTTP", str(e)) error = CheckError("HTTP", str(e))
except proxy_errors.ProxyError as e: except proxy_errors.ProxyError as e:
@@ -154,7 +158,7 @@ def process_site_result(
# additional check for errors # additional check for errors
if status_code and not check_error: if status_code and not check_error:
check_error = detect_error_page( check_error = detect_error_page(
html_text, status_code, site.errors, site.ignore403 html_text, status_code, site.errors_dict, site.ignore403
) )
# parsing activation # parsing activation
@@ -229,9 +233,9 @@ def process_site_result(
result = build_result(QueryStatus.CLAIMED) result = build_result(QueryStatus.CLAIMED)
else: else:
result = build_result(QueryStatus.AVAILABLE) result = build_result(QueryStatus.AVAILABLE)
elif check_type == "status_code": elif check_type in "status_code":
# Checks if the status code of the response is 2XX # Checks if the status code of the response is 2XX
if is_presense_detected and (not status_code >= 300 or status_code < 200): if 200 <= status_code < 300:
result = build_result(QueryStatus.CLAIMED) result = build_result(QueryStatus.CLAIMED)
else: else:
result = build_result(QueryStatus.AVAILABLE) result = build_result(QueryStatus.AVAILABLE)
@@ -268,7 +272,10 @@ def process_site_result(
new_usernames[v] = k new_usernames[v] = k
results_info["ids_usernames"] = new_usernames results_info["ids_usernames"] = new_usernames
results_info["ids_links"] = eval(extracted_ids_data.get("links", "[]")) links = ascii_data_display(extracted_ids_data.get("links", "[]"))
if "website" in extracted_ids_data:
links.append(extracted_ids_data["website"])
results_info["ids_links"] = links
result.ids_data = extracted_ids_data result.ids_data = extracted_ids_data
# Save status of request # Save status of request
@@ -449,7 +456,7 @@ async def maigret(
logger, logger,
query_notify=None, query_notify=None,
proxy=None, proxy=None,
timeout=None, timeout=3,
is_parsing_enabled=False, is_parsing_enabled=False,
id_type="username", id_type="username",
debug=False, debug=False,
@@ -471,7 +478,7 @@ async def maigret(
query results. query results.
logger -- Standard Python logger object. logger -- Standard Python logger object.
timeout -- Time in seconds to wait before timing out request. timeout -- Time in seconds to wait before timing out request.
Default is no timeout. Default is 3 seconds.
is_parsing_enabled -- Extract additional info from account pages. is_parsing_enabled -- Extract additional info from account pages.
id_type -- Type of username to search. id_type -- Type of username to search.
Default is 'username', see all supported here: Default is 'username', see all supported here:
+10 -4
View File
@@ -13,6 +13,7 @@ from typing import List, Tuple
import requests import requests
from socid_extractor import extract, parse, __version__ as socid_version from socid_extractor import extract, parse, __version__ as socid_version
from .__version__ import __version__
from .checking import ( from .checking import (
timeout_check, timeout_check,
SUPPORTED_IDS, SUPPORTED_IDS,
@@ -31,14 +32,13 @@ from .report import (
save_txt_report, save_txt_report,
SUPPORTED_JSON_REPORT_FORMATS, SUPPORTED_JSON_REPORT_FORMATS,
save_json_report, save_json_report,
get_plaintext_report,
) )
from .sites import MaigretDatabase from .sites import MaigretDatabase
from .submit import submit_dialog from .submit import submit_dialog
from .types import QueryResultWrapper from .types import QueryResultWrapper
from .utils import get_dict_ascii_tree from .utils import get_dict_ascii_tree
__version__ = '0.2.2'
def notify_about_errors(search_results: QueryResultWrapper, query_notify): def notify_about_errors(search_results: QueryResultWrapper, query_notify):
errs = errors.extract_and_group(search_results) errs = errors.extract_and_group(search_results)
@@ -49,7 +49,7 @@ def notify_about_errors(search_results: QueryResultWrapper, query_notify):
text = f'Too many errors of type "{e["err"]}" ({e["perc"]}%)' text = f'Too many errors of type "{e["err"]}" ({e["perc"]}%)'
solution = errors.solution_of(e['err']) solution = errors.solution_of(e['err'])
if solution: if solution:
text = '. '.join([text, solution]) text = '. '.join([text, solution.capitalize()])
query_notify.warning(text, '!') query_notify.warning(text, '!')
was_errs_displayed = True was_errs_displayed = True
@@ -166,7 +166,7 @@ def setup_arguments_parser():
type=int, type=int,
metavar='RETRIES', metavar='RETRIES',
default=1, default=1,
help="Attempts to restart temporary failed requests.", help="Attempts to restart temporarily failed requests.",
) )
parser.add_argument( parser.add_argument(
"-n", "-n",
@@ -647,6 +647,12 @@ async def main():
filename = report_filepath_tpl.format(username=username, postfix='.pdf') filename = report_filepath_tpl.format(username=username, postfix='.pdf')
save_pdf_report(filename, report_context) save_pdf_report(filename, report_context)
query_notify.warning(f'PDF report on all usernames saved in {filename}') query_notify.warning(f'PDF report on all usernames saved in {filename}')
text_report = get_plaintext_report(report_context)
if text_report:
query_notify.info('Short text report:')
print(text_report)
# update database # update database
db.save_to_file(args.db_file) db.save_to_file(args.db_file)
+10 -3
View File
@@ -205,13 +205,20 @@ class QueryNotifyPrint(QueryNotify):
else: else:
print(f"[*] {title} {message} on:") print(f"[*] {title} {message} on:")
def warning(self, message, symbol="-"): def _colored_print(self, fore_color, msg):
msg = f"[{symbol}] {message}"
if self.color: if self.color:
print(Style.BRIGHT + Fore.YELLOW + msg) print(Style.BRIGHT + fore_color + msg)
else: else:
print(msg) print(msg)
def warning(self, message, symbol="-"):
msg = f"[{symbol}] {message}"
self._colored_print(Fore.YELLOW, msg)
def info(self, message, symbol="*"):
msg = f"[{symbol}] {message}"
self._colored_print(Fore.BLUE, msg)
def update(self, result, is_similar=False): def update(self, result, is_similar=False):
"""Notify Update. """Notify Update.
+12
View File
@@ -70,6 +70,17 @@ def save_json_report(filename: str, username: str, results: dict, report_type: s
generate_json_report(username, results, f, report_type=report_type) generate_json_report(username, results, f, report_type=report_type)
def get_plaintext_report(context: dict) -> str:
output = (context['brief'] + " ").replace('. ', '.\n')
interests = list(map(lambda x: x[0], context.get('interests_tuple_list', [])))
countries = list(map(lambda x: x[0], context.get('countries_tuple_list', [])))
if countries:
output += f'Countries: {", ".join(countries)}\n'
if interests:
output += f'Interests (tags): {", ".join(interests)}\n'
return output.strip()
""" """
REPORTS GENERATING REPORTS GENERATING
""" """
@@ -215,6 +226,7 @@ def generate_report_context(username_results: list):
return { return {
"username": first_username, "username": first_username,
# TODO: return brief list
"brief": brief, "brief": brief,
"results": username_results, "results": username_results,
"first_seen": first_seen, "first_seen": first_seen,
+3870 -2321
View File
File diff suppressed because it is too large Load Diff
+21 -1
View File
@@ -53,6 +53,18 @@ SUPPORTED_TAGS = [
"medicine", "medicine",
"reading", "reading",
"stock", "stock",
"messaging",
"trading",
"links",
"fashion",
"tasks",
"military",
"auto",
"gambling",
"cybercriminal",
"review",
"bookmarks",
"design",
] ]
@@ -180,6 +192,14 @@ class MaigretSite:
return result return result
@property
def errors_dict(self) -> dict:
errors: Dict[str, str] = {}
if self.engine_obj:
errors.update(self.engine_obj.site.get('errors', {}))
errors.update(self.errors)
return errors
def get_url_type(self) -> str: def get_url_type(self) -> str:
url = URLMatcher.extract_main_part(self.url) url = URLMatcher.extract_main_part(self.url)
if url.startswith("{username}"): if url.startswith("{username}"):
@@ -456,7 +476,7 @@ class MaigretDatabase:
output += f"{count}\t{url}\n" output += f"{count}\t{url}\n"
output += "Top tags:\n" output += "Top tags:\n"
for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True)[:20]: for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True)[:200]:
mark = "" mark = ""
if tag not in SUPPORTED_TAGS: if tag not in SUPPORTED_TAGS:
mark = " (non-standard)" mark = " (non-standard)"
+36 -11
View File
@@ -2,7 +2,7 @@ import asyncio
import difflib import difflib
import re import re
from typing import List from typing import List
import xml.etree.ElementTree as ET
import requests import requests
from .activation import import_aiohttp_cookies from .activation import import_aiohttp_cookies
@@ -46,6 +46,20 @@ def get_match_ratio(x):
) )
def get_alexa_rank(site_url_main):
url = f"http://data.alexa.com/data?cli=10&url={site_url_main}"
xml_data = requests.get(url).text
root = ET.fromstring(xml_data)
alexa_rank = 0
try:
alexa_rank = int(root.find('.//REACH').attrib['RANK'])
except Exception:
pass
return alexa_rank
def extract_mainpage_url(url): def extract_mainpage_url(url):
return "/".join(url.split("/", 3)[:3]) return "/".join(url.split("/", 3)[:3])
@@ -133,6 +147,7 @@ async def detect_known_engine(
) -> List[MaigretSite]: ) -> List[MaigretSite]:
try: try:
r = requests.get(url_mainpage) r = requests.get(url_mainpage)
logger.debug(r.text)
except Exception as e: except Exception as e:
logger.warning(e) logger.warning(e)
print("Some error while checking main page") print("Some error while checking main page")
@@ -199,6 +214,7 @@ async def check_features_manually(
# cookies # cookies
cookie_dict = None cookie_dict = None
if cookie_file: if cookie_file:
logger.info(f'Use {cookie_file} for cookies')
cookie_jar = await import_aiohttp_cookies(cookie_file) cookie_jar = await import_aiohttp_cookies(cookie_file)
cookie_dict = {c.key: c.value for c in cookie_jar} cookie_dict = {c.key: c.value for c in cookie_jar}
@@ -239,7 +255,7 @@ async def check_features_manually(
features = input("If features was not detected correctly, write it manually: ") features = input("If features was not detected correctly, write it manually: ")
if features: if features:
presence_list = features.split(",") presence_list = list(map(str.strip, features.split(",")))
absence_list = sorted(b_minus_a, key=get_match_ratio, reverse=True)[ absence_list = sorted(b_minus_a, key=get_match_ratio, reverse=True)[
:top_features_count :top_features_count
@@ -248,7 +264,7 @@ async def check_features_manually(
features = input("If features was not detected correctly, write it manually: ") features = input("If features was not detected correctly, write it manually: ")
if features: if features:
absence_list = features.split(",") absence_list = list(map(str.strip, features.split(",")))
site_data = { site_data = {
"absenceStrs": absence_list, "absenceStrs": absence_list,
@@ -327,17 +343,26 @@ async def submit_dialog(db, url_exists, cookie_file, logger):
print( print(
"Try to run this mode again and increase features count or choose others." "Try to run this mode again and increase features count or choose others."
) )
return False
else: else:
if ( if (
input( input(
f"Site {chosen_site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] " f"Site {chosen_site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] "
).lower() )
in "y" .lower()
.strip("y")
): ):
logger.debug(chosen_site.json) return False
site_data = chosen_site.strip_engine_data()
logger.debug(site_data.json)
db.update_site(site_data)
return True
return False chosen_site.name = input("Change site name if you want: ") or chosen_site.name
chosen_site.tags = list(map(str.strip, input("Site tags: ").split(',')))
rank = get_alexa_rank(chosen_site.url_main)
if rank:
print(f'New alexa rank: {rank}')
chosen_site.alexa_rank = rank
logger.debug(chosen_site.json)
site_data = chosen_site.strip_engine_data()
logger.debug(site_data.json)
db.update_site(site_data)
return True
+8 -1
View File
@@ -1,5 +1,7 @@
import ast
import re import re
import random import random
from typing import Any
DEFAULT_USER_AGENTS = [ DEFAULT_USER_AGENTS = [
@@ -65,6 +67,10 @@ class URLMatcher:
return re.compile(regexp_str) return re.compile(regexp_str)
def ascii_data_display(data: str) -> Any:
return ast.literal_eval(data)
def get_dict_ascii_tree(items, prepend="", new_line=True): def get_dict_ascii_tree(items, prepend="", new_line=True):
text = "" text = ""
for num, item in enumerate(items): for num, item in enumerate(items):
@@ -75,7 +81,8 @@ def get_dict_ascii_tree(items, prepend="", new_line=True):
if field_value.startswith("['"): if field_value.startswith("['"):
is_last_item = num == len(items) - 1 is_last_item = num == len(items) - 1
prepend_symbols = " " * 3 if is_last_item else "" prepend_symbols = " " * 3 if is_last_item else ""
field_value = get_dict_ascii_tree(eval(field_value), prepend_symbols) data = ascii_data_display(field_value)
field_value = get_dict_ascii_tree(data, prepend_symbols)
text += f"\n{prepend}{box_symbol}{field_name}: {field_value}" text += f"\n{prepend}{box_symbol}{field_name}: {field_value}"
else: else:
text += f"\n{prepend}{box_symbol} {item}" text += f"\n{prepend}{box_symbol} {item}"
+1 -1
View File
@@ -26,7 +26,7 @@ python-socks==1.1.2
requests>=2.24.0 requests>=2.24.0
requests-futures==1.0.0 requests-futures==1.0.0
six==1.15.0 six==1.15.0
socid-extractor>=0.0.16 socid-extractor>=0.0.20
soupsieve==2.1 soupsieve==2.1
stem==1.8.0 stem==1.8.0
torrequest==0.1.0 torrequest==0.1.0
+1 -1
View File
@@ -12,7 +12,7 @@ with open('requirements.txt') as rf:
requires = rf.read().splitlines() requires = rf.read().splitlines()
setup(name='maigret', setup(name='maigret',
version='0.2.2', version='0.2.4',
description='Collect a dossier on a person by username from a huge number of sites', description='Collect a dossier on a person by username from a huge number of sites',
long_description=long_description, long_description=long_description,
long_description_content_type="text/markdown", long_description_content_type="text/markdown",
+1632 -1579
View File
File diff suppressed because it is too large Load Diff
+6
View File
@@ -0,0 +1,6 @@
flake8==3.8.4
pytest==6.2.4
pytest-asyncio==0.14.0
pytest-cov==2.10.1
pytest-httpserver==1.0.0
pytest-rerunfailures==9.1.1
+12 -5
View File
@@ -12,6 +12,7 @@ from maigret.maigret import setup_arguments_parser
CUR_PATH = os.path.dirname(os.path.realpath(__file__)) CUR_PATH = os.path.dirname(os.path.realpath(__file__))
JSON_FILE = os.path.join(CUR_PATH, '../maigret/resources/data.json') JSON_FILE = os.path.join(CUR_PATH, '../maigret/resources/data.json')
TEST_JSON_FILE = os.path.join(CUR_PATH, 'db.json') TEST_JSON_FILE = os.path.join(CUR_PATH, 'db.json')
LOCAL_TEST_JSON_FILE = os.path.join(CUR_PATH, 'local.json')
empty_mark = Mark('', (), {}) empty_mark = Mark('', (), {})
@@ -36,16 +37,17 @@ def remove_test_reports():
@pytest.fixture(scope='session') @pytest.fixture(scope='session')
def default_db(): def default_db():
db = MaigretDatabase().load_from_file(JSON_FILE) return MaigretDatabase().load_from_file(JSON_FILE)
return db
@pytest.fixture(scope='function') @pytest.fixture(scope='function')
def test_db(): def test_db():
db = MaigretDatabase().load_from_file(TEST_JSON_FILE) return MaigretDatabase().load_from_file(TEST_JSON_FILE)
return db
@pytest.fixture(scope='function')
def local_test_db():
return MaigretDatabase().load_from_file(LOCAL_TEST_JSON_FILE)
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
@@ -58,3 +60,8 @@ def reports_autoclean():
@pytest.fixture(scope='session') @pytest.fixture(scope='session')
def argparser(): def argparser():
return setup_arguments_parser() return setup_arguments_parser()
@pytest.fixture(scope="session")
def httpserver_listen_address():
return ("localhost", 8989)
+21
View File
@@ -0,0 +1,21 @@
{
"engines": {},
"sites": {
"StatusCode": {
"checkType": "status_code",
"url": "http://localhost:8989/url?id={username}",
"urlMain": "http://localhost:8989/",
"usernameClaimed": "claimed",
"usernameUnclaimed": "unclaimed"
},
"Message": {
"checkType": "message",
"url": "http://localhost:8989/url?id={username}",
"urlMain": "http://localhost:8989/",
"presenseStrs": ["user", "profile"],
"absenseStrs": ["not found", "404"],
"usernameClaimed": "claimed",
"usernameUnclaimed": "unclaimed"
}
}
}
+1
View File
@@ -22,6 +22,7 @@ httpbin.org FALSE / FALSE 0 a b
""" """
@pytest.mark.skip(reason="periodically fails")
@pytest.mark.slow @pytest.mark.slow
def test_twitter_activation(default_db): def test_twitter_activation(default_db):
twitter_site = default_db.sites_dict['Twitter'] twitter_site = default_db.sites_dict['Twitter']
+65
View File
@@ -0,0 +1,65 @@
from mock import Mock
import pytest
from maigret import search
def site_result_except(server, username, **kwargs):
query = f'id={username}'
server.expect_request('/url', query_string=query).respond_with_data(**kwargs)
@pytest.mark.asyncio
async def test_checking_by_status_code(httpserver, local_test_db):
sites_dict = local_test_db.sites_dict
site_result_except(httpserver, 'claimed', status=200)
site_result_except(httpserver, 'unclaimed', status=404)
result = await search('claimed', site_dict=sites_dict, logger=Mock())
assert result['StatusCode']['status'].is_found() is True
result = await search('unclaimed', site_dict=sites_dict, logger=Mock())
assert result['StatusCode']['status'].is_found() is False
@pytest.mark.asyncio
async def test_checking_by_message_positive_full(httpserver, local_test_db):
sites_dict = local_test_db.sites_dict
site_result_except(httpserver, 'claimed', response_data="user profile")
site_result_except(httpserver, 'unclaimed', response_data="404 not found")
result = await search('claimed', site_dict=sites_dict, logger=Mock())
assert result['Message']['status'].is_found() is True
result = await search('unclaimed', site_dict=sites_dict, logger=Mock())
assert result['Message']['status'].is_found() is False
@pytest.mark.asyncio
async def test_checking_by_message_positive_part(httpserver, local_test_db):
sites_dict = local_test_db.sites_dict
site_result_except(httpserver, 'claimed', response_data="profile")
site_result_except(httpserver, 'unclaimed', response_data="404")
result = await search('claimed', site_dict=sites_dict, logger=Mock())
assert result['Message']['status'].is_found() is True
result = await search('unclaimed', site_dict=sites_dict, logger=Mock())
assert result['Message']['status'].is_found() is False
@pytest.mark.asyncio
async def test_checking_by_message_negative(httpserver, local_test_db):
sites_dict = local_test_db.sites_dict
site_result_except(httpserver, 'claimed', response_data="")
site_result_except(httpserver, 'unclaimed', response_data="user 404")
result = await search('claimed', site_dict=sites_dict, logger=Mock())
assert result['Message']['status'].is_found() is False
result = await search('unclaimed', site_dict=sites_dict, logger=Mock())
assert result['Message']['status'].is_found() is True
+15
View File
@@ -0,0 +1,15 @@
"""Maigret data test functions"""
from maigret.utils import is_country_tag
from maigret.sites import SUPPORTED_TAGS
def test_tags_validity(default_db):
unknown_tags = set()
for site in default_db.sites:
for tag in filter(lambda x: not is_country_tag(x), site.tags):
if tag not in SUPPORTED_TAGS:
unknown_tags.add(tag)
assert unknown_tags == set()
+11
View File
@@ -16,6 +16,7 @@ from maigret.report import (
generate_report_template, generate_report_template,
generate_report_context, generate_report_context,
generate_json_report, generate_json_report,
get_plaintext_report,
) )
from maigret.result import QueryResult, QueryStatus from maigret.result import QueryResult, QueryStatus
from maigret.sites import MaigretSite from maigret.sites import MaigretSite
@@ -346,3 +347,13 @@ def test_pdf_report():
save_pdf_report(report_name, context) save_pdf_report(report_name, context)
assert os.path.exists(report_name) assert os.path.exists(report_name)
def test_text_report():
context = generate_report_context(TEST)
report_text = get_plaintext_report(context)
for brief_part in SUPPOSED_BRIEF.split():
assert brief_part in report_text
assert 'us' in report_text
assert 'photo' in report_text
+1
View File
@@ -103,6 +103,7 @@ def test_saving_site_error():
amperka = db.sites[0] amperka = db.sites[0]
assert len(amperka.errors) == 2 assert len(amperka.errors) == 2
assert len(amperka.errors_dict) == 2
assert amperka.strip_engine_data().errors == {'error1': 'text1'} assert amperka.strip_engine_data().errors == {'error1': 'text1'}
assert amperka.strip_engine_data().json['errors'] == {'error1': 'text1'} assert amperka.strip_engine_data().json['errors'] == {'error1': 'text1'}
+5
View File
@@ -57,6 +57,11 @@ def test_enrich_link_str():
) )
def test_url_extract_main_part_negative():
url_main_part = 'None'
assert URLMatcher.extract_main_part(url_main_part) == ''
def test_url_extract_main_part(): def test_url_extract_main_part():
url_main_part = 'flickr.com/photos/alexaimephotography' url_main_part = 'flickr.com/photos/alexaimephotography'
+57
View File
@@ -0,0 +1,57 @@
#!/usr/bin/env python3
import random
from argparse import ArgumentParser, RawDescriptionHelpFormatter
from maigret.maigret import MaigretDatabase
from maigret.submit import get_alexa_rank
def update_tags(site):
tags = []
if not site.tags:
print(f'Site {site.name} doesn\'t have tags')
else:
tags = site.tags
print(f'Site {site.name} tags: ' + ', '.join(tags))
print(f'URL: {site.url_main}')
new_tags = set(input('Enter new tags: ').split(', '))
if "disabled" in new_tags:
new_tags.remove("disabled")
site.disabled = True
print(f'Old alexa rank: {site.alexa_rank}')
rank = get_alexa_rank(site.url_main)
if rank:
print(f'New alexa rank: {rank}')
site.alexa_rank = rank
site.tags = [x for x in list(new_tags) if x]
if __name__ == '__main__':
parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter
)
parser.add_argument("--base","-b", metavar="BASE_FILE",
dest="base_file", default="maigret/resources/data.json",
help="JSON file with sites data to update.")
pool = list()
args = parser.parse_args()
db = MaigretDatabase()
db.load_from_file(args.base_file).sites
while True:
site = random.choice(db.sites)
if site.engine == 'uCoz':
continue
if not 'in' in site.tags:
continue
update_tags(site)
db.save_to_file(args.base_file)
+21 -16
View File
@@ -37,15 +37,15 @@ def get_rank(domain_to_query, site, print_errors=True):
try: try:
#Get ranking for this site. #Get ranking for this site.
site.alexa_rank = int(root.find('.//REACH').attrib['RANK']) site.alexa_rank = int(root.find('.//REACH').attrib['RANK'])
country = root.find('.//COUNTRY') # country = root.find('.//COUNTRY')
if not country is None and country.attrib: # if not country is None and country.attrib:
country_code = country.attrib['CODE'] # country_code = country.attrib['CODE']
tags = set(site.tags) # tags = set(site.tags)
if country_code: # if country_code:
tags.add(country_code.lower()) # tags.add(country_code.lower())
site.tags = sorted(list(tags)) # site.tags = sorted(list(tags))
if site.type != 'username': # if site.type != 'username':
site.disabled = False # site.disabled = False
except Exception as e: except Exception as e:
if print_errors: if print_errors:
logging.error(e) logging.error(e)
@@ -74,6 +74,7 @@ if __name__ == '__main__':
dest="base_file", default="maigret/resources/data.json", dest="base_file", default="maigret/resources/data.json",
help="JSON file with sites data to update.") help="JSON file with sites data to update.")
parser.add_argument('--with-rank', help='update with use of local data only', action='store_true')
parser.add_argument('--empty-only', help='update only sites without rating', action='store_true') parser.add_argument('--empty-only', help='update only sites without rating', action='store_true')
parser.add_argument('--exclude-engine', help='do not update score with certain engine', parser.add_argument('--exclude-engine', help='do not update score with certain engine',
action="append", dest="exclude_engine_list", default=[]) action="append", dest="exclude_engine_list", default=[])
@@ -93,22 +94,25 @@ Rank data fetched from Alexa by domains.
""") """)
for site in sites_subset: for site in sites_subset:
if not args.with_rank:
break
url_main = site.url_main url_main = site.url_main
if site.alexa_rank < sys.maxsize and args.empty_only: if site.alexa_rank < sys.maxsize and args.empty_only:
continue continue
if args.exclude_engine_list and site.engine in args.exclude_engine_list: if args.exclude_engine_list and site.engine in args.exclude_engine_list:
continue continue
site.alexa_rank = 0 site.alexa_rank = 0
th = threading.Thread(target=get_rank, args=(url_main, site)) th = threading.Thread(target=get_rank, args=(url_main, site,))
pool.append((site.name, url_main, th)) pool.append((site.name, url_main, th))
th.start() th.start()
index = 1 if args.with_rank:
for site_name, url_main, th in pool: index = 1
th.join() for site_name, url_main, th in pool:
sys.stdout.write("\r{0}".format(f"Updated {index} out of {len(sites_subset)} entries")) th.join()
sys.stdout.flush() sys.stdout.write("\r{0}".format(f"Updated {index} out of {len(sites_subset)} entries"))
index = index + 1 sys.stdout.flush()
index = index + 1
sites_full_list = [(s, s.alexa_rank) for s in sites_subset] sites_full_list = [(s, s.alexa_rank) for s in sites_subset]
@@ -123,6 +127,7 @@ Rank data fetched from Alexa by domains.
url_main = site.url_main url_main = site.url_main
valid_rank = get_step_rank(rank) valid_rank = get_step_rank(rank)
all_tags = site.tags all_tags = site.tags
all_tags.sort()
tags = ', ' + ', '.join(all_tags) if all_tags else '' tags = ', ' + ', '.join(all_tags) if all_tags else ''
note = '' note = ''
if site.disabled: if site.disabled: