mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-07 06:24:35 +00:00
Compare commits
43 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 53d698bb7b | |||
| 23fff42ca7 | |||
| 51d9e6f5f6 | |||
| 640c04f20b | |||
| 69f78e331b | |||
| 69c315b00e | |||
| b755628a1d | |||
| 7490a412db | |||
| 2741680d4a | |||
| e5fc221ce2 | |||
| a044e3dd79 | |||
| 6da4ff1e7b | |||
| eb2442401d | |||
| d23d24eeca | |||
| a2ddb15f09 | |||
| e90e85d2a9 | |||
| 2bb01f7019 | |||
| b586a4cd06 | |||
| 28733282ab | |||
| 0a7a7ad70d | |||
| c895f6b418 | |||
| a6286a0286 | |||
| 314eb25d1f | |||
| fbbc8b49f3 | |||
| faa03b62e5 | |||
| d676f7bb94 | |||
| d4757aab78 | |||
| 908176be85 | |||
| 940f408da3 | |||
| 8c700b9810 | |||
| f9c9af5f41 | |||
| 57a9a82102 | |||
| 9bbca995e9 | |||
| 39b713497d | |||
| 6a84875775 | |||
| 84f7d93478 | |||
| 17870ef5c8 | |||
| d3cd5e45a1 | |||
| 9a3f2f0aa7 | |||
| 4b7d344b41 | |||
| ac9cfe7885 | |||
| 6058a4b70c | |||
| 3aa225bda4 |
@@ -15,7 +15,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: [3.6, 3.7, 3.8, 3.9]
|
||||
python-version: [3.6.9, 3.7, 3.8, 3.9]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
@@ -26,8 +26,8 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip install flake8 pytest
|
||||
python -m pip install flake8 pytest pytest-rerunfailures
|
||||
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
|
||||
- name: Test with pytest
|
||||
run: |
|
||||
pytest
|
||||
pytest --reruns 3 --reruns-delay 5
|
||||
|
||||
+12
-1
@@ -2,9 +2,20 @@
|
||||
|
||||
## [Unreleased]
|
||||
|
||||
## [0.1.17] - 2021-03-30
|
||||
* simplified maigret search API
|
||||
* improved documentation
|
||||
* fixed 403 response code ignoring bug
|
||||
|
||||
## [0.1.16] - 2021-03-21
|
||||
* improved URL parsing mode
|
||||
* improved sites submit mode
|
||||
* added uID.me uguid support
|
||||
* improved requests processing
|
||||
|
||||
## [0.1.15] - 2021-03-14
|
||||
* improved HTML reports
|
||||
* fixed python-3.6- specific error
|
||||
* fixed python-3.6-specific error
|
||||
* false positives fixes
|
||||
|
||||
## [0.1.14] - 2021-02-25
|
||||
|
||||
+6
-6
@@ -1,21 +1,21 @@
|
||||
FROM python:3.7-alpine
|
||||
FROM python:3.7
|
||||
LABEL maintainer="Soxoj <soxoj@protonmail.com>"
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ADD requirements.txt .
|
||||
|
||||
RUN pip install --upgrade pip \
|
||||
&& apk add --update --virtual .build-dependencies \
|
||||
build-base \
|
||||
RUN pip install --upgrade pip
|
||||
|
||||
RUN apt update -y
|
||||
|
||||
RUN apt install -y\
|
||||
gcc \
|
||||
musl-dev \
|
||||
libxml2 \
|
||||
libxml2-dev \
|
||||
libxslt-dev \
|
||||
jpeg-dev \
|
||||
&& YARL_NO_EXTENSIONS=1 python3 -m pip install maigret \
|
||||
&& apk del .build-dependencies \
|
||||
&& rm -rf /var/cache/apk/* \
|
||||
/tmp/* \
|
||||
/var/tmp/*
|
||||
|
||||
@@ -33,20 +33,43 @@ Currently supported more than 2000 sites ([full list](./sites.md)), by default s
|
||||
|
||||
**Python 3.8 is recommended.**
|
||||
|
||||
### Package installing
|
||||
```bash
|
||||
# install from pypi
|
||||
$ pip3 install maigret
|
||||
pip3 install maigret
|
||||
|
||||
# or clone and install manually
|
||||
$ git clone https://github.com/soxoj/maigret && cd maigret
|
||||
$ pip3 install .
|
||||
git clone https://github.com/soxoj/maigret && cd maigret
|
||||
pip3 install .
|
||||
```
|
||||
|
||||
### Cloning a repository
|
||||
|
||||
```bash
|
||||
git clone https://github.com/soxoj/maigret && cd maigret
|
||||
```
|
||||
|
||||
You can use your a free virtual machine, the repo will be automatically cloned:
|
||||
|
||||
[](https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/soxoj/maigret&tutorial=README.md) [](https://repl.it/github/soxoj/maigret)
|
||||
<a href="https://colab.research.google.com/gist//soxoj/879b51bc3b2f8b695abb054090645000/maigret.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" height="40"></a>
|
||||
|
||||
```bash
|
||||
pip3 install -r requirements.txt
|
||||
```
|
||||
|
||||
## Using examples
|
||||
|
||||
```bash
|
||||
maigret user
|
||||
# for a cloned repo
|
||||
./maigret.py user
|
||||
|
||||
# for a package
|
||||
maigret user
|
||||
```
|
||||
|
||||
Features:
|
||||
```bash
|
||||
# make HTML and PDF reports
|
||||
maigret user --html --pdf
|
||||
|
||||
@@ -63,9 +86,11 @@ Run `maigret --help` to get arguments description. Also options are documented i
|
||||
|
||||
With Docker:
|
||||
```
|
||||
docker build -t maigret .
|
||||
# manual build
|
||||
docker build -t maigret . && docker run maigret user
|
||||
|
||||
docker run maigret user
|
||||
# official image
|
||||
docker run soxoj/maigret:latest user
|
||||
```
|
||||
|
||||
## Demo with page parsing and recursive username search
|
||||
|
||||
+9
-11
@@ -1,15 +1,13 @@
|
||||
# HTTP Cookie File downloaded with cookies.txt by Genuinous @genuinous
|
||||
# This file can be used by wget, curl, aria2c and other standard compliant tools.
|
||||
# Usage Examples:
|
||||
# 1) wget -x --load-cookies cookies.txt "https://xss.is/search/"
|
||||
# 2) curl --cookie cookies.txt "https://xss.is/search/"
|
||||
# 3) aria2c --load-cookies cookies.txt "https://xss.is/search/"
|
||||
# 1) wget -x --load-cookies cookies.txt "https://pixabay.com/users/blue-156711/"
|
||||
# 2) curl --cookie cookies.txt "https://pixabay.com/users/blue-156711/"
|
||||
# 3) aria2c --load-cookies cookies.txt "https://pixabay.com/users/blue-156711/"
|
||||
#
|
||||
xss.is FALSE / TRUE 0 xf_csrf PMnZNsr42HETwYEr
|
||||
xss.is FALSE / TRUE 0 xf_from_search google
|
||||
xss.is FALSE / TRUE 1642709308 xf_user 215268%2CZNKB_-64Wk-BOpsdtLYy-1UxfS5zGpxWaiEGUhmX
|
||||
xss.is FALSE / TRUE 0 xf_session sGdxJtP_sKV0LCG8vUQbr6cL670_EFWM
|
||||
.xss.is TRUE / FALSE 0 muchacho_cache ["00fbb0f2772c9596b0483d6864563cce"]
|
||||
.xss.is TRUE / FALSE 0 muchacho_png ["00fbb0f2772c9596b0483d6864563cce"]
|
||||
.xss.is TRUE / FALSE 0 muchacho_etag ["00fbb0f2772c9596b0483d6864563cce"]
|
||||
.xss.is TRUE / FALSE 1924905600 2e66e4dd94a7a237d0d1b4d50f01e179_evc ["00fbb0f2772c9596b0483d6864563cce"]
|
||||
.pixabay.com TRUE / TRUE 1618356838 __cfduid d56929cd50d11474f421b849df5758a881615764837
|
||||
.pixabay.com TRUE / TRUE 1615766638 __cf_bm ea8f7c565b44d749f65500f0e45176cebccaeb09-1615764837-1800-AYJIXh2boDJ6HPf44JI9fnteWABHOVvkxiSccACP9EiS1E58UDTGhViXtqjFfVE0QRj1WowP4ss2DzCs+pW+qUc=
|
||||
pixabay.com FALSE / FALSE 0 anonymous_user_id c1e4ee09-5674-4252-aa94-8c47b1ea80ab
|
||||
pixabay.com FALSE / FALSE 1647214439 csrftoken vfetTSvIul7gBlURt6s985JNM18GCdEwN5MWMKqX4yI73xoPgEj42dbNefjGx5fr
|
||||
pixabay.com FALSE / FALSE 1647300839 client_width 1680
|
||||
pixabay.com FALSE / FALSE 748111764839 is_human 1
|
||||
|
||||
@@ -1 +1,3 @@
|
||||
"""Maigret"""
|
||||
|
||||
from .checking import maigret as search
|
||||
|
||||
@@ -1,11 +1,9 @@
|
||||
import aiohttp
|
||||
from aiohttp import CookieJar
|
||||
import asyncio
|
||||
import json
|
||||
from http.cookiejar import MozillaCookieJar
|
||||
from http.cookies import Morsel
|
||||
|
||||
import requests
|
||||
from aiohttp import CookieJar
|
||||
|
||||
|
||||
class ParsingActivator:
|
||||
@staticmethod
|
||||
|
||||
+150
-37
@@ -1,8 +1,12 @@
|
||||
import asyncio
|
||||
import logging
|
||||
from mock import Mock
|
||||
import re
|
||||
import ssl
|
||||
import sys
|
||||
import tqdm
|
||||
import time
|
||||
from typing import Callable, Any, Iterable, Tuple
|
||||
|
||||
import aiohttp
|
||||
import tqdm.asyncio
|
||||
@@ -22,6 +26,7 @@ supported_recursive_search_ids = (
|
||||
'ok_id',
|
||||
'wikimapia_uid',
|
||||
'steam_id',
|
||||
'uidme_uguid',
|
||||
)
|
||||
|
||||
common_errors = {
|
||||
@@ -37,6 +42,103 @@ common_errors = {
|
||||
|
||||
unsupported_characters = '#'
|
||||
|
||||
QueryDraft = Tuple[Callable, Any, Any]
|
||||
QueriesDraft = Iterable[QueryDraft]
|
||||
|
||||
|
||||
class AsyncExecutor:
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.logger = kwargs['logger']
|
||||
|
||||
async def run(self, tasks: QueriesDraft):
|
||||
start_time = time.time()
|
||||
results = await self._run(tasks)
|
||||
self.execution_time = time.time() - start_time
|
||||
self.logger.debug(f'Spent time: {self.execution_time}')
|
||||
return results
|
||||
|
||||
async def _run(self, tasks: QueriesDraft):
|
||||
await asyncio.sleep(0)
|
||||
|
||||
|
||||
class AsyncioSimpleExecutor(AsyncExecutor):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
async def _run(self, tasks: QueriesDraft):
|
||||
futures = [f(*args, **kwargs) for f, args, kwargs in tasks]
|
||||
return await asyncio.gather(*futures)
|
||||
|
||||
|
||||
class AsyncioProgressbarExecutor(AsyncExecutor):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
async def _run(self, tasks: QueriesDraft):
|
||||
futures = [f(*args, **kwargs) for f, args, kwargs in tasks]
|
||||
results = []
|
||||
for f in tqdm.asyncio.tqdm.as_completed(futures):
|
||||
results.append(await f)
|
||||
return results
|
||||
|
||||
|
||||
class AsyncioProgressbarSemaphoreExecutor(AsyncExecutor):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.semaphore = asyncio.Semaphore(kwargs.get('in_parallel', 1))
|
||||
|
||||
async def _run(self, tasks: QueriesDraft):
|
||||
async def _wrap_query(q: QueryDraft):
|
||||
async with self.semaphore:
|
||||
f, args, kwargs = q
|
||||
return await f(*args, **kwargs)
|
||||
|
||||
async def semaphore_gather(tasks: QueriesDraft):
|
||||
coros = [_wrap_query(q) for q in tasks]
|
||||
results = []
|
||||
for f in tqdm.asyncio.tqdm.as_completed(coros):
|
||||
results.append(await f)
|
||||
return results
|
||||
|
||||
return await semaphore_gather(tasks)
|
||||
|
||||
|
||||
class AsyncioProgressbarQueueExecutor(AsyncExecutor):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.workers_count = kwargs.get('in_parallel', 10)
|
||||
self.progress_func = kwargs.get('progress_func', tqdm.tqdm)
|
||||
self.queue = asyncio.Queue(self.workers_count)
|
||||
|
||||
async def worker(self):
|
||||
while True:
|
||||
f, args, kwargs = await self.queue.get()
|
||||
result = await f(*args, **kwargs)
|
||||
self.results.append(result)
|
||||
self.progress.update(1)
|
||||
self.queue.task_done()
|
||||
|
||||
async def _run(self, tasks: QueriesDraft):
|
||||
self.results = []
|
||||
|
||||
if sys.version_info.minor > 6:
|
||||
create_task = asyncio.create_task
|
||||
else:
|
||||
loop = asyncio.get_event_loop()
|
||||
create_task = loop.create_task
|
||||
|
||||
workers = [create_task(self.worker())
|
||||
for _ in range(self.workers_count)]
|
||||
task_list = list(tasks)
|
||||
self.progress = self.progress_func(total=len(task_list))
|
||||
for t in task_list:
|
||||
await self.queue.put(t)
|
||||
await self.queue.join()
|
||||
for w in workers:
|
||||
w.cancel()
|
||||
self.progress.close()
|
||||
return self.results
|
||||
|
||||
|
||||
async def get_response(request_future, site_name, logger):
|
||||
html_text = None
|
||||
@@ -87,19 +189,18 @@ async def get_response(request_future, site_name, logger):
|
||||
return html_text, status_code, error_text, expection_text
|
||||
|
||||
|
||||
async def update_site_dict_from_response(sitename, site_dict, results_info, semaphore, logger, query_notify):
|
||||
async with semaphore:
|
||||
site_obj = site_dict[sitename]
|
||||
future = site_obj.request_future
|
||||
if not future:
|
||||
# ignore: search by incompatible id type
|
||||
return
|
||||
async def update_site_dict_from_response(sitename, site_dict, results_info, logger, query_notify):
|
||||
site_obj = site_dict[sitename]
|
||||
future = site_obj.request_future
|
||||
if not future:
|
||||
# ignore: search by incompatible id type
|
||||
return
|
||||
|
||||
response = await get_response(request_future=future,
|
||||
site_name=sitename,
|
||||
logger=logger)
|
||||
response = await get_response(request_future=future,
|
||||
site_name=sitename,
|
||||
logger=logger)
|
||||
|
||||
site_dict[sitename] = process_site_result(response, query_notify, logger, results_info, site_obj)
|
||||
return sitename, process_site_result(response, query_notify, logger, results_info, site_obj)
|
||||
|
||||
|
||||
# TODO: move to separate class
|
||||
@@ -166,7 +267,7 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
|
||||
|
||||
if status_code and not error_text:
|
||||
error_text, site_error_text = detect_error_page(html_text, status_code, failure_errors,
|
||||
site.ignore_403)
|
||||
site.ignore403)
|
||||
|
||||
if site.activation and html_text:
|
||||
is_need_activation = any([s for s in site.activation['marks'] if s in html_text])
|
||||
@@ -294,26 +395,32 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
|
||||
return results_info
|
||||
|
||||
|
||||
async def maigret(username, site_dict, query_notify, logger,
|
||||
async def maigret(username, site_dict, logger, query_notify=None,
|
||||
proxy=None, timeout=None, is_parsing_enabled=False,
|
||||
id_type='username', debug=False, forced=False,
|
||||
max_connections=100, no_progressbar=False,
|
||||
cookies=None):
|
||||
"""Main search func
|
||||
|
||||
Checks for existence of username on various social media sites.
|
||||
Checks for existence of username on certain sites.
|
||||
|
||||
Keyword Arguments:
|
||||
username -- String indicating username that report
|
||||
should be created against.
|
||||
site_dict -- Dictionary containing all of the site data.
|
||||
username -- Username string will be used for search.
|
||||
site_dict -- Dictionary containing sites data.
|
||||
query_notify -- Object with base type of QueryNotify().
|
||||
This will be used to notify the caller about
|
||||
query results.
|
||||
proxy -- String indicating the proxy URL
|
||||
logger -- Standard Python logger object.
|
||||
timeout -- Time in seconds to wait before timing out request.
|
||||
Default is no timeout.
|
||||
is_parsing_enabled -- Search for other usernames in website pages.
|
||||
is_parsing_enabled -- Extract additional info from account pages.
|
||||
id_type -- Type of username to search.
|
||||
Default is 'username', see all supported here:
|
||||
https://github.com/soxoj/maigret/wiki/Supported-identifier-types
|
||||
max_connections -- Maximum number of concurrent connections allowed.
|
||||
Default is 100.
|
||||
no_progressbar -- Displaying of ASCII progressbar during scanner.
|
||||
cookies -- Filename of a cookie jar file to use for each request.
|
||||
|
||||
Return Value:
|
||||
Dictionary containing results from report. Key of dictionary is the name
|
||||
@@ -330,6 +437,9 @@ async def maigret(username, site_dict, query_notify, logger,
|
||||
"""
|
||||
|
||||
# Notify caller that we are starting the query.
|
||||
if not query_notify:
|
||||
query_notify = Mock()
|
||||
|
||||
query_notify.start(username, id_type)
|
||||
|
||||
# TODO: connector
|
||||
@@ -380,7 +490,7 @@ async def maigret(username, site_dict, query_notify, logger,
|
||||
|
||||
headers.update(site.headers)
|
||||
|
||||
if not 'url' in site.__dict__:
|
||||
if 'url' not in site.__dict__:
|
||||
logger.error('No URL for site %s', site.name)
|
||||
# URL of user on site (if it exists)
|
||||
url = site.url.format(
|
||||
@@ -454,28 +564,33 @@ async def maigret(username, site_dict, query_notify, logger,
|
||||
# Add this site's results into final dictionary with all of the other results.
|
||||
results_total[site_name] = results_site
|
||||
|
||||
# TODO: move into top-level function
|
||||
|
||||
sem = asyncio.Semaphore(max_connections)
|
||||
|
||||
tasks = []
|
||||
coroutines = []
|
||||
for sitename, result_obj in results_total.items():
|
||||
update_site_coro = update_site_dict_from_response(sitename, site_dict, result_obj, sem, logger, query_notify)
|
||||
future = asyncio.ensure_future(update_site_coro)
|
||||
tasks.append(future)
|
||||
coroutines.append((update_site_dict_from_response, [sitename, site_dict, result_obj, logger, query_notify], {}))
|
||||
|
||||
if no_progressbar:
|
||||
await asyncio.gather(*tasks)
|
||||
executor = AsyncioSimpleExecutor(logger=logger)
|
||||
else:
|
||||
for f in tqdm.asyncio.tqdm.as_completed(tasks):
|
||||
await f
|
||||
executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=max_connections, timeout=timeout+0.5)
|
||||
|
||||
results = await executor.run(coroutines)
|
||||
|
||||
await session.close()
|
||||
|
||||
# Notify caller that all queries are finished.
|
||||
query_notify.finish()
|
||||
|
||||
return results_total
|
||||
data = {}
|
||||
for result in results:
|
||||
# TODO: still can be empty
|
||||
if result:
|
||||
try:
|
||||
data[result[0]] = result[1]
|
||||
except Exception as e:
|
||||
logger.error(e, exc_info=True)
|
||||
logger.info(result)
|
||||
|
||||
return data
|
||||
|
||||
|
||||
def timeout_check(value):
|
||||
@@ -504,7 +619,6 @@ def timeout_check(value):
|
||||
|
||||
|
||||
async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False):
|
||||
query_notify = Mock()
|
||||
changes = {
|
||||
'disabled': False,
|
||||
}
|
||||
@@ -524,10 +638,9 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F
|
||||
for username, status in check_data:
|
||||
async with semaphore:
|
||||
results_dict = await maigret(
|
||||
username,
|
||||
{site.name: site},
|
||||
query_notify,
|
||||
logger,
|
||||
username=username,
|
||||
site_dict={site.name: site},
|
||||
logger=logger,
|
||||
timeout=30,
|
||||
id_type=site.type,
|
||||
forced=True,
|
||||
|
||||
+40
-25
@@ -1,23 +1,27 @@
|
||||
"""
|
||||
Maigret main module
|
||||
"""
|
||||
|
||||
import aiohttp
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import platform
|
||||
import sys
|
||||
import platform
|
||||
from argparse import ArgumentParser, RawDescriptionHelpFormatter
|
||||
|
||||
import requests
|
||||
from socid_extractor import parse, __version__ as socid_version
|
||||
from socid_extractor import extract, parse, __version__ as socid_version
|
||||
|
||||
from .checking import *
|
||||
from .checking import timeout_check, supported_recursive_search_ids, self_check, unsupported_characters, maigret
|
||||
from .notify import QueryNotifyPrint
|
||||
from .report import save_csv_report, save_xmind_report, save_html_report, save_pdf_report, \
|
||||
generate_report_context, save_txt_report, SUPPORTED_JSON_REPORT_FORMATS, check_supported_json_format, \
|
||||
save_json_report
|
||||
from .sites import MaigretDatabase
|
||||
from .submit import submit_dialog
|
||||
from .utils import get_dict_ascii_tree
|
||||
|
||||
__version__ = '0.1.15'
|
||||
__version__ = '0.1.17'
|
||||
|
||||
|
||||
async def main():
|
||||
@@ -176,7 +180,7 @@ async def main():
|
||||
action="store", metavar='REPORT_TYPE',
|
||||
dest="json", default='', type=check_supported_json_format,
|
||||
help=f"Generate a JSON report of specific type: {', '.join(SUPPORTED_JSON_REPORT_FORMATS)}"
|
||||
" (one report per username)."
|
||||
" (one report per username)."
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
@@ -204,7 +208,7 @@ async def main():
|
||||
u: args.id_type
|
||||
for u in args.username
|
||||
if u not in ['-']
|
||||
and u not in args.ignore_ids_list
|
||||
and u not in args.ignore_ids_list
|
||||
}
|
||||
|
||||
parsing_enabled = not args.disable_extracting
|
||||
@@ -215,15 +219,29 @@ async def main():
|
||||
print("Using the proxy: " + args.proxy)
|
||||
|
||||
if args.parse_url:
|
||||
page, _ = parse(args.parse_url, cookies_str='')
|
||||
info = extract(page)
|
||||
text = 'Extracted ID data from webpage: ' + ', '.join([f'{a}: {b}' for a, b in info.items()])
|
||||
print(text)
|
||||
for k, v in info.items():
|
||||
if 'username' in k:
|
||||
usernames[v] = 'username'
|
||||
if k in supported_recursive_search_ids:
|
||||
usernames[v] = k
|
||||
# url, headers
|
||||
reqs = [(args.parse_url, set())]
|
||||
try:
|
||||
# temporary workaround for URL mutations MVP
|
||||
from socid_extractor import mutate_url
|
||||
reqs += list(mutate_url(args.parse_url))
|
||||
except:
|
||||
pass
|
||||
|
||||
for req in reqs:
|
||||
url, headers = req
|
||||
print(f'Scanning webpage by URL {url}...')
|
||||
page, _ = parse(url, cookies_str='', headers=headers)
|
||||
info = extract(page)
|
||||
if not info:
|
||||
print('Nothing extracted')
|
||||
else:
|
||||
print(get_dict_ascii_tree(info.items(), new_line=False), ' ')
|
||||
for k, v in info.items():
|
||||
if 'username' in k:
|
||||
usernames[v] = 'username'
|
||||
if k in supported_recursive_search_ids:
|
||||
usernames[v] = k
|
||||
|
||||
if args.tags:
|
||||
args.tags = list(set(str(args.tags).split(',')))
|
||||
@@ -253,7 +271,7 @@ async def main():
|
||||
site_data = get_top_sites_for_id(args.id_type)
|
||||
|
||||
if args.new_site_to_submit:
|
||||
is_submitted = await submit_dialog(db, args.new_site_to_submit)
|
||||
is_submitted = await submit_dialog(db, args.new_site_to_submit, args.cookie_file)
|
||||
if is_submitted:
|
||||
db.save_to_file(args.db_file)
|
||||
|
||||
@@ -262,7 +280,7 @@ async def main():
|
||||
print('Maigret sites database self-checking...')
|
||||
is_need_update = await self_check(db, site_data, logger, max_connections=args.connections)
|
||||
if is_need_update:
|
||||
if input('Do you want to save changes permanently? [yYnN]\n').lower() == 'y':
|
||||
if input('Do you want to save changes permanently? [Yn]\n').lower() == 'y':
|
||||
db.save_to_file(args.db_file)
|
||||
print('Database was successfully updated.')
|
||||
else:
|
||||
@@ -274,7 +292,6 @@ async def main():
|
||||
|
||||
# Make reports folder is not exists
|
||||
os.makedirs(args.folderoutput, exist_ok=True)
|
||||
report_path = args.folderoutput
|
||||
|
||||
# Define one report filename template
|
||||
report_filepath_tpl = os.path.join(args.folderoutput, 'report_{username}{postfix}')
|
||||
@@ -324,9 +341,9 @@ async def main():
|
||||
|
||||
sites_to_check = get_top_sites_for_id(id_type)
|
||||
|
||||
results = await maigret(username,
|
||||
dict(sites_to_check),
|
||||
query_notify,
|
||||
results = await maigret(username=username,
|
||||
site_dict=dict(sites_to_check),
|
||||
query_notify=query_notify,
|
||||
proxy=args.proxy,
|
||||
timeout=args.timeout,
|
||||
is_parsing_enabled=parsing_enabled,
|
||||
@@ -338,14 +355,13 @@ async def main():
|
||||
max_connections=args.connections,
|
||||
)
|
||||
|
||||
username_result = (username, id_type, results)
|
||||
general_results.append((username, id_type, results))
|
||||
|
||||
# TODO: tests
|
||||
for website_name in results:
|
||||
dictionary = results[website_name]
|
||||
# TODO: fix no site data issue
|
||||
if not dictionary:
|
||||
if not dictionary or not recursive_search_enabled:
|
||||
continue
|
||||
|
||||
new_usernames = dictionary.get('ids_usernames')
|
||||
@@ -380,7 +396,6 @@ async def main():
|
||||
save_json_report(filename, username, results, report_type=args.json)
|
||||
query_notify.warning(f'JSON {args.json} report for {username} saved in {filename}')
|
||||
|
||||
|
||||
# reporting for all the result
|
||||
if general_results:
|
||||
if args.html or args.pdf:
|
||||
|
||||
+3
-17
@@ -4,9 +4,11 @@ This module defines the objects for notifying the caller about the
|
||||
results of queries.
|
||||
"""
|
||||
import sys
|
||||
|
||||
from colorama import Fore, Style, init
|
||||
|
||||
from .result import QueryStatus
|
||||
from .utils import get_dict_ascii_tree
|
||||
|
||||
|
||||
class QueryNotify():
|
||||
@@ -175,22 +177,6 @@ class QueryNotifyPrint(QueryNotify):
|
||||
else:
|
||||
print(msg)
|
||||
|
||||
def get_additional_data_text(self, items, prepend=''):
|
||||
text = ''
|
||||
for num, item in enumerate(items):
|
||||
box_symbol = '┣╸' if num != len(items) - 1 else '┗╸'
|
||||
|
||||
if type(item) == tuple:
|
||||
field_name, field_value = item
|
||||
if field_value.startswith('[\''):
|
||||
is_last_item = num == len(items) - 1
|
||||
prepend_symbols = ' ' * 3 if is_last_item else ' ┃ '
|
||||
field_value = self.get_additional_data_text(eval(field_value), prepend_symbols)
|
||||
text += f'\n{prepend}{box_symbol}{field_name}: {field_value}'
|
||||
else:
|
||||
text += f'\n{prepend}{box_symbol} {item}'
|
||||
|
||||
return text
|
||||
|
||||
def update(self, result, is_similar=False):
|
||||
"""Notify Update.
|
||||
@@ -210,7 +196,7 @@ class QueryNotifyPrint(QueryNotify):
|
||||
if not self.result.ids_data:
|
||||
ids_data_text = ""
|
||||
else:
|
||||
ids_data_text = self.get_additional_data_text(self.result.ids_data.items(), ' ')
|
||||
ids_data_text = get_dict_ascii_tree(self.result.ids_data.items(), ' ')
|
||||
|
||||
def make_colored_terminal_notify(status, text, status_color, text_color, appendix):
|
||||
text = [
|
||||
|
||||
+25
-15
@@ -1,15 +1,16 @@
|
||||
import csv
|
||||
import json
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from argparse import ArgumentTypeError
|
||||
from datetime import datetime
|
||||
|
||||
import pycountry
|
||||
import xmind
|
||||
from datetime import datetime
|
||||
from dateutil.parser import parse as parse_datetime_str
|
||||
from jinja2 import Template
|
||||
from xhtml2pdf import pisa
|
||||
from argparse import ArgumentTypeError
|
||||
from dateutil.parser import parse as parse_datetime_str
|
||||
|
||||
from .result import QueryStatus
|
||||
from .utils import is_country_tag, CaseConverter, enrich_link_str
|
||||
@@ -19,10 +20,11 @@ SUPPORTED_JSON_REPORT_FORMATS = [
|
||||
'ndjson',
|
||||
]
|
||||
|
||||
|
||||
'''
|
||||
UTILS
|
||||
'''
|
||||
|
||||
|
||||
def filter_supposed_data(data):
|
||||
### interesting fields
|
||||
allowed_fields = ['fullname', 'gender', 'location', 'age']
|
||||
@@ -35,6 +37,8 @@ def filter_supposed_data(data):
|
||||
'''
|
||||
REPORTS SAVING
|
||||
'''
|
||||
|
||||
|
||||
def save_csv_report(filename: str, username: str, results: dict):
|
||||
with open(filename, 'w', newline='', encoding='utf-8') as f:
|
||||
generate_csv_report(username, results, f)
|
||||
@@ -58,6 +62,7 @@ def save_pdf_report(filename: str, context: dict):
|
||||
with open(filename, 'w+b') as f:
|
||||
pisa.pisaDocument(io.StringIO(filled_template), dest=f, default_css=css)
|
||||
|
||||
|
||||
def save_json_report(filename: str, username: str, results: dict, report_type: str):
|
||||
with open(filename, 'w', encoding='utf-8') as f:
|
||||
generate_json_report(username, results, f, report_type=report_type)
|
||||
@@ -66,10 +71,13 @@ def save_json_report(filename: str, username: str, results: dict, report_type: s
|
||||
'''
|
||||
REPORTS GENERATING
|
||||
'''
|
||||
|
||||
|
||||
def generate_report_template(is_pdf: bool):
|
||||
"""
|
||||
HTML/PDF template generation
|
||||
"""
|
||||
|
||||
def get_resource_content(filename):
|
||||
return open(os.path.join(maigret_path, 'resources', filename)).read()
|
||||
|
||||
@@ -112,6 +120,9 @@ def generate_report_context(username_results: list):
|
||||
continue
|
||||
|
||||
status = dictionary.get('status')
|
||||
if not status: # FIXME: currently in case of timeout
|
||||
continue
|
||||
|
||||
if status.ids_data:
|
||||
dictionary['ids_data'] = status.ids_data
|
||||
extended_info_count += 1
|
||||
@@ -166,7 +177,6 @@ def generate_report_context(username_results: list):
|
||||
for t in status.tags:
|
||||
tags[t] = tags.get(t, 0) + 1
|
||||
|
||||
|
||||
brief_text.append(f'Search by {id_type} {username} returned {found_accounts} accounts.')
|
||||
|
||||
if new_ids:
|
||||
@@ -177,8 +187,6 @@ def generate_report_context(username_results: list):
|
||||
|
||||
brief_text.append(f'Extended info extracted from {extended_info_count} accounts.')
|
||||
|
||||
|
||||
|
||||
brief = ' '.join(brief_text).strip()
|
||||
tuple_sort = lambda d: sorted(d, key=lambda x: x[1], reverse=True)
|
||||
|
||||
@@ -221,7 +229,7 @@ def generate_csv_report(username: str, results: dict, csvfile):
|
||||
results[site]['url_user'],
|
||||
str(results[site]['status'].status),
|
||||
results[site]['http_status'],
|
||||
])
|
||||
])
|
||||
|
||||
|
||||
def generate_txt_report(username: str, results: dict, file):
|
||||
@@ -253,16 +261,19 @@ def generate_json_report(username: str, results: dict, file, report_type):
|
||||
|
||||
if is_report_per_line:
|
||||
data['sitename'] = sitename
|
||||
file.write(json.dumps(data)+'\n')
|
||||
file.write(json.dumps(data) + '\n')
|
||||
else:
|
||||
all_json[sitename] = data
|
||||
|
||||
if not is_report_per_line:
|
||||
file.write(json.dumps(all_json))
|
||||
|
||||
|
||||
'''
|
||||
XMIND 8 Functions
|
||||
'''
|
||||
|
||||
|
||||
def save_xmind_report(filename, username, results):
|
||||
if os.path.exists(filename):
|
||||
os.remove(filename)
|
||||
@@ -277,9 +288,9 @@ def design_sheet(sheet, username, results):
|
||||
alltags = {}
|
||||
supposed_data = {}
|
||||
|
||||
sheet.setTitle("%s Analysis"%(username))
|
||||
sheet.setTitle("%s Analysis" % (username))
|
||||
root_topic1 = sheet.getRootTopic()
|
||||
root_topic1.setTitle("%s"%(username))
|
||||
root_topic1.setTitle("%s" % (username))
|
||||
|
||||
undefinedsection = root_topic1.addSubTopic()
|
||||
undefinedsection.setTitle("Undefined")
|
||||
@@ -333,7 +344,7 @@ def design_sheet(sheet, username, results):
|
||||
currentsublabel.setTitle("%s: %s" % (k, currentval))
|
||||
### Add Supposed DATA
|
||||
filterede_supposed_data = filter_supposed_data(supposed_data)
|
||||
if(len(filterede_supposed_data) >0):
|
||||
if (len(filterede_supposed_data) > 0):
|
||||
undefinedsection = root_topic1.addSubTopic()
|
||||
undefinedsection.setTitle("SUPPOSED DATA")
|
||||
for k, v in filterede_supposed_data.items():
|
||||
@@ -344,6 +355,5 @@ def design_sheet(sheet, username, results):
|
||||
def check_supported_json_format(value):
|
||||
if value and not value in SUPPORTED_JSON_REPORT_FORMATS:
|
||||
raise ArgumentTypeError(f'JSON report type must be one of the following types: '
|
||||
+ ', '.join(SUPPORTED_JSON_REPORT_FORMATS))
|
||||
+ ', '.join(SUPPORTED_JSON_REPORT_FORMATS))
|
||||
return value
|
||||
|
||||
|
||||
+2423
-2178
File diff suppressed because it is too large
Load Diff
+7
-11
@@ -2,7 +2,6 @@
|
||||
"""Maigret Sites Information"""
|
||||
import copy
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
|
||||
import requests
|
||||
@@ -16,6 +15,7 @@ SUPPORTED_TAGS = [
|
||||
'discussion', 'sharing', 'writing', 'wiki', 'business', 'shopping', 'sport',
|
||||
'books', 'news', 'documents', 'travel', 'maps', 'hobby', 'apps', 'classified',
|
||||
'career', 'geosocial', 'streaming', 'education', 'networking', 'torrent',
|
||||
'science', 'medicine',
|
||||
]
|
||||
|
||||
|
||||
@@ -46,7 +46,7 @@ class MaigretSite:
|
||||
|
||||
self.disabled = False
|
||||
self.similar_search = False
|
||||
self.ignore_403 = False
|
||||
self.ignore403 = False
|
||||
self.tags = []
|
||||
|
||||
self.type = 'username'
|
||||
@@ -87,13 +87,12 @@ class MaigretSite:
|
||||
url = self.url
|
||||
for group in ['urlMain', 'urlSubpath']:
|
||||
if group in url:
|
||||
url = url.replace('{'+group+'}', self.__dict__[CaseConverter.camel_to_snake(group)])
|
||||
url = url.replace('{' + group + '}', self.__dict__[CaseConverter.camel_to_snake(group)])
|
||||
|
||||
self.url_regexp = URLMatcher.make_profile_url_regexp(url, self.regex_check)
|
||||
|
||||
def detect_username(self, url: str) -> str:
|
||||
if self.url_regexp:
|
||||
import logging
|
||||
match_groups = self.url_regexp.match(url)
|
||||
if match_groups:
|
||||
return match_groups.groups()[-1].rstrip('/')
|
||||
@@ -156,7 +155,8 @@ class MaigretSite:
|
||||
# remove dict keys
|
||||
if isinstance(engine_data[k], dict) and is_exists:
|
||||
for f in engine_data[k].keys():
|
||||
del self_copy.__dict__[field][f]
|
||||
if f in self_copy.__dict__[field]:
|
||||
del self_copy.__dict__[field][f]
|
||||
continue
|
||||
# remove list items
|
||||
if isinstance(engine_data[k], list) and is_exists:
|
||||
@@ -238,7 +238,6 @@ class MaigretDatabase:
|
||||
|
||||
return self
|
||||
|
||||
|
||||
def load_from_json(self, json_data: dict) -> MaigretDatabase:
|
||||
# Add all of site information from the json file to internal site list.
|
||||
site_data = json_data.get("sites", {})
|
||||
@@ -263,7 +262,6 @@ class MaigretDatabase:
|
||||
|
||||
return self
|
||||
|
||||
|
||||
def load_from_str(self, db_str: str) -> MaigretDatabase:
|
||||
try:
|
||||
data = json.loads(db_str)
|
||||
@@ -274,7 +272,6 @@ class MaigretDatabase:
|
||||
|
||||
return self.load_from_json(data)
|
||||
|
||||
|
||||
def load_from_url(self, url: str) -> MaigretDatabase:
|
||||
is_url_valid = url.startswith('http://') or url.startswith('https://')
|
||||
|
||||
@@ -303,7 +300,6 @@ class MaigretDatabase:
|
||||
|
||||
return self.load_from_json(data)
|
||||
|
||||
|
||||
def load_from_file(self, filename: str) -> MaigretDatabase:
|
||||
try:
|
||||
with open(filename, 'r', encoding='utf-8') as file:
|
||||
@@ -364,7 +360,7 @@ class MaigretDatabase:
|
||||
continue
|
||||
tags[tag] = tags.get(tag, 0) + 1
|
||||
|
||||
output += f'Enabled/total sites: {total_count-disabled_count}/{total_count}\n'
|
||||
output += f'Enabled/total sites: {total_count - disabled_count}/{total_count}\n'
|
||||
output += 'Top sites\' profile URLs:\n'
|
||||
for url, count in sorted(urls.items(), key=lambda x: x[1], reverse=True)[:20]:
|
||||
if count == 1:
|
||||
@@ -377,4 +373,4 @@ class MaigretDatabase:
|
||||
mark = ' (non-standard)'
|
||||
output += f'{count}\t{tag}{mark}\n'
|
||||
|
||||
return output
|
||||
return output
|
||||
|
||||
+104
-44
@@ -1,14 +1,15 @@
|
||||
import difflib
|
||||
import json
|
||||
|
||||
import requests
|
||||
from mock import Mock
|
||||
|
||||
from .checking import *
|
||||
|
||||
|
||||
DESIRED_STRINGS = ["username", "not found", "пользователь", "profile", "lastname", "firstname", "biography",
|
||||
"birthday", "репутация", "информация", "e-mail"]
|
||||
|
||||
SUPPOSED_USERNAMES = ['alex', 'god', 'admin', 'red', 'blue', 'john']
|
||||
|
||||
RATIO = 0.6
|
||||
TOP_FEATURES = 5
|
||||
URL_RE = re.compile(r'https?://(www\.)?')
|
||||
@@ -21,12 +22,11 @@ def get_match_ratio(x):
|
||||
]), 2)
|
||||
|
||||
|
||||
def extract_domain(url):
|
||||
def extract_mainpage_url(url):
|
||||
return '/'.join(url.split('/', 3)[:3])
|
||||
|
||||
|
||||
async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False):
|
||||
query_notify = Mock()
|
||||
changes = {
|
||||
'disabled': False,
|
||||
}
|
||||
@@ -39,26 +39,24 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F
|
||||
logger.info(f'Checking {site.name}...')
|
||||
|
||||
for username, status in check_data:
|
||||
async with semaphore:
|
||||
results_dict = await maigret(
|
||||
username,
|
||||
{site.name: site},
|
||||
query_notify,
|
||||
logger,
|
||||
timeout=30,
|
||||
id_type=site.type,
|
||||
forced=True,
|
||||
no_progressbar=True,
|
||||
)
|
||||
results_dict = await maigret(
|
||||
username=username,
|
||||
site_dict={site.name: site},
|
||||
logger=logger,
|
||||
timeout=30,
|
||||
id_type=site.type,
|
||||
forced=True,
|
||||
no_progressbar=True,
|
||||
)
|
||||
|
||||
# don't disable entries with other ids types
|
||||
# TODO: make normal checking
|
||||
if site.name not in results_dict:
|
||||
logger.info(results_dict)
|
||||
changes['disabled'] = True
|
||||
continue
|
||||
# don't disable entries with other ids types
|
||||
# TODO: make normal checking
|
||||
if site.name not in results_dict:
|
||||
logger.info(results_dict)
|
||||
changes['disabled'] = True
|
||||
continue
|
||||
|
||||
result = results_dict[site.name]['status']
|
||||
result = results_dict[site.name]['status']
|
||||
|
||||
site_status = result.status
|
||||
|
||||
@@ -85,18 +83,45 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F
|
||||
return changes
|
||||
|
||||
|
||||
async def submit_dialog(db, url_exists):
|
||||
domain_raw = URL_RE.sub('', url_exists).strip().strip('/')
|
||||
domain_raw = domain_raw.split('/')[0]
|
||||
async def detect_known_engine(db, url_exists, url_mainpage):
|
||||
try:
|
||||
r = requests.get(url_mainpage)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
print('Some error while checking main page')
|
||||
return None
|
||||
|
||||
matched_sites = list(filter(lambda x: domain_raw in x.url_main+x.url, db.sites))
|
||||
if matched_sites:
|
||||
print(f'Sites with domain "{domain_raw}" already exists in the Maigret database!')
|
||||
status = lambda s: '(disabled)' if s.disabled else ''
|
||||
url_block = lambda s: f'\n\t{s.url_main}\n\t{s.url}'
|
||||
print('\n'.join([f'{site.name} {status(site)}{url_block(site)}' for site in matched_sites]))
|
||||
return False
|
||||
for e in db.engines:
|
||||
strs_to_check = e.__dict__.get('presenseStrs')
|
||||
if strs_to_check and r and r.text:
|
||||
all_strs_in_response = True
|
||||
for s in strs_to_check:
|
||||
if not s in r.text:
|
||||
all_strs_in_response = False
|
||||
if all_strs_in_response:
|
||||
engine_name = e.__dict__.get('name')
|
||||
print(f'Detected engine {engine_name} for site {url_mainpage}')
|
||||
|
||||
sites = []
|
||||
for u in SUPPOSED_USERNAMES:
|
||||
site_data = {
|
||||
'urlMain': url_mainpage,
|
||||
'name': url_mainpage.split('//')[0],
|
||||
'engine': engine_name,
|
||||
'usernameClaimed': u,
|
||||
'usernameUnclaimed': 'noonewouldeverusethis7',
|
||||
}
|
||||
|
||||
maigret_site = MaigretSite(url_mainpage.split('/')[-1], site_data)
|
||||
maigret_site.update_from_engine(db.engines_dict[engine_name])
|
||||
sites.append(maigret_site)
|
||||
|
||||
return sites
|
||||
|
||||
return None
|
||||
|
||||
|
||||
async def check_features_manually(db, url_exists, url_mainpage, cookie_file):
|
||||
url_parts = url_exists.split('/')
|
||||
supposed_username = url_parts[-1]
|
||||
new_name = input(f'Is "{supposed_username}" a valid username? If not, write it manually: ')
|
||||
@@ -107,8 +132,14 @@ async def submit_dialog(db, url_exists):
|
||||
url_user = url_exists.replace(supposed_username, '{username}')
|
||||
url_not_exists = url_exists.replace(supposed_username, non_exist_username)
|
||||
|
||||
a = requests.get(url_exists).text
|
||||
b = requests.get(url_not_exists).text
|
||||
# cookies
|
||||
cookie_dict = None
|
||||
if cookie_file:
|
||||
cookie_jar = await import_aiohttp_cookies(cookie_file)
|
||||
cookie_dict = {c.key: c.value for c in cookie_jar}
|
||||
|
||||
a = requests.get(url_exists, cookies=cookie_dict).text
|
||||
b = requests.get(url_not_exists, cookies=cookie_dict).text
|
||||
|
||||
tokens_a = set(a.split('"'))
|
||||
tokens_b = set(b.split('"'))
|
||||
@@ -133,21 +164,40 @@ async def submit_dialog(db, url_exists):
|
||||
if features:
|
||||
absence_list = features.split(',')
|
||||
|
||||
url_main = extract_domain(url_exists)
|
||||
|
||||
site_data = {
|
||||
'absenceStrs': absence_list,
|
||||
'presenseStrs': presence_list,
|
||||
'url': url_user,
|
||||
'urlMain': url_main,
|
||||
'urlMain': url_mainpage,
|
||||
'usernameClaimed': supposed_username,
|
||||
'usernameUnclaimed': non_exist_username,
|
||||
'checkType': 'message',
|
||||
}
|
||||
|
||||
site = MaigretSite(url_main.split('/')[-1], site_data)
|
||||
site = MaigretSite(url_mainpage.split('/')[-1], site_data)
|
||||
return site
|
||||
|
||||
print(site.__dict__)
|
||||
async def submit_dialog(db, url_exists, cookie_file):
|
||||
domain_raw = URL_RE.sub('', url_exists).strip().strip('/')
|
||||
domain_raw = domain_raw.split('/')[0]
|
||||
|
||||
# check for existence
|
||||
matched_sites = list(filter(lambda x: domain_raw in x.url_main + x.url, db.sites))
|
||||
if matched_sites:
|
||||
print(f'Sites with domain "{domain_raw}" already exists in the Maigret database!')
|
||||
status = lambda s: '(disabled)' if s.disabled else ''
|
||||
url_block = lambda s: f'\n\t{s.url_main}\n\t{s.url}'
|
||||
print('\n'.join([f'{site.name} {status(site)}{url_block(site)}' for site in matched_sites]))
|
||||
return False
|
||||
|
||||
url_mainpage = extract_mainpage_url(url_exists)
|
||||
|
||||
sites = await detect_known_engine(db, url_exists, url_mainpage)
|
||||
if not sites:
|
||||
print('Unable to detect site engine, lets generate checking features')
|
||||
sites = [await check_features_manually(db, url_exists, url_mainpage, cookie_file)]
|
||||
|
||||
print(sites[0].__dict__)
|
||||
|
||||
sem = asyncio.Semaphore(1)
|
||||
log_level = logging.INFO
|
||||
@@ -159,14 +209,24 @@ async def submit_dialog(db, url_exists):
|
||||
logger = logging.getLogger('site-submit')
|
||||
logger.setLevel(log_level)
|
||||
|
||||
result = await site_self_check(site, logger, sem, db)
|
||||
found = False
|
||||
chosen_site = None
|
||||
for s in sites:
|
||||
chosen_site = s
|
||||
result = await site_self_check(s, logger, sem, db)
|
||||
if not result['disabled']:
|
||||
found = True
|
||||
break
|
||||
|
||||
if result['disabled']:
|
||||
print(f'Sorry, we couldn\'t find params to detect account presence/absence in {site.name}.')
|
||||
if not found:
|
||||
print(f'Sorry, we couldn\'t find params to detect account presence/absence in {chosen_site.name}.')
|
||||
print('Try to run this mode again and increase features count or choose others.')
|
||||
else:
|
||||
if input(f'Site {site.name} successfully checked. Do you want to save it in the Maigret DB? [yY] ') in 'yY':
|
||||
db.update_site(site)
|
||||
if input(f'Site {chosen_site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] ').lower() in 'y':
|
||||
print(chosen_site.json)
|
||||
site_data = chosen_site.strip_engine_data()
|
||||
print(site_data.json)
|
||||
db.update_site(site_data)
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
+22
-2
@@ -1,5 +1,4 @@
|
||||
import re
|
||||
import sys
|
||||
|
||||
|
||||
class CaseConverter:
|
||||
@@ -55,4 +54,25 @@ class URLMatcher:
|
||||
url_regexp = url_main_part.replace('{username}', f'({username_regexp})')
|
||||
regexp_str = self._HTTP_URL_RE_STR.replace('(.+)', url_regexp)
|
||||
|
||||
return re.compile(regexp_str)
|
||||
return re.compile(regexp_str)
|
||||
|
||||
|
||||
def get_dict_ascii_tree(items, prepend='', new_line=True):
|
||||
text = ''
|
||||
for num, item in enumerate(items):
|
||||
box_symbol = '┣╸' if num != len(items) - 1 else '┗╸'
|
||||
|
||||
if type(item) == tuple:
|
||||
field_name, field_value = item
|
||||
if field_value.startswith('[\''):
|
||||
is_last_item = num == len(items) - 1
|
||||
prepend_symbols = ' ' * 3 if is_last_item else ' ┃ '
|
||||
field_value = print_ascii_tree(eval(field_value), prepend_symbols)
|
||||
text += f'\n{prepend}{box_symbol}{field_name}: {field_value}'
|
||||
else:
|
||||
text += f'\n{prepend}{box_symbol} {item}'
|
||||
|
||||
if not new_line:
|
||||
text = text[1:]
|
||||
|
||||
return text
|
||||
|
||||
+3
-5
@@ -13,22 +13,20 @@ future==0.18.2
|
||||
future-annotations==1.0.0
|
||||
html5lib==1.1
|
||||
idna==2.10
|
||||
Jinja2==2.11.2
|
||||
lxml==4.6.2
|
||||
Jinja2==2.11.3
|
||||
lxml==4.6.3
|
||||
MarkupSafe==1.1.1
|
||||
mock==4.0.2
|
||||
multidict==5.1.0
|
||||
Pillow==8.1.0
|
||||
pycountry==20.7.3
|
||||
PyPDF2==1.26.0
|
||||
PySocks==1.7.1
|
||||
python-bidi==0.4.2
|
||||
python-socks==1.1.2
|
||||
reportlab==3.5.59
|
||||
requests>=2.24.0
|
||||
requests-futures==1.0.0
|
||||
six==1.15.0
|
||||
socid-extractor>=0.0.13
|
||||
socid-extractor>=0.0.16
|
||||
soupsieve==2.1
|
||||
stem==1.8.0
|
||||
torrequest==0.1.0
|
||||
|
||||
@@ -12,7 +12,7 @@ with open('requirements.txt') as rf:
|
||||
requires = rf.read().splitlines()
|
||||
|
||||
setup(name='maigret',
|
||||
version='0.1.15',
|
||||
version='0.1.17',
|
||||
description='Collect a dossier on a person by username from a huge number of sites',
|
||||
long_description=long_description,
|
||||
long_description_content_type="text/markdown",
|
||||
|
||||
+2
-2
@@ -1,11 +1,11 @@
|
||||
import glob
|
||||
import logging
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from _pytest.mark import Mark
|
||||
from mock import Mock
|
||||
|
||||
from maigret.sites import MaigretDatabase, MaigretSite
|
||||
from maigret.sites import MaigretDatabase
|
||||
|
||||
CUR_PATH = os.path.dirname(os.path.realpath(__file__))
|
||||
JSON_FILE = os.path.join(CUR_PATH, '../maigret/resources/data.json')
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
"""Maigret activation test functions"""
|
||||
import json
|
||||
|
||||
import aiohttp
|
||||
import pytest
|
||||
from mock import Mock
|
||||
|
||||
@@ -0,0 +1,66 @@
|
||||
"""Maigret checking logic test functions"""
|
||||
import pytest
|
||||
import asyncio
|
||||
import logging
|
||||
from maigret.checking import AsyncioSimpleExecutor, AsyncioProgressbarExecutor, AsyncioProgressbarSemaphoreExecutor, AsyncioProgressbarQueueExecutor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
async def func(n):
|
||||
await asyncio.sleep(0.1 * (n % 3))
|
||||
return n
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_simple_asyncio_executor():
|
||||
tasks = [(func, [n], {}) for n in range(10)]
|
||||
executor = AsyncioSimpleExecutor(logger=logger)
|
||||
assert await executor.run(tasks) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
|
||||
assert executor.execution_time > 0.2
|
||||
assert executor.execution_time < 0.3
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_asyncio_progressbar_executor():
|
||||
tasks = [(func, [n], {}) for n in range(10)]
|
||||
|
||||
executor = AsyncioProgressbarExecutor(logger=logger)
|
||||
# no guarantees for the results order
|
||||
assert sorted(await executor.run(tasks)) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
|
||||
assert executor.execution_time > 0.2
|
||||
assert executor.execution_time < 0.3
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_asyncio_progressbar_semaphore_executor():
|
||||
tasks = [(func, [n], {}) for n in range(10)]
|
||||
|
||||
executor = AsyncioProgressbarSemaphoreExecutor(logger=logger, in_parallel=5)
|
||||
# no guarantees for the results order
|
||||
assert sorted(await executor.run(tasks)) == [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
|
||||
assert executor.execution_time > 0.2
|
||||
assert executor.execution_time < 0.4
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_asyncio_progressbar_queue_executor():
|
||||
tasks = [(func, [n], {}) for n in range(10)]
|
||||
|
||||
executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=2)
|
||||
assert await executor.run(tasks) == [0, 1, 3, 2, 4, 6, 7, 5, 9, 8]
|
||||
assert executor.execution_time > 0.5
|
||||
assert executor.execution_time < 0.6
|
||||
|
||||
executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=3)
|
||||
assert await executor.run(tasks) == [0, 3, 1, 4, 6, 2, 7, 9, 5, 8]
|
||||
assert executor.execution_time > 0.4
|
||||
assert executor.execution_time < 0.5
|
||||
|
||||
executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=5)
|
||||
assert await executor.run(tasks) == [0, 3, 6, 1, 4, 7, 9, 2, 5, 8]
|
||||
assert executor.execution_time > 0.3
|
||||
assert executor.execution_time < 0.4
|
||||
|
||||
executor = AsyncioProgressbarQueueExecutor(logger=logger, in_parallel=10)
|
||||
assert await executor.run(tasks) == [0, 3, 6, 9, 1, 4, 7, 2, 5, 8]
|
||||
assert executor.execution_time > 0.2
|
||||
assert executor.execution_time < 0.3
|
||||
@@ -1,10 +1,11 @@
|
||||
"""Maigret main module test functions"""
|
||||
import asyncio
|
||||
|
||||
import pytest
|
||||
from mock import Mock
|
||||
|
||||
from maigret.maigret import self_check
|
||||
from maigret.sites import MaigretDatabase, MaigretSite
|
||||
from maigret.sites import MaigretDatabase
|
||||
|
||||
EXAMPLE_DB = {
|
||||
'engines': {
|
||||
|
||||
+20
-20
@@ -1,33 +1,32 @@
|
||||
"""Maigret Database test functions"""
|
||||
from maigret.sites import MaigretDatabase, MaigretSite
|
||||
|
||||
|
||||
EXAMPLE_DB = {
|
||||
'engines': {
|
||||
"XenForo": {
|
||||
"presenseStrs": ["XenForo"],
|
||||
"site": {
|
||||
"absenceStrs": [
|
||||
"The specified member cannot be found. Please enter a member's entire name.",
|
||||
],
|
||||
"checkType": "message",
|
||||
"errors": {
|
||||
"You must be logged-in to do that.": "Login required"
|
||||
},
|
||||
"url": "{urlMain}{urlSubpath}/members/?username={username}"
|
||||
}
|
||||
"presenseStrs": ["XenForo"],
|
||||
"site": {
|
||||
"absenceStrs": [
|
||||
"The specified member cannot be found. Please enter a member's entire name.",
|
||||
],
|
||||
"checkType": "message",
|
||||
"errors": {
|
||||
"You must be logged-in to do that.": "Login required"
|
||||
},
|
||||
"url": "{urlMain}{urlSubpath}/members/?username={username}"
|
||||
}
|
||||
},
|
||||
},
|
||||
'sites': {
|
||||
"Amperka": {
|
||||
"engine": "XenForo",
|
||||
"rank": 121613,
|
||||
"tags": [
|
||||
"ru"
|
||||
],
|
||||
"urlMain": "http://forum.amperka.ru",
|
||||
"usernameClaimed": "adam",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
"engine": "XenForo",
|
||||
"rank": 121613,
|
||||
"tags": [
|
||||
"ru"
|
||||
],
|
||||
"urlMain": "http://forum.amperka.ru",
|
||||
"usernameClaimed": "adam",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -167,6 +166,7 @@ def test_ranked_sites_dict_disabled():
|
||||
assert len(db.ranked_sites_dict()) == 2
|
||||
assert len(db.ranked_sites_dict(disabled=False)) == 1
|
||||
|
||||
|
||||
def test_ranked_sites_dict_id_type():
|
||||
db = MaigretDatabase()
|
||||
db.update_site(MaigretSite('1', {}))
|
||||
|
||||
+73
-39
@@ -1,66 +1,100 @@
|
||||
"""Maigret utils test functions"""
|
||||
import itertools
|
||||
import re
|
||||
from maigret.utils import CaseConverter, is_country_tag, enrich_link_str, URLMatcher
|
||||
|
||||
from maigret.utils import CaseConverter, is_country_tag, enrich_link_str, URLMatcher, get_dict_ascii_tree
|
||||
|
||||
|
||||
def test_case_convert_camel_to_snake():
|
||||
a = 'SnakeCasedString'
|
||||
b = CaseConverter.camel_to_snake(a)
|
||||
a = 'SnakeCasedString'
|
||||
b = CaseConverter.camel_to_snake(a)
|
||||
|
||||
assert b == 'snake_cased_string'
|
||||
|
||||
assert b == 'snake_cased_string'
|
||||
|
||||
def test_case_convert_snake_to_camel():
|
||||
a = 'camel_cased_string'
|
||||
b = CaseConverter.snake_to_camel(a)
|
||||
a = 'camel_cased_string'
|
||||
b = CaseConverter.snake_to_camel(a)
|
||||
|
||||
assert b == 'camelCasedString'
|
||||
|
||||
assert b == 'camelCasedString'
|
||||
|
||||
def test_case_convert_snake_to_title():
|
||||
a = 'camel_cased_string'
|
||||
b = CaseConverter.snake_to_title(a)
|
||||
a = 'camel_cased_string'
|
||||
b = CaseConverter.snake_to_title(a)
|
||||
|
||||
assert b == 'Camel cased string'
|
||||
|
||||
|
||||
def test_case_convert_camel_with_digits_to_snake():
|
||||
a = 'ignore403'
|
||||
b = CaseConverter.camel_to_snake(a)
|
||||
|
||||
assert b == 'ignore403'
|
||||
|
||||
assert b == 'Camel cased string'
|
||||
|
||||
def test_is_country_tag():
|
||||
assert is_country_tag('ru') == True
|
||||
assert is_country_tag('FR') == True
|
||||
assert is_country_tag('ru') == True
|
||||
assert is_country_tag('FR') == True
|
||||
|
||||
assert is_country_tag('a1') == False
|
||||
assert is_country_tag('dating') == False
|
||||
assert is_country_tag('a1') == False
|
||||
assert is_country_tag('dating') == False
|
||||
|
||||
assert is_country_tag('global') == True
|
||||
|
||||
assert is_country_tag('global') == True
|
||||
|
||||
def test_enrich_link_str():
|
||||
assert enrich_link_str('test') == 'test'
|
||||
assert enrich_link_str(' www.flickr.com/photos/alexaimephotography/') == '<a class="auto-link" href="www.flickr.com/photos/alexaimephotography/">www.flickr.com/photos/alexaimephotography/</a>'
|
||||
assert enrich_link_str('test') == 'test'
|
||||
assert enrich_link_str(
|
||||
' www.flickr.com/photos/alexaimephotography/') == '<a class="auto-link" href="www.flickr.com/photos/alexaimephotography/">www.flickr.com/photos/alexaimephotography/</a>'
|
||||
|
||||
|
||||
def test_url_extract_main_part():
|
||||
url_main_part = 'flickr.com/photos/alexaimephotography'
|
||||
url_main_part = 'flickr.com/photos/alexaimephotography'
|
||||
|
||||
parts = [
|
||||
['http://', 'https://'],
|
||||
['www.', ''],
|
||||
[url_main_part],
|
||||
['/', ''],
|
||||
]
|
||||
parts = [
|
||||
['http://', 'https://'],
|
||||
['www.', ''],
|
||||
[url_main_part],
|
||||
['/', ''],
|
||||
]
|
||||
|
||||
url_regexp = re.compile('^https?://(www.)?flickr.com/photos/(.+?)$')
|
||||
for url_parts in itertools.product(*parts):
|
||||
url = ''.join(url_parts)
|
||||
assert URLMatcher.extract_main_part(url) == url_main_part
|
||||
assert not url_regexp.match(url) is None
|
||||
|
||||
url_regexp = re.compile('^https?://(www.)?flickr.com/photos/(.+?)$')
|
||||
for url_parts in itertools.product(*parts):
|
||||
url = ''.join(url_parts)
|
||||
assert URLMatcher.extract_main_part(url) == url_main_part
|
||||
assert not url_regexp.match(url) is None
|
||||
|
||||
def test_url_make_profile_url_regexp():
|
||||
url_main_part = 'flickr.com/photos/{username}'
|
||||
url_main_part = 'flickr.com/photos/{username}'
|
||||
|
||||
parts = [
|
||||
['http://', 'https://'],
|
||||
['www.', ''],
|
||||
[url_main_part],
|
||||
['/', ''],
|
||||
]
|
||||
parts = [
|
||||
['http://', 'https://'],
|
||||
['www.', ''],
|
||||
[url_main_part],
|
||||
['/', ''],
|
||||
]
|
||||
|
||||
for url_parts in itertools.product(*parts):
|
||||
url = ''.join(url_parts)
|
||||
assert URLMatcher.make_profile_url_regexp(url).pattern == r'^https?://(www.)?flickr\.com/photos/(.+?)$'
|
||||
for url_parts in itertools.product(*parts):
|
||||
url = ''.join(url_parts)
|
||||
assert URLMatcher.make_profile_url_regexp(url).pattern == r'^https?://(www.)?flickr\.com/photos/(.+?)$'
|
||||
|
||||
|
||||
def test_get_dict_ascii_tree():
|
||||
data = {'uid': 'dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==', 'legacy_id': '26403415', 'username': 'alexaimephotographycars', 'name': 'Alex Aimé', 'created_at': '2018-05-04T10:17:01.000+0000', 'image': 'https://drscdn.500px.org/user_avatar/26403415/q%3D85_w%3D300_h%3D300/v2?webp=true&v=2&sig=0235678a4f7b65e007e864033ebfaf5ef6d87fad34f80a8639d985320c20fe3b', 'image_bg': 'https://drscdn.500px.org/user_cover/26403415/q%3D65_m%3D2048/v2?webp=true&v=1&sig=bea411fb158391a4fdad498874ff17088f91257e59dfb376ff67e3a44c3a4201', 'website': 'www.instagram.com/street.reality.photography/', 'facebook_link': ' www.instagram.com/street.reality.photography/', 'instagram_username': 'Street.Reality.Photography', 'twitter_username': 'Alexaimephotogr'}
|
||||
|
||||
ascii_tree = get_dict_ascii_tree(data.items())
|
||||
|
||||
assert ascii_tree == """
|
||||
┣╸uid: dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==
|
||||
┣╸legacy_id: 26403415
|
||||
┣╸username: alexaimephotographycars
|
||||
┣╸name: Alex Aimé
|
||||
┣╸created_at: 2018-05-04T10:17:01.000+0000
|
||||
┣╸image: https://drscdn.500px.org/user_avatar/26403415/q%3D85_w%3D300_h%3D300/v2?webp=true&v=2&sig=0235678a4f7b65e007e864033ebfaf5ef6d87fad34f80a8639d985320c20fe3b
|
||||
┣╸image_bg: https://drscdn.500px.org/user_cover/26403415/q%3D65_m%3D2048/v2?webp=true&v=1&sig=bea411fb158391a4fdad498874ff17088f91257e59dfb376ff67e3a44c3a4201
|
||||
┣╸website: www.instagram.com/street.reality.photography/
|
||||
┣╸facebook_link: www.instagram.com/street.reality.photography/
|
||||
┣╸instagram_username: Street.Reality.Photography
|
||||
┗╸twitter_username: Alexaimephotogr"""
|
||||
@@ -20,8 +20,9 @@ RANKS.update({
|
||||
'5000': '5K',
|
||||
'10000': '10K',
|
||||
'100000': '100K',
|
||||
'10000000': '1M',
|
||||
'50000000': '10M',
|
||||
'10000000': '10M',
|
||||
'50000000': '50M',
|
||||
'100000000': '100M',
|
||||
})
|
||||
|
||||
SEMAPHORE = threading.Semaphore(10)
|
||||
@@ -58,8 +59,9 @@ def get_rank(domain_to_query, site, print_errors=True):
|
||||
def get_step_rank(rank):
|
||||
def get_readable_rank(r):
|
||||
return RANKS[str(r)]
|
||||
|
||||
valid_step_ranks = sorted(map(int, RANKS.keys()))
|
||||
if rank == 0:
|
||||
if rank == 0 or rank == sys.maxsize:
|
||||
return get_readable_rank(valid_step_ranks[-1])
|
||||
else:
|
||||
return get_readable_rank(list(filter(lambda x: x >= rank, valid_step_ranks))[0])
|
||||
@@ -73,6 +75,8 @@ if __name__ == '__main__':
|
||||
help="JSON file with sites data to update.")
|
||||
|
||||
parser.add_argument('--empty-only', help='update only sites without rating', action='store_true')
|
||||
parser.add_argument('--exclude-engine', help='do not update score with certain engine',
|
||||
action="append", dest="exclude_engine_list", default=[])
|
||||
|
||||
pool = list()
|
||||
|
||||
@@ -92,6 +96,8 @@ Rank data fetched from Alexa by domains.
|
||||
url_main = site.url_main
|
||||
if site.alexa_rank < sys.maxsize and args.empty_only:
|
||||
continue
|
||||
if args.exclude_engine_list and site.engine in args.exclude_engine_list:
|
||||
continue
|
||||
site.alexa_rank = 0
|
||||
th = threading.Thread(target=get_rank, args=(url_main, site))
|
||||
pool.append((site.name, url_main, th))
|
||||
|
||||
Reference in New Issue
Block a user