diff --git a/Makefile b/Makefile index e41f1ae..33f4da3 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ test: coverage html rerun-tests: - pytest --lf + pytest --lf -vv lint: @echo 'syntax errors or undefined names' diff --git a/example.ipynb b/example.ipynb new file mode 100644 index 0000000..7ba621f --- /dev/null +++ b/example.ipynb @@ -0,0 +1,68 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "8v6PEfyXb0Gx" + }, + "outputs": [], + "source": [ + "# clone the repo\n", + "!git clone https://github.com/soxoj/maigret\n", + "!pip3 install -r maigret/requirements.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cXOQUAhDchkl" + }, + "outputs": [], + "source": [ + "# help\n", + "!python3 maigret/maigret.py --help" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "SjDmpN4QGnJu" + }, + "outputs": [], + "source": [ + "# search\n", + "!python3 maigret/maigret.py user" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "include_colab_link": true, + "name": "maigret.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/maigret/checking.py b/maigret/checking.py index 11fcd73..0a682a5 100644 --- a/maigret/checking.py +++ b/maigret/checking.py @@ -36,6 +36,7 @@ from .utils import get_random_user_agent, ascii_data_display SUPPORTED_IDS = ( + "username", "yandex_public_id", "gaia_id", "vk_id", diff --git a/maigret/maigret.py b/maigret/maigret.py index 77f6122..5da87d5 100755 --- a/maigret/maigret.py +++ b/maigret/maigret.py @@ -34,6 +34,7 @@ from .report import ( save_json_report, get_plaintext_report, sort_report_by_data_points, + save_graph_report, ) from .sites import MaigretDatabase from .submit import Submitter @@ -62,17 +63,6 @@ def notify_about_errors(search_results: QueryResultWrapper, query_notify): ) -def extract_ids_from_url(url: str, db: MaigretDatabase) -> dict: - results = {} - for s in db.sites: - result = s.extract_id_from_url(url) - if not result: - continue - _id, _type = result - results[_id] = _type - return results - - def extract_ids_from_page(url, logger, timeout=5) -> dict: results = {} # url, headers @@ -118,7 +108,7 @@ def extract_ids_from_results(results: QueryResultWrapper, db: MaigretDatabase) - ids_results[u] = utype for url in dictionary.get('ids_links', []): - ids_results.update(extract_ids_from_url(url, db)) + ids_results.update(db.extract_ids_from_url(url)) return ids_results @@ -431,6 +421,14 @@ def setup_arguments_parser(): default=False, help="Generate a PDF report (general report on all usernames).", ) + report_group.add_argument( + "-G", + "--graph", + action="store_true", + dest="graph", + default=False, + help="Generate a graph report (general report on all usernames).", + ) report_group.add_argument( "-J", "--json", @@ -693,6 +691,11 @@ async def main(): save_pdf_report(filename, report_context) query_notify.warning(f'PDF report on all usernames saved in {filename}') + if args.graph: + filename = report_filepath_tpl.format(username=username, postfix='.html') + save_graph_report(filename, general_results, db) + query_notify.warning(f'Graph report on all usernames saved in {filename}') + text_report = get_plaintext_report(report_context) if text_report: query_notify.info('Short text report:') diff --git a/maigret/report.py b/maigret/report.py index 2aa1800..a43b110 100644 --- a/maigret/report.py +++ b/maigret/report.py @@ -1,3 +1,4 @@ +import ast import csv import io import json @@ -11,8 +12,12 @@ import xmind from dateutil.parser import parse as parse_datetime_str from jinja2 import Template from xhtml2pdf import pisa +from pyvis.network import Network +import networkx as nx +from .checking import SUPPORTED_IDS from .result import QueryStatus +from .sites import MaigretDatabase from .utils import is_country_tag, CaseConverter, enrich_link_str SUPPORTED_JSON_REPORT_FORMATS = [ @@ -82,6 +87,121 @@ def save_json_report(filename: str, username: str, results: dict, report_type: s generate_json_report(username, results, f, report_type=report_type) +class MaigretGraph: + other_params = {'size': 10, 'group': 3} + site_params = {'size': 15, 'group': 2} + username_params = {'size': 20, 'group': 1} + + def __init__(self, graph): + self.G = graph + + def add_node(self, key, value): + node_name = f'{key}: {value}' + + params = self.other_params + if key in SUPPORTED_IDS: + params = self.username_params + elif value.startswith('http'): + params = self.site_params + + self.G.add_node(node_name, title=node_name, **params) + + if value != value.lower(): + normalized_node_name = self.add_node(key, value.lower()) + self.link(node_name, normalized_node_name) + + return node_name + + def link(self, node1_name, node2_name): + self.G.add_edge(node1_name, node2_name, weight=2) + + +def save_graph_report(filename: str, username_results: list, db: MaigretDatabase): + G = nx.Graph() + graph = MaigretGraph(G) + + for username, id_type, results in username_results: + username_node_name = graph.add_node(id_type, username) + + for website_name in results: + dictionary = results[website_name] + # TODO: fix no site data issue + if not dictionary: + continue + + if dictionary.get("is_similar"): + continue + + status = dictionary.get("status") + if not status: # FIXME: currently in case of timeout + continue + + if dictionary["status"].status != QueryStatus.CLAIMED: + continue + + site_fallback_name = dictionary.get('url_user', f'{website_name}: {username.lower()}') + # site_node_name = dictionary.get('url_user', f'{website_name}: {username.lower()}') + site_node_name = graph.add_node('site', site_fallback_name) + graph.link(username_node_name, site_node_name) + + def process_ids(parent_node, ids): + for k, v in ids.items(): + if k.endswith('_count') or k.startswith('is_') or k.endswith('_at'): + continue + if k in 'image': + continue + + v_data = v + if v.startswith('['): + try: + v_data = ast.literal_eval(v) + except Exception as e: + logging.error(e) + + # value is a list + if isinstance(v_data, list): + list_node_name = graph.add_node(k, site_fallback_name) + for vv in v_data: + data_node_name = graph.add_node(vv, site_fallback_name) + graph.link(list_node_name, data_node_name) + + add_ids = {a: b for b, a in db.extract_ids_from_url(vv).items()} + if add_ids: + process_ids(data_node_name, add_ids) + else: + # value is just a string + # ids_data_name = f'{k}: {v}' + # if ids_data_name == parent_node: + # continue + + ids_data_name = graph.add_node(k, v) + # G.add_node(ids_data_name, size=10, title=ids_data_name, group=3) + graph.link(parent_node, ids_data_name) + + # check for username + if 'username' in k or k in SUPPORTED_IDS: + new_username_node_name = graph.add_node('username', v) + graph.link(ids_data_name, new_username_node_name) + + add_ids = {k: v for v, k in db.extract_ids_from_url(v).items()} + if add_ids: + process_ids(ids_data_name, add_ids) + + if status.ids_data: + process_ids(site_node_name, status.ids_data) + + nodes_to_remove = [] + for node in G.nodes: + if len(str(node)) > 100: + nodes_to_remove.append(node) + + [G.remove_node(node) for node in nodes_to_remove] + + nt = Network(notebook=True, height="750px", width="100%") + nt.from_nx(G) + nt.show(filename) + + def get_plaintext_report(context: dict) -> str: output = (context['brief'] + " ").replace('. ', '.\n') interests = list(map(lambda x: x[0], context.get('interests_tuple_list', []))) diff --git a/maigret/resources/data.json b/maigret/resources/data.json index 9bc14eb..2b3b5f9 100644 --- a/maigret/resources/data.json +++ b/maigret/resources/data.json @@ -3643,6 +3643,7 @@ "errors": { "Invalid API key": "New API key needed" }, + "regexCheck": "^[^/]+$", "urlProbe": "https://disqus.com/api/3.0/users/details?user=username%3A{username}&attach=userFlaggedUser&api_key=E8Uh5l5fHZ6gD8U3KycjAIAk46f68Zw7C6eW8WSjZvCLXebZ7p0r1yrYDrLilk2F", "checkType": "status_code", "presenseStrs": [ @@ -13036,7 +13037,7 @@ "us" ], "headers": { - "authorization": "Bearer BQAkHoH1XLhjIl6oh6r9YzH3kHC1OZg3UXgLiz39FzqRFh_xQrFaVrZcU-esM-t87B6Hqdc4L1HBgukKnWE" + "authorization": "Bearer BQBbhm9gxBxIDmwZvO8mzV28G7V07L57WlKILvhXijRaTxwh9N03yHxSLADfioU3uWYDAjjq_mMWQSbQ2OA" }, "errors": { "Spotify is currently not available in your country.": "Access denied in your country, use proxy/vpn" @@ -14463,7 +14464,7 @@ "sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"", "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA", "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36", - "x-guest-token": "1403829602053771266" + "x-guest-token": "1404906435025195008" }, "errors": { "Bad guest token": "x-guest-token update required" @@ -14870,7 +14871,7 @@ "video" ], "headers": { - "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MjM1MzQ5NjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.5T8_p_q9zXOHXI2FT_XtMhsZUJMtPgCIaqwVF2u4aZI" + "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MjM3OTYyNjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.ZUCw6WWIPIoVy1zoj8AKA1EMfX6ao7hJI2pWxgAZlac" }, "activation": { "url": "https://vimeo.com/_rv/viewer", diff --git a/maigret/sites.py b/maigret/sites.py index 2df121d..03d02bb 100644 --- a/maigret/sites.py +++ b/maigret/sites.py @@ -400,6 +400,18 @@ class MaigretDatabase: return found_flags + + def extract_ids_from_url(self, url: str) -> dict: + results = {} + for s in self._sites: + result = s.extract_id_from_url(url) + if not result: + continue + _id, _type = result + results[_id] = _type + return results + + def get_db_stats(self, sites_dict): if not sites_dict: sites_dict = self.sites_dict() diff --git a/requirements.txt b/requirements.txt index aa81c18..9fbf6e2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -37,3 +37,5 @@ webencodings==0.5.1 xhtml2pdf==0.2.5 XMind==1.2.0 yarl==1.6.3 +networkx==2.5.1 +pyvis==0.1.9 diff --git a/tests/test_cli.py b/tests/test_cli.py index eee3741..5a3f53e 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -13,6 +13,7 @@ DEFAULT_ARGS: Dict[str, Any] = { 'disable_recursive_search': False, 'folderoutput': 'reports', 'html': False, + 'graph': False, 'id_type': 'username', 'ignore_ids_list': [], 'info': False, diff --git a/tests/test_maigret.py b/tests/test_maigret.py index 8f7fc6b..0c05262 100644 --- a/tests/test_maigret.py +++ b/tests/test_maigret.py @@ -9,7 +9,6 @@ from maigret.maigret import self_check, maigret from maigret.maigret import ( extract_ids_from_page, extract_ids_from_results, - extract_ids_from_url, ) from maigret.sites import MaigretSite from maigret.result import QueryResult, QueryStatus @@ -144,18 +143,18 @@ def test_maigret_results(test_db): def test_extract_ids_from_url(default_db): - assert extract_ids_from_url('https://www.reddit.com/user/test', default_db) == { + assert default_db.extract_ids_from_url('https://www.reddit.com/user/test') == { 'test': 'username' } - assert extract_ids_from_url('https://vk.com/id123', default_db) == {'123': 'vk_id'} - assert extract_ids_from_url('https://vk.com/ida123', default_db) == { + assert default_db.extract_ids_from_url('https://vk.com/id123') == {'123': 'vk_id'} + assert default_db.extract_ids_from_url('https://vk.com/ida123') == { 'ida123': 'username' } - assert extract_ids_from_url( - 'https://my.mail.ru/yandex.ru/dipres8904/', default_db + assert default_db.extract_ids_from_url( + 'https://my.mail.ru/yandex.ru/dipres8904/' ) == {'dipres8904': 'username'} - assert extract_ids_from_url( - 'https://reviews.yandex.ru/user/adbced123', default_db + assert default_db.extract_ids_from_url( + 'https://reviews.yandex.ru/user/adbced123' ) == {'adbced123': 'yandex_public_id'}