Draft of graph report

This commit is contained in:
Soxoj
2021-06-21 22:14:24 +03:00
parent 49708da980
commit c7977f1cdf
10 changed files with 231 additions and 24 deletions
+1 -1
View File
@@ -6,7 +6,7 @@ test:
coverage html coverage html
rerun-tests: rerun-tests:
pytest --lf pytest --lf -vv
lint: lint:
@echo 'syntax errors or undefined names' @echo 'syntax errors or undefined names'
+68
View File
@@ -0,0 +1,68 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "8v6PEfyXb0Gx"
},
"outputs": [],
"source": [
"# clone the repo\n",
"!git clone https://github.com/soxoj/maigret\n",
"!pip3 install -r maigret/requirements.txt"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "cXOQUAhDchkl"
},
"outputs": [],
"source": [
"# help\n",
"!python3 maigret/maigret.py --help"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "SjDmpN4QGnJu"
},
"outputs": [],
"source": [
"# search\n",
"!python3 maigret/maigret.py user"
]
}
],
"metadata": {
"colab": {
"collapsed_sections": [],
"include_colab_link": true,
"name": "maigret.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
+1
View File
@@ -36,6 +36,7 @@ from .utils import get_random_user_agent, ascii_data_display
SUPPORTED_IDS = ( SUPPORTED_IDS = (
"username",
"yandex_public_id", "yandex_public_id",
"gaia_id", "gaia_id",
"vk_id", "vk_id",
+15 -12
View File
@@ -34,6 +34,7 @@ from .report import (
save_json_report, save_json_report,
get_plaintext_report, get_plaintext_report,
sort_report_by_data_points, sort_report_by_data_points,
save_graph_report,
) )
from .sites import MaigretDatabase from .sites import MaigretDatabase
from .submit import Submitter from .submit import Submitter
@@ -62,17 +63,6 @@ def notify_about_errors(search_results: QueryResultWrapper, query_notify):
) )
def extract_ids_from_url(url: str, db: MaigretDatabase) -> dict:
results = {}
for s in db.sites:
result = s.extract_id_from_url(url)
if not result:
continue
_id, _type = result
results[_id] = _type
return results
def extract_ids_from_page(url, logger, timeout=5) -> dict: def extract_ids_from_page(url, logger, timeout=5) -> dict:
results = {} results = {}
# url, headers # url, headers
@@ -118,7 +108,7 @@ def extract_ids_from_results(results: QueryResultWrapper, db: MaigretDatabase) -
ids_results[u] = utype ids_results[u] = utype
for url in dictionary.get('ids_links', []): for url in dictionary.get('ids_links', []):
ids_results.update(extract_ids_from_url(url, db)) ids_results.update(db.extract_ids_from_url(url))
return ids_results return ids_results
@@ -431,6 +421,14 @@ def setup_arguments_parser():
default=False, default=False,
help="Generate a PDF report (general report on all usernames).", help="Generate a PDF report (general report on all usernames).",
) )
report_group.add_argument(
"-G",
"--graph",
action="store_true",
dest="graph",
default=False,
help="Generate a graph report (general report on all usernames).",
)
report_group.add_argument( report_group.add_argument(
"-J", "-J",
"--json", "--json",
@@ -693,6 +691,11 @@ async def main():
save_pdf_report(filename, report_context) save_pdf_report(filename, report_context)
query_notify.warning(f'PDF report on all usernames saved in {filename}') query_notify.warning(f'PDF report on all usernames saved in {filename}')
if args.graph:
filename = report_filepath_tpl.format(username=username, postfix='.html')
save_graph_report(filename, general_results, db)
query_notify.warning(f'Graph report on all usernames saved in {filename}')
text_report = get_plaintext_report(report_context) text_report = get_plaintext_report(report_context)
if text_report: if text_report:
query_notify.info('Short text report:') query_notify.info('Short text report:')
+120
View File
@@ -1,3 +1,4 @@
import ast
import csv import csv
import io import io
import json import json
@@ -11,8 +12,12 @@ import xmind
from dateutil.parser import parse as parse_datetime_str from dateutil.parser import parse as parse_datetime_str
from jinja2 import Template from jinja2 import Template
from xhtml2pdf import pisa from xhtml2pdf import pisa
from pyvis.network import Network
import networkx as nx
from .checking import SUPPORTED_IDS
from .result import QueryStatus from .result import QueryStatus
from .sites import MaigretDatabase
from .utils import is_country_tag, CaseConverter, enrich_link_str from .utils import is_country_tag, CaseConverter, enrich_link_str
SUPPORTED_JSON_REPORT_FORMATS = [ SUPPORTED_JSON_REPORT_FORMATS = [
@@ -82,6 +87,121 @@ def save_json_report(filename: str, username: str, results: dict, report_type: s
generate_json_report(username, results, f, report_type=report_type) generate_json_report(username, results, f, report_type=report_type)
class MaigretGraph:
other_params = {'size': 10, 'group': 3}
site_params = {'size': 15, 'group': 2}
username_params = {'size': 20, 'group': 1}
def __init__(self, graph):
self.G = graph
def add_node(self, key, value):
node_name = f'{key}: {value}'
params = self.other_params
if key in SUPPORTED_IDS:
params = self.username_params
elif value.startswith('http'):
params = self.site_params
self.G.add_node(node_name, title=node_name, **params)
if value != value.lower():
normalized_node_name = self.add_node(key, value.lower())
self.link(node_name, normalized_node_name)
return node_name
def link(self, node1_name, node2_name):
self.G.add_edge(node1_name, node2_name, weight=2)
def save_graph_report(filename: str, username_results: list, db: MaigretDatabase):
G = nx.Graph()
graph = MaigretGraph(G)
for username, id_type, results in username_results:
username_node_name = graph.add_node(id_type, username)
for website_name in results:
dictionary = results[website_name]
# TODO: fix no site data issue
if not dictionary:
continue
if dictionary.get("is_similar"):
continue
status = dictionary.get("status")
if not status: # FIXME: currently in case of timeout
continue
if dictionary["status"].status != QueryStatus.CLAIMED:
continue
site_fallback_name = dictionary.get('url_user', f'{website_name}: {username.lower()}')
# site_node_name = dictionary.get('url_user', f'{website_name}: {username.lower()}')
site_node_name = graph.add_node('site', site_fallback_name)
graph.link(username_node_name, site_node_name)
def process_ids(parent_node, ids):
for k, v in ids.items():
if k.endswith('_count') or k.startswith('is_') or k.endswith('_at'):
continue
if k in 'image':
continue
v_data = v
if v.startswith('['):
try:
v_data = ast.literal_eval(v)
except Exception as e:
logging.error(e)
# value is a list
if isinstance(v_data, list):
list_node_name = graph.add_node(k, site_fallback_name)
for vv in v_data:
data_node_name = graph.add_node(vv, site_fallback_name)
graph.link(list_node_name, data_node_name)
add_ids = {a: b for b, a in db.extract_ids_from_url(vv).items()}
if add_ids:
process_ids(data_node_name, add_ids)
else:
# value is just a string
# ids_data_name = f'{k}: {v}'
# if ids_data_name == parent_node:
# continue
ids_data_name = graph.add_node(k, v)
# G.add_node(ids_data_name, size=10, title=ids_data_name, group=3)
graph.link(parent_node, ids_data_name)
# check for username
if 'username' in k or k in SUPPORTED_IDS:
new_username_node_name = graph.add_node('username', v)
graph.link(ids_data_name, new_username_node_name)
add_ids = {k: v for v, k in db.extract_ids_from_url(v).items()}
if add_ids:
process_ids(ids_data_name, add_ids)
if status.ids_data:
process_ids(site_node_name, status.ids_data)
nodes_to_remove = []
for node in G.nodes:
if len(str(node)) > 100:
nodes_to_remove.append(node)
[G.remove_node(node) for node in nodes_to_remove]
nt = Network(notebook=True, height="750px", width="100%")
nt.from_nx(G)
nt.show(filename)
def get_plaintext_report(context: dict) -> str: def get_plaintext_report(context: dict) -> str:
output = (context['brief'] + " ").replace('. ', '.\n') output = (context['brief'] + " ").replace('. ', '.\n')
interests = list(map(lambda x: x[0], context.get('interests_tuple_list', []))) interests = list(map(lambda x: x[0], context.get('interests_tuple_list', [])))
+4 -3
View File
@@ -3643,6 +3643,7 @@
"errors": { "errors": {
"Invalid API key": "New API key needed" "Invalid API key": "New API key needed"
}, },
"regexCheck": "^[^/]+$",
"urlProbe": "https://disqus.com/api/3.0/users/details?user=username%3A{username}&attach=userFlaggedUser&api_key=E8Uh5l5fHZ6gD8U3KycjAIAk46f68Zw7C6eW8WSjZvCLXebZ7p0r1yrYDrLilk2F", "urlProbe": "https://disqus.com/api/3.0/users/details?user=username%3A{username}&attach=userFlaggedUser&api_key=E8Uh5l5fHZ6gD8U3KycjAIAk46f68Zw7C6eW8WSjZvCLXebZ7p0r1yrYDrLilk2F",
"checkType": "status_code", "checkType": "status_code",
"presenseStrs": [ "presenseStrs": [
@@ -13036,7 +13037,7 @@
"us" "us"
], ],
"headers": { "headers": {
"authorization": "Bearer BQAkHoH1XLhjIl6oh6r9YzH3kHC1OZg3UXgLiz39FzqRFh_xQrFaVrZcU-esM-t87B6Hqdc4L1HBgukKnWE" "authorization": "Bearer BQBbhm9gxBxIDmwZvO8mzV28G7V07L57WlKILvhXijRaTxwh9N03yHxSLADfioU3uWYDAjjq_mMWQSbQ2OA"
}, },
"errors": { "errors": {
"Spotify is currently not available in your country.": "Access denied in your country, use proxy/vpn" "Spotify is currently not available in your country.": "Access denied in your country, use proxy/vpn"
@@ -14463,7 +14464,7 @@
"sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"", "sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
"authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA", "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36", "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
"x-guest-token": "1403829602053771266" "x-guest-token": "1404906435025195008"
}, },
"errors": { "errors": {
"Bad guest token": "x-guest-token update required" "Bad guest token": "x-guest-token update required"
@@ -14870,7 +14871,7 @@
"video" "video"
], ],
"headers": { "headers": {
"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MjM1MzQ5NjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.5T8_p_q9zXOHXI2FT_XtMhsZUJMtPgCIaqwVF2u4aZI" "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MjM3OTYyNjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.ZUCw6WWIPIoVy1zoj8AKA1EMfX6ao7hJI2pWxgAZlac"
}, },
"activation": { "activation": {
"url": "https://vimeo.com/_rv/viewer", "url": "https://vimeo.com/_rv/viewer",
+12
View File
@@ -400,6 +400,18 @@ class MaigretDatabase:
return found_flags return found_flags
def extract_ids_from_url(self, url: str) -> dict:
results = {}
for s in self._sites:
result = s.extract_id_from_url(url)
if not result:
continue
_id, _type = result
results[_id] = _type
return results
def get_db_stats(self, sites_dict): def get_db_stats(self, sites_dict):
if not sites_dict: if not sites_dict:
sites_dict = self.sites_dict() sites_dict = self.sites_dict()
+2
View File
@@ -37,3 +37,5 @@ webencodings==0.5.1
xhtml2pdf==0.2.5 xhtml2pdf==0.2.5
XMind==1.2.0 XMind==1.2.0
yarl==1.6.3 yarl==1.6.3
networkx==2.5.1
pyvis==0.1.9
+1
View File
@@ -13,6 +13,7 @@ DEFAULT_ARGS: Dict[str, Any] = {
'disable_recursive_search': False, 'disable_recursive_search': False,
'folderoutput': 'reports', 'folderoutput': 'reports',
'html': False, 'html': False,
'graph': False,
'id_type': 'username', 'id_type': 'username',
'ignore_ids_list': [], 'ignore_ids_list': [],
'info': False, 'info': False,
+7 -8
View File
@@ -9,7 +9,6 @@ from maigret.maigret import self_check, maigret
from maigret.maigret import ( from maigret.maigret import (
extract_ids_from_page, extract_ids_from_page,
extract_ids_from_results, extract_ids_from_results,
extract_ids_from_url,
) )
from maigret.sites import MaigretSite from maigret.sites import MaigretSite
from maigret.result import QueryResult, QueryStatus from maigret.result import QueryResult, QueryStatus
@@ -144,18 +143,18 @@ def test_maigret_results(test_db):
def test_extract_ids_from_url(default_db): def test_extract_ids_from_url(default_db):
assert extract_ids_from_url('https://www.reddit.com/user/test', default_db) == { assert default_db.extract_ids_from_url('https://www.reddit.com/user/test') == {
'test': 'username' 'test': 'username'
} }
assert extract_ids_from_url('https://vk.com/id123', default_db) == {'123': 'vk_id'} assert default_db.extract_ids_from_url('https://vk.com/id123') == {'123': 'vk_id'}
assert extract_ids_from_url('https://vk.com/ida123', default_db) == { assert default_db.extract_ids_from_url('https://vk.com/ida123') == {
'ida123': 'username' 'ida123': 'username'
} }
assert extract_ids_from_url( assert default_db.extract_ids_from_url(
'https://my.mail.ru/yandex.ru/dipres8904/', default_db 'https://my.mail.ru/yandex.ru/dipres8904/'
) == {'dipres8904': 'username'} ) == {'dipres8904': 'username'}
assert extract_ids_from_url( assert default_db.extract_ids_from_url(
'https://reviews.yandex.ru/user/adbced123', default_db 'https://reviews.yandex.ru/user/adbced123'
) == {'adbced123': 'yandex_public_id'} ) == {'adbced123': 'yandex_public_id'}