mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-13 18:05:39 +00:00
Draft of graph report
This commit is contained in:
@@ -36,6 +36,7 @@ from .utils import get_random_user_agent, ascii_data_display
|
||||
|
||||
|
||||
SUPPORTED_IDS = (
|
||||
"username",
|
||||
"yandex_public_id",
|
||||
"gaia_id",
|
||||
"vk_id",
|
||||
|
||||
+15
-12
@@ -34,6 +34,7 @@ from .report import (
|
||||
save_json_report,
|
||||
get_plaintext_report,
|
||||
sort_report_by_data_points,
|
||||
save_graph_report,
|
||||
)
|
||||
from .sites import MaigretDatabase
|
||||
from .submit import Submitter
|
||||
@@ -62,17 +63,6 @@ def notify_about_errors(search_results: QueryResultWrapper, query_notify):
|
||||
)
|
||||
|
||||
|
||||
def extract_ids_from_url(url: str, db: MaigretDatabase) -> dict:
|
||||
results = {}
|
||||
for s in db.sites:
|
||||
result = s.extract_id_from_url(url)
|
||||
if not result:
|
||||
continue
|
||||
_id, _type = result
|
||||
results[_id] = _type
|
||||
return results
|
||||
|
||||
|
||||
def extract_ids_from_page(url, logger, timeout=5) -> dict:
|
||||
results = {}
|
||||
# url, headers
|
||||
@@ -118,7 +108,7 @@ def extract_ids_from_results(results: QueryResultWrapper, db: MaigretDatabase) -
|
||||
ids_results[u] = utype
|
||||
|
||||
for url in dictionary.get('ids_links', []):
|
||||
ids_results.update(extract_ids_from_url(url, db))
|
||||
ids_results.update(db.extract_ids_from_url(url))
|
||||
|
||||
return ids_results
|
||||
|
||||
@@ -431,6 +421,14 @@ def setup_arguments_parser():
|
||||
default=False,
|
||||
help="Generate a PDF report (general report on all usernames).",
|
||||
)
|
||||
report_group.add_argument(
|
||||
"-G",
|
||||
"--graph",
|
||||
action="store_true",
|
||||
dest="graph",
|
||||
default=False,
|
||||
help="Generate a graph report (general report on all usernames).",
|
||||
)
|
||||
report_group.add_argument(
|
||||
"-J",
|
||||
"--json",
|
||||
@@ -693,6 +691,11 @@ async def main():
|
||||
save_pdf_report(filename, report_context)
|
||||
query_notify.warning(f'PDF report on all usernames saved in {filename}')
|
||||
|
||||
if args.graph:
|
||||
filename = report_filepath_tpl.format(username=username, postfix='.html')
|
||||
save_graph_report(filename, general_results, db)
|
||||
query_notify.warning(f'Graph report on all usernames saved in {filename}')
|
||||
|
||||
text_report = get_plaintext_report(report_context)
|
||||
if text_report:
|
||||
query_notify.info('Short text report:')
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import ast
|
||||
import csv
|
||||
import io
|
||||
import json
|
||||
@@ -11,8 +12,12 @@ import xmind
|
||||
from dateutil.parser import parse as parse_datetime_str
|
||||
from jinja2 import Template
|
||||
from xhtml2pdf import pisa
|
||||
from pyvis.network import Network
|
||||
import networkx as nx
|
||||
|
||||
from .checking import SUPPORTED_IDS
|
||||
from .result import QueryStatus
|
||||
from .sites import MaigretDatabase
|
||||
from .utils import is_country_tag, CaseConverter, enrich_link_str
|
||||
|
||||
SUPPORTED_JSON_REPORT_FORMATS = [
|
||||
@@ -82,6 +87,121 @@ def save_json_report(filename: str, username: str, results: dict, report_type: s
|
||||
generate_json_report(username, results, f, report_type=report_type)
|
||||
|
||||
|
||||
class MaigretGraph:
|
||||
other_params = {'size': 10, 'group': 3}
|
||||
site_params = {'size': 15, 'group': 2}
|
||||
username_params = {'size': 20, 'group': 1}
|
||||
|
||||
def __init__(self, graph):
|
||||
self.G = graph
|
||||
|
||||
def add_node(self, key, value):
|
||||
node_name = f'{key}: {value}'
|
||||
|
||||
params = self.other_params
|
||||
if key in SUPPORTED_IDS:
|
||||
params = self.username_params
|
||||
elif value.startswith('http'):
|
||||
params = self.site_params
|
||||
|
||||
self.G.add_node(node_name, title=node_name, **params)
|
||||
|
||||
if value != value.lower():
|
||||
normalized_node_name = self.add_node(key, value.lower())
|
||||
self.link(node_name, normalized_node_name)
|
||||
|
||||
return node_name
|
||||
|
||||
def link(self, node1_name, node2_name):
|
||||
self.G.add_edge(node1_name, node2_name, weight=2)
|
||||
|
||||
|
||||
def save_graph_report(filename: str, username_results: list, db: MaigretDatabase):
|
||||
G = nx.Graph()
|
||||
graph = MaigretGraph(G)
|
||||
|
||||
for username, id_type, results in username_results:
|
||||
username_node_name = graph.add_node(id_type, username)
|
||||
|
||||
for website_name in results:
|
||||
dictionary = results[website_name]
|
||||
# TODO: fix no site data issue
|
||||
if not dictionary:
|
||||
continue
|
||||
|
||||
if dictionary.get("is_similar"):
|
||||
continue
|
||||
|
||||
status = dictionary.get("status")
|
||||
if not status: # FIXME: currently in case of timeout
|
||||
continue
|
||||
|
||||
if dictionary["status"].status != QueryStatus.CLAIMED:
|
||||
continue
|
||||
|
||||
site_fallback_name = dictionary.get('url_user', f'{website_name}: {username.lower()}')
|
||||
# site_node_name = dictionary.get('url_user', f'{website_name}: {username.lower()}')
|
||||
site_node_name = graph.add_node('site', site_fallback_name)
|
||||
graph.link(username_node_name, site_node_name)
|
||||
|
||||
def process_ids(parent_node, ids):
|
||||
for k, v in ids.items():
|
||||
if k.endswith('_count') or k.startswith('is_') or k.endswith('_at'):
|
||||
continue
|
||||
if k in 'image':
|
||||
continue
|
||||
|
||||
v_data = v
|
||||
if v.startswith('['):
|
||||
try:
|
||||
v_data = ast.literal_eval(v)
|
||||
except Exception as e:
|
||||
logging.error(e)
|
||||
|
||||
# value is a list
|
||||
if isinstance(v_data, list):
|
||||
list_node_name = graph.add_node(k, site_fallback_name)
|
||||
for vv in v_data:
|
||||
data_node_name = graph.add_node(vv, site_fallback_name)
|
||||
graph.link(list_node_name, data_node_name)
|
||||
|
||||
add_ids = {a: b for b, a in db.extract_ids_from_url(vv).items()}
|
||||
if add_ids:
|
||||
process_ids(data_node_name, add_ids)
|
||||
else:
|
||||
# value is just a string
|
||||
# ids_data_name = f'{k}: {v}'
|
||||
# if ids_data_name == parent_node:
|
||||
# continue
|
||||
|
||||
ids_data_name = graph.add_node(k, v)
|
||||
# G.add_node(ids_data_name, size=10, title=ids_data_name, group=3)
|
||||
graph.link(parent_node, ids_data_name)
|
||||
|
||||
# check for username
|
||||
if 'username' in k or k in SUPPORTED_IDS:
|
||||
new_username_node_name = graph.add_node('username', v)
|
||||
graph.link(ids_data_name, new_username_node_name)
|
||||
|
||||
add_ids = {k: v for v, k in db.extract_ids_from_url(v).items()}
|
||||
if add_ids:
|
||||
process_ids(ids_data_name, add_ids)
|
||||
|
||||
if status.ids_data:
|
||||
process_ids(site_node_name, status.ids_data)
|
||||
|
||||
nodes_to_remove = []
|
||||
for node in G.nodes:
|
||||
if len(str(node)) > 100:
|
||||
nodes_to_remove.append(node)
|
||||
|
||||
[G.remove_node(node) for node in nodes_to_remove]
|
||||
|
||||
nt = Network(notebook=True, height="750px", width="100%")
|
||||
nt.from_nx(G)
|
||||
nt.show(filename)
|
||||
|
||||
|
||||
def get_plaintext_report(context: dict) -> str:
|
||||
output = (context['brief'] + " ").replace('. ', '.\n')
|
||||
interests = list(map(lambda x: x[0], context.get('interests_tuple_list', [])))
|
||||
|
||||
@@ -3643,6 +3643,7 @@
|
||||
"errors": {
|
||||
"Invalid API key": "New API key needed"
|
||||
},
|
||||
"regexCheck": "^[^/]+$",
|
||||
"urlProbe": "https://disqus.com/api/3.0/users/details?user=username%3A{username}&attach=userFlaggedUser&api_key=E8Uh5l5fHZ6gD8U3KycjAIAk46f68Zw7C6eW8WSjZvCLXebZ7p0r1yrYDrLilk2F",
|
||||
"checkType": "status_code",
|
||||
"presenseStrs": [
|
||||
@@ -13036,7 +13037,7 @@
|
||||
"us"
|
||||
],
|
||||
"headers": {
|
||||
"authorization": "Bearer BQAkHoH1XLhjIl6oh6r9YzH3kHC1OZg3UXgLiz39FzqRFh_xQrFaVrZcU-esM-t87B6Hqdc4L1HBgukKnWE"
|
||||
"authorization": "Bearer BQBbhm9gxBxIDmwZvO8mzV28G7V07L57WlKILvhXijRaTxwh9N03yHxSLADfioU3uWYDAjjq_mMWQSbQ2OA"
|
||||
},
|
||||
"errors": {
|
||||
"Spotify is currently not available in your country.": "Access denied in your country, use proxy/vpn"
|
||||
@@ -14463,7 +14464,7 @@
|
||||
"sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
|
||||
"authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
|
||||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
|
||||
"x-guest-token": "1403829602053771266"
|
||||
"x-guest-token": "1404906435025195008"
|
||||
},
|
||||
"errors": {
|
||||
"Bad guest token": "x-guest-token update required"
|
||||
@@ -14870,7 +14871,7 @@
|
||||
"video"
|
||||
],
|
||||
"headers": {
|
||||
"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MjM1MzQ5NjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.5T8_p_q9zXOHXI2FT_XtMhsZUJMtPgCIaqwVF2u4aZI"
|
||||
"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE2MjM3OTYyNjAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbH0.ZUCw6WWIPIoVy1zoj8AKA1EMfX6ao7hJI2pWxgAZlac"
|
||||
},
|
||||
"activation": {
|
||||
"url": "https://vimeo.com/_rv/viewer",
|
||||
|
||||
@@ -400,6 +400,18 @@ class MaigretDatabase:
|
||||
|
||||
return found_flags
|
||||
|
||||
|
||||
def extract_ids_from_url(self, url: str) -> dict:
|
||||
results = {}
|
||||
for s in self._sites:
|
||||
result = s.extract_id_from_url(url)
|
||||
if not result:
|
||||
continue
|
||||
_id, _type = result
|
||||
results[_id] = _type
|
||||
return results
|
||||
|
||||
|
||||
def get_db_stats(self, sites_dict):
|
||||
if not sites_dict:
|
||||
sites_dict = self.sites_dict()
|
||||
|
||||
Reference in New Issue
Block a user