mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-06 22:19:01 +00:00
572 lines
19 KiB
Python
572 lines
19 KiB
Python
import ast
|
|
import csv
|
|
import io
|
|
import json
|
|
import logging
|
|
import os
|
|
from datetime import datetime
|
|
from typing import Dict, Any
|
|
|
|
import xmind # type: ignore[import-untyped]
|
|
from dateutil.tz import gettz
|
|
from dateutil.parser import parse as parse_datetime_str
|
|
from jinja2 import Template
|
|
|
|
from .checking import SUPPORTED_IDS
|
|
from .result import MaigretCheckStatus
|
|
from .sites import MaigretDatabase
|
|
from .utils import is_country_tag, CaseConverter, enrich_link_str
|
|
|
|
|
|
ADDITIONAL_TZINFO = {"CDT": gettz("America/Chicago")}
|
|
SUPPORTED_JSON_REPORT_FORMATS = [
|
|
"simple",
|
|
"ndjson",
|
|
]
|
|
|
|
"""
|
|
UTILS
|
|
"""
|
|
|
|
|
|
def filter_supposed_data(data):
|
|
# interesting fields
|
|
allowed_fields = ["fullname", "gender", "location", "age"]
|
|
filtered_supposed_data = {
|
|
CaseConverter.snake_to_title(k): v[0]
|
|
for k, v in data.items()
|
|
if k in allowed_fields
|
|
}
|
|
return filtered_supposed_data
|
|
|
|
|
|
def sort_report_by_data_points(results):
|
|
return dict(
|
|
sorted(
|
|
results.items(),
|
|
key=lambda x: len(
|
|
(x[1].get('status') and x[1]['status'].ids_data or {}).keys()
|
|
),
|
|
reverse=True,
|
|
)
|
|
)
|
|
|
|
|
|
"""
|
|
REPORTS SAVING
|
|
"""
|
|
|
|
|
|
def save_csv_report(filename: str, username: str, results: dict):
|
|
with open(filename, "w", newline="", encoding="utf-8") as f:
|
|
generate_csv_report(username, results, f)
|
|
|
|
|
|
def save_txt_report(filename: str, username: str, results: dict):
|
|
with open(filename, "w", encoding="utf-8") as f:
|
|
generate_txt_report(username, results, f)
|
|
|
|
|
|
def save_html_report(filename: str, context: dict):
|
|
template, _ = generate_report_template(is_pdf=False)
|
|
filled_template = template.render(**context)
|
|
with open(filename, "w", encoding="utf-8") as f:
|
|
f.write(filled_template)
|
|
|
|
|
|
def save_pdf_report(filename: str, context: dict):
|
|
template, css = generate_report_template(is_pdf=True)
|
|
filled_template = template.render(**context)
|
|
|
|
# moved here to speed up the launch of Maigret
|
|
from xhtml2pdf import pisa # type: ignore[import-untyped]
|
|
|
|
with open(filename, "w+b") as f:
|
|
pisa.pisaDocument(io.StringIO(filled_template), dest=f, default_css=css)
|
|
|
|
|
|
def save_json_report(filename: str, username: str, results: dict, report_type: str):
|
|
with open(filename, "w", encoding="utf-8") as f:
|
|
generate_json_report(username, results, f, report_type=report_type)
|
|
|
|
|
|
class MaigretGraph:
|
|
other_params: dict = {'size': 10, 'group': 3}
|
|
site_params: dict = {'size': 15, 'group': 2}
|
|
username_params: dict = {'size': 20, 'group': 1}
|
|
|
|
def __init__(self, graph):
|
|
self.G = graph
|
|
|
|
def add_node(self, key, value, color=None):
|
|
node_name = f'{key}: {value}'
|
|
|
|
params = dict(self.other_params)
|
|
if key in SUPPORTED_IDS:
|
|
params = dict(self.username_params)
|
|
elif value.startswith('http'):
|
|
params = dict(self.site_params)
|
|
|
|
params['title'] = node_name
|
|
if color:
|
|
params['color'] = color
|
|
|
|
self.G.add_node(node_name, **params)
|
|
return node_name
|
|
|
|
def link(self, node1_name, node2_name):
|
|
self.G.add_edge(node1_name, node2_name, weight=2)
|
|
|
|
|
|
def save_graph_report(filename: str, username_results: list, db: MaigretDatabase):
|
|
import networkx as nx
|
|
|
|
G: Any = nx.Graph()
|
|
graph = MaigretGraph(G)
|
|
|
|
base_site_nodes = {}
|
|
site_account_nodes = {}
|
|
processed_values: Dict[str, Any] = {} # Track processed values to avoid duplicates
|
|
|
|
for username, id_type, results in username_results:
|
|
# Add username node, using normalized version directly if different
|
|
norm_username = username.lower()
|
|
username_node_name = graph.add_node(id_type, norm_username)
|
|
|
|
for website_name, dictionary in results.items():
|
|
if not dictionary or dictionary.get("is_similar"):
|
|
continue
|
|
|
|
status = dictionary.get("status")
|
|
if not status or status.status != MaigretCheckStatus.CLAIMED:
|
|
continue
|
|
|
|
# base site node
|
|
site_base_url = website_name
|
|
if site_base_url not in base_site_nodes:
|
|
base_site_nodes[site_base_url] = graph.add_node(
|
|
'site', site_base_url, color='#28a745'
|
|
) # Green color
|
|
|
|
site_base_node_name = base_site_nodes[site_base_url]
|
|
|
|
# account node
|
|
account_url = dictionary.get('url_user', f'{site_base_url}/{norm_username}')
|
|
account_node_id = f"{site_base_url}: {account_url}"
|
|
if account_node_id not in site_account_nodes:
|
|
site_account_nodes[account_node_id] = graph.add_node(
|
|
'account', account_url
|
|
)
|
|
|
|
account_node_name = site_account_nodes[account_node_id]
|
|
|
|
# link username → account → site
|
|
graph.link(username_node_name, account_node_name)
|
|
graph.link(account_node_name, site_base_node_name)
|
|
|
|
def process_ids(parent_node, ids):
|
|
for k, v in ids.items():
|
|
if (
|
|
k.endswith('_count')
|
|
or k.startswith('is_')
|
|
or k.endswith('_at')
|
|
or k in 'image'
|
|
):
|
|
continue
|
|
|
|
# Normalize value if string
|
|
norm_v = v.lower() if isinstance(v, str) else v
|
|
value_key = f"{k}:{norm_v}"
|
|
|
|
if value_key in processed_values:
|
|
ids_data_name = processed_values[value_key]
|
|
else:
|
|
v_data = v
|
|
if isinstance(v, str) and v.startswith('['):
|
|
try:
|
|
v_data = ast.literal_eval(v)
|
|
except Exception as e:
|
|
logging.error(e)
|
|
continue
|
|
|
|
if isinstance(v_data, list):
|
|
list_node_name = graph.add_node(k, site_base_url)
|
|
processed_values[value_key] = list_node_name
|
|
for vv in v_data:
|
|
data_node_name = graph.add_node(vv, site_base_url)
|
|
graph.link(list_node_name, data_node_name)
|
|
|
|
add_ids = {
|
|
a: b for b, a in db.extract_ids_from_url(vv).items()
|
|
}
|
|
if add_ids:
|
|
process_ids(data_node_name, add_ids)
|
|
ids_data_name = list_node_name
|
|
else:
|
|
ids_data_name = graph.add_node(k, norm_v)
|
|
processed_values[value_key] = ids_data_name
|
|
|
|
if 'username' in k or k in SUPPORTED_IDS:
|
|
new_username_key = f"username:{norm_v}"
|
|
if new_username_key not in processed_values:
|
|
new_username_node_name = graph.add_node(
|
|
'username', norm_v
|
|
)
|
|
processed_values[new_username_key] = (
|
|
new_username_node_name
|
|
)
|
|
graph.link(ids_data_name, new_username_node_name)
|
|
|
|
add_ids = {
|
|
k: v for v, k in db.extract_ids_from_url(v).items()
|
|
}
|
|
if add_ids:
|
|
process_ids(ids_data_name, add_ids)
|
|
|
|
graph.link(parent_node, ids_data_name)
|
|
|
|
if status.ids_data:
|
|
process_ids(account_node_name, status.ids_data)
|
|
|
|
# Remove overly long nodes
|
|
nodes_to_remove = [node for node in G.nodes if len(str(node)) > 100]
|
|
G.remove_nodes_from(nodes_to_remove)
|
|
|
|
# Remove site nodes with only one connection
|
|
single_degree_sites = [
|
|
n for n, deg in G.degree() if n.startswith("site:") and deg <= 1
|
|
]
|
|
G.remove_nodes_from(single_degree_sites)
|
|
|
|
# Generate interactive visualization
|
|
from pyvis.network import Network # type: ignore[import-untyped]
|
|
|
|
nt = Network(notebook=True, height="750px", width="100%")
|
|
nt.from_nx(G)
|
|
nt.show(filename)
|
|
|
|
|
|
def get_plaintext_report(context: dict) -> str:
|
|
output = (context['brief'] + " ").replace('. ', '.\n')
|
|
interests = list(map(lambda x: x[0], context.get('interests_tuple_list', [])))
|
|
countries = list(map(lambda x: x[0], context.get('countries_tuple_list', [])))
|
|
if countries:
|
|
output += f'Countries: {", ".join(countries)}\n'
|
|
if interests:
|
|
output += f'Interests (tags): {", ".join(interests)}\n'
|
|
return output.strip()
|
|
|
|
|
|
"""
|
|
REPORTS GENERATING
|
|
"""
|
|
|
|
|
|
def generate_report_template(is_pdf: bool):
|
|
"""
|
|
HTML/PDF template generation
|
|
"""
|
|
|
|
def get_resource_content(filename):
|
|
return open(os.path.join(maigret_path, "resources", filename)).read()
|
|
|
|
maigret_path = os.path.dirname(os.path.realpath(__file__))
|
|
|
|
if is_pdf:
|
|
template_content = get_resource_content("simple_report_pdf.tpl")
|
|
css_content = get_resource_content("simple_report_pdf.css")
|
|
else:
|
|
template_content = get_resource_content("simple_report.tpl")
|
|
css_content = None
|
|
|
|
template = Template(template_content)
|
|
template.globals["title"] = CaseConverter.snake_to_title # type: ignore
|
|
template.globals["detect_link"] = enrich_link_str # type: ignore
|
|
return template, css_content
|
|
|
|
|
|
def generate_report_context(username_results: list):
|
|
brief_text = []
|
|
usernames = {}
|
|
extended_info_count = 0
|
|
tags: Dict[str, int] = {}
|
|
supposed_data: Dict[str, Any] = {}
|
|
|
|
first_seen = None
|
|
|
|
# moved here to speed up the launch of Maigret
|
|
import pycountry
|
|
|
|
for username, id_type, results in username_results:
|
|
found_accounts = 0
|
|
new_ids = []
|
|
usernames[username] = {"type": id_type}
|
|
|
|
for website_name in results:
|
|
dictionary = results[website_name]
|
|
# TODO: fix no site data issue
|
|
if not dictionary:
|
|
continue
|
|
|
|
if dictionary.get("is_similar"):
|
|
continue
|
|
|
|
status = dictionary.get("status")
|
|
if not status: # FIXME: currently in case of timeout
|
|
continue
|
|
|
|
if status.ids_data:
|
|
dictionary["ids_data"] = status.ids_data
|
|
extended_info_count += 1
|
|
|
|
# detect first seen
|
|
created_at = status.ids_data.get("created_at")
|
|
if created_at:
|
|
if first_seen is None:
|
|
first_seen = created_at
|
|
else:
|
|
try:
|
|
known_time = parse_datetime_str(
|
|
first_seen, tzinfos=ADDITIONAL_TZINFO
|
|
)
|
|
new_time = parse_datetime_str(
|
|
created_at, tzinfos=ADDITIONAL_TZINFO
|
|
)
|
|
if new_time < known_time:
|
|
first_seen = created_at
|
|
except Exception as e:
|
|
logging.debug(
|
|
"Problems with converting datetime %s/%s: %s",
|
|
first_seen,
|
|
created_at,
|
|
str(e),
|
|
exc_info=True,
|
|
)
|
|
|
|
for k, v in status.ids_data.items():
|
|
# suppose target data
|
|
field = "fullname" if k == "name" else k
|
|
if field not in supposed_data:
|
|
supposed_data[field] = []
|
|
supposed_data[field].append(v)
|
|
# suppose country
|
|
if k in ["country", "locale"]:
|
|
try:
|
|
if is_country_tag(k):
|
|
country = pycountry.countries.get(alpha_2=v)
|
|
tag = country.alpha_2.lower() # type: ignore[union-attr]
|
|
else:
|
|
tag = pycountry.countries.search_fuzzy(v)[
|
|
0
|
|
].alpha_2.lower() # type: ignore[attr-defined]
|
|
# TODO: move countries to another struct
|
|
tags[tag] = tags.get(tag, 0) + 1
|
|
except Exception as e:
|
|
logging.debug(
|
|
"Pycountry exception: %s", str(e), exc_info=True
|
|
)
|
|
|
|
new_usernames = dictionary.get("ids_usernames")
|
|
if new_usernames:
|
|
for u, utype in new_usernames.items():
|
|
if u not in usernames:
|
|
new_ids.append((u, utype))
|
|
usernames[u] = {"type": utype}
|
|
|
|
if status.status == MaigretCheckStatus.CLAIMED:
|
|
found_accounts += 1
|
|
dictionary["found"] = True
|
|
else:
|
|
continue
|
|
|
|
# ignore non-exact search results
|
|
if status.tags:
|
|
for t in status.tags:
|
|
tags[t] = tags.get(t, 0) + 1
|
|
|
|
brief_text.append(
|
|
f"Search by {id_type} {username} returned {found_accounts} accounts."
|
|
)
|
|
|
|
if new_ids:
|
|
ids_list = []
|
|
for u, t in new_ids:
|
|
ids_list.append(f"{u} ({t})" if t != "username" else u)
|
|
brief_text.append("Found target's other IDs: " + ", ".join(ids_list) + ".")
|
|
|
|
brief_text.append(f"Extended info extracted from {extended_info_count} accounts.")
|
|
|
|
brief = " ".join(brief_text).strip()
|
|
tuple_sort = lambda d: sorted(d, key=lambda x: x[1], reverse=True)
|
|
|
|
if "global" in tags:
|
|
# remove tag 'global' useless for country detection
|
|
del tags["global"]
|
|
|
|
first_username = username_results[0][0]
|
|
countries_lists = list(filter(lambda x: is_country_tag(x[0]), tags.items()))
|
|
interests_list = list(filter(lambda x: not is_country_tag(x[0]), tags.items()))
|
|
|
|
filtered_supposed_data = filter_supposed_data(supposed_data)
|
|
|
|
return {
|
|
"username": first_username,
|
|
# TODO: return brief list
|
|
"brief": brief,
|
|
"results": username_results,
|
|
"first_seen": first_seen,
|
|
"interests_tuple_list": tuple_sort(interests_list),
|
|
"countries_tuple_list": tuple_sort(countries_lists),
|
|
"supposed_data": filtered_supposed_data,
|
|
"generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
}
|
|
|
|
|
|
def generate_csv_report(username: str, results: dict, csvfile):
|
|
writer = csv.writer(csvfile)
|
|
writer.writerow(
|
|
["username", "name", "url_main", "url_user", "exists", "http_status"]
|
|
)
|
|
for site in results:
|
|
# TODO: fix the reason
|
|
status = 'Unknown'
|
|
if "status" in results[site]:
|
|
status = str(results[site]["status"].status)
|
|
writer.writerow(
|
|
[
|
|
username,
|
|
site,
|
|
results[site].get("url_main", ""),
|
|
results[site].get("url_user", ""),
|
|
status,
|
|
results[site].get("http_status", 0),
|
|
]
|
|
)
|
|
|
|
|
|
def generate_txt_report(username: str, results: dict, file):
|
|
exists_counter = 0
|
|
for website_name in results:
|
|
dictionary = results[website_name]
|
|
# TODO: fix no site data issue
|
|
if not dictionary:
|
|
continue
|
|
if (
|
|
dictionary.get("status")
|
|
and dictionary["status"].status == MaigretCheckStatus.CLAIMED
|
|
):
|
|
exists_counter += 1
|
|
file.write(dictionary["url_user"] + "\n")
|
|
file.write(f"Total Websites Username Detected On : {exists_counter}")
|
|
|
|
|
|
def generate_json_report(username: str, results: dict, file, report_type):
|
|
is_report_per_line = report_type.startswith("ndjson")
|
|
all_json = {}
|
|
|
|
for sitename in results:
|
|
site_result = results[sitename]
|
|
# TODO: fix no site data issue
|
|
if not site_result or not site_result.get("status"):
|
|
continue
|
|
|
|
if site_result["status"].status != MaigretCheckStatus.CLAIMED:
|
|
continue
|
|
|
|
data = dict(site_result)
|
|
data["status"] = data["status"].json()
|
|
data["site"] = data["site"].json
|
|
for field in ["future", "checker"]:
|
|
if field in data:
|
|
del data[field]
|
|
|
|
if is_report_per_line:
|
|
data["sitename"] = sitename
|
|
file.write(json.dumps(data) + "\n")
|
|
else:
|
|
all_json[sitename] = data
|
|
|
|
if not is_report_per_line:
|
|
file.write(json.dumps(all_json))
|
|
|
|
|
|
"""
|
|
XMIND 8 Functions
|
|
"""
|
|
|
|
|
|
def save_xmind_report(filename, username, results):
|
|
if os.path.exists(filename):
|
|
os.remove(filename)
|
|
workbook = xmind.load(filename)
|
|
sheet = workbook.getPrimarySheet()
|
|
design_xmind_sheet(sheet, username, results)
|
|
xmind.save(workbook, path=filename)
|
|
|
|
|
|
def add_xmind_subtopic(userlink, k, v, supposed_data):
|
|
currentsublabel = userlink.addSubTopic()
|
|
field = "fullname" if k == "name" else k
|
|
if field not in supposed_data:
|
|
supposed_data[field] = []
|
|
supposed_data[field].append(v)
|
|
currentsublabel.setTitle("%s: %s" % (k, v))
|
|
|
|
|
|
def design_xmind_sheet(sheet, username, results):
|
|
alltags: Dict[str, Any] = {}
|
|
supposed_data: Dict[str, Any] = {}
|
|
|
|
sheet.setTitle("%s Analysis" % (username))
|
|
root_topic1 = sheet.getRootTopic()
|
|
root_topic1.setTitle("%s" % (username))
|
|
|
|
undefinedsection = root_topic1.addSubTopic()
|
|
undefinedsection.setTitle("Undefined")
|
|
alltags["undefined"] = undefinedsection
|
|
|
|
for website_name in results:
|
|
dictionary = results[website_name]
|
|
if not dictionary:
|
|
continue
|
|
result_status = dictionary.get("status")
|
|
# TODO: fix the reason
|
|
if not result_status or result_status.status != MaigretCheckStatus.CLAIMED:
|
|
continue
|
|
|
|
stripped_tags = list(map(lambda x: x.strip(), result_status.tags))
|
|
normalized_tags = list(
|
|
filter(lambda x: x and not is_country_tag(x), stripped_tags)
|
|
)
|
|
|
|
category = None
|
|
for tag in normalized_tags:
|
|
if tag in alltags.keys():
|
|
continue
|
|
tagsection = root_topic1.addSubTopic()
|
|
tagsection.setTitle(tag)
|
|
alltags[tag] = tagsection
|
|
category = tag
|
|
|
|
section = alltags[category] if category else undefinedsection
|
|
userlink = section.addSubTopic()
|
|
userlink.addLabel(result_status.site_url_user)
|
|
|
|
ids_data = result_status.ids_data or {}
|
|
for k, v in ids_data.items():
|
|
# suppose target data
|
|
if isinstance(v, list):
|
|
for currentval in v:
|
|
add_xmind_subtopic(userlink, k, currentval, supposed_data)
|
|
else:
|
|
add_xmind_subtopic(userlink, k, v, supposed_data)
|
|
|
|
# add supposed data
|
|
filtered_supposed_data = filter_supposed_data(supposed_data)
|
|
if len(filtered_supposed_data) > 0:
|
|
undefinedsection = root_topic1.addSubTopic()
|
|
undefinedsection.setTitle("SUPPOSED DATA")
|
|
for k, v in filtered_supposed_data.items():
|
|
currentsublabel = undefinedsection.addSubTopic()
|
|
currentsublabel.setTitle("%s: %s" % (k, v))
|