Add Markdown reports for LLM analysis (#2463)

This commit is contained in:
Soxoj
2026-04-06 18:26:43 +02:00
committed by GitHub
parent 44a6c729e3
commit ad95302745
9 changed files with 207 additions and 6 deletions
+34
View File
@@ -106,6 +106,9 @@ username).
``-J``, ``--json`` - Generate a JSON report of specific type: simple, ``-J``, ``--json`` - Generate a JSON report of specific type: simple,
ndjson (one report per username). E.g. ``--json ndjson`` ndjson (one report per username). E.g. ``--json ndjson``
``-M``, ``--md`` - Generate a Markdown report (general report on all
usernames). See :ref:`markdown-report` below.
``-fo``, ``--folderoutput`` - Results will be saved to this folder, ``-fo``, ``--folderoutput`` - Results will be saved to this folder,
``results`` by default. Will be created if doesnt exist. ``results`` by default. Will be created if doesnt exist.
@@ -142,4 +145,35 @@ site main page URL to determine the site engine and methods to check
account presence. After checking Maigret asks if you want to add the account presence. After checking Maigret asks if you want to add the
site, answering y/Y will rewrite the local database. site, answering y/Y will rewrite the local database.
.. _markdown-report:
Markdown report (LLM-friendly)
------------------------------
The ``--md`` / ``-M`` flag generates a Markdown report designed for both human reading and analysis by AI assistants (ChatGPT, Claude, etc.).
.. code-block:: console
maigret username --md
The report includes:
- **Summary** with aggregated personal data (all fullnames, locations, bios found across accounts), country tags, website tags, first/last seen timestamps.
- **Per-account sections** with profile URL, site tags, and all extracted fields (username, bio, follower count, linked accounts, etc.).
- **Possible false positives** disclaimer explaining that accounts may belong to different people.
- **Ethical use** notice about applicable data protection laws.
**Using with AI tools:**
The Markdown format is optimized for LLM context windows. You can feed the report directly to an AI assistant for follow-up analysis:
.. code-block:: console
# Generate the report
maigret johndoe --md
# Feed it to an AI tool
cat reports/report_johndoe.md | llm "Analyze this OSINT report and summarize key findings"
The structured Markdown with per-site sections makes it easy for AI tools to extract relationships, cross-reference identities, and identify patterns across accounts.
+27 -1
View File
@@ -37,6 +37,7 @@ from .report import (
get_plaintext_report, get_plaintext_report,
sort_report_by_data_points, sort_report_by_data_points,
save_graph_report, save_graph_report,
save_markdown_report,
) )
from .sites import MaigretDatabase from .sites import MaigretDatabase
from .submit import Submitter from .submit import Submitter
@@ -465,6 +466,14 @@ def setup_arguments_parser(settings: Settings):
default=settings.pdf_report, default=settings.pdf_report,
help="Generate a PDF report (general report on all usernames).", help="Generate a PDF report (general report on all usernames).",
) )
report_group.add_argument(
"-M",
"--md",
action="store_true",
dest="md",
default=settings.md_report,
help="Generate a Markdown report (general report on all usernames).",
)
report_group.add_argument( report_group.add_argument(
"-G", "-G",
"--graph", "--graph",
@@ -803,7 +812,7 @@ async def main():
# reporting for all the result # reporting for all the result
if general_results: if general_results:
if args.html or args.pdf: if args.html or args.pdf or args.md:
query_notify.warning('Generating report info...') query_notify.warning('Generating report info...')
report_context = generate_report_context(general_results) report_context = generate_report_context(general_results)
# determine main username # determine main username
@@ -823,6 +832,23 @@ async def main():
save_pdf_report(filename, report_context) save_pdf_report(filename, report_context)
query_notify.warning(f'PDF report on all usernames saved in {filename}') query_notify.warning(f'PDF report on all usernames saved in {filename}')
if args.md:
username = username.replace('/', '_')
filename = report_filepath_tpl.format(username=username, postfix='.md')
run_flags = []
if args.tags:
run_flags.append(f"--tags {args.tags}")
if args.site_list:
run_flags.append(f"--site {','.join(args.site_list)}")
if args.all_sites:
run_flags.append("--all-sites")
run_info = {
"sites_count": sum(len(d) for _, _, d in general_results),
"flags": " ".join(run_flags) if run_flags else None,
}
save_markdown_report(filename, report_context, run_info=run_info)
query_notify.warning(f'Markdown report on all usernames saved in {filename}')
if args.graph: if args.graph:
username = username.replace('/', '_') username = username.replace('/', '_')
filename = report_filepath_tpl.format( filename = report_filepath_tpl.format(
+138
View File
@@ -257,6 +257,144 @@ def get_plaintext_report(context: dict) -> str:
return output.strip() return output.strip()
def _md_format_value(value) -> str:
"""Format a value for Markdown output, detecting links."""
if isinstance(value, list):
return ", ".join(str(v) for v in value)
s = str(value)
if s.startswith("http://") or s.startswith("https://"):
return f"[{s}]({s})"
return s
def save_markdown_report(filename: str, context: dict, run_info: dict = None):
username = context.get("username", "unknown")
generated_at = context.get("generated_at", "")
brief = context.get("brief", "")
countries = context.get("countries_tuple_list", [])
interests = context.get("interests_tuple_list", [])
first_seen = context.get("first_seen")
results = context.get("results", [])
# Collect ALL values for key fields across all accounts
all_fields: Dict[str, list] = {}
last_seen = None
for _, _, data in results:
for _, v in data.items():
if not v.get("found") or v.get("is_similar"):
continue
ids_data = v.get("ids_data", {})
# Map multiple source fields to unified output fields
field_sources = {
"fullname": ("fullname", "name"),
"location": ("location", "country", "city", "country_code", "locale", "region"),
"gender": ("gender",),
"bio": ("bio", "about", "description"),
}
for out_field, source_keys in field_sources.items():
for src in source_keys:
val = ids_data.get(src)
if val:
all_fields.setdefault(out_field, [])
val_str = str(val)
if val_str not in all_fields[out_field]:
all_fields[out_field].append(val_str)
# Track last_seen
for ts_field in ("last_online", "latest_activity_at", "updated_at"):
ts = ids_data.get(ts_field)
if ts and (last_seen is None or str(ts) > str(last_seen)):
last_seen = ts
lines = []
lines.append(f"# Report by searching on username \"{username}\"\n")
# Generated line with run info
gen_line = f"Generated at {generated_at} by [Maigret](https://github.com/soxoj/maigret)"
if run_info:
parts = []
if run_info.get("sites_count"):
parts.append(f"{run_info['sites_count']} sites checked")
if run_info.get("flags"):
parts.append(f"flags: `{run_info['flags']}`")
if parts:
gen_line += f" ({', '.join(parts)})"
lines.append(f"{gen_line}\n")
# Summary
lines.append("## Summary\n")
lines.append(f"{brief}\n")
if all_fields:
lines.append("**Information extracted from accounts:**\n")
for field, values in all_fields.items():
title = CaseConverter.snake_to_title(field)
lines.append(f"- {title}: {'; '.join(values)}")
lines.append("")
if countries:
geo = ", ".join(f"{code} (x{count})" for code, count in countries)
lines.append(f"**Country tags:** {geo}\n")
if interests:
tags = ", ".join(f"{tag} (x{count})" for tag, count in interests)
lines.append(f"**Website tags:** {tags}\n")
if first_seen:
lines.append(f"**First seen:** {first_seen}")
if last_seen:
lines.append(f"**Last seen:** {last_seen}")
if first_seen or last_seen:
lines.append("")
# Accounts found
lines.append("## Accounts found\n")
for u, id_type, data in results:
for site_name, v in data.items():
if not v.get("found") or v.get("is_similar"):
continue
lines.append(f"### {site_name}\n")
lines.append(f"- **URL:** [{v.get('url_user', '')}]({v.get('url_user', '')})")
tags = v.get("status") and v["status"].tags or []
if tags:
lines.append(f"- **Tags:** {', '.join(tags)}")
lines.append("")
ids_data = v.get("ids_data", {})
if ids_data:
for field, value in ids_data.items():
if field == "image":
continue
title = CaseConverter.snake_to_title(field)
lines.append(f"- {title}: {_md_format_value(value)}")
lines.append("")
# Possible false positives
lines.append("## Possible false positives\n")
lines.append(
f"This report was generated by searching for accounts matching the username `{username}`. "
f"Accounts listed above may belong to different people who happen to use the same "
f"or similar username. Results without extracted personal information could contain "
f"some false positive findings. Always verify findings before drawing conclusions.\n"
)
# Ethical use
lines.append("## Ethical use\n")
lines.append(
"This report is a result of a technical collection of publicly available information "
"from online accounts and does not constitute personal data processing. If you intend "
"to use this data for personal data processing or collection purposes, ensure your use "
"complies with applicable laws and regulations in your jurisdiction (such as GDPR, "
"CCPA, and similar).\n"
)
with open(filename, "w", encoding="utf-8") as f:
f.write("\n".join(lines))
""" """
REPORTS GENERATING REPORTS GENERATING
""" """
+2 -2
View File
@@ -101,7 +101,7 @@
"sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"", "sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
"authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA", "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36", "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
"x-guest-token": "2039637579922866279" "x-guest-token": "2041186137171976270"
}, },
"errors": { "errors": {
"Bad guest token": "x-guest-token update required" "Bad guest token": "x-guest-token update required"
@@ -294,7 +294,7 @@
"method": "vimeo" "method": "vimeo"
}, },
"headers": { "headers": {
"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NzUxMjM0MDAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbCwianRpIjoiZDY4YjViMGMtYTE3OC00ZDdhLWIyM2QtMDg5Y2MwZjAwOGEyIn0.0bGwlqckn4J07em2-nEX10OfW1JAmi54QCrPtm8Qn6A" "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NzU0OTI1ODAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbCwianRpIjoiZjkwOGY0MmYtMTE2Zi00MDRkLWExOTgtOGUyOTE2MTFmZTQzIn0.Wt_z9qrjHofYPtUIDkbxrPX2S-glzmEowkR8m89O_Zg"
}, },
"urlProbe": "https://api.vimeo.com/users/{username}?fields=name%2Cgender%2Cbio%2Curi%2Clink%2Cbackground_video%2Clocation_details%2Cpictures%2Cverified%2Cmetadata.public_videos.total%2Cavailable_for_hire%2Ccan_work_remotely%2Cmetadata.connections.videos.total%2Cmetadata.connections.albums.total%2Cmetadata.connections.followers.total%2Cmetadata.connections.following.total%2Cmetadata.public_videos.total%2Cmetadata.connections.vimeo_experts.is_enrolled%2Ctotal_collection_count%2Ccreated_time%2Cprofile_preferences%2Cmembership%2Cclients%2Cskills%2Cproject_types%2Crates%2Ccategories%2Cis_expert%2Cprofile_discovery%2Cwebsites%2Ccontact_emails&fetch_user_profile=1", "urlProbe": "https://api.vimeo.com/users/{username}?fields=name%2Cgender%2Cbio%2Curi%2Clink%2Cbackground_video%2Clocation_details%2Cpictures%2Cverified%2Cmetadata.public_videos.total%2Cavailable_for_hire%2Ccan_work_remotely%2Cmetadata.connections.videos.total%2Cmetadata.connections.albums.total%2Cmetadata.connections.followers.total%2Cmetadata.connections.following.total%2Cmetadata.public_videos.total%2Cmetadata.connections.vimeo_experts.is_enrolled%2Ctotal_collection_count%2Ccreated_time%2Cprofile_preferences%2Cmembership%2Cclients%2Cskills%2Cproject_types%2Crates%2Ccategories%2Cis_expert%2Cprofile_discovery%2Cwebsites%2Ccontact_emails&fetch_user_profile=1",
"checkType": "status_code", "checkType": "status_code",
+2 -2
View File
@@ -1,8 +1,8 @@
{ {
"version": 1, "version": 1,
"updated_at": "2026-04-04T17:04:45Z", "updated_at": "2026-04-06T16:20:33Z",
"sites_count": 3155, "sites_count": 3155,
"min_maigret_version": "0.5.0", "min_maigret_version": "0.5.0",
"data_sha256": "4b1c0c96e1595f6e83584a7a6e885647095cbfb7f23c938d7440f8a3408551b1", "data_sha256": "da87fd6f32bd60efc25e35aa6aa7d329e490d4aa544ddb68539d490cd2157b56",
"data_url": "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/data.json" "data_url": "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/data.json"
} }
+1
View File
@@ -54,6 +54,7 @@
"graph_report": false, "graph_report": false,
"pdf_report": false, "pdf_report": false,
"html_report": false, "html_report": false,
"md_report": false,
"web_interface_port": 5000, "web_interface_port": 5000,
"no_autoupdate": false, "no_autoupdate": false,
"db_update_meta_url": "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/db_meta.json", "db_update_meta_url": "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/db_meta.json",
+1
View File
@@ -42,6 +42,7 @@ class Settings:
pdf_report: bool pdf_report: bool
html_report: bool html_report: bool
graph_report: bool graph_report: bool
md_report: bool
web_interface_port: int web_interface_port: int
no_autoupdate: bool no_autoupdate: bool
db_update_meta_url: str db_update_meta_url: str
+1 -1
View File
@@ -3159,7 +3159,7 @@ Rank data fetched from Majestic Million by domains.
1. ![](https://www.google.com/s2/favicons?domain=https://tonometerbot.com) [Tonometerbot (https://tonometerbot.com)](https://tonometerbot.com)*: top 100M, crypto* 1. ![](https://www.google.com/s2/favicons?domain=https://tonometerbot.com) [Tonometerbot (https://tonometerbot.com)](https://tonometerbot.com)*: top 100M, crypto*
1. ![](https://www.google.com/s2/favicons?domain=https://www.spatial.io) [Spatial (https://www.spatial.io)](https://www.spatial.io)*: top 100M, crypto, gaming* 1. ![](https://www.google.com/s2/favicons?domain=https://www.spatial.io) [Spatial (https://www.spatial.io)](https://www.spatial.io)*: top 100M, crypto, gaming*
The list was updated at (2026-04-04) The list was updated at (2026-04-06)
## Statistics ## Statistics
Enabled/total sites: 2538/3155 = 80.44% Enabled/total sites: 2538/3155 = 80.44%
+1
View File
@@ -48,6 +48,7 @@ DEFAULT_ARGS: Dict[str, Any] = {
'web': None, 'web': None,
'with_domains': False, 'with_domains': False,
'xmind': False, 'xmind': False,
'md': False,
'no_autoupdate': False, 'no_autoupdate': False,
'force_update': False, 'force_update': False,
} }