Add Markdown reports for LLM analysis (#2463)

This commit is contained in:
Soxoj
2026-04-06 18:26:43 +02:00
committed by GitHub
parent 44a6c729e3
commit ad95302745
9 changed files with 207 additions and 6 deletions
+34
View File
@@ -106,6 +106,9 @@ username).
``-J``, ``--json`` - Generate a JSON report of specific type: simple,
ndjson (one report per username). E.g. ``--json ndjson``
``-M``, ``--md`` - Generate a Markdown report (general report on all
usernames). See :ref:`markdown-report` below.
``-fo``, ``--folderoutput`` - Results will be saved to this folder,
``results`` by default. Will be created if doesnt exist.
@@ -142,4 +145,35 @@ site main page URL to determine the site engine and methods to check
account presence. After checking Maigret asks if you want to add the
site, answering y/Y will rewrite the local database.
.. _markdown-report:
Markdown report (LLM-friendly)
------------------------------
The ``--md`` / ``-M`` flag generates a Markdown report designed for both human reading and analysis by AI assistants (ChatGPT, Claude, etc.).
.. code-block:: console
maigret username --md
The report includes:
- **Summary** with aggregated personal data (all fullnames, locations, bios found across accounts), country tags, website tags, first/last seen timestamps.
- **Per-account sections** with profile URL, site tags, and all extracted fields (username, bio, follower count, linked accounts, etc.).
- **Possible false positives** disclaimer explaining that accounts may belong to different people.
- **Ethical use** notice about applicable data protection laws.
**Using with AI tools:**
The Markdown format is optimized for LLM context windows. You can feed the report directly to an AI assistant for follow-up analysis:
.. code-block:: console
# Generate the report
maigret johndoe --md
# Feed it to an AI tool
cat reports/report_johndoe.md | llm "Analyze this OSINT report and summarize key findings"
The structured Markdown with per-site sections makes it easy for AI tools to extract relationships, cross-reference identities, and identify patterns across accounts.
+27 -1
View File
@@ -37,6 +37,7 @@ from .report import (
get_plaintext_report,
sort_report_by_data_points,
save_graph_report,
save_markdown_report,
)
from .sites import MaigretDatabase
from .submit import Submitter
@@ -465,6 +466,14 @@ def setup_arguments_parser(settings: Settings):
default=settings.pdf_report,
help="Generate a PDF report (general report on all usernames).",
)
report_group.add_argument(
"-M",
"--md",
action="store_true",
dest="md",
default=settings.md_report,
help="Generate a Markdown report (general report on all usernames).",
)
report_group.add_argument(
"-G",
"--graph",
@@ -803,7 +812,7 @@ async def main():
# reporting for all the result
if general_results:
if args.html or args.pdf:
if args.html or args.pdf or args.md:
query_notify.warning('Generating report info...')
report_context = generate_report_context(general_results)
# determine main username
@@ -823,6 +832,23 @@ async def main():
save_pdf_report(filename, report_context)
query_notify.warning(f'PDF report on all usernames saved in {filename}')
if args.md:
username = username.replace('/', '_')
filename = report_filepath_tpl.format(username=username, postfix='.md')
run_flags = []
if args.tags:
run_flags.append(f"--tags {args.tags}")
if args.site_list:
run_flags.append(f"--site {','.join(args.site_list)}")
if args.all_sites:
run_flags.append("--all-sites")
run_info = {
"sites_count": sum(len(d) for _, _, d in general_results),
"flags": " ".join(run_flags) if run_flags else None,
}
save_markdown_report(filename, report_context, run_info=run_info)
query_notify.warning(f'Markdown report on all usernames saved in {filename}')
if args.graph:
username = username.replace('/', '_')
filename = report_filepath_tpl.format(
+138
View File
@@ -257,6 +257,144 @@ def get_plaintext_report(context: dict) -> str:
return output.strip()
def _md_format_value(value) -> str:
"""Format a value for Markdown output, detecting links."""
if isinstance(value, list):
return ", ".join(str(v) for v in value)
s = str(value)
if s.startswith("http://") or s.startswith("https://"):
return f"[{s}]({s})"
return s
def save_markdown_report(filename: str, context: dict, run_info: dict = None):
username = context.get("username", "unknown")
generated_at = context.get("generated_at", "")
brief = context.get("brief", "")
countries = context.get("countries_tuple_list", [])
interests = context.get("interests_tuple_list", [])
first_seen = context.get("first_seen")
results = context.get("results", [])
# Collect ALL values for key fields across all accounts
all_fields: Dict[str, list] = {}
last_seen = None
for _, _, data in results:
for _, v in data.items():
if not v.get("found") or v.get("is_similar"):
continue
ids_data = v.get("ids_data", {})
# Map multiple source fields to unified output fields
field_sources = {
"fullname": ("fullname", "name"),
"location": ("location", "country", "city", "country_code", "locale", "region"),
"gender": ("gender",),
"bio": ("bio", "about", "description"),
}
for out_field, source_keys in field_sources.items():
for src in source_keys:
val = ids_data.get(src)
if val:
all_fields.setdefault(out_field, [])
val_str = str(val)
if val_str not in all_fields[out_field]:
all_fields[out_field].append(val_str)
# Track last_seen
for ts_field in ("last_online", "latest_activity_at", "updated_at"):
ts = ids_data.get(ts_field)
if ts and (last_seen is None or str(ts) > str(last_seen)):
last_seen = ts
lines = []
lines.append(f"# Report by searching on username \"{username}\"\n")
# Generated line with run info
gen_line = f"Generated at {generated_at} by [Maigret](https://github.com/soxoj/maigret)"
if run_info:
parts = []
if run_info.get("sites_count"):
parts.append(f"{run_info['sites_count']} sites checked")
if run_info.get("flags"):
parts.append(f"flags: `{run_info['flags']}`")
if parts:
gen_line += f" ({', '.join(parts)})"
lines.append(f"{gen_line}\n")
# Summary
lines.append("## Summary\n")
lines.append(f"{brief}\n")
if all_fields:
lines.append("**Information extracted from accounts:**\n")
for field, values in all_fields.items():
title = CaseConverter.snake_to_title(field)
lines.append(f"- {title}: {'; '.join(values)}")
lines.append("")
if countries:
geo = ", ".join(f"{code} (x{count})" for code, count in countries)
lines.append(f"**Country tags:** {geo}\n")
if interests:
tags = ", ".join(f"{tag} (x{count})" for tag, count in interests)
lines.append(f"**Website tags:** {tags}\n")
if first_seen:
lines.append(f"**First seen:** {first_seen}")
if last_seen:
lines.append(f"**Last seen:** {last_seen}")
if first_seen or last_seen:
lines.append("")
# Accounts found
lines.append("## Accounts found\n")
for u, id_type, data in results:
for site_name, v in data.items():
if not v.get("found") or v.get("is_similar"):
continue
lines.append(f"### {site_name}\n")
lines.append(f"- **URL:** [{v.get('url_user', '')}]({v.get('url_user', '')})")
tags = v.get("status") and v["status"].tags or []
if tags:
lines.append(f"- **Tags:** {', '.join(tags)}")
lines.append("")
ids_data = v.get("ids_data", {})
if ids_data:
for field, value in ids_data.items():
if field == "image":
continue
title = CaseConverter.snake_to_title(field)
lines.append(f"- {title}: {_md_format_value(value)}")
lines.append("")
# Possible false positives
lines.append("## Possible false positives\n")
lines.append(
f"This report was generated by searching for accounts matching the username `{username}`. "
f"Accounts listed above may belong to different people who happen to use the same "
f"or similar username. Results without extracted personal information could contain "
f"some false positive findings. Always verify findings before drawing conclusions.\n"
)
# Ethical use
lines.append("## Ethical use\n")
lines.append(
"This report is a result of a technical collection of publicly available information "
"from online accounts and does not constitute personal data processing. If you intend "
"to use this data for personal data processing or collection purposes, ensure your use "
"complies with applicable laws and regulations in your jurisdiction (such as GDPR, "
"CCPA, and similar).\n"
)
with open(filename, "w", encoding="utf-8") as f:
f.write("\n".join(lines))
"""
REPORTS GENERATING
"""
+2 -2
View File
@@ -101,7 +101,7 @@
"sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
"authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
"x-guest-token": "2039637579922866279"
"x-guest-token": "2041186137171976270"
},
"errors": {
"Bad guest token": "x-guest-token update required"
@@ -294,7 +294,7 @@
"method": "vimeo"
},
"headers": {
"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NzUxMjM0MDAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbCwianRpIjoiZDY4YjViMGMtYTE3OC00ZDdhLWIyM2QtMDg5Y2MwZjAwOGEyIn0.0bGwlqckn4J07em2-nEX10OfW1JAmi54QCrPtm8Qn6A"
"Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NzU0OTI1ODAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbCwianRpIjoiZjkwOGY0MmYtMTE2Zi00MDRkLWExOTgtOGUyOTE2MTFmZTQzIn0.Wt_z9qrjHofYPtUIDkbxrPX2S-glzmEowkR8m89O_Zg"
},
"urlProbe": "https://api.vimeo.com/users/{username}?fields=name%2Cgender%2Cbio%2Curi%2Clink%2Cbackground_video%2Clocation_details%2Cpictures%2Cverified%2Cmetadata.public_videos.total%2Cavailable_for_hire%2Ccan_work_remotely%2Cmetadata.connections.videos.total%2Cmetadata.connections.albums.total%2Cmetadata.connections.followers.total%2Cmetadata.connections.following.total%2Cmetadata.public_videos.total%2Cmetadata.connections.vimeo_experts.is_enrolled%2Ctotal_collection_count%2Ccreated_time%2Cprofile_preferences%2Cmembership%2Cclients%2Cskills%2Cproject_types%2Crates%2Ccategories%2Cis_expert%2Cprofile_discovery%2Cwebsites%2Ccontact_emails&fetch_user_profile=1",
"checkType": "status_code",
+2 -2
View File
@@ -1,8 +1,8 @@
{
"version": 1,
"updated_at": "2026-04-04T17:04:45Z",
"updated_at": "2026-04-06T16:20:33Z",
"sites_count": 3155,
"min_maigret_version": "0.5.0",
"data_sha256": "4b1c0c96e1595f6e83584a7a6e885647095cbfb7f23c938d7440f8a3408551b1",
"data_sha256": "da87fd6f32bd60efc25e35aa6aa7d329e490d4aa544ddb68539d490cd2157b56",
"data_url": "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/data.json"
}
+1
View File
@@ -54,6 +54,7 @@
"graph_report": false,
"pdf_report": false,
"html_report": false,
"md_report": false,
"web_interface_port": 5000,
"no_autoupdate": false,
"db_update_meta_url": "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/db_meta.json",
+1
View File
@@ -42,6 +42,7 @@ class Settings:
pdf_report: bool
html_report: bool
graph_report: bool
md_report: bool
web_interface_port: int
no_autoupdate: bool
db_update_meta_url: str
+1 -1
View File
@@ -3159,7 +3159,7 @@ Rank data fetched from Majestic Million by domains.
1. ![](https://www.google.com/s2/favicons?domain=https://tonometerbot.com) [Tonometerbot (https://tonometerbot.com)](https://tonometerbot.com)*: top 100M, crypto*
1. ![](https://www.google.com/s2/favicons?domain=https://www.spatial.io) [Spatial (https://www.spatial.io)](https://www.spatial.io)*: top 100M, crypto, gaming*
The list was updated at (2026-04-04)
The list was updated at (2026-04-06)
## Statistics
Enabled/total sites: 2538/3155 = 80.44%
+1
View File
@@ -48,6 +48,7 @@ DEFAULT_ARGS: Dict[str, Any] = {
'web': None,
'with_domains': False,
'xmind': False,
'md': False,
'no_autoupdate': False,
'force_update': False,
}