Add Markdown reports for LLM analysis (#2463)

2026-05-06 14:08:59 +00:00 · 2026-04-06 18:26:43 +02:00
parent 44a6c729e3
commit ad95302745
9 changed files with 207 additions and 6 deletions
@@ -106,6 +106,9 @@ username).
 ``-J``, ``--json`` - Generate a JSON report of specific type: simple,
 ndjson (one report per username). E.g. ``--json ndjson``

+``-M``, ``--md`` - Generate a Markdown report (general report on all
+usernames). See :ref:`markdown-report` below.
+
 ``-fo``, ``--folderoutput`` - Results will be saved to this folder,
 ``results`` by default. Will be created if doesn’t exist.

@@ -142,4 +145,35 @@ site main page URL to determine the site engine and methods to check
 account presence. After checking Maigret asks if you want to add the
 site, answering y/Y will rewrite the local database.

+.. _markdown-report:
+
+Markdown report (LLM-friendly)
+------------------------------
+
+The ``--md`` / ``-M`` flag generates a Markdown report designed for both human reading and analysis by AI assistants (ChatGPT, Claude, etc.).
+
+.. code-block:: console
+
+   maigret username --md
+
+The report includes:
+
+- **Summary** with aggregated personal data (all fullnames, locations, bios found across accounts), country tags, website tags, first/last seen timestamps.
+- **Per-account sections** with profile URL, site tags, and all extracted fields (username, bio, follower count, linked accounts, etc.).
+- **Possible false positives** disclaimer explaining that accounts may belong to different people.
+- **Ethical use** notice about applicable data protection laws.
+
+**Using with AI tools:**
+
+The Markdown format is optimized for LLM context windows. You can feed the report directly to an AI assistant for follow-up analysis:
+
+.. code-block:: console
+
+   # Generate the report
+   maigret johndoe --md
+
+   # Feed it to an AI tool
+   cat reports/report_johndoe.md | llm "Analyze this OSINT report and summarize key findings"
+
+The structured Markdown with per-site sections makes it easy for AI tools to extract relationships, cross-reference identities, and identify patterns across accounts.

@@ -37,6 +37,7 @@ from .report import (
    get_plaintext_report,
    sort_report_by_data_points,
    save_graph_report,
+    save_markdown_report,
 )
 from .sites import MaigretDatabase
 from .submit import Submitter
@@ -465,6 +466,14 @@ def setup_arguments_parser(settings: Settings):
        default=settings.pdf_report,
        help="Generate a PDF report (general report on all usernames).",
    )
+    report_group.add_argument(
+        "-M",
+        "--md",
+        action="store_true",
+        dest="md",
+        default=settings.md_report,
+        help="Generate a Markdown report (general report on all usernames).",
+    )
    report_group.add_argument(
        "-G",
        "--graph",
@@ -803,7 +812,7 @@ async def main():

    # reporting for all the result
    if general_results:
-        if args.html or args.pdf:
+        if args.html or args.pdf or args.md:
            query_notify.warning('Generating report info...')
        report_context = generate_report_context(general_results)
        # determine main username
@@ -823,6 +832,23 @@ async def main():
            save_pdf_report(filename, report_context)
            query_notify.warning(f'PDF report on all usernames saved in {filename}')

+        if args.md:
+            username = username.replace('/', '_')
+            filename = report_filepath_tpl.format(username=username, postfix='.md')
+            run_flags = []
+            if args.tags:
+                run_flags.append(f"--tags {args.tags}")
+            if args.site_list:
+                run_flags.append(f"--site {','.join(args.site_list)}")
+            if args.all_sites:
+                run_flags.append("--all-sites")
+            run_info = {
+                "sites_count": sum(len(d) for _, _, d in general_results),
+                "flags": " ".join(run_flags) if run_flags else None,
+            }
+            save_markdown_report(filename, report_context, run_info=run_info)
+            query_notify.warning(f'Markdown report on all usernames saved in {filename}')
+
        if args.graph:
            username = username.replace('/', '_')
            filename = report_filepath_tpl.format(
@@ -257,6 +257,144 @@ def get_plaintext_report(context: dict) -> str:
    return output.strip()


+def _md_format_value(value) -> str:
+    """Format a value for Markdown output, detecting links."""
+    if isinstance(value, list):
+        return ", ".join(str(v) for v in value)
+    s = str(value)
+    if s.startswith("http://") or s.startswith("https://"):
+        return f"[{s}]({s})"
+    return s
+
+
+def save_markdown_report(filename: str, context: dict, run_info: dict = None):
+    username = context.get("username", "unknown")
+    generated_at = context.get("generated_at", "")
+    brief = context.get("brief", "")
+    countries = context.get("countries_tuple_list", [])
+    interests = context.get("interests_tuple_list", [])
+    first_seen = context.get("first_seen")
+    results = context.get("results", [])
+
+    # Collect ALL values for key fields across all accounts
+    all_fields: Dict[str, list] = {}
+    last_seen = None
+    for _, _, data in results:
+        for _, v in data.items():
+            if not v.get("found") or v.get("is_similar"):
+                continue
+            ids_data = v.get("ids_data", {})
+            # Map multiple source fields to unified output fields
+            field_sources = {
+                "fullname": ("fullname", "name"),
+                "location": ("location", "country", "city", "country_code", "locale", "region"),
+                "gender": ("gender",),
+                "bio": ("bio", "about", "description"),
+            }
+            for out_field, source_keys in field_sources.items():
+                for src in source_keys:
+                    val = ids_data.get(src)
+                    if val:
+                        all_fields.setdefault(out_field, [])
+                        val_str = str(val)
+                        if val_str not in all_fields[out_field]:
+                            all_fields[out_field].append(val_str)
+            # Track last_seen
+            for ts_field in ("last_online", "latest_activity_at", "updated_at"):
+                ts = ids_data.get(ts_field)
+                if ts and (last_seen is None or str(ts) > str(last_seen)):
+                    last_seen = ts
+
+    lines = []
+    lines.append(f"# Report by searching on username \"{username}\"\n")
+
+    # Generated line with run info
+    gen_line = f"Generated at {generated_at} by [Maigret](https://github.com/soxoj/maigret)"
+    if run_info:
+        parts = []
+        if run_info.get("sites_count"):
+            parts.append(f"{run_info['sites_count']} sites checked")
+        if run_info.get("flags"):
+            parts.append(f"flags: `{run_info['flags']}`")
+        if parts:
+            gen_line += f" ({', '.join(parts)})"
+    lines.append(f"{gen_line}\n")
+
+    # Summary
+    lines.append("## Summary\n")
+    lines.append(f"{brief}\n")
+
+    if all_fields:
+        lines.append("**Information extracted from accounts:**\n")
+        for field, values in all_fields.items():
+            title = CaseConverter.snake_to_title(field)
+            lines.append(f"- {title}: {'; '.join(values)}")
+        lines.append("")
+
+    if countries:
+        geo = ", ".join(f"{code} (x{count})" for code, count in countries)
+        lines.append(f"**Country tags:** {geo}\n")
+
+    if interests:
+        tags = ", ".join(f"{tag} (x{count})" for tag, count in interests)
+        lines.append(f"**Website tags:** {tags}\n")
+
+    if first_seen:
+        lines.append(f"**First seen:** {first_seen}")
+    if last_seen:
+        lines.append(f"**Last seen:** {last_seen}")
+    if first_seen or last_seen:
+        lines.append("")
+
+    # Accounts found
+    lines.append("## Accounts found\n")
+
+    for u, id_type, data in results:
+        for site_name, v in data.items():
+            if not v.get("found") or v.get("is_similar"):
+                continue
+
+            lines.append(f"### {site_name}\n")
+            lines.append(f"- **URL:** [{v.get('url_user', '')}]({v.get('url_user', '')})")
+
+            tags = v.get("status") and v["status"].tags or []
+            if tags:
+                lines.append(f"- **Tags:** {', '.join(tags)}")
+                lines.append("")
+
+            ids_data = v.get("ids_data", {})
+            if ids_data:
+                for field, value in ids_data.items():
+                    if field == "image":
+                        continue
+                    title = CaseConverter.snake_to_title(field)
+                    lines.append(f"- {title}: {_md_format_value(value)}")
+
+            lines.append("")
+
+    # Possible false positives
+    lines.append("## Possible false positives\n")
+    lines.append(
+        f"This report was generated by searching for accounts matching the username `{username}`. "
+        f"Accounts listed above may belong to different people who happen to use the same "
+        f"or similar username. Results without extracted personal information could contain "
+        f"some false positive findings. Always verify findings before drawing conclusions.\n"
+    )
+
+    # Ethical use
+    lines.append("## Ethical use\n")
+    lines.append(
+        "This report is a result of a technical collection of publicly available information "
+        "from online accounts and does not constitute personal data processing. If you intend "
+        "to use this data for personal data processing or collection purposes, ensure your use "
+        "complies with applicable laws and regulations in your jurisdiction (such as GDPR, "
+        "CCPA, and similar).\n"
+    )
+
+    with open(filename, "w", encoding="utf-8") as f:
+        f.write("\n".join(lines))
+
+
 """
 REPORTS GENERATING
 """
@@ -101,7 +101,7 @@
                "sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
                "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
                "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
-                "x-guest-token": "2039637579922866279"
+                "x-guest-token": "2041186137171976270"
            },
            "errors": {
                "Bad guest token": "x-guest-token update required"
@@ -294,7 +294,7 @@
                "method": "vimeo"
            },
            "headers": {
-                "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NzUxMjM0MDAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbCwianRpIjoiZDY4YjViMGMtYTE3OC00ZDdhLWIyM2QtMDg5Y2MwZjAwOGEyIn0.0bGwlqckn4J07em2-nEX10OfW1JAmi54QCrPtm8Qn6A"
+                "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NzU0OTI1ODAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbCwianRpIjoiZjkwOGY0MmYtMTE2Zi00MDRkLWExOTgtOGUyOTE2MTFmZTQzIn0.Wt_z9qrjHofYPtUIDkbxrPX2S-glzmEowkR8m89O_Zg"
            },
            "urlProbe": "https://api.vimeo.com/users/{username}?fields=name%2Cgender%2Cbio%2Curi%2Clink%2Cbackground_video%2Clocation_details%2Cpictures%2Cverified%2Cmetadata.public_videos.total%2Cavailable_for_hire%2Ccan_work_remotely%2Cmetadata.connections.videos.total%2Cmetadata.connections.albums.total%2Cmetadata.connections.followers.total%2Cmetadata.connections.following.total%2Cmetadata.public_videos.total%2Cmetadata.connections.vimeo_experts.is_enrolled%2Ctotal_collection_count%2Ccreated_time%2Cprofile_preferences%2Cmembership%2Cclients%2Cskills%2Cproject_types%2Crates%2Ccategories%2Cis_expert%2Cprofile_discovery%2Cwebsites%2Ccontact_emails&fetch_user_profile=1",
            "checkType": "status_code",
@@ -1,8 +1,8 @@
 {
    "version": 1,
-    "updated_at": "2026-04-04T17:04:45Z",
+    "updated_at": "2026-04-06T16:20:33Z",
    "sites_count": 3155,
    "min_maigret_version": "0.5.0",
-    "data_sha256": "4b1c0c96e1595f6e83584a7a6e885647095cbfb7f23c938d7440f8a3408551b1",
+    "data_sha256": "da87fd6f32bd60efc25e35aa6aa7d329e490d4aa544ddb68539d490cd2157b56",
    "data_url": "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/data.json"
 }
@@ -54,6 +54,7 @@
    "graph_report": false,
    "pdf_report": false,
    "html_report": false,
+    "md_report": false,
    "web_interface_port": 5000,
    "no_autoupdate": false,
    "db_update_meta_url": "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/db_meta.json",
@@ -42,6 +42,7 @@ class Settings:
    pdf_report: bool
    html_report: bool
    graph_report: bool
+    md_report: bool
    web_interface_port: int
    no_autoupdate: bool
    db_update_meta_url: str
@@ -3159,7 +3159,7 @@ Rank data fetched from Majestic Million by domains.
 1. ![](https://www.google.com/s2/favicons?domain=https://tonometerbot.com) [Tonometerbot (https://tonometerbot.com)](https://tonometerbot.com)*: top 100M, crypto*
 1. ![](https://www.google.com/s2/favicons?domain=https://www.spatial.io) [Spatial (https://www.spatial.io)](https://www.spatial.io)*: top 100M, crypto, gaming*

-The list was updated at (2026-04-04)
+The list was updated at (2026-04-06)
 ## Statistics

 Enabled/total sites: 2538/3155 = 80.44%
@@ -48,6 +48,7 @@ DEFAULT_ARGS: Dict[str, Any] = {
    'web': None,
    'with_domains': False,
    'xmind': False,
+    'md': False,
    'no_autoupdate': False,
    'force_update': False,
 }