Add Markdown reports for LLM analysis (#2463)

2026-05-07 06:24:35 +00:00 · 2026-04-06 18:26:43 +02:00
parent 44a6c729e3
commit ad95302745
9 changed files with 207 additions and 6 deletions
@@ -106,6 +106,9 @@ username).
 ``-J``, ``--json`` - Generate a JSON report of specific type: simple,
 ndjson (one report per username). E.g. ``--json ndjson``
 ``-M``, ``--md`` - Generate a Markdown report (general report on all
 usernames). See :ref:`markdown-report` below.
 ``-fo``, ``--folderoutput`` - Results will be saved to this folder,
 ``results`` by default. Will be created if doesn’t exist.
@@ -142,4 +145,35 @@ site main page URL to determine the site engine and methods to check
 account presence. After checking Maigret asks if you want to add the
 site, answering y/Y will rewrite the local database.
 .. _markdown-report:
 Markdown report (LLM-friendly)
 ------------------------------
 The ``--md`` / ``-M`` flag generates a Markdown report designed for both human reading and analysis by AI assistants (ChatGPT, Claude, etc.).
 .. code-block:: console
   maigret username --md
 The report includes:
 - **Summary** with aggregated personal data (all fullnames, locations, bios found across accounts), country tags, website tags, first/last seen timestamps.
 - **Per-account sections** with profile URL, site tags, and all extracted fields (username, bio, follower count, linked accounts, etc.).
 - **Possible false positives** disclaimer explaining that accounts may belong to different people.
 - **Ethical use** notice about applicable data protection laws.
 **Using with AI tools:**
 The Markdown format is optimized for LLM context windows. You can feed the report directly to an AI assistant for follow-up analysis:
 .. code-block:: console
   # Generate the report
   maigret johndoe --md
   # Feed it to an AI tool
   cat reports/report_johndoe.md | llm "Analyze this OSINT report and summarize key findings"
 The structured Markdown with per-site sections makes it easy for AI tools to extract relationships, cross-reference identities, and identify patterns across accounts.
@@ -37,6 +37,7 @@ from .report import (
    get_plaintext_report,
    sort_report_by_data_points,
    save_graph_report,
    save_markdown_report,
 )
 from .sites import MaigretDatabase
 from .submit import Submitter
@@ -465,6 +466,14 @@ def setup_arguments_parser(settings: Settings):
        default=settings.pdf_report,
        help="Generate a PDF report (general report on all usernames).",
    )
    report_group.add_argument(
        "-M",
        "--md",
        action="store_true",
        dest="md",
        default=settings.md_report,
        help="Generate a Markdown report (general report on all usernames).",
    )
    report_group.add_argument(
        "-G",
        "--graph",
@@ -803,7 +812,7 @@ async def main():
    # reporting for all the result
    if general_results:
-        if args.html or args.pdf:
+        if args.html or args.pdf or args.md:
            query_notify.warning('Generating report info...')
        report_context = generate_report_context(general_results)
        # determine main username
@@ -823,6 +832,23 @@ async def main():
            save_pdf_report(filename, report_context)
            query_notify.warning(f'PDF report on all usernames saved in {filename}')
        if args.md:
            username = username.replace('/', '_')
            filename = report_filepath_tpl.format(username=username, postfix='.md')
            run_flags = []
            if args.tags:
                run_flags.append(f"--tags {args.tags}")
            if args.site_list:
                run_flags.append(f"--site {','.join(args.site_list)}")
            if args.all_sites:
                run_flags.append("--all-sites")
            run_info = {
                "sites_count": sum(len(d) for _, _, d in general_results),
                "flags": " ".join(run_flags) if run_flags else None,
            }
            save_markdown_report(filename, report_context, run_info=run_info)
            query_notify.warning(f'Markdown report on all usernames saved in {filename}')
        if args.graph:
            username = username.replace('/', '_')
            filename = report_filepath_tpl.format(
@@ -257,6 +257,144 @@ def get_plaintext_report(context: dict) -> str:
    return output.strip()
 def _md_format_value(value) -> str:
    """Format a value for Markdown output, detecting links."""
    if isinstance(value, list):
        return ", ".join(str(v) for v in value)
    s = str(value)
    if s.startswith("http://") or s.startswith("https://"):
        return f"[{s}]({s})"
    return s
 def save_markdown_report(filename: str, context: dict, run_info: dict = None):
    username = context.get("username", "unknown")
    generated_at = context.get("generated_at", "")
    brief = context.get("brief", "")
    countries = context.get("countries_tuple_list", [])
    interests = context.get("interests_tuple_list", [])
    first_seen = context.get("first_seen")
    results = context.get("results", [])
    # Collect ALL values for key fields across all accounts
    all_fields: Dict[str, list] = {}
    last_seen = None
    for _, _, data in results:
        for _, v in data.items():
            if not v.get("found") or v.get("is_similar"):
                continue
            ids_data = v.get("ids_data", {})
            # Map multiple source fields to unified output fields
            field_sources = {
                "fullname": ("fullname", "name"),
                "location": ("location", "country", "city", "country_code", "locale", "region"),
                "gender": ("gender",),
                "bio": ("bio", "about", "description"),
            }
            for out_field, source_keys in field_sources.items():
                for src in source_keys:
                    val = ids_data.get(src)
                    if val:
                        all_fields.setdefault(out_field, [])
                        val_str = str(val)
                        if val_str not in all_fields[out_field]:
                            all_fields[out_field].append(val_str)
            # Track last_seen
            for ts_field in ("last_online", "latest_activity_at", "updated_at"):
                ts = ids_data.get(ts_field)
                if ts and (last_seen is None or str(ts) > str(last_seen)):
                    last_seen = ts
    lines = []
    lines.append(f"# Report by searching on username \"{username}\"\n")
    # Generated line with run info
    gen_line = f"Generated at {generated_at} by [Maigret](https://github.com/soxoj/maigret)"
    if run_info:
        parts = []
        if run_info.get("sites_count"):
            parts.append(f"{run_info['sites_count']} sites checked")
        if run_info.get("flags"):
            parts.append(f"flags: `{run_info['flags']}`")
        if parts:
            gen_line += f" ({', '.join(parts)})"
    lines.append(f"{gen_line}\n")
    # Summary
    lines.append("## Summary\n")
    lines.append(f"{brief}\n")
    if all_fields:
        lines.append("**Information extracted from accounts:**\n")
        for field, values in all_fields.items():
            title = CaseConverter.snake_to_title(field)
            lines.append(f"- {title}: {'; '.join(values)}")
        lines.append("")
    if countries:
        geo = ", ".join(f"{code} (x{count})" for code, count in countries)
        lines.append(f"**Country tags:** {geo}\n")
    if interests:
        tags = ", ".join(f"{tag} (x{count})" for tag, count in interests)
        lines.append(f"**Website tags:** {tags}\n")
    if first_seen:
        lines.append(f"**First seen:** {first_seen}")
    if last_seen:
        lines.append(f"**Last seen:** {last_seen}")
    if first_seen or last_seen:
        lines.append("")
    # Accounts found
    lines.append("## Accounts found\n")
    for u, id_type, data in results:
        for site_name, v in data.items():
            if not v.get("found") or v.get("is_similar"):
                continue
            lines.append(f"### {site_name}\n")
            lines.append(f"- **URL:** [{v.get('url_user', '')}]({v.get('url_user', '')})")
            tags = v.get("status") and v["status"].tags or []
            if tags:
                lines.append(f"- **Tags:** {', '.join(tags)}")
                lines.append("")
            ids_data = v.get("ids_data", {})
            if ids_data:
                for field, value in ids_data.items():
                    if field == "image":
                        continue
                    title = CaseConverter.snake_to_title(field)
                    lines.append(f"- {title}: {_md_format_value(value)}")
            lines.append("")
    # Possible false positives
    lines.append("## Possible false positives\n")
    lines.append(
        f"This report was generated by searching for accounts matching the username `{username}`. "
        f"Accounts listed above may belong to different people who happen to use the same "
        f"or similar username. Results without extracted personal information could contain "
        f"some false positive findings. Always verify findings before drawing conclusions.\n"
    )
    # Ethical use
    lines.append("## Ethical use\n")
    lines.append(
        "This report is a result of a technical collection of publicly available information "
        "from online accounts and does not constitute personal data processing. If you intend "
        "to use this data for personal data processing or collection purposes, ensure your use "
        "complies with applicable laws and regulations in your jurisdiction (such as GDPR, "
        "CCPA, and similar).\n"
    )
    with open(filename, "w", encoding="utf-8") as f:
        f.write("\n".join(lines))
 """
 REPORTS GENERATING
 """
@@ -101,7 +101,7 @@
                "sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
                "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
                "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
-                "x-guest-token": "2039637579922866279"
+                "x-guest-token": "2041186137171976270"
            },
            "errors": {
                "Bad guest token": "x-guest-token update required"
@@ -294,7 +294,7 @@
                "method": "vimeo"
            },
            "headers": {
-                "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NzUxMjM0MDAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbCwianRpIjoiZDY4YjViMGMtYTE3OC00ZDdhLWIyM2QtMDg5Y2MwZjAwOGEyIn0.0bGwlqckn4J07em2-nEX10OfW1JAmi54QCrPtm8Qn6A"
+                "Authorization": "jwt eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJleHAiOjE3NzU0OTI1ODAsInVzZXJfaWQiOm51bGwsImFwcF9pZCI6NTg0NzksInNjb3BlcyI6InB1YmxpYyIsInRlYW1fdXNlcl9pZCI6bnVsbCwianRpIjoiZjkwOGY0MmYtMTE2Zi00MDRkLWExOTgtOGUyOTE2MTFmZTQzIn0.Wt_z9qrjHofYPtUIDkbxrPX2S-glzmEowkR8m89O_Zg"
            },
            "urlProbe": "https://api.vimeo.com/users/{username}?fields=name%2Cgender%2Cbio%2Curi%2Clink%2Cbackground_video%2Clocation_details%2Cpictures%2Cverified%2Cmetadata.public_videos.total%2Cavailable_for_hire%2Ccan_work_remotely%2Cmetadata.connections.videos.total%2Cmetadata.connections.albums.total%2Cmetadata.connections.followers.total%2Cmetadata.connections.following.total%2Cmetadata.public_videos.total%2Cmetadata.connections.vimeo_experts.is_enrolled%2Ctotal_collection_count%2Ccreated_time%2Cprofile_preferences%2Cmembership%2Cclients%2Cskills%2Cproject_types%2Crates%2Ccategories%2Cis_expert%2Cprofile_discovery%2Cwebsites%2Ccontact_emails&fetch_user_profile=1",
            "checkType": "status_code",
@@ -1,8 +1,8 @@
 {
    "version": 1,
-    "updated_at": "2026-04-04T17:04:45Z",
+    "updated_at": "2026-04-06T16:20:33Z",
    "sites_count": 3155,
    "min_maigret_version": "0.5.0",
-    "data_sha256": "4b1c0c96e1595f6e83584a7a6e885647095cbfb7f23c938d7440f8a3408551b1",
+    "data_sha256": "da87fd6f32bd60efc25e35aa6aa7d329e490d4aa544ddb68539d490cd2157b56",
    "data_url": "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/data.json"
 }
@@ -54,6 +54,7 @@
    "graph_report": false,
    "pdf_report": false,
    "html_report": false,
    "md_report": false,
    "web_interface_port": 5000,
    "no_autoupdate": false,
    "db_update_meta_url": "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/db_meta.json",
@@ -42,6 +42,7 @@ class Settings:
    pdf_report: bool
    html_report: bool
    graph_report: bool
    md_report: bool
    web_interface_port: int
    no_autoupdate: bool
    db_update_meta_url: str
@@ -3159,7 +3159,7 @@ Rank data fetched from Majestic Million by domains.
 1. ![](https://www.google.com/s2/favicons?domain=https://tonometerbot.com) [Tonometerbot (https://tonometerbot.com)](https://tonometerbot.com)*: top 100M, crypto*
 1. ![](https://www.google.com/s2/favicons?domain=https://www.spatial.io) [Spatial (https://www.spatial.io)](https://www.spatial.io)*: top 100M, crypto, gaming*
-The list was updated at (2026-04-04)
+The list was updated at (2026-04-06)
 ## Statistics
 Enabled/total sites: 2538/3155 = 80.44%
@@ -48,6 +48,7 @@ DEFAULT_ARGS: Dict[str, Any] = {
    'web': None,
    'with_domains': False,
    'xmind': False,
    'md': False,
    'no_autoupdate': False,
    'force_update': False,
 }