mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-06 14:08:59 +00:00
+158
@@ -0,0 +1,158 @@
|
||||
"""Maigret AI Analysis Module
|
||||
|
||||
Provides AI-powered analysis of search results using OpenAI-compatible APIs.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import threading
|
||||
|
||||
import aiohttp
|
||||
|
||||
|
||||
def load_ai_prompt() -> str:
|
||||
"""Load the AI system prompt from the resources directory."""
|
||||
maigret_path = os.path.dirname(os.path.realpath(__file__))
|
||||
prompt_path = os.path.join(maigret_path, "resources", "ai_prompt.txt")
|
||||
with open(prompt_path, "r", encoding="utf-8") as f:
|
||||
return f.read()
|
||||
|
||||
|
||||
def resolve_api_key(settings) -> str | None:
|
||||
"""Resolve OpenAI API key from settings or environment variable.
|
||||
|
||||
Priority: settings.openai_api_key > OPENAI_API_KEY env var.
|
||||
"""
|
||||
key = getattr(settings, "openai_api_key", None)
|
||||
if key:
|
||||
return key
|
||||
return os.environ.get("OPENAI_API_KEY")
|
||||
|
||||
|
||||
class _Spinner:
|
||||
"""Simple animated spinner for terminal output."""
|
||||
|
||||
FRAMES = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
|
||||
|
||||
def __init__(self, text=""):
|
||||
self.text = text
|
||||
self._stop = threading.Event()
|
||||
self._thread = None
|
||||
|
||||
def start(self):
|
||||
self._thread = threading.Thread(target=self._spin, daemon=True)
|
||||
self._thread.start()
|
||||
|
||||
def _spin(self):
|
||||
i = 0
|
||||
while not self._stop.is_set():
|
||||
frame = self.FRAMES[i % len(self.FRAMES)]
|
||||
sys.stderr.write(f"\r{frame} {self.text}")
|
||||
sys.stderr.flush()
|
||||
i += 1
|
||||
self._stop.wait(0.08)
|
||||
|
||||
def stop(self):
|
||||
self._stop.set()
|
||||
if self._thread:
|
||||
self._thread.join()
|
||||
sys.stderr.write("\r\033[2K")
|
||||
sys.stderr.flush()
|
||||
|
||||
|
||||
async def print_streaming(text: str, delay: float = 0.04):
|
||||
"""Print text word by word with a delay, simulating streaming LLM output."""
|
||||
words = text.split(" ")
|
||||
for i, word in enumerate(words):
|
||||
if i > 0:
|
||||
sys.stdout.write(" ")
|
||||
sys.stdout.write(word)
|
||||
sys.stdout.flush()
|
||||
await asyncio.sleep(delay)
|
||||
sys.stdout.write("\n")
|
||||
sys.stdout.flush()
|
||||
|
||||
|
||||
async def get_ai_analysis(
|
||||
api_key: str,
|
||||
markdown_report: str,
|
||||
model: str = "gpt-4o",
|
||||
api_base_url: str = "https://api.openai.com/v1",
|
||||
) -> str:
|
||||
"""Send the markdown report to an OpenAI-compatible API and return the analysis.
|
||||
|
||||
Uses streaming to display tokens as they arrive.
|
||||
Raises on HTTP errors with descriptive messages.
|
||||
"""
|
||||
system_prompt = load_ai_prompt()
|
||||
|
||||
url = f"{api_base_url.rstrip('/')}/chat/completions"
|
||||
headers = {
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
payload = {
|
||||
"model": model,
|
||||
"stream": True,
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": markdown_report},
|
||||
],
|
||||
}
|
||||
|
||||
spinner = _Spinner("Analysing the data with AI...")
|
||||
spinner.start()
|
||||
first_token = True
|
||||
full_response = []
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.post(url, json=payload, headers=headers) as resp:
|
||||
if resp.status == 401:
|
||||
raise RuntimeError("Invalid OpenAI API key (HTTP 401)")
|
||||
if resp.status == 429:
|
||||
raise RuntimeError("OpenAI API rate limit exceeded (HTTP 429)")
|
||||
if resp.status != 200:
|
||||
body = await resp.text()
|
||||
raise RuntimeError(
|
||||
f"OpenAI API error (HTTP {resp.status}): {body[:500]}"
|
||||
)
|
||||
|
||||
async for line in resp.content:
|
||||
decoded = line.decode("utf-8").strip()
|
||||
if not decoded or not decoded.startswith("data: "):
|
||||
continue
|
||||
|
||||
data_str = decoded[len("data: "):]
|
||||
if data_str == "[DONE]":
|
||||
break
|
||||
|
||||
try:
|
||||
chunk = json.loads(data_str)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
delta = chunk.get("choices", [{}])[0].get("delta", {})
|
||||
content = delta.get("content", "")
|
||||
if not content:
|
||||
continue
|
||||
|
||||
if first_token:
|
||||
spinner.stop()
|
||||
print()
|
||||
first_token = False
|
||||
|
||||
sys.stdout.write(content)
|
||||
sys.stdout.flush()
|
||||
except Exception:
|
||||
spinner.stop()
|
||||
raise
|
||||
|
||||
if first_token:
|
||||
# No tokens received — stop spinner anyway
|
||||
spinner.stop()
|
||||
|
||||
print()
|
||||
return "".join(full_response)
|
||||
+80
-14
@@ -494,6 +494,21 @@ def setup_arguments_parser(settings: Settings):
|
||||
" (one report per username).",
|
||||
)
|
||||
|
||||
report_group.add_argument(
|
||||
"--ai",
|
||||
action="store_true",
|
||||
dest="ai",
|
||||
default=False,
|
||||
help="Generate an AI-powered analysis of the search results using OpenAI API. "
|
||||
"Requires OPENAI_API_KEY env var or openai_api_key in settings.",
|
||||
)
|
||||
report_group.add_argument(
|
||||
"--ai-model",
|
||||
dest="ai_model",
|
||||
default=settings.openai_model,
|
||||
help="OpenAI model to use for AI analysis (default: gpt-4o).",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--reports-sorting",
|
||||
default=settings.report_sorting,
|
||||
@@ -596,6 +611,7 @@ async def main():
|
||||
print_found_only=not args.print_not_found,
|
||||
skip_check_errors=not args.print_check_errors,
|
||||
color=not args.no_color,
|
||||
silent=args.ai,
|
||||
)
|
||||
|
||||
# Create object with all information about sites we are aware of.
|
||||
@@ -711,17 +727,33 @@ async def main():
|
||||
+ get_dict_ascii_tree(usernames, prepend="\t")
|
||||
)
|
||||
|
||||
if args.ai:
|
||||
from .ai import resolve_api_key
|
||||
|
||||
if not resolve_api_key(settings):
|
||||
query_notify.warning(
|
||||
'AI analysis requires an OpenAI API key. '
|
||||
'Set OPENAI_API_KEY environment variable or add '
|
||||
'openai_api_key to settings.json.'
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
if not site_data:
|
||||
query_notify.warning('No sites to check, exiting!')
|
||||
sys.exit(2)
|
||||
|
||||
query_notify.warning(
|
||||
f'Starting a search on top {len(site_data)} sites from the Maigret database...'
|
||||
)
|
||||
if not args.all_sites:
|
||||
if args.ai:
|
||||
query_notify.warning(
|
||||
'You can run search by full list of sites with flag `-a`', '!'
|
||||
f'Starting AI-assisted search on top {len(site_data)} sites from the Maigret database...'
|
||||
)
|
||||
else:
|
||||
query_notify.warning(
|
||||
f'Starting a search on top {len(site_data)} sites from the Maigret database...'
|
||||
)
|
||||
if not args.all_sites:
|
||||
query_notify.warning(
|
||||
'You can run search by full list of sites with flag `-a`', '!'
|
||||
)
|
||||
|
||||
already_checked = set()
|
||||
general_results = []
|
||||
@@ -774,11 +806,12 @@ async def main():
|
||||
check_domains=args.with_domains,
|
||||
)
|
||||
|
||||
errs = errors.notify_about_errors(
|
||||
results, query_notify, show_statistics=args.verbose
|
||||
)
|
||||
for e in errs:
|
||||
query_notify.warning(*e)
|
||||
if not args.ai:
|
||||
errs = errors.notify_about_errors(
|
||||
results, query_notify, show_statistics=args.verbose
|
||||
)
|
||||
for e in errs:
|
||||
query_notify.warning(*e)
|
||||
|
||||
if args.reports_sorting == "data":
|
||||
results = sort_report_by_data_points(results)
|
||||
@@ -867,10 +900,43 @@ async def main():
|
||||
save_graph_report(filename, general_results, db)
|
||||
query_notify.warning(f'Graph report on all usernames saved in {filename}')
|
||||
|
||||
text_report = get_plaintext_report(report_context)
|
||||
if text_report:
|
||||
query_notify.info('Short text report:')
|
||||
print(text_report)
|
||||
if not args.ai:
|
||||
text_report = get_plaintext_report(report_context)
|
||||
if text_report:
|
||||
query_notify.info('Short text report:')
|
||||
print(text_report)
|
||||
|
||||
if args.ai:
|
||||
from .ai import get_ai_analysis, resolve_api_key
|
||||
from .report import generate_markdown_report
|
||||
|
||||
api_key = resolve_api_key(settings)
|
||||
|
||||
run_flags = []
|
||||
if args.tags:
|
||||
run_flags.append(f"--tags {args.tags}")
|
||||
if args.site_list:
|
||||
run_flags.append(f"--site {','.join(args.site_list)}")
|
||||
if args.all_sites:
|
||||
run_flags.append("--all-sites")
|
||||
run_info = {
|
||||
"sites_count": sum(len(d) for _, _, d in general_results),
|
||||
"flags": " ".join(run_flags) if run_flags else None,
|
||||
}
|
||||
|
||||
md_report = generate_markdown_report(report_context, run_info=run_info)
|
||||
|
||||
try:
|
||||
await get_ai_analysis(
|
||||
api_key=api_key,
|
||||
markdown_report=md_report,
|
||||
model=args.ai_model,
|
||||
api_base_url=getattr(
|
||||
settings, 'openai_api_base_url', 'https://api.openai.com/v1'
|
||||
),
|
||||
)
|
||||
except Exception as e:
|
||||
query_notify.warning(f'AI analysis failed: {e}')
|
||||
|
||||
# update database
|
||||
db.save_to_file(db_file)
|
||||
|
||||
@@ -123,6 +123,7 @@ class QueryNotifyPrint(QueryNotify):
|
||||
print_found_only=False,
|
||||
skip_check_errors=False,
|
||||
color=True,
|
||||
silent=False,
|
||||
):
|
||||
"""Create Query Notify Print Object.
|
||||
|
||||
@@ -149,6 +150,7 @@ class QueryNotifyPrint(QueryNotify):
|
||||
self.print_found_only = print_found_only
|
||||
self.skip_check_errors = skip_check_errors
|
||||
self.color = color
|
||||
self.silent = silent
|
||||
|
||||
return
|
||||
|
||||
@@ -187,6 +189,9 @@ class QueryNotifyPrint(QueryNotify):
|
||||
Nothing.
|
||||
"""
|
||||
|
||||
if self.silent:
|
||||
return
|
||||
|
||||
title = f"Checking {id_type}"
|
||||
if self.color:
|
||||
print(
|
||||
@@ -236,6 +241,9 @@ class QueryNotifyPrint(QueryNotify):
|
||||
Return Value:
|
||||
Nothing.
|
||||
"""
|
||||
if self.silent:
|
||||
return
|
||||
|
||||
notify = None
|
||||
self.result = result
|
||||
|
||||
|
||||
+7
-2
@@ -267,7 +267,7 @@ def _md_format_value(value) -> str:
|
||||
return s
|
||||
|
||||
|
||||
def save_markdown_report(filename: str, context: dict, run_info: dict = None):
|
||||
def generate_markdown_report(context: dict, run_info: dict = None) -> str:
|
||||
username = context.get("username", "unknown")
|
||||
generated_at = context.get("generated_at", "")
|
||||
brief = context.get("brief", "")
|
||||
@@ -391,8 +391,13 @@ def save_markdown_report(filename: str, context: dict, run_info: dict = None):
|
||||
"CCPA, and similar).\n"
|
||||
)
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def save_markdown_report(filename: str, context: dict, run_info: dict = None):
|
||||
content = generate_markdown_report(context, run_info)
|
||||
with open(filename, "w", encoding="utf-8") as f:
|
||||
f.write("\n".join(lines))
|
||||
f.write(content)
|
||||
|
||||
|
||||
"""
|
||||
|
||||
@@ -0,0 +1,62 @@
|
||||
You are an OSINT analyst that converts raw username-investigation reports into a short, clean human-readable summary.
|
||||
|
||||
Your task:
|
||||
Read the attached account-discovery report and produce a concise report in exactly this style:
|
||||
|
||||
# Investigation Summary
|
||||
|
||||
Name: <most likely real full name>
|
||||
Location: <most likely current location>
|
||||
Occupation: <short combined description based only on strong signals>
|
||||
Interests: <3–6 broad interests inferred from platform types, bios, and activity>
|
||||
Languages: <languages supported by strong evidence only>
|
||||
Website: <main personal website if clearly present>
|
||||
Username: <main username> (variant: <variant usernames if any>)
|
||||
Platforms: <number> profiles, active from <first year> to <last year>
|
||||
Confidence: <High / Medium / Low> — <one short explanation why>
|
||||
|
||||
# Other leads
|
||||
|
||||
- <lead 1>
|
||||
- <lead 2>
|
||||
- <lead 3 if needed>
|
||||
|
||||
Rules:
|
||||
1. Use only information supported by the report.
|
||||
2. Resolve identity using consistency of username, full name, bio, links, company, and location.
|
||||
3. Prefer strong repeated signals over one-off weak signals.
|
||||
4. If one profile clearly conflicts with the rest, mention it in "Other leads" as a likely false positive instead of mixing it into the main identity.
|
||||
5. Keep the tone analytical and neutral.
|
||||
6. Do not mention every platform individually.
|
||||
7. Do not include raw URLs except for the main website.
|
||||
8. Do not mention NSFW/adult platforms in the main summary unless they are the only source for a critical lead; if such a profile looks inconsistent, mention it only as a likely false positive.
|
||||
9. "Occupation" should be a compact merged description, for example: "Chief Product Officer (CPO) at ..., entrepreneur, OSINT community founder".
|
||||
10. "Interests" should be broad categories, not noisy tags. Convert raw platform/tag evidence into natural categories like OSINT, software development, blogging, gaming, streaming, etc.
|
||||
11. "Languages" should only include languages clearly supported by bios, texts, country tags, or profile content.
|
||||
12. For "Platforms", count the profiles reported as found by the report summary, not manually deduplicated.
|
||||
13. For active years, use the earliest and latest reliable dates from the consistent identity cluster. Ignore obvious outlier dates if they belong to likely false positives or weak profiles.
|
||||
14. For confidence:
|
||||
- High = strong consistency across username, name, bio, links, location, and/or company
|
||||
- Medium = partial consistency with some gaps
|
||||
- Low = mostly username-only matches
|
||||
15. If some field is not reliably known, omit speculation and use the best cautious wording possible.
|
||||
16. For "Name", output only the most likely real personal name in clean canonical form.
|
||||
- Remove nicknames, handles, aliases, or bracketed parts such as "(Soxoj)".
|
||||
- Example: "Dmitriy (Soxoj) Danilov" -> "Dmitriy Danilov".
|
||||
17. For "Website", output only the plain domain or URL as text, not a markdown hyperlink.
|
||||
18. In "Other leads", do not label conflicting profiles as "false positive", "likely unrelated", or "potentially a false positive".
|
||||
- Instead, use neutral intelligence wording such as:
|
||||
"Accounts were found that are most likely unrelated to the main identity, but may indicate possible cross-border activity and should be verified."
|
||||
19. When describing anomalies in "Other leads", prefer cautious investigative phrasing:
|
||||
- "may be unrelated"
|
||||
- "requires verification"
|
||||
- "could indicate separate activity"
|
||||
- "should be checked manually"
|
||||
20. Do not include nicknames or aliases inside the Name field unless they are clearly part of the legal or real-world name.
|
||||
|
||||
Output requirements:
|
||||
- Return only the final formatted text.
|
||||
- Keep it short.
|
||||
- No preamble, no explanations.
|
||||
|
||||
Now analyze the following report
|
||||
@@ -55,6 +55,9 @@
|
||||
"pdf_report": false,
|
||||
"html_report": false,
|
||||
"md_report": false,
|
||||
"openai_api_key": "",
|
||||
"openai_model": "gpt-4o",
|
||||
"openai_api_base_url": "https://api.openai.com/v1",
|
||||
"web_interface_port": 5000,
|
||||
"no_autoupdate": false,
|
||||
"db_update_meta_url": "https://raw.githubusercontent.com/soxoj/maigret/main/maigret/resources/db_meta.json",
|
||||
|
||||
@@ -49,6 +49,8 @@ DEFAULT_ARGS: Dict[str, Any] = {
|
||||
'with_domains': False,
|
||||
'xmind': False,
|
||||
'md': False,
|
||||
'ai': False,
|
||||
'ai_model': 'gpt-4o',
|
||||
'no_autoupdate': False,
|
||||
'force_update': False,
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user