diff --git a/maigret/maigret.py b/maigret/maigret.py index 4b13804..ac6bc8b 100755 --- a/maigret/maigret.py +++ b/maigret/maigret.py @@ -13,7 +13,8 @@ from socid_extractor import parse, __version__ as socid_version from .checking import * from .notify import QueryNotifyPrint from .report import save_csv_report, save_xmind_report, save_html_report, save_pdf_report, \ - generate_report_context, save_txt_report + generate_report_context, save_txt_report, SUPPORTED_JSON_REPORT_FORMATS, check_supported_json_format, \ + save_json_report from .submit import submit_dialog __version__ = '0.1.13' @@ -56,9 +57,9 @@ async def main(): action="store", dest="proxy", default=None, help="Make requests over a proxy. e.g. socks5://127.0.0.1:1080" ) - parser.add_argument("--json", "-j", metavar="JSON_FILE", - dest="json_file", default=None, - help="Load data from a JSON file or an online, valid, JSON file.") + parser.add_argument("--db", metavar="DB_FILE", + dest="db_file", default=None, + help="Load Maigret database from a JSON file or an online, valid, JSON file.") parser.add_argument("--cookies-jar-file", metavar="COOKIE_FILE", dest="cookie_file", default=None, help="File with cookies.") @@ -91,7 +92,7 @@ async def main(): action="store_true", dest="print_check_errors", default=False, help="Print errors messages: connection, captcha, site country ban, etc." ) - parser.add_argument("--submit", + parser.add_argument("--submit", metavar='EXISTING_USER_URL', type=str, dest="new_site_to_submit", default=False, help="URL of existing profile in new site to submit." ) @@ -158,6 +159,12 @@ async def main(): dest="pdf", default=False, help="Generate a PDF report (general report on all usernames)." ) + parser.add_argument("-J", "--json", + action="store", metavar='REPORT_TYPE', + dest="json", default='', type=check_supported_json_format, + help=f"Generate a JSON report of specific type: {', '.join(SUPPORTED_JSON_REPORT_FORMATS)}" + " (one report per username)." + ) args = parser.parse_args() @@ -206,8 +213,8 @@ async def main(): if args.tags: args.tags = list(set(str(args.tags).split(','))) - if args.json_file is None: - args.json_file = \ + if args.db_file is None: + args.db_file = \ os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources/data.json" ) @@ -223,7 +230,7 @@ async def main(): color=not args.no_color) # Create object with all information about sites we are aware of. - db = MaigretDatabase().load_from_file(args.json_file) + db = MaigretDatabase().load_from_file(args.db_file) get_top_sites_for_id = lambda x: db.ranked_sites_dict(top=args.top_sites, tags=args.tags, names=args.site_list, disabled=False, id_type=x) @@ -233,7 +240,7 @@ async def main(): if args.new_site_to_submit: is_submitted = await submit_dialog(db, args.new_site_to_submit) if is_submitted: - db.save_to_file(args.json_file) + db.save_to_file(args.db_file) # Database self-checking if args.self_check: @@ -241,7 +248,7 @@ async def main(): is_need_update = await self_check(db, site_data, logger, max_connections=args.connections) if is_need_update: if input('Do you want to save changes permanently? [yYnN]\n').lower() == 'y': - db.save_to_file(args.json_file) + db.save_to_file(args.db_file) print('Database was successfully updated.') else: print('Updates will be applied only for current search session.') @@ -339,6 +346,12 @@ async def main(): save_txt_report(filename, username, results) query_notify.warning(f'TXT report for {username} saved in {filename}') + if args.json: + filename = report_filepath_tpl.format(username=username, postfix=f'_{args.json}.json') + save_json_report(filename, username, results, report_type=args.json) + query_notify.warning(f'JSON {args.json} report for {username} saved in {filename}') + + # reporting for all the result if general_results: if args.html or args.pdf: @@ -357,7 +370,7 @@ async def main(): save_pdf_report(filename, report_context) query_notify.warning(f'PDF report on all usernames saved in {filename}') # update database - db.save_to_file(args.json_file) + db.save_to_file(args.db_file) def run(): diff --git a/maigret/report.py b/maigret/report.py index c847143..b30417b 100644 --- a/maigret/report.py +++ b/maigret/report.py @@ -1,4 +1,5 @@ import csv +import json import io import logging import os @@ -7,11 +8,17 @@ import xmind from datetime import datetime from jinja2 import Template from xhtml2pdf import pisa +from argparse import ArgumentTypeError from dateutil.parser import parse as parse_datetime_str from .result import QueryStatus from .utils import is_country_tag, CaseConverter, enrich_link_str +SUPPORTED_JSON_REPORT_FORMATS = [ + 'simple', + 'ndjson', +] + ''' UTILS @@ -51,6 +58,10 @@ def save_pdf_report(filename: str, context: dict): with open(filename, 'w+b') as f: pisa.pisaDocument(io.StringIO(filled_template), dest=f, default_css=css) +def save_json_report(filename: str, username: str, results: dict, report_type: str): + with open(filename, 'w', encoding='utf-8') as f: + generate_json_report(username, results, f, report_type=report_type) + ''' REPORTS GENERATING @@ -225,6 +236,30 @@ def generate_txt_report(username: str, results: dict, file): file.write(dictionary["url_user"] + "\n") file.write(f'Total Websites Username Detected On : {exists_counter}') + +def generate_json_report(username: str, results: dict, file, report_type): + exists_counter = 0 + is_report_per_line = report_type.startswith('ndjson') + all_json = {} + + for sitename in results: + site_result = results[sitename] + # TODO: fix no site data issue + if not site_result or site_result.get("status").status != QueryStatus.CLAIMED: + continue + + data = dict(site_result) + data['status'] = data['status'].json() + + if is_report_per_line: + data['sitename'] = sitename + file.write(json.dumps(data)+'\n') + else: + all_json[sitename] = data + + if not is_report_per_line: + file.write(json.dumps(all_json)) + ''' XMIND 8 Functions ''' @@ -306,3 +341,9 @@ def design_sheet(sheet, username, results): currentsublabel.setTitle("%s: %s" % (k, v)) +def check_supported_json_format(value): + if value and not value in SUPPORTED_JSON_REPORT_FORMATS: + raise ArgumentTypeError(f'JSON report type must be one of the following types: ' + + ', '.join(SUPPORTED_JSON_REPORT_FORMATS)) + return value + diff --git a/maigret/resources/data.json b/maigret/resources/data.json index b926cab..cb1cd10 100644 --- a/maigret/resources/data.json +++ b/maigret/resources/data.json @@ -23096,6 +23096,62 @@ "urlMain": "https://www.are.na", "usernameClaimed": "nate-cassel", "usernameUnclaimed": "noonewouldeverusethis7" + }, + "mywishboard.com": { + "checkType": "message", + "presenseStrs": [ + "profile-header", + " profile-header__col" + ], + "absenceStrs": [ + "This page could not be found" + ], + "url": "https://mywishboard.com/@{username}", + "urlMain": "https://mywishboard.com", + "usernameClaimed": "alex", + "usernameUnclaimed": "noonewouldeverusethis7" + }, + "crafta.ua": { + "checkType": "message", + "presenseStrs": [ + "cft-profile-about" + ], + "absenceStrs": [ + "Page not found" + ], + "url": "https://{username}.crafta.ua/", + "urlMain": "https://crafta.ua", + "usernameClaimed": "test", + "usernameUnclaimed": "noonewouldeverusethis7" + }, + "m.smutty.com": { + "tags": [ + "erotic" + ], + "checkType": "message", + "presenseStrs": [ + "profile_stats_n" + ], + "absenceStrs": [ + "Not Found" + ], + "url": "https://m.smutty.com/user/{username}/", + "urlMain": "https://m.smutty.com", + "usernameClaimed": "alex", + "usernameUnclaimed": "noonewouldeverusethis7" + }, + "www.marykay.ru": { + "checkType": "message", + "presenseStrs": [ + "email" + ], + "absenceStrs": [ + "errorPage" + ], + "url": "https://www.marykay.ru/{username}", + "urlMain": "https://www.marykay.ru", + "usernameClaimed": "anna", + "usernameUnclaimed": "noonewouldeverusethis7" } }, "engines": { diff --git a/maigret/result.py b/maigret/result.py index bdfade7..e8c3a10 100644 --- a/maigret/result.py +++ b/maigret/result.py @@ -1,4 +1,4 @@ -"""Sherlock Result Module +"""Maigret Result Module This module defines various objects for recording the results of queries. """ @@ -74,6 +74,15 @@ class QueryResult(): self.ids_data = ids_data self.tags = tags + def json(self): + return { + 'username': self.username, + 'site_name': self.site_name, + 'url': self.site_url_user, + 'status': str(self.status), + 'ids': self.ids_data or {}, + 'tags': self.tags, + } def __str__(self): """Convert Object To String. diff --git a/maigret/submit.py b/maigret/submit.py index d45c263..1b07aa9 100644 --- a/maigret/submit.py +++ b/maigret/submit.py @@ -1,4 +1,5 @@ import difflib +import json import requests from mock import Mock @@ -10,6 +11,7 @@ DESIRED_STRINGS = ["username", "not found", "пользователь", "profile RATIO = 0.6 TOP_FEATURES = 5 +URL_RE = re.compile(r'https?://(www\.)?') def get_match_ratio(x): @@ -84,6 +86,17 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F async def submit_dialog(db, url_exists): + domain_raw = URL_RE.sub('', url_exists).strip().strip('/') + domain_raw = domain_raw.split('/')[0] + + matched_sites = list(filter(lambda x: domain_raw in x.url_main+x.url, db.sites)) + if matched_sites: + print(f'Sites with domain "{domain_raw}" already exists in the Maigret database!') + status = lambda s: '(disabled)' if s.disabled else '' + url_block = lambda s: f'\n\t{s.url_main}\n\t{s.url}' + print('\n'.join([f'{site.name} {status(site)}{url_block(site)}' for site in matched_sites])) + return False + url_parts = url_exists.split('/') supposed_username = url_parts[-1] new_name = input(f'Is "{supposed_username}" a valid username? If not, write it manually: ') @@ -103,9 +116,7 @@ async def submit_dialog(db, url_exists): a_minus_b = tokens_a.difference(tokens_b) b_minus_a = tokens_b.difference(tokens_a) - top_features_count = int(input(f'Specify count of features to extract [default {TOP_FEATURES}]: ')) - if not top_features_count: - top_features_count = TOP_FEATURES + top_features_count = int(input(f'Specify count of features to extract [default {TOP_FEATURES}]: ') or TOP_FEATURES) presence_list = sorted(a_minus_b, key=get_match_ratio, reverse=True)[:top_features_count] diff --git a/tests/test_report.py b/tests/test_report.py index b619eeb..ff8e268 100644 --- a/tests/test_report.py +++ b/tests/test_report.py @@ -1,5 +1,6 @@ """Maigret reports test functions""" import copy +import json import os from io import StringIO @@ -7,7 +8,7 @@ import xmind from jinja2 import Template from maigret.report import generate_csv_report, generate_txt_report, save_xmind_report, save_html_report, \ - save_pdf_report, generate_report_template, generate_report_context + save_pdf_report, generate_report_template, generate_report_context, generate_json_report from maigret.result import QueryResult, QueryStatus EXAMPLE_RESULTS = { @@ -146,6 +147,32 @@ def test_generate_txt_report(): ] +def test_generate_json_simple_report(): + jsonfile = StringIO() + MODIFIED_RESULTS = dict(EXAMPLE_RESULTS) + MODIFIED_RESULTS['GitHub2'] = EXAMPLE_RESULTS['GitHub'] + generate_json_report('test', MODIFIED_RESULTS, jsonfile, 'simple') + + jsonfile.seek(0) + data = jsonfile.readlines() + + assert len(data) == 1 + assert list(json.loads(data[0]).keys()) == ['GitHub', 'GitHub2'] + + +def test_generate_json_ndjson_report(): + jsonfile = StringIO() + MODIFIED_RESULTS = dict(EXAMPLE_RESULTS) + MODIFIED_RESULTS['GitHub2'] = EXAMPLE_RESULTS['GitHub'] + generate_json_report('test', MODIFIED_RESULTS, jsonfile, 'ndjson') + + jsonfile.seek(0) + data = jsonfile.readlines() + + assert len(data) == 2 + assert json.loads(data[0])['sitename'] == 'GitHub' + + def test_save_xmind_report(): filename = 'report_test.xmind' save_xmind_report(filename, 'test', EXAMPLE_RESULTS)