Merge pull request #53 from soxoj/json-reports-submit-improvements

Added JSON reports
2026-05-07 06:24:35 +00:00 · 2021-02-13 01:10:55 +03:00
parent 7676c053f9 631de7b346
commit 53f72edaff
6 changed files with 173 additions and 16 deletions
@@ -13,7 +13,8 @@ from socid_extractor import parse, __version__ as socid_version
 from .checking import *
 from .notify import QueryNotifyPrint
 from .report import save_csv_report, save_xmind_report, save_html_report, save_pdf_report, \
-    generate_report_context, save_txt_report
+    generate_report_context, save_txt_report, SUPPORTED_JSON_REPORT_FORMATS, check_supported_json_format, \
+    save_json_report
 from .submit import submit_dialog

 __version__ = '0.1.13'
@@ -56,9 +57,9 @@ async def main():
                        action="store", dest="proxy", default=None,
                        help="Make requests over a proxy. e.g. socks5://127.0.0.1:1080"
                        )
-    parser.add_argument("--json", "-j", metavar="JSON_FILE",
-                        dest="json_file", default=None,
-                        help="Load data from a JSON file or an online, valid, JSON file.")
+    parser.add_argument("--db", metavar="DB_FILE",
+                        dest="db_file", default=None,
+                        help="Load Maigret database from a JSON file or an online, valid, JSON file.")
    parser.add_argument("--cookies-jar-file", metavar="COOKIE_FILE",
                        dest="cookie_file", default=None,
                        help="File with cookies.")
@@ -91,7 +92,7 @@ async def main():
                        action="store_true", dest="print_check_errors", default=False,
                        help="Print errors messages: connection, captcha, site country ban, etc."
                        )
-    parser.add_argument("--submit",
+    parser.add_argument("--submit", metavar='EXISTING_USER_URL',
                        type=str, dest="new_site_to_submit", default=False,
                        help="URL of existing profile in new site to submit."
                        )
@@ -158,6 +159,12 @@ async def main():
                        dest="pdf", default=False,
                        help="Generate a PDF report (general report on all usernames)."
                        )
+    parser.add_argument("-J", "--json",
+                        action="store", metavar='REPORT_TYPE',
+                        dest="json", default='', type=check_supported_json_format,
+                        help=f"Generate a JSON report of specific type: {', '.join(SUPPORTED_JSON_REPORT_FORMATS)}"
+                        " (one report per username)."
+                        )

    args = parser.parse_args()

@@ -206,8 +213,8 @@ async def main():
    if args.tags:
        args.tags = list(set(str(args.tags).split(',')))

-    if args.json_file is None:
-        args.json_file = \
+    if args.db_file is None:
+        args.db_file = \
            os.path.join(os.path.dirname(os.path.realpath(__file__)),
                         "resources/data.json"
                         )
@@ -223,7 +230,7 @@ async def main():
                                    color=not args.no_color)

    # Create object with all information about sites we are aware of.
-    db = MaigretDatabase().load_from_file(args.json_file)
+    db = MaigretDatabase().load_from_file(args.db_file)
    get_top_sites_for_id = lambda x: db.ranked_sites_dict(top=args.top_sites, tags=args.tags,
                                                          names=args.site_list,
                                                          disabled=False, id_type=x)
@@ -233,7 +240,7 @@ async def main():
    if args.new_site_to_submit:
        is_submitted = await submit_dialog(db, args.new_site_to_submit)
        if is_submitted:
-            db.save_to_file(args.json_file)
+            db.save_to_file(args.db_file)

    # Database self-checking
    if args.self_check:
@@ -241,7 +248,7 @@ async def main():
        is_need_update = await self_check(db, site_data, logger, max_connections=args.connections)
        if is_need_update:
            if input('Do you want to save changes permanently? [yYnN]\n').lower() == 'y':
-                db.save_to_file(args.json_file)
+                db.save_to_file(args.db_file)
                print('Database was successfully updated.')
            else:
                print('Updates will be applied only for current search session.')
@@ -339,6 +346,12 @@ async def main():
            save_txt_report(filename, username, results)
            query_notify.warning(f'TXT report for {username} saved in {filename}')

+        if args.json:
+            filename = report_filepath_tpl.format(username=username, postfix=f'_{args.json}.json')
+            save_json_report(filename, username, results, report_type=args.json)
+            query_notify.warning(f'JSON {args.json} report for {username} saved in {filename}')
+
+
    # reporting for all the result
    if general_results:
        if args.html or args.pdf:
@@ -357,7 +370,7 @@ async def main():
            save_pdf_report(filename, report_context)
            query_notify.warning(f'PDF report on all usernames saved in {filename}')
    # update database
-    db.save_to_file(args.json_file)
+    db.save_to_file(args.db_file)


 def run():
@@ -1,4 +1,5 @@
 import csv
+import json
 import io
 import logging
 import os
@@ -7,11 +8,17 @@ import xmind
 from datetime import datetime
 from jinja2 import Template
 from xhtml2pdf import pisa
+from argparse import ArgumentTypeError
 from dateutil.parser import parse as parse_datetime_str

 from .result import QueryStatus
 from .utils import is_country_tag, CaseConverter, enrich_link_str

+SUPPORTED_JSON_REPORT_FORMATS = [
+    'simple',
+    'ndjson',
+]
+

 '''
 UTILS
@@ -51,6 +58,10 @@ def save_pdf_report(filename: str, context: dict):
    with open(filename, 'w+b') as f:
        pisa.pisaDocument(io.StringIO(filled_template), dest=f, default_css=css)

+def save_json_report(filename: str, username: str, results: dict, report_type: str):
+    with open(filename, 'w', encoding='utf-8') as f:
+        generate_json_report(username, results, f, report_type=report_type)
+

 '''
 REPORTS GENERATING
@@ -225,6 +236,30 @@ def generate_txt_report(username: str, results: dict, file):
            file.write(dictionary["url_user"] + "\n")
    file.write(f'Total Websites Username Detected On : {exists_counter}')

+
+def generate_json_report(username: str, results: dict, file, report_type):
+    exists_counter = 0
+    is_report_per_line = report_type.startswith('ndjson')
+    all_json = {}
+
+    for sitename in results:
+        site_result = results[sitename]
+        # TODO: fix no site data issue
+        if not site_result or site_result.get("status").status != QueryStatus.CLAIMED:
+            continue
+
+        data = dict(site_result)
+        data['status'] = data['status'].json()
+
+        if is_report_per_line:
+            data['sitename'] = sitename
+            file.write(json.dumps(data)+'\n')
+        else:
+            all_json[sitename] = data
+
+    if not is_report_per_line:
+        file.write(json.dumps(all_json))
+
 '''
 XMIND 8 Functions
 '''
@@ -306,3 +341,9 @@ def design_sheet(sheet, username, results):
            currentsublabel.setTitle("%s: %s" % (k, v))


+def check_supported_json_format(value):
+    if value and not value in SUPPORTED_JSON_REPORT_FORMATS:
+        raise ArgumentTypeError(f'JSON report type must be one of the following types: '
+            + ', '.join(SUPPORTED_JSON_REPORT_FORMATS))
+    return value
+
@@ -23096,6 +23096,62 @@
            "urlMain": "https://www.are.na",
            "usernameClaimed": "nate-cassel",
            "usernameUnclaimed": "noonewouldeverusethis7"
+        },
+        "mywishboard.com": {
+            "checkType": "message",
+            "presenseStrs": [
+                "profile-header",
+                " profile-header__col"
+            ],
+            "absenceStrs": [
+                "This page could not be found"
+            ],
+            "url": "https://mywishboard.com/@{username}",
+            "urlMain": "https://mywishboard.com",
+            "usernameClaimed": "alex",
+            "usernameUnclaimed": "noonewouldeverusethis7"
+        },
+        "crafta.ua": {
+            "checkType": "message",
+            "presenseStrs": [
+                "cft-profile-about"
+            ],
+            "absenceStrs": [
+                "Page not found"
+            ],
+            "url": "https://{username}.crafta.ua/",
+            "urlMain": "https://crafta.ua",
+            "usernameClaimed": "test",
+            "usernameUnclaimed": "noonewouldeverusethis7"
+        },
+        "m.smutty.com": {
+            "tags": [
+                "erotic"
+            ],
+            "checkType": "message",
+            "presenseStrs": [
+                "profile_stats_n"
+            ],
+            "absenceStrs": [
+                "Not Found</span>"
+            ],
+            "url": "https://m.smutty.com/user/{username}/",
+            "urlMain": "https://m.smutty.com",
+            "usernameClaimed": "alex",
+            "usernameUnclaimed": "noonewouldeverusethis7"
+        },
+        "www.marykay.ru": {
+            "checkType": "message",
+            "presenseStrs": [
+                "email"
+            ],
+            "absenceStrs": [
+                "errorPage"
+            ],
+            "url": "https://www.marykay.ru/{username}",
+            "urlMain": "https://www.marykay.ru",
+            "usernameClaimed": "anna",
+            "usernameUnclaimed": "noonewouldeverusethis7"
        }
    },
    "engines": {
@@ -1,4 +1,4 @@
-"""Sherlock Result Module
+"""Maigret Result Module

 This module defines various objects for recording the results of queries.
 """
@@ -74,6 +74,15 @@ class QueryResult():
        self.ids_data = ids_data
        self.tags = tags

+    def json(self):
+        return {
+            'username': self.username,
+            'site_name': self.site_name,
+            'url': self.site_url_user,
+            'status': str(self.status),
+            'ids': self.ids_data or {},
+            'tags': self.tags,
+        }

    def __str__(self):
        """Convert Object To String.
@@ -1,4 +1,5 @@
 import difflib
+import json

 import requests
 from mock import Mock
@@ -10,6 +11,7 @@ DESIRED_STRINGS = ["username", "not found", "пользователь", "profile

 RATIO = 0.6
 TOP_FEATURES = 5
+URL_RE = re.compile(r'https?://(www\.)?')


 def get_match_ratio(x):
@@ -84,6 +86,17 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F


 async def submit_dialog(db, url_exists):
+    domain_raw = URL_RE.sub('', url_exists).strip().strip('/')
+    domain_raw = domain_raw.split('/')[0]
+
+    matched_sites = list(filter(lambda x: domain_raw in x.url_main+x.url, db.sites))
+    if matched_sites:
+        print(f'Sites with domain "{domain_raw}" already exists in the Maigret database!')
+        status = lambda s: '(disabled)' if s.disabled else ''
+        url_block = lambda s: f'\n\t{s.url_main}\n\t{s.url}'
+        print('\n'.join([f'{site.name} {status(site)}{url_block(site)}' for site in matched_sites]))
+        return False
+
    url_parts = url_exists.split('/')
    supposed_username = url_parts[-1]
    new_name = input(f'Is "{supposed_username}" a valid username? If not, write it manually: ')
@@ -103,9 +116,7 @@ async def submit_dialog(db, url_exists):
    a_minus_b = tokens_a.difference(tokens_b)
    b_minus_a = tokens_b.difference(tokens_a)

-    top_features_count = int(input(f'Specify count of features to extract [default {TOP_FEATURES}]: '))
-    if not top_features_count:
-        top_features_count = TOP_FEATURES
+    top_features_count = int(input(f'Specify count of features to extract [default {TOP_FEATURES}]: ') or TOP_FEATURES)

    presence_list = sorted(a_minus_b, key=get_match_ratio, reverse=True)[:top_features_count]

@@ -1,5 +1,6 @@
 """Maigret reports test functions"""
 import copy
+import json
 import os
 from io import StringIO

@@ -7,7 +8,7 @@ import xmind
 from jinja2 import Template

 from maigret.report import generate_csv_report, generate_txt_report, save_xmind_report, save_html_report, \
-    save_pdf_report, generate_report_template, generate_report_context
+    save_pdf_report, generate_report_template, generate_report_context, generate_json_report
 from maigret.result import QueryResult, QueryStatus

 EXAMPLE_RESULTS = {
@@ -146,6 +147,32 @@ def test_generate_txt_report():
    ]


+def test_generate_json_simple_report():
+    jsonfile = StringIO()
+    MODIFIED_RESULTS = dict(EXAMPLE_RESULTS)
+    MODIFIED_RESULTS['GitHub2'] = EXAMPLE_RESULTS['GitHub']
+    generate_json_report('test', MODIFIED_RESULTS, jsonfile, 'simple')
+
+    jsonfile.seek(0)
+    data = jsonfile.readlines()
+
+    assert len(data) == 1
+    assert list(json.loads(data[0]).keys()) == ['GitHub', 'GitHub2']
+
+
+def test_generate_json_ndjson_report():
+    jsonfile = StringIO()
+    MODIFIED_RESULTS = dict(EXAMPLE_RESULTS)
+    MODIFIED_RESULTS['GitHub2'] = EXAMPLE_RESULTS['GitHub']
+    generate_json_report('test', MODIFIED_RESULTS, jsonfile, 'ndjson')
+
+    jsonfile.seek(0)
+    data = jsonfile.readlines()
+
+    assert len(data) == 2
+    assert json.loads(data[0])['sitename'] == 'GitHub'
+
+
 def test_save_xmind_report():
    filename = 'report_test.xmind'
    save_xmind_report(filename, 'test', EXAMPLE_RESULTS)