HTLM reports draft, 500 sites scanning by default

2026-05-07 06:24:35 +00:00 · 2021-01-07 23:52:29 +03:00
parent 5c8b65d033
commit e4765d1ed9
11 changed files with 544 additions and 65 deletions
@@ -26,7 +26,7 @@ from socid_extractor import parse, extract
 from .notify import QueryNotifyPrint
 from .result import QueryResult, QueryStatus
 from .sites import MaigretDatabase, MaigretSite
-from .report import save_csv_report, genxmindfile
+from .report import save_csv_report, genxmindfile, save_html_report
 import xmind
@@ -629,6 +629,10 @@ async def main():
                        action="store_true", dest="csv", default=False,
                        help="Create Comma-Separated Values (CSV) File."
                        )
    parser.add_argument("--html",
                        action="store_true", dest="html", default=False,
                        help="Create HTML report file."
                        )
    parser.add_argument("--site",
                        action="append", metavar='SITE_NAME',
                        dest="site_list", default=None,
@@ -649,6 +653,10 @@ async def main():
                             "A longer timeout will be more likely to get results from slow sites."
                             "On the other hand, this may cause a long delay to gather all results."
                        )
    parser.add_argument("--top-sites",
                        action="store", default=500,
                        help="Count of sites for checking ranked by Alexa Top (default: 500)."
                        )
    parser.add_argument("--print-not-found",
                        action="store_true", dest="print_not_found", default=False,
                        help="Print sites where the username was not found."
@@ -757,7 +765,8 @@ async def main():
    # Create object with all information about sites we are aware of.
    try:
-        site_data_all = MaigretDatabase().load_from_file(args.json_file).sites_dict
+        db = MaigretDatabase().load_from_file(args.json_file)
        site_data_all = db.ranked_sites_dict(top=args.top_sites)
    except Exception as error:
        print(f"ERROR:  {error}")
        sys.exit(1)
@@ -805,6 +814,8 @@ async def main():
    already_checked = set()
    general_results = []
    while usernames:
        username, id_type = list(usernames.items())[0]
        del usernames[username]
@@ -834,6 +845,7 @@ async def main():
                                logger=logger,
                                forced=args.use_disabled_sites,
                                )
        general_results.append((username, id_type, results))
        if args.folderoutput:
            # The usernames results should be stored in a targeted folder.
@@ -870,6 +882,9 @@ async def main():
        if args.csv:
            save_csv_report(username, results)
    if args.html:
        save_html_report(general_results)
 def run():
    try:
@@ -0,0 +1,215 @@
 import csv
 from datetime import datetime
 import logging
 import os
 import xmind
 from jinja2 import Template
 import pycountry
 from .result import QueryStatus
 from .utils import is_country_tag, CaseConverter, enrich_link_str
 def save_csv_report(username: str, results: dict):
    with open(username + '.csv', 'w', newline='', encoding='utf-8') as csvfile:
        save_csv_report_to_file(username, results, csvfile)
 def save_html_report(username_results: list):
    brief_text = []
    usernames = {}
    extended_info_count = 0
    tags = {}
    supposed_data = {}
    allowed_fields = ['fullname', 'gender']
    first_seen = None
    first_seen_format = '%Y-%m-%d %H:%M:%S'
    for username, id_type, results in username_results:
        found_accounts = 0
        new_ids = []
        usernames[username] = {'type': id_type}
        for website_name in results:
            dictionary = results[website_name]
            # TODO: fix no site data issue
            if not dictionary:
                continue
            status = dictionary.get('status')
            if status.ids_data:
                dictionary['ids_data'] = status.ids_data
                extended_info_count += 1
                # detect first seen
                created_at = status.ids_data.get('created_at')
                if created_at:
                    if first_seen is None:
                        first_seen = created_at
                    else:
                        known_time = datetime.strptime(first_seen, first_seen_format)
                        new_time = datetime.strptime(created_at, first_seen_format)
                        if new_time < known_time:
                            first_seen = created_at
                for k, v in status.ids_data.items():
                    # suppose target data
                    field = 'fullname' if k == 'name' else k
                    if not field in supposed_data:
                        supposed_data[field] = []
                    supposed_data[field].append(v)
                    # suppose country
                    if k in ['country', 'locale']:
                        try:
                            if is_country_tag(k):
                                tag = pycountry.countries.get(alpha_2=v).alpha_2.lower()
                            else:
                                tag = pycountry.countries.search_fuzzy(v)[0].alpha_2.lower()
                            # TODO: move countries to another struct
                            tags[tag] = tags.get(tag, 0) + 1
                        except Exception as e:
                            logging.debug('pycountry exception', exc_info=True)
            new_usernames = dictionary.get('ids_usernames')
            if new_usernames:
                for u, utype in new_usernames.items():
                    if not u in usernames:
                        new_ids.append((u, utype))
                        usernames[u] = {'type': utype}
            if status.status == QueryStatus.CLAIMED:
                found_accounts += 1
                dictionary['found'] = True
            else:
                continue
            if not dictionary.get('is_similar'):
                # ignore non-exact search results
                if status.tags:
                    for t in status.tags:
                        tags[t] = tags.get(t, 0) + 1
        brief_text.append(f'Search by {id_type} {username} returned {found_accounts} accounts.')
        if new_ids:
            ids_list = []
            for u, t in new_ids:
                ids_list.append(f'{u} ({t})' if t != 'username' else u)
            brief_text.append(f'Found target\'s other IDs: ' + ', '.join(ids_list) + '.')
    brief_text.append(f'Extended info extracted from {extended_info_count} accounts.')
    # template generation
    template_text = open(os.path.join(os.path.dirname(os.path.realpath(__file__)),
                         "resources/simple_report.tpl")).read()
    template = Template(template_text)
    template.globals['title'] = CaseConverter.snake_to_title
    template.globals['detect_link'] = enrich_link_str
    brief = ' '.join(brief_text).strip()
    tuple_sort = lambda d: sorted(d, key=lambda x: x[1], reverse=True)
    if 'global' in tags:
        # remove tag 'global' useless for country detection
        del tags['global']
    first_username = username_results[0][0]
    countries_lists = list(filter(lambda x: is_country_tag(x[0]), tags.items()))
    interests_list = list(filter(lambda x: not is_country_tag(x[0]), tags.items()))
    filtered_supposed_data = {CaseConverter.snake_to_title(k): v[0]
                              for k, v in supposed_data.items()
                              if k in allowed_fields}
    filled_template = template.render(username=first_username,
                                      brief=brief,
                                      results=username_results,
                                      first_seen=first_seen,
                                      interests_tuple_list=tuple_sort(interests_list),
                                      countries_tuple_list=tuple_sort(countries_lists),
                                      supposed_data=filtered_supposed_data,
                                      generated_at=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                                      )
    # save report
    html_filename = f'report_{first_username}.html'
    with open(html_filename, 'w') as f:
        f.write(filled_template)
 def save_csv_report_to_file(username: str, results: dict, csvfile):
    print(results)
    writer = csv.writer(csvfile)
    writer.writerow(['username',
                     'name',
                     'url_main',
                     'url_user',
                     'exists',
                     'http_status'
                     ]
                    )
    for site in results:
        writer.writerow([username,
                         site,
                         results[site]['url_main'],
                         results[site]['url_user'],
                         str(results[site]['status'].status),
                         results[site]['http_status'],
                        ])
 def genxmindfile(filename, username, results):
    print(f'Generating XMIND8 file for username {username}')
    if os.path.exists(filename):
        os.remove(filename)
    workbook = xmind.load(filename)
    sheet = workbook.getPrimarySheet()
    design_sheet1(sheet, username, results)
    xmind.save(workbook, path=filename)
 def design_sheet1(sheet, username, results):
    ##all tag list
    alltags = {}
    sheet.setTitle("%s Analysis"%(username))
    root_topic1 = sheet.getRootTopic()
    root_topic1.setTitle("%s"%(username))
    undefinedsection = root_topic1.addSubTopic()
    undefinedsection.setTitle("Undefined")
    alltags["undefined"] = undefinedsection
    for website_name in results:
        dictionary = results[website_name]
        if dictionary.get("status").status == QueryStatus.CLAIMED:
            ## firsttime I found that entry
            for tag in dictionary.get("status").tags:
                if tag.strip() == "":
                    continue
                if tag not in alltags.keys():
                    if not is_country_tag(tag):
                        tagsection = root_topic1.addSubTopic()
                        tagsection.setTitle(tag)
                        alltags[tag] = tagsection
            category = None
            userlink=  None
            for tag in dictionary.get("status").tags:
                if tag.strip() == "":
                    continue
                if not is_country_tag(tag):
                    category = tag
            if category is None:
                category = "undefined"
                userlink = undefinedsection.addSubTopic()
            else:
                userlink = alltags[category].addSubTopic()
            userlink.addLabel(dictionary.get("status").site_url_user)
            #for tag in dictionary.get("status").tags:
            #    if( tag != category ):
            #       sheet.createRelationship(userlink.getID(), alltags[tag].getID(),"other tag")
@@ -307,8 +307,9 @@
        },
        "500px": {
            "tags": [
-                "images",
+                "photos",
-                "in"
+                "in",
                "global"
            ],
            "errors": {
                "INTERNAL_SERVER_ERROR": "Site error",
@@ -3221,6 +3222,7 @@
            "tags": [
                "global",
                "images",
                "photos",
                "us"
            ],
            "regexCheck": "^[a-zA-Z][a-zA-Z0-9_-]*$",
@@ -3979,8 +3981,11 @@
        },
        "EyeEm": {
            "tags": [
                "de",
                "in",
-                "sd"
+                "sd",
                "global",
                "photos"
            ],
            "checkType": "message",
            "absenceStrs": "Not Found (404) | EyeEm",
@@ -6551,8 +6556,8 @@
        },
        "Instagram": {
            "tags": [
-                "social",
+                "photos",
-                "us"
+                "global"
            ],
            "errors": {
                "Login \u2022 Instagram": "Login required"
@@ -8018,7 +8023,9 @@
                "news",
                "us"
            ],
-            "checkType": "status_code",
+            "checkType": "message",
            "absenceStrs": [":{\"__typename\":\"NotFound\"},\"viewer\""],
            "presenseStrs": ["userPostCounts"],
            "alexaRank": 76,
            "url": "https://medium.com/@{username}",
            "urlMain": "https://medium.com/",
@@ -9835,9 +9842,9 @@
        },
        "Picuki": {
            "tags": [
                "photos",
                "global",
-                "jp",
+                "instagram"
                "us"
            ],
            "checkType": "message",
            "absenceStrs": [
@@ -9899,7 +9906,8 @@
        },
        "Pinterest": {
            "tags": [
-                "social",
+                "images",
                "photos",
                "us"
            ],
            "checkType": "status_code",
@@ -10858,6 +10866,7 @@
        },
        "Reddit": {
            "tags": [
                "social",
                "news",
                "us"
            ],
@@ -13392,6 +13401,7 @@
        },
        "Tumblr": {
            "tags": [
                "blogs",
                "global",
                "us"
            ],
@@ -13433,11 +13443,14 @@
                "us"
            ],
            "headers": {
-                "User-Agent": "Mozilla"
+                "sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
                "authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
                "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
                "x-guest-token": "1347256342462009351"
            },
-            "urlProbe": "https://mobile.twitter.com/{username}",
+            "urlProbe": "https://twitter.com/i/api/graphql/ZRnOhhXPwue_JGILb9TNug/UserByScreenName?variables=%7B%22screen_name%22%3A%22{username}%22%2C%22withHighlightedLabel%22%3Atrue%7D",
            "checkType": "message",
-            "absenceStrs": "Sorry, that page doesn't exist",
+            "absenceStrs": "Not found",
            "alexaRank": 55,
            "url": "https://twitter.com/{username}",
            "urlMain": "https://www.twitter.com/",
@@ -13604,9 +13617,9 @@
        },
        "VK": {
            "tags": [
                "global",
                "ru",
-                "social"
+                "social",
                "global"
            ],
            "checkType": "response_url",
            "alexaRank": 23,
@@ -14107,6 +14120,8 @@
        },
        "We Heart It": {
            "tags": [
                "photos",
                "us",
                "in"
            ],
            "checkType": "message",
@@ -0,0 +1,109 @@
 <html>
 <head>
    <meta charset="utf-8" />
 </head>
 <meta name="viewport" content="width=device-width, initial-scale=1.0, shrink-to-fit=no" />
 <title>{{ username }} -- Maigret username search report</title>
 <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
 <style>
    .table td, .table th {
        padding: .4rem;
    }
    @media print {
        .pagebreak { page-break-before: always; }
    }
 </style>
 <body>
    <div class="container">
        <div class="row-mb">
            <div class="col-12 card-body" style="padding-bottom: 0.5rem;">
                <h4 class="mb-0">
                    <a class="blog-header-logo text-dark" href="#">Username search report for {{ username }}</a>
                </h4>
                <small class="text-muted">Generated at {{ generated_at }}</small>
            </div>
        </div>
        <div class="row-mb">
            <div class="col-md">
                <div class="card flex-md-row mb-4 box-shadow h-md-250">
                    <div class="card-body d-flex flex-column align-items-start">
                        <h5>Supposed personal data</h5>
                        {% for k, v in supposed_data.items() %}
                        <span>
                            {{ k }}: {{ v }}
                        </span>
                        {% endfor %}
                        {% if countries_tuple_list %}
                        <span>
                            Geo: {% for k, v in countries_tuple_list %}{{ k }} <span class="text-muted">({{ v }})</span>{{ ", " if not loop.last }}{% endfor %}
                        </span>
                        {% endif %}{% if interests_tuple_list %}
                        <span>
                            Interests: {% for k, v in interests_tuple_list %}{{ k }} <span class="text-muted">({{ v }})</span>{{ ", " if not loop.last }}{% endfor %}
                        </span>
                        {% endif %}{% if first_seen %}
                        <span>
                            First seen: {{ first_seen }}
                        </span>
                        {% endif %}
                    </div>
                </div>
            </div>
        </div>
        <div class="row-mb">
            <div class="col-md">
                <div class="card flex-md-row mb-4 box-shadow h-md-250">
                    <div class="card-body d-flex flex-column align-items-start">
                        <h5>Brief</h5>
                        <span>
                            {{ brief }}  
                        </span>
                    </div>
                </div>
            </div>
        </div>
        {% for u, t, data in results %}
            {% for k, v in data.items() %}
                {% if v.found and not v.is_similar %}
        <div class="row-mb">
            <div class="col-md">
                <div class="card flex-md-row mb-4 box-shadow h-md-250">
                    <img class="card-img-right flex-auto d-none d-md-block" alt="Photo" style="width: 200px; height: 200px; object-fit: scale-down;" src="{{ v.status.ids_data.image or 'https://i.imgur.com/040fmbw.png' }}" data-holder-rendered="true">
                    <div class="card-body d-flex flex-column align-items-start" style="padding-top: 0;">
                    <h3 class="mb-0" style="padding-top: 1rem;">
                        <a class="text-dark" href="{{ v.url_main }}" target="_blank">{{ k }}</a>
                    </h3>
                    {% if v.status.tags %}
                        <div class="mb-1 text-muted">Tags: {{ v.status.tags | join(', ') }}</div>
                    {% endif %}
                    <p class="card-text">
                        <a href="{{ v.url_user }}" target="_blank">{{ v.url_user }}</a>
                    </p>
                    {% if v.ids_data %}
                    <table class="table table-striped">
                        <tbody>
                        {% for k1, v1 in v.ids_data.items() %}
                            {% if k1 != 'image' %}
                            <tr>
                                <th>{{ title(k1) }}</th>
                                <td>{% if v1 is iterable and (v1 is not string and v1 is not mapping) %}{{ v1 | join(', ') }}{% else %}{{ detect_link(v1) }}{% endif %}
                                </td>
                            </tr>
                            {% endif %}
                        {% endfor %}
                        </tbody>
                    </table>
                    {% endif %}
                  </p>
                </div>
                </div>
            </div>
        </div>
                {% endif %}
            {% endfor %}
        {% endfor %}
    </div>
    <script src="https://code.jquery.com/jquery-3.3.1.slim.min.js" integrity="sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo" crossorigin="anonymous"></script>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js" integrity="sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1" crossorigin="anonymous"></script>
    <script src="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js" integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM" crossorigin="anonymous"></script>
 </html>
@@ -34,7 +34,7 @@ class QueryResult():
    """
    def __init__(self, username, site_name, site_url_user, status, ids_data=None,
-                 query_time=None, context=None, tags=None):
+                 query_time=None, context=None, tags=[]):
        """Create Query Result Object.
        Contains information about a specific method of detecting usernames on
@@ -72,14 +72,8 @@ class QueryResult():
        self.query_time = query_time
        self.context = context
        self.ids_data = ids_data
        self.tags = tags
        self.tags = ""
        if (tags is not None):
            TAGstring = "".join(['%s,' % tags for tags in tags])
            TAGstring = TAGstring[:-1]
            self.tags = TAGstring
        return
    def __str__(self):
        """Convert Object To String.
@@ -13,6 +13,7 @@ from .utils import CaseConverter
 class MaigretEngine:
    def __init__(self, name, data):
        self.name = name
        self.site = {}
        self.__dict__.update(data)
    @property
@@ -127,6 +128,15 @@ class MaigretDatabase:
    def sites_dict(self):
        return {site.name: site for site in self._sites}
    def ranked_sites_dict(self, reverse=False, top=sys.maxsize, tags=[]):
        if not tags:
            filtered_list = self.sites
        else:
            filtered_list = [s for s in self.sites if set(s.tags).intersection(set(tags)) or s.engine in tags]
        sorted_list = sorted(filtered_list, key=lambda x: x.alexa_rank, reverse=reverse)[:top]
        return {site.name: site for site in sorted_list}
    @property
    def engines(self):
        return self._engines
@@ -145,12 +155,12 @@ class MaigretDatabase:
        return self
    def save_to_file(self, filename: str) -> MaigretDatabase:
-        json_data = {
+        db_data = {
            'sites': {site.name: site.strip_engine_data().json for site in self._sites},
            'engines': {engine.name: engine.json for engine in self._engines},
        }
-        json_data = json.dumps(json_data, indent=4)
+        json_data = json.dumps(db_data, indent=4)
        with open(filename, 'w') as f:
            f.write(json_data)
@@ -160,8 +170,8 @@ class MaigretDatabase:
    def load_from_json(self, json_data: dict) -> MaigretDatabase:
        # Add all of site information from the json file to internal site list.
-        site_data = json_data.get("sites")
+        site_data = json_data.get("sites", {})
-        engines_data = json_data.get("engines")
+        engines_data = json_data.get("engines", {})
        for engine_name in engines_data:
            self._engines.append(MaigretEngine(engine_name, engines_data[engine_name]))
@@ -198,7 +208,7 @@ class MaigretDatabase:
        is_url_valid = url.startswith('http://') or url.startswith('https://')
        if not is_url_valid:
-            return False
+            raise FileNotFoundError(f"Invalid data file URL '{url}'.")
        try:
            response = requests.get(url=url)
@@ -238,33 +248,3 @@ class MaigretDatabase:
                                    )
        return self.load_from_json(data)
    def site_name_list(self, popularity_rank=False):
        """Get Site Name List.
        Keyword Arguments:
        self                   -- This object.
        popularity_rank        -- Boolean indicating if list should be sorted
                                  by popularity rank.
                                  Default value is False.
                                  NOTE:  List is sorted in ascending
                                         alphabetical order is popularity rank
                                         is not requested.
        Return Value:
        List of strings containing names of sites.
        """
        if popularity_rank:
            # Sort in ascending popularity rank order.
            site_rank_name = \
                sorted([(site.popularity_rank, site.name) for site in self],
                       key=operator.itemgetter(0)
                       )
            site_names = [name for _, name in site_rank_name]
        else:
            # Sort in ascending alphabetical order.
            site_names = sorted([site.name for site in self], key=str.lower)
        return site_names
@@ -3,16 +3,29 @@ import re
 class CaseConverter:
    @staticmethod
-    def camel_to_snake(camelcased_string: str):
+    def camel_to_snake(camelcased_string: str) -> str:
        return re.sub(r'(?<!^)(?=[A-Z])', '_', camelcased_string).lower()
    @staticmethod
-    def snake_to_camel(snakecased_string: str):
+    def snake_to_camel(snakecased_string: str) -> str:
        formatted = ''.join(word.title() for word in snakecased_string.split('_'))
        result = formatted[0].lower() + formatted[1:]
        return result
    @staticmethod
    def snake_to_title(snakecased_string: str) -> str:
        words = snakecased_string.split('_')
        words[0] = words[0].title()
        return ' '.join(words)
-def is_country_tag(tag):
+
 def is_country_tag(tag: str) -> bool:
    """detect if tag represent a country"""
-    return bool(re.match("^([a-z]){2}$", tag))
+    return bool(re.match("^([a-zA-Z]){2}$", tag)) or tag == 'global'
 def enrich_link_str(link: str) -> str:
    link = link.strip()
    if link.startswith('www.') or (link.startswith('http') and '//' in link):
        return f'<a class="auto-link" href="{link}">{link}</a>'
    return link
@@ -8,9 +8,11 @@ certifi==2020.12.5
 chardet==3.0.4
 colorama==0.4.4
 idna==2.10
 Jinja2==2.11.2
 lxml==4.6.2
 mock==4.0.2
 multidict==5.1.0
 pycountry==20.7.3
 PySocks==1.7.1
 python-socks==1.1.2
 requests==2.25.1
@@ -0,0 +1,104 @@
 """Maigret reports test functions"""
 from io import StringIO
 import copy
 import os
 import xmind
 from maigret.report import save_csv_report_to_file, genxmindfile, save_html_report
 from maigret.result import QueryResult, QueryStatus
 EXAMPLE_RESULTS = {
    'GitHub': {
        'username': 'test',
        'parsing_enabled': True,
        'url_main': 'https://www.github.com/',
        'url_user': 'https://www.github.com/test',
        'status': QueryResult('test',
                              'GitHub',
                              'https://www.github.com/test',
                              QueryStatus.CLAIMED,
                              tags=['test_tag']),
        'http_status': 200,
        'is_similar': False,
        'rank': 78
    }
 }
 GOOD_RESULT = QueryResult('', '', '', QueryStatus.CLAIMED)
 BAD_RESULT = QueryResult('', '', '', QueryStatus.AVAILABLE)
 GOOD_500PX_RESULT = copy.deepcopy(GOOD_RESULT)
 GOOD_500PX_RESULT.tags = ['photo', 'us', 'global']
 GOOD_500PX_RESULT.ids_data = {"uid": "dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==", "legacy_id": "26403415", "username": "alexaimephotographycars", "name": "Alex Aim\u00e9", "website": "www.flickr.com/photos/alexaimephotography/", "facebook_link": " www.instagram.com/street.reality.photography/", "instagram_username": "alexaimephotography", "twitter_username": "Alexaimephotogr"}
 GOOD_REDDIT_RESULT = copy.deepcopy(GOOD_RESULT)
 GOOD_REDDIT_RESULT.tags = ['news', 'us']
 GOOD_REDDIT_RESULT.ids_data = {"reddit_id": "t5_1nytpy", "reddit_username": "alexaimephotography", "fullname": "alexaimephotography", "image": "https://styles.redditmedia.com/t5_1nytpy/styles/profileIcon_7vmhdwzd3g931.jpg?width=256&height=256&crop=256:256,smart&frame=1&s=4f355f16b4920844a3f4eacd4237a7bf76b2e97e", "is_employee": "False", "is_nsfw": "False", "is_mod": "True", "is_following": "True", "has_user_profile": "True", "hide_from_robots": "False", "created_at": "2019-07-10 12:20:03", "total_karma": "53959", "post_karma": "52738"}
 GOOD_IG_RESULT = copy.deepcopy(GOOD_RESULT)
 GOOD_IG_RESULT.tags = ['photo', 'global']
 GOOD_IG_RESULT.ids_data = {"instagram_username": "alexaimephotography", "fullname": "Alexaimephotography", "id": "6828488620", "image": "https://scontent-hel3-1.cdninstagram.com/v/t51.2885-19/s320x320/95420076_1169632876707608_8741505804647006208_n.jpg?_nc_ht=scontent-hel3-1.cdninstagram.com&_nc_ohc=jd87OUGsX4MAX_Ym5GX&tp=1&oh=0f42badd68307ba97ec7fb1ef7b4bfd4&oe=601E5E6F", "bio": "Photographer \nChild of fine street arts", "external_url": "https://www.flickr.com/photos/alexaimephotography2020/"}
 GOOD_TWITTER_RESULT = copy.deepcopy(GOOD_RESULT)
 GOOD_TWITTER_RESULT.tags = ['social', 'us']
 TEST = [('alexaimephotographycars', 'username', {'500px': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://500px.com/', 'url_user': 'https://500px.com/p/alexaimephotographycars', 'ids_usernames': {'alexaimephotographycars': 'username', 'alexaimephotography': 'username', 'Alexaimephotogr': 'username'}, 'status': GOOD_500PX_RESULT, 'http_status': 200, 'is_similar': False, 'rank': 2981}, 'Reddit': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/', 'url_user': 'https://www.reddit.com/user/alexaimephotographycars', 'status': BAD_RESULT, 'http_status': 404, 'is_similar': False, 'rank': 17}, 'Twitter': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/', 'url_user': 'https://twitter.com/alexaimephotographycars', 'status': BAD_RESULT, 'http_status': 400, 'is_similar': False, 'rank': 55}, 'Instagram': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://www.instagram.com/', 'url_user': 'https://www.instagram.com/alexaimephotographycars', 'status': BAD_RESULT, 'http_status': 404, 'is_similar': False, 'rank': 29}}), ('alexaimephotography', 'username', {'500px': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://500px.com/', 'url_user': 'https://500px.com/p/alexaimephotography', 'status': BAD_RESULT, 'http_status': 200, 'is_similar': False, 'rank': 2981}, 'Reddit': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/', 'url_user': 'https://www.reddit.com/user/alexaimephotography', 'ids_usernames': {'alexaimephotography': 'username'}, 'status': GOOD_REDDIT_RESULT, 'http_status': 200, 'is_similar': False, 'rank': 17}, 'Twitter': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/', 'url_user': 'https://twitter.com/alexaimephotography', 'status': BAD_RESULT, 'http_status': 400, 'is_similar': False, 'rank': 55}, 'Instagram': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.instagram.com/', 'url_user': 'https://www.instagram.com/alexaimephotography', 'ids_usernames': {'alexaimephotography': 'username'}, 'status': GOOD_IG_RESULT, 'http_status': 200, 'is_similar': False, 'rank': 29}}), ('Alexaimephotogr', 'username', {'500px': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://500px.com/', 'url_user': 'https://500px.com/p/Alexaimephotogr', 'status': BAD_RESULT, 'http_status': 200, 'is_similar': False, 'rank': 2981}, 'Reddit': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/', 'url_user': 'https://www.reddit.com/user/Alexaimephotogr', 'status': BAD_RESULT, 'http_status': 404, 'is_similar': False, 'rank': 17}, 'Twitter': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/', 'url_user': 'https://twitter.com/Alexaimephotogr', 'status': GOOD_TWITTER_RESULT, 'http_status': 400, 'is_similar': False, 'rank': 55}, 'Instagram': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.instagram.com/', 'url_user': 'https://www.instagram.com/Alexaimephotogr', 'status':BAD_RESULT, 'http_status': 404, 'is_similar': False, 'rank': 29}})]
 SUPPOSED_BRIEF = """Search by username alexaimephotographycars returned 1 accounts. Found target's other IDs: alexaimephotography, Alexaimephotogr. Search by username alexaimephotography returned 2 accounts. Search by username Alexaimephotogr returned 1 accounts. Extended info extracted from 3 accounts."""
 SUPPOSED_INTERESTS = "Interests: photo <span class=\"text-muted\">(2)</span>, news <span class=\"text-muted\">(1)</span>, social <span class=\"text-muted\">(1)</span>"
 SUPPOSED_GEO = "Geo: us <span class=\"text-muted\">(3)</span>"
 def test_save_csv_report_to_file():
    csvfile = StringIO()
    save_csv_report_to_file('test', EXAMPLE_RESULTS, csvfile)
    csvfile.seek(0)
    data = csvfile.readlines()
    assert data == [
         'username,name,url_main,url_user,exists,http_status\r\n',
         'test,GitHub,https://www.github.com/,https://www.github.com/test,Claimed,200\r\n',
    ]
 def test_save_xmind_report():
    filename = 'test_report.xmind'
    genxmindfile(filename, 'test', EXAMPLE_RESULTS)
    workbook = xmind.load(filename)
    sheet = workbook.getPrimarySheet()
    data = sheet.getData()
    assert data['title'] == 'test Analysis'
    assert data['topic']['title'] == 'test'
    assert len(data['topic']['topics']) == 2
    assert data['topic']['topics'][0]['title'] == 'Undefined'
    assert data['topic']['topics'][1]['title'] == 'test_tag'
    assert len(data['topic']['topics'][1]['topics']) == 1
    assert data['topic']['topics'][1]['topics'][0]['label'] == 'https://www.github.com/test'
 def test_html_report():
    report_name = 'report_alexaimephotographycars.html'
    try:
        os.remove(report_name)
    except:
        pass
    save_html_report(TEST)
    assert os.path.exists(report_name)
    report_text = open(report_name).read()
    assert SUPPOSED_BRIEF in report_text
    assert SUPPOSED_GEO in report_text
    assert SUPPOSED_INTERESTS in report_text
@@ -1,5 +1,5 @@
 """Maigret Database test functions"""
-from maigret.sites import MaigretDatabase
+from maigret.sites import MaigretDatabase, MaigretSite
 EXAMPLE_DB = {
@@ -99,3 +99,22 @@ def test_saving_site_error():
    assert amperka.strip_engine_data().errors == {'error1': 'text1'}
    assert amperka.strip_engine_data().json['errors'] == {'error1': 'text1'}
 def test_ranked_sites_dict():
    db = MaigretDatabase()
    db.update_site(MaigretSite('3', {'alexaRank': 1000, 'engine': 'ucoz'}))
    db.update_site(MaigretSite('1', {'alexaRank': 2, 'tags': ['forum']}))
    db.update_site(MaigretSite('2', {'alexaRank': 10, 'tags': ['ru', 'forum']}))
    # sorting
    assert list(db.ranked_sites_dict().keys()) == ['1', '2', '3']
    assert list(db.ranked_sites_dict(top=2).keys()) == ['1', '2']
    assert list(db.ranked_sites_dict(reverse=True, top=2).keys()) == ['3', '2']
    # filtering by tags
    assert list(db.ranked_sites_dict(tags=['ru'], top=2).keys()) == ['2']
    assert list(db.ranked_sites_dict(tags=['forum']).keys()) == ['1', '2']
    # filtering by engine
    assert list(db.ranked_sites_dict(tags=['ucoz']).keys()) == ['3']
@@ -1,5 +1,5 @@
 """Maigret utils test functions"""
-from maigret.utils import CaseConverter, is_country_tag
+from maigret.utils import CaseConverter, is_country_tag, enrich_link_str
 def test_case_convert_camel_to_snake():
@@ -14,8 +14,21 @@ def test_case_convert_snake_to_camel():
 	assert b == 'camelCasedString'
 def test_case_convert_snake_to_title():
 	a = 'camel_cased_string'
 	b = CaseConverter.snake_to_title(a)
 	assert b == 'Camel cased string'
 def test_is_country_tag():
 	assert is_country_tag('ru') == True
 	assert is_country_tag('FR') == True
 	assert is_country_tag('a1') == False
 	assert is_country_tag('dating') == False
 	assert is_country_tag('global') == True
 def test_enrich_link_str():
 	assert enrich_link_str('test') == 'test'
 	assert enrich_link_str(' www.flickr.com/photos/alexaimephotography/') == '<a class="auto-link" href="www.flickr.com/photos/alexaimephotography/">www.flickr.com/photos/alexaimephotography/</a>'