mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-06 14:08:59 +00:00
HTLM reports draft, 500 sites scanning by default
This commit is contained in:
+17
-2
@@ -26,7 +26,7 @@ from socid_extractor import parse, extract
|
||||
from .notify import QueryNotifyPrint
|
||||
from .result import QueryResult, QueryStatus
|
||||
from .sites import MaigretDatabase, MaigretSite
|
||||
from .report import save_csv_report, genxmindfile
|
||||
from .report import save_csv_report, genxmindfile, save_html_report
|
||||
|
||||
import xmind
|
||||
|
||||
@@ -629,6 +629,10 @@ async def main():
|
||||
action="store_true", dest="csv", default=False,
|
||||
help="Create Comma-Separated Values (CSV) File."
|
||||
)
|
||||
parser.add_argument("--html",
|
||||
action="store_true", dest="html", default=False,
|
||||
help="Create HTML report file."
|
||||
)
|
||||
parser.add_argument("--site",
|
||||
action="append", metavar='SITE_NAME',
|
||||
dest="site_list", default=None,
|
||||
@@ -649,6 +653,10 @@ async def main():
|
||||
"A longer timeout will be more likely to get results from slow sites."
|
||||
"On the other hand, this may cause a long delay to gather all results."
|
||||
)
|
||||
parser.add_argument("--top-sites",
|
||||
action="store", default=500,
|
||||
help="Count of sites for checking ranked by Alexa Top (default: 500)."
|
||||
)
|
||||
parser.add_argument("--print-not-found",
|
||||
action="store_true", dest="print_not_found", default=False,
|
||||
help="Print sites where the username was not found."
|
||||
@@ -757,7 +765,8 @@ async def main():
|
||||
|
||||
# Create object with all information about sites we are aware of.
|
||||
try:
|
||||
site_data_all = MaigretDatabase().load_from_file(args.json_file).sites_dict
|
||||
db = MaigretDatabase().load_from_file(args.json_file)
|
||||
site_data_all = db.ranked_sites_dict(top=args.top_sites)
|
||||
except Exception as error:
|
||||
print(f"ERROR: {error}")
|
||||
sys.exit(1)
|
||||
@@ -805,6 +814,8 @@ async def main():
|
||||
|
||||
already_checked = set()
|
||||
|
||||
general_results = []
|
||||
|
||||
while usernames:
|
||||
username, id_type = list(usernames.items())[0]
|
||||
del usernames[username]
|
||||
@@ -834,6 +845,7 @@ async def main():
|
||||
logger=logger,
|
||||
forced=args.use_disabled_sites,
|
||||
)
|
||||
general_results.append((username, id_type, results))
|
||||
|
||||
if args.folderoutput:
|
||||
# The usernames results should be stored in a targeted folder.
|
||||
@@ -870,6 +882,9 @@ async def main():
|
||||
if args.csv:
|
||||
save_csv_report(username, results)
|
||||
|
||||
if args.html:
|
||||
save_html_report(general_results)
|
||||
|
||||
|
||||
def run():
|
||||
try:
|
||||
|
||||
@@ -0,0 +1,215 @@
|
||||
import csv
|
||||
from datetime import datetime
|
||||
import logging
|
||||
import os
|
||||
import xmind
|
||||
|
||||
from jinja2 import Template
|
||||
import pycountry
|
||||
|
||||
from .result import QueryStatus
|
||||
from .utils import is_country_tag, CaseConverter, enrich_link_str
|
||||
|
||||
|
||||
def save_csv_report(username: str, results: dict):
|
||||
with open(username + '.csv', 'w', newline='', encoding='utf-8') as csvfile:
|
||||
save_csv_report_to_file(username, results, csvfile)
|
||||
|
||||
|
||||
def save_html_report(username_results: list):
|
||||
brief_text = []
|
||||
usernames = {}
|
||||
extended_info_count = 0
|
||||
tags = {}
|
||||
supposed_data = {}
|
||||
allowed_fields = ['fullname', 'gender']
|
||||
first_seen = None
|
||||
first_seen_format = '%Y-%m-%d %H:%M:%S'
|
||||
|
||||
for username, id_type, results in username_results:
|
||||
found_accounts = 0
|
||||
new_ids = []
|
||||
usernames[username] = {'type': id_type}
|
||||
|
||||
for website_name in results:
|
||||
dictionary = results[website_name]
|
||||
# TODO: fix no site data issue
|
||||
if not dictionary:
|
||||
continue
|
||||
|
||||
status = dictionary.get('status')
|
||||
if status.ids_data:
|
||||
dictionary['ids_data'] = status.ids_data
|
||||
extended_info_count += 1
|
||||
|
||||
# detect first seen
|
||||
created_at = status.ids_data.get('created_at')
|
||||
if created_at:
|
||||
if first_seen is None:
|
||||
first_seen = created_at
|
||||
else:
|
||||
known_time = datetime.strptime(first_seen, first_seen_format)
|
||||
new_time = datetime.strptime(created_at, first_seen_format)
|
||||
if new_time < known_time:
|
||||
first_seen = created_at
|
||||
|
||||
for k, v in status.ids_data.items():
|
||||
# suppose target data
|
||||
field = 'fullname' if k == 'name' else k
|
||||
if not field in supposed_data:
|
||||
supposed_data[field] = []
|
||||
supposed_data[field].append(v)
|
||||
# suppose country
|
||||
if k in ['country', 'locale']:
|
||||
try:
|
||||
if is_country_tag(k):
|
||||
tag = pycountry.countries.get(alpha_2=v).alpha_2.lower()
|
||||
else:
|
||||
tag = pycountry.countries.search_fuzzy(v)[0].alpha_2.lower()
|
||||
# TODO: move countries to another struct
|
||||
tags[tag] = tags.get(tag, 0) + 1
|
||||
except Exception as e:
|
||||
logging.debug('pycountry exception', exc_info=True)
|
||||
|
||||
new_usernames = dictionary.get('ids_usernames')
|
||||
if new_usernames:
|
||||
for u, utype in new_usernames.items():
|
||||
if not u in usernames:
|
||||
new_ids.append((u, utype))
|
||||
usernames[u] = {'type': utype}
|
||||
|
||||
if status.status == QueryStatus.CLAIMED:
|
||||
found_accounts += 1
|
||||
dictionary['found'] = True
|
||||
else:
|
||||
continue
|
||||
|
||||
if not dictionary.get('is_similar'):
|
||||
# ignore non-exact search results
|
||||
if status.tags:
|
||||
for t in status.tags:
|
||||
tags[t] = tags.get(t, 0) + 1
|
||||
|
||||
|
||||
brief_text.append(f'Search by {id_type} {username} returned {found_accounts} accounts.')
|
||||
|
||||
if new_ids:
|
||||
ids_list = []
|
||||
for u, t in new_ids:
|
||||
ids_list.append(f'{u} ({t})' if t != 'username' else u)
|
||||
brief_text.append(f'Found target\'s other IDs: ' + ', '.join(ids_list) + '.')
|
||||
|
||||
brief_text.append(f'Extended info extracted from {extended_info_count} accounts.')
|
||||
|
||||
# template generation
|
||||
template_text = open(os.path.join(os.path.dirname(os.path.realpath(__file__)),
|
||||
"resources/simple_report.tpl")).read()
|
||||
template = Template(template_text)
|
||||
|
||||
template.globals['title'] = CaseConverter.snake_to_title
|
||||
template.globals['detect_link'] = enrich_link_str
|
||||
|
||||
brief = ' '.join(brief_text).strip()
|
||||
tuple_sort = lambda d: sorted(d, key=lambda x: x[1], reverse=True)
|
||||
|
||||
if 'global' in tags:
|
||||
# remove tag 'global' useless for country detection
|
||||
del tags['global']
|
||||
|
||||
first_username = username_results[0][0]
|
||||
countries_lists = list(filter(lambda x: is_country_tag(x[0]), tags.items()))
|
||||
interests_list = list(filter(lambda x: not is_country_tag(x[0]), tags.items()))
|
||||
|
||||
filtered_supposed_data = {CaseConverter.snake_to_title(k): v[0]
|
||||
for k, v in supposed_data.items()
|
||||
if k in allowed_fields}
|
||||
|
||||
filled_template = template.render(username=first_username,
|
||||
brief=brief,
|
||||
results=username_results,
|
||||
first_seen=first_seen,
|
||||
interests_tuple_list=tuple_sort(interests_list),
|
||||
countries_tuple_list=tuple_sort(countries_lists),
|
||||
supposed_data=filtered_supposed_data,
|
||||
generated_at=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
)
|
||||
# save report
|
||||
html_filename = f'report_{first_username}.html'
|
||||
with open(html_filename, 'w') as f:
|
||||
f.write(filled_template)
|
||||
|
||||
def save_csv_report_to_file(username: str, results: dict, csvfile):
|
||||
print(results)
|
||||
writer = csv.writer(csvfile)
|
||||
writer.writerow(['username',
|
||||
'name',
|
||||
'url_main',
|
||||
'url_user',
|
||||
'exists',
|
||||
'http_status'
|
||||
]
|
||||
)
|
||||
for site in results:
|
||||
writer.writerow([username,
|
||||
site,
|
||||
results[site]['url_main'],
|
||||
results[site]['url_user'],
|
||||
str(results[site]['status'].status),
|
||||
results[site]['http_status'],
|
||||
])
|
||||
|
||||
|
||||
def genxmindfile(filename, username, results):
|
||||
print(f'Generating XMIND8 file for username {username}')
|
||||
if os.path.exists(filename):
|
||||
os.remove(filename)
|
||||
workbook = xmind.load(filename)
|
||||
sheet = workbook.getPrimarySheet()
|
||||
design_sheet1(sheet, username, results)
|
||||
xmind.save(workbook, path=filename)
|
||||
|
||||
|
||||
def design_sheet1(sheet, username, results):
|
||||
##all tag list
|
||||
alltags = {}
|
||||
|
||||
sheet.setTitle("%s Analysis"%(username))
|
||||
root_topic1 = sheet.getRootTopic()
|
||||
root_topic1.setTitle("%s"%(username))
|
||||
|
||||
undefinedsection = root_topic1.addSubTopic()
|
||||
undefinedsection.setTitle("Undefined")
|
||||
alltags["undefined"] = undefinedsection
|
||||
|
||||
for website_name in results:
|
||||
dictionary = results[website_name]
|
||||
|
||||
if dictionary.get("status").status == QueryStatus.CLAIMED:
|
||||
## firsttime I found that entry
|
||||
for tag in dictionary.get("status").tags:
|
||||
if tag.strip() == "":
|
||||
continue
|
||||
if tag not in alltags.keys():
|
||||
if not is_country_tag(tag):
|
||||
tagsection = root_topic1.addSubTopic()
|
||||
tagsection.setTitle(tag)
|
||||
alltags[tag] = tagsection
|
||||
|
||||
category = None
|
||||
userlink= None
|
||||
for tag in dictionary.get("status").tags:
|
||||
if tag.strip() == "":
|
||||
continue
|
||||
if not is_country_tag(tag):
|
||||
category = tag
|
||||
|
||||
if category is None:
|
||||
category = "undefined"
|
||||
userlink = undefinedsection.addSubTopic()
|
||||
else:
|
||||
userlink = alltags[category].addSubTopic()
|
||||
userlink.addLabel(dictionary.get("status").site_url_user)
|
||||
|
||||
#for tag in dictionary.get("status").tags:
|
||||
# if( tag != category ):
|
||||
# sheet.createRelationship(userlink.getID(), alltags[tag].getID(),"other tag")
|
||||
+29
-14
@@ -307,8 +307,9 @@
|
||||
},
|
||||
"500px": {
|
||||
"tags": [
|
||||
"images",
|
||||
"in"
|
||||
"photos",
|
||||
"in",
|
||||
"global"
|
||||
],
|
||||
"errors": {
|
||||
"INTERNAL_SERVER_ERROR": "Site error",
|
||||
@@ -3221,6 +3222,7 @@
|
||||
"tags": [
|
||||
"global",
|
||||
"images",
|
||||
"photos",
|
||||
"us"
|
||||
],
|
||||
"regexCheck": "^[a-zA-Z][a-zA-Z0-9_-]*$",
|
||||
@@ -3979,8 +3981,11 @@
|
||||
},
|
||||
"EyeEm": {
|
||||
"tags": [
|
||||
"de",
|
||||
"in",
|
||||
"sd"
|
||||
"sd",
|
||||
"global",
|
||||
"photos"
|
||||
],
|
||||
"checkType": "message",
|
||||
"absenceStrs": "Not Found (404) | EyeEm",
|
||||
@@ -6551,8 +6556,8 @@
|
||||
},
|
||||
"Instagram": {
|
||||
"tags": [
|
||||
"social",
|
||||
"us"
|
||||
"photos",
|
||||
"global"
|
||||
],
|
||||
"errors": {
|
||||
"Login \u2022 Instagram": "Login required"
|
||||
@@ -8018,7 +8023,9 @@
|
||||
"news",
|
||||
"us"
|
||||
],
|
||||
"checkType": "status_code",
|
||||
"checkType": "message",
|
||||
"absenceStrs": [":{\"__typename\":\"NotFound\"},\"viewer\""],
|
||||
"presenseStrs": ["userPostCounts"],
|
||||
"alexaRank": 76,
|
||||
"url": "https://medium.com/@{username}",
|
||||
"urlMain": "https://medium.com/",
|
||||
@@ -9835,9 +9842,9 @@
|
||||
},
|
||||
"Picuki": {
|
||||
"tags": [
|
||||
"photos",
|
||||
"global",
|
||||
"jp",
|
||||
"us"
|
||||
"instagram"
|
||||
],
|
||||
"checkType": "message",
|
||||
"absenceStrs": [
|
||||
@@ -9899,7 +9906,8 @@
|
||||
},
|
||||
"Pinterest": {
|
||||
"tags": [
|
||||
"social",
|
||||
"images",
|
||||
"photos",
|
||||
"us"
|
||||
],
|
||||
"checkType": "status_code",
|
||||
@@ -10858,6 +10866,7 @@
|
||||
},
|
||||
"Reddit": {
|
||||
"tags": [
|
||||
"social",
|
||||
"news",
|
||||
"us"
|
||||
],
|
||||
@@ -13392,6 +13401,7 @@
|
||||
},
|
||||
"Tumblr": {
|
||||
"tags": [
|
||||
"blogs",
|
||||
"global",
|
||||
"us"
|
||||
],
|
||||
@@ -13433,11 +13443,14 @@
|
||||
"us"
|
||||
],
|
||||
"headers": {
|
||||
"User-Agent": "Mozilla"
|
||||
"sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
|
||||
"authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
|
||||
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
|
||||
"x-guest-token": "1347256342462009351"
|
||||
},
|
||||
"urlProbe": "https://mobile.twitter.com/{username}",
|
||||
"urlProbe": "https://twitter.com/i/api/graphql/ZRnOhhXPwue_JGILb9TNug/UserByScreenName?variables=%7B%22screen_name%22%3A%22{username}%22%2C%22withHighlightedLabel%22%3Atrue%7D",
|
||||
"checkType": "message",
|
||||
"absenceStrs": "Sorry, that page doesn't exist",
|
||||
"absenceStrs": "Not found",
|
||||
"alexaRank": 55,
|
||||
"url": "https://twitter.com/{username}",
|
||||
"urlMain": "https://www.twitter.com/",
|
||||
@@ -13604,9 +13617,9 @@
|
||||
},
|
||||
"VK": {
|
||||
"tags": [
|
||||
"global",
|
||||
"ru",
|
||||
"social"
|
||||
"social",
|
||||
"global"
|
||||
],
|
||||
"checkType": "response_url",
|
||||
"alexaRank": 23,
|
||||
@@ -14107,6 +14120,8 @@
|
||||
},
|
||||
"We Heart It": {
|
||||
"tags": [
|
||||
"photos",
|
||||
"us",
|
||||
"in"
|
||||
],
|
||||
"checkType": "message",
|
||||
|
||||
@@ -0,0 +1,109 @@
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
</head>
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0, shrink-to-fit=no" />
|
||||
<title>{{ username }} -- Maigret username search report</title>
|
||||
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
|
||||
<style>
|
||||
.table td, .table th {
|
||||
padding: .4rem;
|
||||
}
|
||||
@media print {
|
||||
.pagebreak { page-break-before: always; }
|
||||
}
|
||||
</style>
|
||||
<body>
|
||||
<div class="container">
|
||||
<div class="row-mb">
|
||||
<div class="col-12 card-body" style="padding-bottom: 0.5rem;">
|
||||
<h4 class="mb-0">
|
||||
<a class="blog-header-logo text-dark" href="#">Username search report for {{ username }}</a>
|
||||
</h4>
|
||||
<small class="text-muted">Generated at {{ generated_at }}</small>
|
||||
</div>
|
||||
</div>
|
||||
<div class="row-mb">
|
||||
<div class="col-md">
|
||||
<div class="card flex-md-row mb-4 box-shadow h-md-250">
|
||||
<div class="card-body d-flex flex-column align-items-start">
|
||||
<h5>Supposed personal data</h5>
|
||||
{% for k, v in supposed_data.items() %}
|
||||
<span>
|
||||
{{ k }}: {{ v }}
|
||||
</span>
|
||||
{% endfor %}
|
||||
{% if countries_tuple_list %}
|
||||
<span>
|
||||
Geo: {% for k, v in countries_tuple_list %}{{ k }} <span class="text-muted">({{ v }})</span>{{ ", " if not loop.last }}{% endfor %}
|
||||
</span>
|
||||
{% endif %}{% if interests_tuple_list %}
|
||||
<span>
|
||||
Interests: {% for k, v in interests_tuple_list %}{{ k }} <span class="text-muted">({{ v }})</span>{{ ", " if not loop.last }}{% endfor %}
|
||||
</span>
|
||||
{% endif %}{% if first_seen %}
|
||||
<span>
|
||||
First seen: {{ first_seen }}
|
||||
</span>
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="row-mb">
|
||||
<div class="col-md">
|
||||
<div class="card flex-md-row mb-4 box-shadow h-md-250">
|
||||
<div class="card-body d-flex flex-column align-items-start">
|
||||
<h5>Brief</h5>
|
||||
<span>
|
||||
{{ brief }}
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% for u, t, data in results %}
|
||||
{% for k, v in data.items() %}
|
||||
{% if v.found and not v.is_similar %}
|
||||
<div class="row-mb">
|
||||
<div class="col-md">
|
||||
<div class="card flex-md-row mb-4 box-shadow h-md-250">
|
||||
<img class="card-img-right flex-auto d-none d-md-block" alt="Photo" style="width: 200px; height: 200px; object-fit: scale-down;" src="{{ v.status.ids_data.image or 'https://i.imgur.com/040fmbw.png' }}" data-holder-rendered="true">
|
||||
<div class="card-body d-flex flex-column align-items-start" style="padding-top: 0;">
|
||||
<h3 class="mb-0" style="padding-top: 1rem;">
|
||||
<a class="text-dark" href="{{ v.url_main }}" target="_blank">{{ k }}</a>
|
||||
</h3>
|
||||
{% if v.status.tags %}
|
||||
<div class="mb-1 text-muted">Tags: {{ v.status.tags | join(', ') }}</div>
|
||||
{% endif %}
|
||||
<p class="card-text">
|
||||
<a href="{{ v.url_user }}" target="_blank">{{ v.url_user }}</a>
|
||||
</p>
|
||||
{% if v.ids_data %}
|
||||
<table class="table table-striped">
|
||||
<tbody>
|
||||
{% for k1, v1 in v.ids_data.items() %}
|
||||
{% if k1 != 'image' %}
|
||||
<tr>
|
||||
<th>{{ title(k1) }}</th>
|
||||
<td>{% if v1 is iterable and (v1 is not string and v1 is not mapping) %}{{ v1 | join(', ') }}{% else %}{{ detect_link(v1) }}{% endif %}
|
||||
</td>
|
||||
</tr>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
{% endif %}
|
||||
</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
<script src="https://code.jquery.com/jquery-3.3.1.slim.min.js" integrity="sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo" crossorigin="anonymous"></script>
|
||||
<script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js" integrity="sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1" crossorigin="anonymous"></script>
|
||||
<script src="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js" integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM" crossorigin="anonymous"></script>
|
||||
</html>
|
||||
+2
-8
@@ -34,7 +34,7 @@ class QueryResult():
|
||||
"""
|
||||
|
||||
def __init__(self, username, site_name, site_url_user, status, ids_data=None,
|
||||
query_time=None, context=None, tags=None):
|
||||
query_time=None, context=None, tags=[]):
|
||||
"""Create Query Result Object.
|
||||
|
||||
Contains information about a specific method of detecting usernames on
|
||||
@@ -72,14 +72,8 @@ class QueryResult():
|
||||
self.query_time = query_time
|
||||
self.context = context
|
||||
self.ids_data = ids_data
|
||||
self.tags = tags
|
||||
|
||||
self.tags = ""
|
||||
if (tags is not None):
|
||||
TAGstring = "".join(['%s,' % tags for tags in tags])
|
||||
TAGstring = TAGstring[:-1]
|
||||
self.tags = TAGstring
|
||||
|
||||
return
|
||||
|
||||
def __str__(self):
|
||||
"""Convert Object To String.
|
||||
|
||||
+15
-35
@@ -13,6 +13,7 @@ from .utils import CaseConverter
|
||||
class MaigretEngine:
|
||||
def __init__(self, name, data):
|
||||
self.name = name
|
||||
self.site = {}
|
||||
self.__dict__.update(data)
|
||||
|
||||
@property
|
||||
@@ -127,6 +128,15 @@ class MaigretDatabase:
|
||||
def sites_dict(self):
|
||||
return {site.name: site for site in self._sites}
|
||||
|
||||
def ranked_sites_dict(self, reverse=False, top=sys.maxsize, tags=[]):
|
||||
if not tags:
|
||||
filtered_list = self.sites
|
||||
else:
|
||||
filtered_list = [s for s in self.sites if set(s.tags).intersection(set(tags)) or s.engine in tags]
|
||||
|
||||
sorted_list = sorted(filtered_list, key=lambda x: x.alexa_rank, reverse=reverse)[:top]
|
||||
return {site.name: site for site in sorted_list}
|
||||
|
||||
@property
|
||||
def engines(self):
|
||||
return self._engines
|
||||
@@ -145,12 +155,12 @@ class MaigretDatabase:
|
||||
return self
|
||||
|
||||
def save_to_file(self, filename: str) -> MaigretDatabase:
|
||||
json_data = {
|
||||
db_data = {
|
||||
'sites': {site.name: site.strip_engine_data().json for site in self._sites},
|
||||
'engines': {engine.name: engine.json for engine in self._engines},
|
||||
}
|
||||
|
||||
json_data = json.dumps(json_data, indent=4)
|
||||
json_data = json.dumps(db_data, indent=4)
|
||||
|
||||
with open(filename, 'w') as f:
|
||||
f.write(json_data)
|
||||
@@ -160,8 +170,8 @@ class MaigretDatabase:
|
||||
|
||||
def load_from_json(self, json_data: dict) -> MaigretDatabase:
|
||||
# Add all of site information from the json file to internal site list.
|
||||
site_data = json_data.get("sites")
|
||||
engines_data = json_data.get("engines")
|
||||
site_data = json_data.get("sites", {})
|
||||
engines_data = json_data.get("engines", {})
|
||||
|
||||
for engine_name in engines_data:
|
||||
self._engines.append(MaigretEngine(engine_name, engines_data[engine_name]))
|
||||
@@ -198,7 +208,7 @@ class MaigretDatabase:
|
||||
is_url_valid = url.startswith('http://') or url.startswith('https://')
|
||||
|
||||
if not is_url_valid:
|
||||
return False
|
||||
raise FileNotFoundError(f"Invalid data file URL '{url}'.")
|
||||
|
||||
try:
|
||||
response = requests.get(url=url)
|
||||
@@ -238,33 +248,3 @@ class MaigretDatabase:
|
||||
)
|
||||
|
||||
return self.load_from_json(data)
|
||||
|
||||
|
||||
def site_name_list(self, popularity_rank=False):
|
||||
"""Get Site Name List.
|
||||
|
||||
Keyword Arguments:
|
||||
self -- This object.
|
||||
popularity_rank -- Boolean indicating if list should be sorted
|
||||
by popularity rank.
|
||||
Default value is False.
|
||||
NOTE: List is sorted in ascending
|
||||
alphabetical order is popularity rank
|
||||
is not requested.
|
||||
|
||||
Return Value:
|
||||
List of strings containing names of sites.
|
||||
"""
|
||||
|
||||
if popularity_rank:
|
||||
# Sort in ascending popularity rank order.
|
||||
site_rank_name = \
|
||||
sorted([(site.popularity_rank, site.name) for site in self],
|
||||
key=operator.itemgetter(0)
|
||||
)
|
||||
site_names = [name for _, name in site_rank_name]
|
||||
else:
|
||||
# Sort in ascending alphabetical order.
|
||||
site_names = sorted([site.name for site in self], key=str.lower)
|
||||
|
||||
return site_names
|
||||
|
||||
+17
-4
@@ -3,16 +3,29 @@ import re
|
||||
|
||||
class CaseConverter:
|
||||
@staticmethod
|
||||
def camel_to_snake(camelcased_string: str):
|
||||
def camel_to_snake(camelcased_string: str) -> str:
|
||||
return re.sub(r'(?<!^)(?=[A-Z])', '_', camelcased_string).lower()
|
||||
|
||||
@staticmethod
|
||||
def snake_to_camel(snakecased_string: str):
|
||||
def snake_to_camel(snakecased_string: str) -> str:
|
||||
formatted = ''.join(word.title() for word in snakecased_string.split('_'))
|
||||
result = formatted[0].lower() + formatted[1:]
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def snake_to_title(snakecased_string: str) -> str:
|
||||
words = snakecased_string.split('_')
|
||||
words[0] = words[0].title()
|
||||
return ' '.join(words)
|
||||
|
||||
def is_country_tag(tag):
|
||||
|
||||
def is_country_tag(tag: str) -> bool:
|
||||
"""detect if tag represent a country"""
|
||||
return bool(re.match("^([a-z]){2}$", tag))
|
||||
return bool(re.match("^([a-zA-Z]){2}$", tag)) or tag == 'global'
|
||||
|
||||
|
||||
def enrich_link_str(link: str) -> str:
|
||||
link = link.strip()
|
||||
if link.startswith('www.') or (link.startswith('http') and '//' in link):
|
||||
return f'<a class="auto-link" href="{link}">{link}</a>'
|
||||
return link
|
||||
@@ -8,9 +8,11 @@ certifi==2020.12.5
|
||||
chardet==3.0.4
|
||||
colorama==0.4.4
|
||||
idna==2.10
|
||||
Jinja2==2.11.2
|
||||
lxml==4.6.2
|
||||
mock==4.0.2
|
||||
multidict==5.1.0
|
||||
pycountry==20.7.3
|
||||
PySocks==1.7.1
|
||||
python-socks==1.1.2
|
||||
requests==2.25.1
|
||||
|
||||
@@ -0,0 +1,104 @@
|
||||
"""Maigret reports test functions"""
|
||||
from io import StringIO
|
||||
import copy
|
||||
import os
|
||||
|
||||
import xmind
|
||||
|
||||
from maigret.report import save_csv_report_to_file, genxmindfile, save_html_report
|
||||
from maigret.result import QueryResult, QueryStatus
|
||||
|
||||
|
||||
EXAMPLE_RESULTS = {
|
||||
'GitHub': {
|
||||
'username': 'test',
|
||||
'parsing_enabled': True,
|
||||
'url_main': 'https://www.github.com/',
|
||||
'url_user': 'https://www.github.com/test',
|
||||
'status': QueryResult('test',
|
||||
'GitHub',
|
||||
'https://www.github.com/test',
|
||||
QueryStatus.CLAIMED,
|
||||
tags=['test_tag']),
|
||||
'http_status': 200,
|
||||
'is_similar': False,
|
||||
'rank': 78
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
GOOD_RESULT = QueryResult('', '', '', QueryStatus.CLAIMED)
|
||||
BAD_RESULT = QueryResult('', '', '', QueryStatus.AVAILABLE)
|
||||
|
||||
GOOD_500PX_RESULT = copy.deepcopy(GOOD_RESULT)
|
||||
GOOD_500PX_RESULT.tags = ['photo', 'us', 'global']
|
||||
GOOD_500PX_RESULT.ids_data = {"uid": "dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==", "legacy_id": "26403415", "username": "alexaimephotographycars", "name": "Alex Aim\u00e9", "website": "www.flickr.com/photos/alexaimephotography/", "facebook_link": " www.instagram.com/street.reality.photography/", "instagram_username": "alexaimephotography", "twitter_username": "Alexaimephotogr"}
|
||||
|
||||
GOOD_REDDIT_RESULT = copy.deepcopy(GOOD_RESULT)
|
||||
GOOD_REDDIT_RESULT.tags = ['news', 'us']
|
||||
GOOD_REDDIT_RESULT.ids_data = {"reddit_id": "t5_1nytpy", "reddit_username": "alexaimephotography", "fullname": "alexaimephotography", "image": "https://styles.redditmedia.com/t5_1nytpy/styles/profileIcon_7vmhdwzd3g931.jpg?width=256&height=256&crop=256:256,smart&frame=1&s=4f355f16b4920844a3f4eacd4237a7bf76b2e97e", "is_employee": "False", "is_nsfw": "False", "is_mod": "True", "is_following": "True", "has_user_profile": "True", "hide_from_robots": "False", "created_at": "2019-07-10 12:20:03", "total_karma": "53959", "post_karma": "52738"}
|
||||
|
||||
GOOD_IG_RESULT = copy.deepcopy(GOOD_RESULT)
|
||||
GOOD_IG_RESULT.tags = ['photo', 'global']
|
||||
GOOD_IG_RESULT.ids_data = {"instagram_username": "alexaimephotography", "fullname": "Alexaimephotography", "id": "6828488620", "image": "https://scontent-hel3-1.cdninstagram.com/v/t51.2885-19/s320x320/95420076_1169632876707608_8741505804647006208_n.jpg?_nc_ht=scontent-hel3-1.cdninstagram.com&_nc_ohc=jd87OUGsX4MAX_Ym5GX&tp=1&oh=0f42badd68307ba97ec7fb1ef7b4bfd4&oe=601E5E6F", "bio": "Photographer \nChild of fine street arts", "external_url": "https://www.flickr.com/photos/alexaimephotography2020/"}
|
||||
|
||||
GOOD_TWITTER_RESULT = copy.deepcopy(GOOD_RESULT)
|
||||
GOOD_TWITTER_RESULT.tags = ['social', 'us']
|
||||
|
||||
|
||||
TEST = [('alexaimephotographycars', 'username', {'500px': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://500px.com/', 'url_user': 'https://500px.com/p/alexaimephotographycars', 'ids_usernames': {'alexaimephotographycars': 'username', 'alexaimephotography': 'username', 'Alexaimephotogr': 'username'}, 'status': GOOD_500PX_RESULT, 'http_status': 200, 'is_similar': False, 'rank': 2981}, 'Reddit': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/', 'url_user': 'https://www.reddit.com/user/alexaimephotographycars', 'status': BAD_RESULT, 'http_status': 404, 'is_similar': False, 'rank': 17}, 'Twitter': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/', 'url_user': 'https://twitter.com/alexaimephotographycars', 'status': BAD_RESULT, 'http_status': 400, 'is_similar': False, 'rank': 55}, 'Instagram': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://www.instagram.com/', 'url_user': 'https://www.instagram.com/alexaimephotographycars', 'status': BAD_RESULT, 'http_status': 404, 'is_similar': False, 'rank': 29}}), ('alexaimephotography', 'username', {'500px': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://500px.com/', 'url_user': 'https://500px.com/p/alexaimephotography', 'status': BAD_RESULT, 'http_status': 200, 'is_similar': False, 'rank': 2981}, 'Reddit': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/', 'url_user': 'https://www.reddit.com/user/alexaimephotography', 'ids_usernames': {'alexaimephotography': 'username'}, 'status': GOOD_REDDIT_RESULT, 'http_status': 200, 'is_similar': False, 'rank': 17}, 'Twitter': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/', 'url_user': 'https://twitter.com/alexaimephotography', 'status': BAD_RESULT, 'http_status': 400, 'is_similar': False, 'rank': 55}, 'Instagram': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.instagram.com/', 'url_user': 'https://www.instagram.com/alexaimephotography', 'ids_usernames': {'alexaimephotography': 'username'}, 'status': GOOD_IG_RESULT, 'http_status': 200, 'is_similar': False, 'rank': 29}}), ('Alexaimephotogr', 'username', {'500px': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://500px.com/', 'url_user': 'https://500px.com/p/Alexaimephotogr', 'status': BAD_RESULT, 'http_status': 200, 'is_similar': False, 'rank': 2981}, 'Reddit': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/', 'url_user': 'https://www.reddit.com/user/Alexaimephotogr', 'status': BAD_RESULT, 'http_status': 404, 'is_similar': False, 'rank': 17}, 'Twitter': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/', 'url_user': 'https://twitter.com/Alexaimephotogr', 'status': GOOD_TWITTER_RESULT, 'http_status': 400, 'is_similar': False, 'rank': 55}, 'Instagram': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.instagram.com/', 'url_user': 'https://www.instagram.com/Alexaimephotogr', 'status':BAD_RESULT, 'http_status': 404, 'is_similar': False, 'rank': 29}})]
|
||||
|
||||
|
||||
SUPPOSED_BRIEF = """Search by username alexaimephotographycars returned 1 accounts. Found target's other IDs: alexaimephotography, Alexaimephotogr. Search by username alexaimephotography returned 2 accounts. Search by username Alexaimephotogr returned 1 accounts. Extended info extracted from 3 accounts."""
|
||||
|
||||
SUPPOSED_INTERESTS = "Interests: photo <span class=\"text-muted\">(2)</span>, news <span class=\"text-muted\">(1)</span>, social <span class=\"text-muted\">(1)</span>"
|
||||
|
||||
SUPPOSED_GEO = "Geo: us <span class=\"text-muted\">(3)</span>"
|
||||
|
||||
|
||||
def test_save_csv_report_to_file():
|
||||
csvfile = StringIO()
|
||||
save_csv_report_to_file('test', EXAMPLE_RESULTS, csvfile)
|
||||
|
||||
csvfile.seek(0)
|
||||
data = csvfile.readlines()
|
||||
|
||||
assert data == [
|
||||
'username,name,url_main,url_user,exists,http_status\r\n',
|
||||
'test,GitHub,https://www.github.com/,https://www.github.com/test,Claimed,200\r\n',
|
||||
]
|
||||
|
||||
|
||||
def test_save_xmind_report():
|
||||
filename = 'test_report.xmind'
|
||||
genxmindfile(filename, 'test', EXAMPLE_RESULTS)
|
||||
|
||||
workbook = xmind.load(filename)
|
||||
sheet = workbook.getPrimarySheet()
|
||||
data = sheet.getData()
|
||||
|
||||
assert data['title'] == 'test Analysis'
|
||||
assert data['topic']['title'] == 'test'
|
||||
assert len(data['topic']['topics']) == 2
|
||||
assert data['topic']['topics'][0]['title'] == 'Undefined'
|
||||
assert data['topic']['topics'][1]['title'] == 'test_tag'
|
||||
assert len(data['topic']['topics'][1]['topics']) == 1
|
||||
assert data['topic']['topics'][1]['topics'][0]['label'] == 'https://www.github.com/test'
|
||||
|
||||
|
||||
def test_html_report():
|
||||
report_name = 'report_alexaimephotographycars.html'
|
||||
try:
|
||||
os.remove(report_name)
|
||||
except:
|
||||
pass
|
||||
|
||||
save_html_report(TEST)
|
||||
|
||||
assert os.path.exists(report_name)
|
||||
|
||||
report_text = open(report_name).read()
|
||||
|
||||
assert SUPPOSED_BRIEF in report_text
|
||||
assert SUPPOSED_GEO in report_text
|
||||
assert SUPPOSED_INTERESTS in report_text
|
||||
+20
-1
@@ -1,5 +1,5 @@
|
||||
"""Maigret Database test functions"""
|
||||
from maigret.sites import MaigretDatabase
|
||||
from maigret.sites import MaigretDatabase, MaigretSite
|
||||
|
||||
|
||||
EXAMPLE_DB = {
|
||||
@@ -99,3 +99,22 @@ def test_saving_site_error():
|
||||
|
||||
assert amperka.strip_engine_data().errors == {'error1': 'text1'}
|
||||
assert amperka.strip_engine_data().json['errors'] == {'error1': 'text1'}
|
||||
|
||||
|
||||
def test_ranked_sites_dict():
|
||||
db = MaigretDatabase()
|
||||
db.update_site(MaigretSite('3', {'alexaRank': 1000, 'engine': 'ucoz'}))
|
||||
db.update_site(MaigretSite('1', {'alexaRank': 2, 'tags': ['forum']}))
|
||||
db.update_site(MaigretSite('2', {'alexaRank': 10, 'tags': ['ru', 'forum']}))
|
||||
|
||||
# sorting
|
||||
assert list(db.ranked_sites_dict().keys()) == ['1', '2', '3']
|
||||
assert list(db.ranked_sites_dict(top=2).keys()) == ['1', '2']
|
||||
assert list(db.ranked_sites_dict(reverse=True, top=2).keys()) == ['3', '2']
|
||||
|
||||
# filtering by tags
|
||||
assert list(db.ranked_sites_dict(tags=['ru'], top=2).keys()) == ['2']
|
||||
assert list(db.ranked_sites_dict(tags=['forum']).keys()) == ['1', '2']
|
||||
|
||||
# filtering by engine
|
||||
assert list(db.ranked_sites_dict(tags=['ucoz']).keys()) == ['3']
|
||||
|
||||
+14
-1
@@ -1,5 +1,5 @@
|
||||
"""Maigret utils test functions"""
|
||||
from maigret.utils import CaseConverter, is_country_tag
|
||||
from maigret.utils import CaseConverter, is_country_tag, enrich_link_str
|
||||
|
||||
|
||||
def test_case_convert_camel_to_snake():
|
||||
@@ -14,8 +14,21 @@ def test_case_convert_snake_to_camel():
|
||||
|
||||
assert b == 'camelCasedString'
|
||||
|
||||
def test_case_convert_snake_to_title():
|
||||
a = 'camel_cased_string'
|
||||
b = CaseConverter.snake_to_title(a)
|
||||
|
||||
assert b == 'Camel cased string'
|
||||
|
||||
def test_is_country_tag():
|
||||
assert is_country_tag('ru') == True
|
||||
assert is_country_tag('FR') == True
|
||||
|
||||
assert is_country_tag('a1') == False
|
||||
assert is_country_tag('dating') == False
|
||||
|
||||
assert is_country_tag('global') == True
|
||||
|
||||
def test_enrich_link_str():
|
||||
assert enrich_link_str('test') == 'test'
|
||||
assert enrich_link_str(' www.flickr.com/photos/alexaimephotography/') == '<a class="auto-link" href="www.flickr.com/photos/alexaimephotography/">www.flickr.com/photos/alexaimephotography/</a>'
|
||||
|
||||
Reference in New Issue
Block a user