HTLM reports draft, 500 sites scanning by default

This commit is contained in:
Soxoj
2021-01-07 23:52:29 +03:00
parent 5c8b65d033
commit e4765d1ed9
11 changed files with 544 additions and 65 deletions
+17 -2
View File
@@ -26,7 +26,7 @@ from socid_extractor import parse, extract
from .notify import QueryNotifyPrint
from .result import QueryResult, QueryStatus
from .sites import MaigretDatabase, MaigretSite
from .report import save_csv_report, genxmindfile
from .report import save_csv_report, genxmindfile, save_html_report
import xmind
@@ -629,6 +629,10 @@ async def main():
action="store_true", dest="csv", default=False,
help="Create Comma-Separated Values (CSV) File."
)
parser.add_argument("--html",
action="store_true", dest="html", default=False,
help="Create HTML report file."
)
parser.add_argument("--site",
action="append", metavar='SITE_NAME',
dest="site_list", default=None,
@@ -649,6 +653,10 @@ async def main():
"A longer timeout will be more likely to get results from slow sites."
"On the other hand, this may cause a long delay to gather all results."
)
parser.add_argument("--top-sites",
action="store", default=500,
help="Count of sites for checking ranked by Alexa Top (default: 500)."
)
parser.add_argument("--print-not-found",
action="store_true", dest="print_not_found", default=False,
help="Print sites where the username was not found."
@@ -757,7 +765,8 @@ async def main():
# Create object with all information about sites we are aware of.
try:
site_data_all = MaigretDatabase().load_from_file(args.json_file).sites_dict
db = MaigretDatabase().load_from_file(args.json_file)
site_data_all = db.ranked_sites_dict(top=args.top_sites)
except Exception as error:
print(f"ERROR: {error}")
sys.exit(1)
@@ -805,6 +814,8 @@ async def main():
already_checked = set()
general_results = []
while usernames:
username, id_type = list(usernames.items())[0]
del usernames[username]
@@ -834,6 +845,7 @@ async def main():
logger=logger,
forced=args.use_disabled_sites,
)
general_results.append((username, id_type, results))
if args.folderoutput:
# The usernames results should be stored in a targeted folder.
@@ -870,6 +882,9 @@ async def main():
if args.csv:
save_csv_report(username, results)
if args.html:
save_html_report(general_results)
def run():
try:
+215
View File
@@ -0,0 +1,215 @@
import csv
from datetime import datetime
import logging
import os
import xmind
from jinja2 import Template
import pycountry
from .result import QueryStatus
from .utils import is_country_tag, CaseConverter, enrich_link_str
def save_csv_report(username: str, results: dict):
with open(username + '.csv', 'w', newline='', encoding='utf-8') as csvfile:
save_csv_report_to_file(username, results, csvfile)
def save_html_report(username_results: list):
brief_text = []
usernames = {}
extended_info_count = 0
tags = {}
supposed_data = {}
allowed_fields = ['fullname', 'gender']
first_seen = None
first_seen_format = '%Y-%m-%d %H:%M:%S'
for username, id_type, results in username_results:
found_accounts = 0
new_ids = []
usernames[username] = {'type': id_type}
for website_name in results:
dictionary = results[website_name]
# TODO: fix no site data issue
if not dictionary:
continue
status = dictionary.get('status')
if status.ids_data:
dictionary['ids_data'] = status.ids_data
extended_info_count += 1
# detect first seen
created_at = status.ids_data.get('created_at')
if created_at:
if first_seen is None:
first_seen = created_at
else:
known_time = datetime.strptime(first_seen, first_seen_format)
new_time = datetime.strptime(created_at, first_seen_format)
if new_time < known_time:
first_seen = created_at
for k, v in status.ids_data.items():
# suppose target data
field = 'fullname' if k == 'name' else k
if not field in supposed_data:
supposed_data[field] = []
supposed_data[field].append(v)
# suppose country
if k in ['country', 'locale']:
try:
if is_country_tag(k):
tag = pycountry.countries.get(alpha_2=v).alpha_2.lower()
else:
tag = pycountry.countries.search_fuzzy(v)[0].alpha_2.lower()
# TODO: move countries to another struct
tags[tag] = tags.get(tag, 0) + 1
except Exception as e:
logging.debug('pycountry exception', exc_info=True)
new_usernames = dictionary.get('ids_usernames')
if new_usernames:
for u, utype in new_usernames.items():
if not u in usernames:
new_ids.append((u, utype))
usernames[u] = {'type': utype}
if status.status == QueryStatus.CLAIMED:
found_accounts += 1
dictionary['found'] = True
else:
continue
if not dictionary.get('is_similar'):
# ignore non-exact search results
if status.tags:
for t in status.tags:
tags[t] = tags.get(t, 0) + 1
brief_text.append(f'Search by {id_type} {username} returned {found_accounts} accounts.')
if new_ids:
ids_list = []
for u, t in new_ids:
ids_list.append(f'{u} ({t})' if t != 'username' else u)
brief_text.append(f'Found target\'s other IDs: ' + ', '.join(ids_list) + '.')
brief_text.append(f'Extended info extracted from {extended_info_count} accounts.')
# template generation
template_text = open(os.path.join(os.path.dirname(os.path.realpath(__file__)),
"resources/simple_report.tpl")).read()
template = Template(template_text)
template.globals['title'] = CaseConverter.snake_to_title
template.globals['detect_link'] = enrich_link_str
brief = ' '.join(brief_text).strip()
tuple_sort = lambda d: sorted(d, key=lambda x: x[1], reverse=True)
if 'global' in tags:
# remove tag 'global' useless for country detection
del tags['global']
first_username = username_results[0][0]
countries_lists = list(filter(lambda x: is_country_tag(x[0]), tags.items()))
interests_list = list(filter(lambda x: not is_country_tag(x[0]), tags.items()))
filtered_supposed_data = {CaseConverter.snake_to_title(k): v[0]
for k, v in supposed_data.items()
if k in allowed_fields}
filled_template = template.render(username=first_username,
brief=brief,
results=username_results,
first_seen=first_seen,
interests_tuple_list=tuple_sort(interests_list),
countries_tuple_list=tuple_sort(countries_lists),
supposed_data=filtered_supposed_data,
generated_at=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
)
# save report
html_filename = f'report_{first_username}.html'
with open(html_filename, 'w') as f:
f.write(filled_template)
def save_csv_report_to_file(username: str, results: dict, csvfile):
print(results)
writer = csv.writer(csvfile)
writer.writerow(['username',
'name',
'url_main',
'url_user',
'exists',
'http_status'
]
)
for site in results:
writer.writerow([username,
site,
results[site]['url_main'],
results[site]['url_user'],
str(results[site]['status'].status),
results[site]['http_status'],
])
def genxmindfile(filename, username, results):
print(f'Generating XMIND8 file for username {username}')
if os.path.exists(filename):
os.remove(filename)
workbook = xmind.load(filename)
sheet = workbook.getPrimarySheet()
design_sheet1(sheet, username, results)
xmind.save(workbook, path=filename)
def design_sheet1(sheet, username, results):
##all tag list
alltags = {}
sheet.setTitle("%s Analysis"%(username))
root_topic1 = sheet.getRootTopic()
root_topic1.setTitle("%s"%(username))
undefinedsection = root_topic1.addSubTopic()
undefinedsection.setTitle("Undefined")
alltags["undefined"] = undefinedsection
for website_name in results:
dictionary = results[website_name]
if dictionary.get("status").status == QueryStatus.CLAIMED:
## firsttime I found that entry
for tag in dictionary.get("status").tags:
if tag.strip() == "":
continue
if tag not in alltags.keys():
if not is_country_tag(tag):
tagsection = root_topic1.addSubTopic()
tagsection.setTitle(tag)
alltags[tag] = tagsection
category = None
userlink= None
for tag in dictionary.get("status").tags:
if tag.strip() == "":
continue
if not is_country_tag(tag):
category = tag
if category is None:
category = "undefined"
userlink = undefinedsection.addSubTopic()
else:
userlink = alltags[category].addSubTopic()
userlink.addLabel(dictionary.get("status").site_url_user)
#for tag in dictionary.get("status").tags:
# if( tag != category ):
# sheet.createRelationship(userlink.getID(), alltags[tag].getID(),"other tag")
+29 -14
View File
@@ -307,8 +307,9 @@
},
"500px": {
"tags": [
"images",
"in"
"photos",
"in",
"global"
],
"errors": {
"INTERNAL_SERVER_ERROR": "Site error",
@@ -3221,6 +3222,7 @@
"tags": [
"global",
"images",
"photos",
"us"
],
"regexCheck": "^[a-zA-Z][a-zA-Z0-9_-]*$",
@@ -3979,8 +3981,11 @@
},
"EyeEm": {
"tags": [
"de",
"in",
"sd"
"sd",
"global",
"photos"
],
"checkType": "message",
"absenceStrs": "Not Found (404) | EyeEm",
@@ -6551,8 +6556,8 @@
},
"Instagram": {
"tags": [
"social",
"us"
"photos",
"global"
],
"errors": {
"Login \u2022 Instagram": "Login required"
@@ -8018,7 +8023,9 @@
"news",
"us"
],
"checkType": "status_code",
"checkType": "message",
"absenceStrs": [":{\"__typename\":\"NotFound\"},\"viewer\""],
"presenseStrs": ["userPostCounts"],
"alexaRank": 76,
"url": "https://medium.com/@{username}",
"urlMain": "https://medium.com/",
@@ -9835,9 +9842,9 @@
},
"Picuki": {
"tags": [
"photos",
"global",
"jp",
"us"
"instagram"
],
"checkType": "message",
"absenceStrs": [
@@ -9899,7 +9906,8 @@
},
"Pinterest": {
"tags": [
"social",
"images",
"photos",
"us"
],
"checkType": "status_code",
@@ -10858,6 +10866,7 @@
},
"Reddit": {
"tags": [
"social",
"news",
"us"
],
@@ -13392,6 +13401,7 @@
},
"Tumblr": {
"tags": [
"blogs",
"global",
"us"
],
@@ -13433,11 +13443,14 @@
"us"
],
"headers": {
"User-Agent": "Mozilla"
"sec-ch-ua": "Google Chrome\";v=\"87\", \" Not;A Brand\";v=\"99\", \"Chromium\";v=\"87\"",
"authorization": "Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA",
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
"x-guest-token": "1347256342462009351"
},
"urlProbe": "https://mobile.twitter.com/{username}",
"urlProbe": "https://twitter.com/i/api/graphql/ZRnOhhXPwue_JGILb9TNug/UserByScreenName?variables=%7B%22screen_name%22%3A%22{username}%22%2C%22withHighlightedLabel%22%3Atrue%7D",
"checkType": "message",
"absenceStrs": "Sorry, that page doesn't exist",
"absenceStrs": "Not found",
"alexaRank": 55,
"url": "https://twitter.com/{username}",
"urlMain": "https://www.twitter.com/",
@@ -13604,9 +13617,9 @@
},
"VK": {
"tags": [
"global",
"ru",
"social"
"social",
"global"
],
"checkType": "response_url",
"alexaRank": 23,
@@ -14107,6 +14120,8 @@
},
"We Heart It": {
"tags": [
"photos",
"us",
"in"
],
"checkType": "message",
+109
View File
@@ -0,0 +1,109 @@
<html>
<head>
<meta charset="utf-8" />
</head>
<meta name="viewport" content="width=device-width, initial-scale=1.0, shrink-to-fit=no" />
<title>{{ username }} -- Maigret username search report</title>
<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
<style>
.table td, .table th {
padding: .4rem;
}
@media print {
.pagebreak { page-break-before: always; }
}
</style>
<body>
<div class="container">
<div class="row-mb">
<div class="col-12 card-body" style="padding-bottom: 0.5rem;">
<h4 class="mb-0">
<a class="blog-header-logo text-dark" href="#">Username search report for {{ username }}</a>
</h4>
<small class="text-muted">Generated at {{ generated_at }}</small>
</div>
</div>
<div class="row-mb">
<div class="col-md">
<div class="card flex-md-row mb-4 box-shadow h-md-250">
<div class="card-body d-flex flex-column align-items-start">
<h5>Supposed personal data</h5>
{% for k, v in supposed_data.items() %}
<span>
{{ k }}: {{ v }}
</span>
{% endfor %}
{% if countries_tuple_list %}
<span>
Geo: {% for k, v in countries_tuple_list %}{{ k }} <span class="text-muted">({{ v }})</span>{{ ", " if not loop.last }}{% endfor %}
</span>
{% endif %}{% if interests_tuple_list %}
<span>
Interests: {% for k, v in interests_tuple_list %}{{ k }} <span class="text-muted">({{ v }})</span>{{ ", " if not loop.last }}{% endfor %}
</span>
{% endif %}{% if first_seen %}
<span>
First seen: {{ first_seen }}
</span>
{% endif %}
</div>
</div>
</div>
</div>
<div class="row-mb">
<div class="col-md">
<div class="card flex-md-row mb-4 box-shadow h-md-250">
<div class="card-body d-flex flex-column align-items-start">
<h5>Brief</h5>
<span>
{{ brief }}
</span>
</div>
</div>
</div>
</div>
{% for u, t, data in results %}
{% for k, v in data.items() %}
{% if v.found and not v.is_similar %}
<div class="row-mb">
<div class="col-md">
<div class="card flex-md-row mb-4 box-shadow h-md-250">
<img class="card-img-right flex-auto d-none d-md-block" alt="Photo" style="width: 200px; height: 200px; object-fit: scale-down;" src="{{ v.status.ids_data.image or 'https://i.imgur.com/040fmbw.png' }}" data-holder-rendered="true">
<div class="card-body d-flex flex-column align-items-start" style="padding-top: 0;">
<h3 class="mb-0" style="padding-top: 1rem;">
<a class="text-dark" href="{{ v.url_main }}" target="_blank">{{ k }}</a>
</h3>
{% if v.status.tags %}
<div class="mb-1 text-muted">Tags: {{ v.status.tags | join(', ') }}</div>
{% endif %}
<p class="card-text">
<a href="{{ v.url_user }}" target="_blank">{{ v.url_user }}</a>
</p>
{% if v.ids_data %}
<table class="table table-striped">
<tbody>
{% for k1, v1 in v.ids_data.items() %}
{% if k1 != 'image' %}
<tr>
<th>{{ title(k1) }}</th>
<td>{% if v1 is iterable and (v1 is not string and v1 is not mapping) %}{{ v1 | join(', ') }}{% else %}{{ detect_link(v1) }}{% endif %}
</td>
</tr>
{% endif %}
{% endfor %}
</tbody>
</table>
{% endif %}
</p>
</div>
</div>
</div>
</div>
{% endif %}
{% endfor %}
{% endfor %}
</div>
<script src="https://code.jquery.com/jquery-3.3.1.slim.min.js" integrity="sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo" crossorigin="anonymous"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js" integrity="sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1" crossorigin="anonymous"></script>
<script src="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js" integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM" crossorigin="anonymous"></script>
</html>
+2 -8
View File
@@ -34,7 +34,7 @@ class QueryResult():
"""
def __init__(self, username, site_name, site_url_user, status, ids_data=None,
query_time=None, context=None, tags=None):
query_time=None, context=None, tags=[]):
"""Create Query Result Object.
Contains information about a specific method of detecting usernames on
@@ -72,14 +72,8 @@ class QueryResult():
self.query_time = query_time
self.context = context
self.ids_data = ids_data
self.tags = tags
self.tags = ""
if (tags is not None):
TAGstring = "".join(['%s,' % tags for tags in tags])
TAGstring = TAGstring[:-1]
self.tags = TAGstring
return
def __str__(self):
"""Convert Object To String.
+15 -35
View File
@@ -13,6 +13,7 @@ from .utils import CaseConverter
class MaigretEngine:
def __init__(self, name, data):
self.name = name
self.site = {}
self.__dict__.update(data)
@property
@@ -127,6 +128,15 @@ class MaigretDatabase:
def sites_dict(self):
return {site.name: site for site in self._sites}
def ranked_sites_dict(self, reverse=False, top=sys.maxsize, tags=[]):
if not tags:
filtered_list = self.sites
else:
filtered_list = [s for s in self.sites if set(s.tags).intersection(set(tags)) or s.engine in tags]
sorted_list = sorted(filtered_list, key=lambda x: x.alexa_rank, reverse=reverse)[:top]
return {site.name: site for site in sorted_list}
@property
def engines(self):
return self._engines
@@ -145,12 +155,12 @@ class MaigretDatabase:
return self
def save_to_file(self, filename: str) -> MaigretDatabase:
json_data = {
db_data = {
'sites': {site.name: site.strip_engine_data().json for site in self._sites},
'engines': {engine.name: engine.json for engine in self._engines},
}
json_data = json.dumps(json_data, indent=4)
json_data = json.dumps(db_data, indent=4)
with open(filename, 'w') as f:
f.write(json_data)
@@ -160,8 +170,8 @@ class MaigretDatabase:
def load_from_json(self, json_data: dict) -> MaigretDatabase:
# Add all of site information from the json file to internal site list.
site_data = json_data.get("sites")
engines_data = json_data.get("engines")
site_data = json_data.get("sites", {})
engines_data = json_data.get("engines", {})
for engine_name in engines_data:
self._engines.append(MaigretEngine(engine_name, engines_data[engine_name]))
@@ -198,7 +208,7 @@ class MaigretDatabase:
is_url_valid = url.startswith('http://') or url.startswith('https://')
if not is_url_valid:
return False
raise FileNotFoundError(f"Invalid data file URL '{url}'.")
try:
response = requests.get(url=url)
@@ -238,33 +248,3 @@ class MaigretDatabase:
)
return self.load_from_json(data)
def site_name_list(self, popularity_rank=False):
"""Get Site Name List.
Keyword Arguments:
self -- This object.
popularity_rank -- Boolean indicating if list should be sorted
by popularity rank.
Default value is False.
NOTE: List is sorted in ascending
alphabetical order is popularity rank
is not requested.
Return Value:
List of strings containing names of sites.
"""
if popularity_rank:
# Sort in ascending popularity rank order.
site_rank_name = \
sorted([(site.popularity_rank, site.name) for site in self],
key=operator.itemgetter(0)
)
site_names = [name for _, name in site_rank_name]
else:
# Sort in ascending alphabetical order.
site_names = sorted([site.name for site in self], key=str.lower)
return site_names
+17 -4
View File
@@ -3,16 +3,29 @@ import re
class CaseConverter:
@staticmethod
def camel_to_snake(camelcased_string: str):
def camel_to_snake(camelcased_string: str) -> str:
return re.sub(r'(?<!^)(?=[A-Z])', '_', camelcased_string).lower()
@staticmethod
def snake_to_camel(snakecased_string: str):
def snake_to_camel(snakecased_string: str) -> str:
formatted = ''.join(word.title() for word in snakecased_string.split('_'))
result = formatted[0].lower() + formatted[1:]
return result
@staticmethod
def snake_to_title(snakecased_string: str) -> str:
words = snakecased_string.split('_')
words[0] = words[0].title()
return ' '.join(words)
def is_country_tag(tag):
def is_country_tag(tag: str) -> bool:
"""detect if tag represent a country"""
return bool(re.match("^([a-z]){2}$", tag))
return bool(re.match("^([a-zA-Z]){2}$", tag)) or tag == 'global'
def enrich_link_str(link: str) -> str:
link = link.strip()
if link.startswith('www.') or (link.startswith('http') and '//' in link):
return f'<a class="auto-link" href="{link}">{link}</a>'
return link
+2
View File
@@ -8,9 +8,11 @@ certifi==2020.12.5
chardet==3.0.4
colorama==0.4.4
idna==2.10
Jinja2==2.11.2
lxml==4.6.2
mock==4.0.2
multidict==5.1.0
pycountry==20.7.3
PySocks==1.7.1
python-socks==1.1.2
requests==2.25.1
+104
View File
@@ -0,0 +1,104 @@
"""Maigret reports test functions"""
from io import StringIO
import copy
import os
import xmind
from maigret.report import save_csv_report_to_file, genxmindfile, save_html_report
from maigret.result import QueryResult, QueryStatus
EXAMPLE_RESULTS = {
'GitHub': {
'username': 'test',
'parsing_enabled': True,
'url_main': 'https://www.github.com/',
'url_user': 'https://www.github.com/test',
'status': QueryResult('test',
'GitHub',
'https://www.github.com/test',
QueryStatus.CLAIMED,
tags=['test_tag']),
'http_status': 200,
'is_similar': False,
'rank': 78
}
}
GOOD_RESULT = QueryResult('', '', '', QueryStatus.CLAIMED)
BAD_RESULT = QueryResult('', '', '', QueryStatus.AVAILABLE)
GOOD_500PX_RESULT = copy.deepcopy(GOOD_RESULT)
GOOD_500PX_RESULT.tags = ['photo', 'us', 'global']
GOOD_500PX_RESULT.ids_data = {"uid": "dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==", "legacy_id": "26403415", "username": "alexaimephotographycars", "name": "Alex Aim\u00e9", "website": "www.flickr.com/photos/alexaimephotography/", "facebook_link": " www.instagram.com/street.reality.photography/", "instagram_username": "alexaimephotography", "twitter_username": "Alexaimephotogr"}
GOOD_REDDIT_RESULT = copy.deepcopy(GOOD_RESULT)
GOOD_REDDIT_RESULT.tags = ['news', 'us']
GOOD_REDDIT_RESULT.ids_data = {"reddit_id": "t5_1nytpy", "reddit_username": "alexaimephotography", "fullname": "alexaimephotography", "image": "https://styles.redditmedia.com/t5_1nytpy/styles/profileIcon_7vmhdwzd3g931.jpg?width=256&height=256&crop=256:256,smart&frame=1&s=4f355f16b4920844a3f4eacd4237a7bf76b2e97e", "is_employee": "False", "is_nsfw": "False", "is_mod": "True", "is_following": "True", "has_user_profile": "True", "hide_from_robots": "False", "created_at": "2019-07-10 12:20:03", "total_karma": "53959", "post_karma": "52738"}
GOOD_IG_RESULT = copy.deepcopy(GOOD_RESULT)
GOOD_IG_RESULT.tags = ['photo', 'global']
GOOD_IG_RESULT.ids_data = {"instagram_username": "alexaimephotography", "fullname": "Alexaimephotography", "id": "6828488620", "image": "https://scontent-hel3-1.cdninstagram.com/v/t51.2885-19/s320x320/95420076_1169632876707608_8741505804647006208_n.jpg?_nc_ht=scontent-hel3-1.cdninstagram.com&_nc_ohc=jd87OUGsX4MAX_Ym5GX&tp=1&oh=0f42badd68307ba97ec7fb1ef7b4bfd4&oe=601E5E6F", "bio": "Photographer \nChild of fine street arts", "external_url": "https://www.flickr.com/photos/alexaimephotography2020/"}
GOOD_TWITTER_RESULT = copy.deepcopy(GOOD_RESULT)
GOOD_TWITTER_RESULT.tags = ['social', 'us']
TEST = [('alexaimephotographycars', 'username', {'500px': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://500px.com/', 'url_user': 'https://500px.com/p/alexaimephotographycars', 'ids_usernames': {'alexaimephotographycars': 'username', 'alexaimephotography': 'username', 'Alexaimephotogr': 'username'}, 'status': GOOD_500PX_RESULT, 'http_status': 200, 'is_similar': False, 'rank': 2981}, 'Reddit': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/', 'url_user': 'https://www.reddit.com/user/alexaimephotographycars', 'status': BAD_RESULT, 'http_status': 404, 'is_similar': False, 'rank': 17}, 'Twitter': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/', 'url_user': 'https://twitter.com/alexaimephotographycars', 'status': BAD_RESULT, 'http_status': 400, 'is_similar': False, 'rank': 55}, 'Instagram': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://www.instagram.com/', 'url_user': 'https://www.instagram.com/alexaimephotographycars', 'status': BAD_RESULT, 'http_status': 404, 'is_similar': False, 'rank': 29}}), ('alexaimephotography', 'username', {'500px': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://500px.com/', 'url_user': 'https://500px.com/p/alexaimephotography', 'status': BAD_RESULT, 'http_status': 200, 'is_similar': False, 'rank': 2981}, 'Reddit': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/', 'url_user': 'https://www.reddit.com/user/alexaimephotography', 'ids_usernames': {'alexaimephotography': 'username'}, 'status': GOOD_REDDIT_RESULT, 'http_status': 200, 'is_similar': False, 'rank': 17}, 'Twitter': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/', 'url_user': 'https://twitter.com/alexaimephotography', 'status': BAD_RESULT, 'http_status': 400, 'is_similar': False, 'rank': 55}, 'Instagram': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.instagram.com/', 'url_user': 'https://www.instagram.com/alexaimephotography', 'ids_usernames': {'alexaimephotography': 'username'}, 'status': GOOD_IG_RESULT, 'http_status': 200, 'is_similar': False, 'rank': 29}}), ('Alexaimephotogr', 'username', {'500px': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://500px.com/', 'url_user': 'https://500px.com/p/Alexaimephotogr', 'status': BAD_RESULT, 'http_status': 200, 'is_similar': False, 'rank': 2981}, 'Reddit': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/', 'url_user': 'https://www.reddit.com/user/Alexaimephotogr', 'status': BAD_RESULT, 'http_status': 404, 'is_similar': False, 'rank': 17}, 'Twitter': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/', 'url_user': 'https://twitter.com/Alexaimephotogr', 'status': GOOD_TWITTER_RESULT, 'http_status': 400, 'is_similar': False, 'rank': 55}, 'Instagram': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.instagram.com/', 'url_user': 'https://www.instagram.com/Alexaimephotogr', 'status':BAD_RESULT, 'http_status': 404, 'is_similar': False, 'rank': 29}})]
SUPPOSED_BRIEF = """Search by username alexaimephotographycars returned 1 accounts. Found target's other IDs: alexaimephotography, Alexaimephotogr. Search by username alexaimephotography returned 2 accounts. Search by username Alexaimephotogr returned 1 accounts. Extended info extracted from 3 accounts."""
SUPPOSED_INTERESTS = "Interests: photo <span class=\"text-muted\">(2)</span>, news <span class=\"text-muted\">(1)</span>, social <span class=\"text-muted\">(1)</span>"
SUPPOSED_GEO = "Geo: us <span class=\"text-muted\">(3)</span>"
def test_save_csv_report_to_file():
csvfile = StringIO()
save_csv_report_to_file('test', EXAMPLE_RESULTS, csvfile)
csvfile.seek(0)
data = csvfile.readlines()
assert data == [
'username,name,url_main,url_user,exists,http_status\r\n',
'test,GitHub,https://www.github.com/,https://www.github.com/test,Claimed,200\r\n',
]
def test_save_xmind_report():
filename = 'test_report.xmind'
genxmindfile(filename, 'test', EXAMPLE_RESULTS)
workbook = xmind.load(filename)
sheet = workbook.getPrimarySheet()
data = sheet.getData()
assert data['title'] == 'test Analysis'
assert data['topic']['title'] == 'test'
assert len(data['topic']['topics']) == 2
assert data['topic']['topics'][0]['title'] == 'Undefined'
assert data['topic']['topics'][1]['title'] == 'test_tag'
assert len(data['topic']['topics'][1]['topics']) == 1
assert data['topic']['topics'][1]['topics'][0]['label'] == 'https://www.github.com/test'
def test_html_report():
report_name = 'report_alexaimephotographycars.html'
try:
os.remove(report_name)
except:
pass
save_html_report(TEST)
assert os.path.exists(report_name)
report_text = open(report_name).read()
assert SUPPOSED_BRIEF in report_text
assert SUPPOSED_GEO in report_text
assert SUPPOSED_INTERESTS in report_text
+20 -1
View File
@@ -1,5 +1,5 @@
"""Maigret Database test functions"""
from maigret.sites import MaigretDatabase
from maigret.sites import MaigretDatabase, MaigretSite
EXAMPLE_DB = {
@@ -99,3 +99,22 @@ def test_saving_site_error():
assert amperka.strip_engine_data().errors == {'error1': 'text1'}
assert amperka.strip_engine_data().json['errors'] == {'error1': 'text1'}
def test_ranked_sites_dict():
db = MaigretDatabase()
db.update_site(MaigretSite('3', {'alexaRank': 1000, 'engine': 'ucoz'}))
db.update_site(MaigretSite('1', {'alexaRank': 2, 'tags': ['forum']}))
db.update_site(MaigretSite('2', {'alexaRank': 10, 'tags': ['ru', 'forum']}))
# sorting
assert list(db.ranked_sites_dict().keys()) == ['1', '2', '3']
assert list(db.ranked_sites_dict(top=2).keys()) == ['1', '2']
assert list(db.ranked_sites_dict(reverse=True, top=2).keys()) == ['3', '2']
# filtering by tags
assert list(db.ranked_sites_dict(tags=['ru'], top=2).keys()) == ['2']
assert list(db.ranked_sites_dict(tags=['forum']).keys()) == ['1', '2']
# filtering by engine
assert list(db.ranked_sites_dict(tags=['ucoz']).keys()) == ['3']
+14 -1
View File
@@ -1,5 +1,5 @@
"""Maigret utils test functions"""
from maigret.utils import CaseConverter, is_country_tag
from maigret.utils import CaseConverter, is_country_tag, enrich_link_str
def test_case_convert_camel_to_snake():
@@ -14,8 +14,21 @@ def test_case_convert_snake_to_camel():
assert b == 'camelCasedString'
def test_case_convert_snake_to_title():
a = 'camel_cased_string'
b = CaseConverter.snake_to_title(a)
assert b == 'Camel cased string'
def test_is_country_tag():
assert is_country_tag('ru') == True
assert is_country_tag('FR') == True
assert is_country_tag('a1') == False
assert is_country_tag('dating') == False
assert is_country_tag('global') == True
def test_enrich_link_str():
assert enrich_link_str('test') == 'test'
assert enrich_link_str(' www.flickr.com/photos/alexaimephotography/') == '<a class="auto-link" href="www.flickr.com/photos/alexaimephotography/">www.flickr.com/photos/alexaimephotography/</a>'