mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-07 06:24:35 +00:00
Self-checking mode fixed, tags/names site filtering & ranking
This commit is contained in:
+82
-92
@@ -368,7 +368,6 @@ async def maigret(username, site_dict, query_notify, logger,
|
|||||||
results_site['parsing_enabled'] = recursive_search
|
results_site['parsing_enabled'] = recursive_search
|
||||||
results_site['url_main'] = site.url_main
|
results_site['url_main'] = site.url_main
|
||||||
|
|
||||||
|
|
||||||
headers = {
|
headers = {
|
||||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11.1; rv:55.0) Gecko/20100101 Firefox/55.0',
|
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11.1; rv:55.0) Gecko/20100101 Firefox/55.0',
|
||||||
}
|
}
|
||||||
@@ -506,95 +505,102 @@ def timeout_check(value):
|
|||||||
return timeout
|
return timeout
|
||||||
|
|
||||||
|
|
||||||
async def site_self_check(site_name, site_data, logger, no_progressbar=False):
|
async def site_self_check(site, logger, semaphore, db: MaigretDatabase, no_progressbar=False):
|
||||||
query_notify = Mock()
|
query_notify = Mock()
|
||||||
changes = {
|
changes = {
|
||||||
'disabled': False,
|
'disabled': False,
|
||||||
}
|
}
|
||||||
|
|
||||||
check_data = [
|
try:
|
||||||
(site_data.username_claimed, QueryStatus.CLAIMED),
|
check_data = [
|
||||||
(site_data.username_unclaimed, QueryStatus.AVAILABLE),
|
(site.username_claimed, QueryStatus.CLAIMED),
|
||||||
]
|
(site.username_unclaimed, QueryStatus.AVAILABLE),
|
||||||
|
]
|
||||||
|
except:
|
||||||
|
print(site.__dict__)
|
||||||
|
|
||||||
logger.info(f'Checking {site_name}...')
|
logger.info(f'Checking {site.name}...')
|
||||||
|
|
||||||
for username, status in check_data:
|
for username, status in check_data:
|
||||||
results = await maigret(
|
async with semaphore:
|
||||||
username,
|
results_dict = await maigret(
|
||||||
{site_name: site_data},
|
username,
|
||||||
query_notify,
|
{site.name: site},
|
||||||
logger,
|
query_notify,
|
||||||
timeout=30,
|
logger,
|
||||||
forced=True,
|
timeout=30,
|
||||||
no_progressbar=no_progressbar,
|
forced=True,
|
||||||
)
|
no_progressbar=True,
|
||||||
|
)
|
||||||
|
|
||||||
# don't disable entries with other ids types
|
# don't disable entries with other ids types
|
||||||
if site_name not in results:
|
# TODO: make normal checking
|
||||||
logger.info(results)
|
if site.name not in results_dict:
|
||||||
changes['disabled'] = True
|
logger.info(results_dict)
|
||||||
continue
|
changes['disabled'] = True
|
||||||
|
continue
|
||||||
|
|
||||||
|
result = results_dict[site.name]['status']
|
||||||
|
|
||||||
|
|
||||||
|
site_status = result.status
|
||||||
|
|
||||||
site_status = results[site_name]['status'].status
|
|
||||||
if site_status != status:
|
if site_status != status:
|
||||||
if site_status == QueryStatus.UNKNOWN:
|
if site_status == QueryStatus.UNKNOWN:
|
||||||
msgs = site_data.absence_strs
|
msgs = site.absence_strs
|
||||||
etype = site_data.check_type
|
etype = site.check_type
|
||||||
logger.info(f'Error while searching {username} in {site_name}: {msgs}, type {etype}')
|
logger.warning(f'Error while searching {username} in {site.name}: {result.context}, {msgs}, type {etype}')
|
||||||
# don't disable in case of available username
|
# don't disable in case of available username
|
||||||
if status == QueryStatus.CLAIMED:
|
if status == QueryStatus.CLAIMED:
|
||||||
changes['disabled'] = True
|
changes['disabled'] = True
|
||||||
elif status == QueryStatus.CLAIMED:
|
elif status == QueryStatus.CLAIMED:
|
||||||
logger.info(f'Not found `{username}` in {site_name}, must be claimed')
|
logger.warning(f'Not found `{username}` in {site.name}, must be claimed')
|
||||||
logger.info(results[site_name])
|
logger.info(results_dict[site.name])
|
||||||
changes['disabled'] = True
|
changes['disabled'] = True
|
||||||
else:
|
else:
|
||||||
logger.info(f'Found `{username}` in {site_name}, must be available')
|
logger.warning(f'Found `{username}` in {site.name}, must be available')
|
||||||
logger.info(results[site_name])
|
logger.info(results_dict[site.name])
|
||||||
changes['disabled'] = True
|
changes['disabled'] = True
|
||||||
|
|
||||||
logger.info(f'Site {site_name} checking is finished')
|
logger.info(f'Site {site.name} checking is finished')
|
||||||
|
|
||||||
|
if changes['disabled'] != site.disabled:
|
||||||
|
site.disabled = changes['disabled']
|
||||||
|
db.update_site(site)
|
||||||
|
action = 'Disabled' if not site.disabled else 'Enabled'
|
||||||
|
print(f'{action} site {site.name}...')
|
||||||
|
|
||||||
return changes
|
return changes
|
||||||
|
|
||||||
|
|
||||||
async def self_check(json_file, logger):
|
async def self_check(db: MaigretDatabase, site_data: dict, logger):
|
||||||
db = MaigretDatabase()
|
sem = asyncio.Semaphore(10)
|
||||||
db.load_from_file(json_file)
|
tasks = []
|
||||||
sites = db.sites
|
all_sites = site_data
|
||||||
all_sites = {}
|
|
||||||
|
|
||||||
def disabled_count(data):
|
def disabled_count(lst):
|
||||||
return len(list(filter(lambda x: x.get('disabled', False), data)))
|
return len(list(filter(lambda x: x.disabled, lst)))
|
||||||
|
|
||||||
async def update_site_data(site_name, site_data, all_sites, logger):
|
|
||||||
updates = await site_self_check(site_name, dict(site_data), logger)
|
|
||||||
all_sites[site_name].update(updates)
|
|
||||||
|
|
||||||
for site in sites:
|
|
||||||
all_sites[site.name] = site.information
|
|
||||||
|
|
||||||
disabled_old_count = disabled_count(all_sites.values())
|
disabled_old_count = disabled_count(all_sites.values())
|
||||||
|
|
||||||
tasks = []
|
for _, site in all_sites.items():
|
||||||
for site_name, site_data in all_sites.items():
|
check_coro = site_self_check(site, logger, sem, db)
|
||||||
future = asyncio.ensure_future(update_site_data(site_name, site_data, all_sites, logger))
|
future = asyncio.ensure_future(check_coro)
|
||||||
tasks.append(future)
|
tasks.append(future)
|
||||||
|
|
||||||
await asyncio.gather(*tasks)
|
for f in tqdm.asyncio.tqdm.as_completed(tasks):
|
||||||
|
await f
|
||||||
|
|
||||||
disabled_new_count = disabled_count(all_sites.values())
|
disabled_new_count = disabled_count(all_sites.values())
|
||||||
total_disabled = disabled_new_count - disabled_old_count
|
total_disabled = disabled_new_count - disabled_old_count
|
||||||
if total_disabled > 0:
|
|
||||||
|
if total_disabled >= 0:
|
||||||
message = 'Disabled'
|
message = 'Disabled'
|
||||||
else:
|
else:
|
||||||
message = 'Enabled'
|
message = 'Enabled'
|
||||||
total_disabled *= -1
|
total_disabled *= -1
|
||||||
print(f'{message} {total_disabled} checked sites. Run with `--info` flag to get more information')
|
|
||||||
|
|
||||||
with open(json_file, 'w') as f:
|
print(f'{message} {total_disabled} checked sites. Run with `--info` flag to get more information')
|
||||||
data['sites'] = all_sites
|
|
||||||
json.dump(data, f, indent=4)
|
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
@@ -621,9 +627,6 @@ async def main():
|
|||||||
action="store_true", dest="debug", default=False,
|
action="store_true", dest="debug", default=False,
|
||||||
help="Saving debugging information and sites responses in debug.txt."
|
help="Saving debugging information and sites responses in debug.txt."
|
||||||
)
|
)
|
||||||
parser.add_argument("--rank", "-r",
|
|
||||||
action="store_true", dest="rank", default=False,
|
|
||||||
help="Present websites ordered by their Alexa.com global rank in popularity.")
|
|
||||||
parser.add_argument("--folderoutput", "-fo", dest="folderoutput", default="reports",
|
parser.add_argument("--folderoutput", "-fo", dest="folderoutput", default="reports",
|
||||||
help="If using multiple usernames, the output of the results will be saved to this folder."
|
help="If using multiple usernames, the output of the results will be saved to this folder."
|
||||||
)
|
)
|
||||||
@@ -637,7 +640,7 @@ async def main():
|
|||||||
)
|
)
|
||||||
parser.add_argument("--site",
|
parser.add_argument("--site",
|
||||||
action="append", metavar='SITE_NAME',
|
action="append", metavar='SITE_NAME',
|
||||||
dest="site_list", default=None,
|
dest="site_list", default=[],
|
||||||
help="Limit analysis to just the listed sites (use several times to specify more than one)"
|
help="Limit analysis to just the listed sites (use several times to specify more than one)"
|
||||||
)
|
)
|
||||||
parser.add_argument("--proxy", "-p", metavar='PROXY_URL',
|
parser.add_argument("--proxy", "-p", metavar='PROXY_URL',
|
||||||
@@ -758,7 +761,7 @@ async def main():
|
|||||||
usernames[v] = k
|
usernames[v] = k
|
||||||
|
|
||||||
if args.tags:
|
if args.tags:
|
||||||
args.tags = set(str(args.tags).split(','))
|
args.tags = list(set(str(args.tags).split(',')))
|
||||||
|
|
||||||
if args.json_file is None:
|
if args.json_file is None:
|
||||||
args.json_file = \
|
args.json_file = \
|
||||||
@@ -766,53 +769,40 @@ async def main():
|
|||||||
"resources/data.json"
|
"resources/data.json"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Database self-checking
|
if args.top_sites == 0:
|
||||||
if args.self_check:
|
args.top_sites = sys.maxsize
|
||||||
print('Maigret sites database self-checking...')
|
|
||||||
await self_check(args.json_file, logger)
|
|
||||||
|
|
||||||
# Create object with all information about sites we are aware of.
|
# Create object with all information about sites we are aware of.
|
||||||
try:
|
try:
|
||||||
db = MaigretDatabase().load_from_file(args.json_file)
|
db = MaigretDatabase().load_from_file(args.json_file)
|
||||||
site_data_all = db.ranked_sites_dict(top=args.top_sites)
|
site_data = db.ranked_sites_dict(top=args.top_sites, tags=args.tags, names=args.site_list)
|
||||||
except Exception as error:
|
except Exception as error:
|
||||||
print(f"ERROR: {error}")
|
print(f"ERROR: {error}")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
if args.site_list is None:
|
# Database self-checking
|
||||||
# Not desired to look at a sub-set of sites
|
if args.self_check:
|
||||||
site_data = site_data_all
|
print('Maigret sites database self-checking...')
|
||||||
else:
|
await self_check(db, site_data, logger)
|
||||||
# User desires to selectively run queries on a sub-set of the site list.
|
if input('Do you want to save changes permanently? [yYnN]\n').lower() == 'y':
|
||||||
|
db.save_to_file(args.json_file)
|
||||||
# Make sure that the sites are supported & build up pruned site database.
|
print('Database was successfully updated.')
|
||||||
site_data = {}
|
else:
|
||||||
site_missing = []
|
print('Updates will be applied only for current search session.')
|
||||||
for site in args.site_list:
|
|
||||||
for existing_site in site_data_all:
|
|
||||||
if site.lower() == existing_site.lower():
|
|
||||||
site_data[existing_site] = site_data_all[existing_site]
|
|
||||||
if not site_data:
|
|
||||||
# Build up list of sites not supported for future error message.
|
|
||||||
site_missing.append(f"'{site}'")
|
|
||||||
|
|
||||||
if site_missing:
|
|
||||||
print(
|
|
||||||
f"Error: Desired sites not found: {', '.join(site_missing)}.")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
if args.rank:
|
|
||||||
# Sort data by rank
|
|
||||||
site_dataCpy = dict(site_data)
|
|
||||||
ranked_sites = sorted(site_data, key=lambda k: ("rank" not in k, site_data[k].get("rank", sys.maxsize)))
|
|
||||||
site_data = {}
|
|
||||||
for site in ranked_sites:
|
|
||||||
site_data[site] = site_dataCpy.get(site)
|
|
||||||
|
|
||||||
# Database consistency
|
# Database consistency
|
||||||
enabled_count = len(list(filter(lambda x: not x.disabled, site_data.values())))
|
enabled_count = len(list(filter(lambda x: not x.disabled, site_data.values())))
|
||||||
print(f'Sites in database, enabled/total: {enabled_count}/{len(site_data)}')
|
print(f'Sites in database, enabled/total: {enabled_count}/{len(site_data)}')
|
||||||
|
|
||||||
|
if not enabled_count:
|
||||||
|
print('No sites to check, exiting!')
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
|
if usernames == ['-']:
|
||||||
|
# magic params to exit after init
|
||||||
|
print('No usernames to check, exiting.')
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
# Create notify object for query results.
|
# Create notify object for query results.
|
||||||
query_notify = QueryNotifyPrint(result=None,
|
query_notify = QueryNotifyPrint(result=None,
|
||||||
verbose=args.verbose,
|
verbose=args.verbose,
|
||||||
|
|||||||
+1661
-1468
File diff suppressed because it is too large
Load Diff
+10
-3
@@ -128,11 +128,18 @@ class MaigretDatabase:
|
|||||||
def sites_dict(self):
|
def sites_dict(self):
|
||||||
return {site.name: site for site in self._sites}
|
return {site.name: site for site in self._sites}
|
||||||
|
|
||||||
def ranked_sites_dict(self, reverse=False, top=sys.maxsize, tags=[]):
|
def ranked_sites_dict(self, reverse=False, top=sys.maxsize, tags=[], names=[]):
|
||||||
if not tags:
|
normalized_names = list(map(str.lower, names))
|
||||||
|
normalized_tags = list(map(str.lower, tags))
|
||||||
|
|
||||||
|
is_tags_ok = lambda x: set(x.tags).intersection(set(normalized_tags))
|
||||||
|
is_name_ok = lambda x: x.name.lower() in normalized_names
|
||||||
|
is_engine_ok = lambda x: isinstance(x.engine, str) and x.engine.lower() in normalized_tags
|
||||||
|
|
||||||
|
if not tags and not names:
|
||||||
filtered_list = self.sites
|
filtered_list = self.sites
|
||||||
else:
|
else:
|
||||||
filtered_list = [s for s in self.sites if set(s.tags).intersection(set(tags)) or s.engine in tags]
|
filtered_list = [s for s in self.sites if is_tags_ok(s) or is_name_ok(s) or is_engine_ok(s)]
|
||||||
|
|
||||||
sorted_list = sorted(filtered_list, key=lambda x: x.alexa_rank, reverse=reverse)[:top]
|
sorted_list = sorted(filtered_list, key=lambda x: x.alexa_rank, reverse=reverse)[:top]
|
||||||
return {site.name: site for site in sorted_list}
|
return {site.name: site for site in sorted_list}
|
||||||
|
|||||||
@@ -0,0 +1,5 @@
|
|||||||
|
# pytest.ini
|
||||||
|
[pytest]
|
||||||
|
filterwarnings =
|
||||||
|
error
|
||||||
|
ignore::UserWarning
|
||||||
@@ -0,0 +1,12 @@
|
|||||||
|
from _pytest.mark import Mark
|
||||||
|
|
||||||
|
|
||||||
|
empty_mark = Mark('', [], {})
|
||||||
|
|
||||||
|
|
||||||
|
def by_slow_marker(item):
|
||||||
|
return item.get_closest_marker('slow', default=empty_mark)
|
||||||
|
|
||||||
|
|
||||||
|
def pytest_collection_modifyitems(items):
|
||||||
|
items.sort(key=by_slow_marker, reverse=False)
|
||||||
@@ -0,0 +1,106 @@
|
|||||||
|
"""Maigret main module test functions"""
|
||||||
|
import asyncio
|
||||||
|
from mock import Mock
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from maigret.sites import MaigretDatabase, MaigretSite
|
||||||
|
from maigret.maigret import self_check
|
||||||
|
|
||||||
|
|
||||||
|
EXAMPLE_DB = {
|
||||||
|
'engines': {
|
||||||
|
},
|
||||||
|
'sites': {
|
||||||
|
"GooglePlayStore": {
|
||||||
|
"tags": [
|
||||||
|
"global",
|
||||||
|
"us"
|
||||||
|
],
|
||||||
|
"disabled": False,
|
||||||
|
"checkType": "status_code",
|
||||||
|
"alexaRank": 1,
|
||||||
|
"url": "https://play.google.com/store/apps/developer?id={username}",
|
||||||
|
"urlMain": "https://play.google.com/store",
|
||||||
|
"usernameClaimed": "Facebook_nosuchname",
|
||||||
|
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||||
|
},
|
||||||
|
"Reddit": {
|
||||||
|
"tags": [
|
||||||
|
"news",
|
||||||
|
"social",
|
||||||
|
"us"
|
||||||
|
],
|
||||||
|
"checkType": "status_code",
|
||||||
|
"presenseStrs": [
|
||||||
|
"totalKarma"
|
||||||
|
],
|
||||||
|
"disabled": True,
|
||||||
|
"alexaRank": 17,
|
||||||
|
"url": "https://www.reddit.com/user/{username}",
|
||||||
|
"urlMain": "https://www.reddit.com/",
|
||||||
|
"usernameClaimed": "blue",
|
||||||
|
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
def test_self_check_db_positive_disable():
|
||||||
|
logger = Mock()
|
||||||
|
db = MaigretDatabase()
|
||||||
|
db.load_from_json(EXAMPLE_DB)
|
||||||
|
|
||||||
|
assert db.sites[0].disabled == False
|
||||||
|
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
loop.run_until_complete(self_check(db, db.sites_dict, logger))
|
||||||
|
|
||||||
|
assert db.sites[0].disabled == True
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
def test_self_check_db_positive_enable():
|
||||||
|
logger = Mock()
|
||||||
|
db = MaigretDatabase()
|
||||||
|
db.load_from_json(EXAMPLE_DB)
|
||||||
|
|
||||||
|
db.sites[0].disabled = True
|
||||||
|
db.sites[0].username_claimed = 'Facebook'
|
||||||
|
assert db.sites[0].disabled == True
|
||||||
|
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
loop.run_until_complete(self_check(db, db.sites_dict, logger))
|
||||||
|
|
||||||
|
assert db.sites[0].disabled == False
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
def test_self_check_db_negative_disabled():
|
||||||
|
logger = Mock()
|
||||||
|
db = MaigretDatabase()
|
||||||
|
db.load_from_json(EXAMPLE_DB)
|
||||||
|
|
||||||
|
db.sites[0].disabled = True
|
||||||
|
assert db.sites[0].disabled == True
|
||||||
|
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
loop.run_until_complete(self_check(db, db.sites_dict, logger))
|
||||||
|
|
||||||
|
assert db.sites[0].disabled == True
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.slow
|
||||||
|
def test_self_check_db_negative_enabled():
|
||||||
|
logger = Mock()
|
||||||
|
db = MaigretDatabase()
|
||||||
|
db.load_from_json(EXAMPLE_DB)
|
||||||
|
|
||||||
|
db.sites[0].disabled = False
|
||||||
|
db.sites[0].username_claimed = 'Facebook'
|
||||||
|
assert db.sites[0].disabled == False
|
||||||
|
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
loop.run_until_complete(self_check(db, db.sites_dict, logger))
|
||||||
|
|
||||||
|
assert db.sites[0].disabled == False
|
||||||
@@ -118,3 +118,14 @@ def test_ranked_sites_dict():
|
|||||||
|
|
||||||
# filtering by engine
|
# filtering by engine
|
||||||
assert list(db.ranked_sites_dict(tags=['ucoz']).keys()) == ['3']
|
assert list(db.ranked_sites_dict(tags=['ucoz']).keys()) == ['3']
|
||||||
|
|
||||||
|
# filtering by names
|
||||||
|
assert list(db.ranked_sites_dict(names=['1', '2']).keys()) == ['1', '2']
|
||||||
|
assert list(db.ranked_sites_dict(names=['2', '3']).keys()) == ['2', '3']
|
||||||
|
|
||||||
|
# disjunction
|
||||||
|
assert list(db.ranked_sites_dict(names=['2'], tags=['forum']).keys()) == ['1', '2']
|
||||||
|
assert list(db.ranked_sites_dict(names=['2'], tags=['forum'], reverse=True).keys()) == ['2', '1']
|
||||||
|
assert list(db.ranked_sites_dict(names=['2'], tags=['ucoz']).keys()) == ['2', '3']
|
||||||
|
assert list(db.ranked_sites_dict(names=['4'], tags=['ru']).keys()) == ['2']
|
||||||
|
assert list(db.ranked_sites_dict(names=['4'], tags=['nosuchtag']).keys()) == []
|
||||||
|
|||||||
+29
-28
@@ -12,6 +12,8 @@ import xml.etree.ElementTree as ET
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from argparse import ArgumentParser, RawDescriptionHelpFormatter
|
from argparse import ArgumentParser, RawDescriptionHelpFormatter
|
||||||
|
|
||||||
|
from maigret.maigret import MaigretDatabase
|
||||||
|
|
||||||
RANKS = {str(i):str(i) for i in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 50, 100, 500]}
|
RANKS = {str(i):str(i) for i in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 50, 100, 500]}
|
||||||
RANKS.update({
|
RANKS.update({
|
||||||
'1000': '1K',
|
'1000': '1K',
|
||||||
@@ -22,7 +24,7 @@ RANKS.update({
|
|||||||
'50000000': '10M',
|
'50000000': '10M',
|
||||||
})
|
})
|
||||||
|
|
||||||
def get_rank(domain_to_query, dest, print_errors=True):
|
def get_rank(domain_to_query, site, print_errors=True):
|
||||||
#Retrieve ranking data via alexa API
|
#Retrieve ranking data via alexa API
|
||||||
url = f"http://data.alexa.com/data?cli=10&url={domain_to_query}"
|
url = f"http://data.alexa.com/data?cli=10&url={domain_to_query}"
|
||||||
xml_data = requests.get(url).text
|
xml_data = requests.get(url).text
|
||||||
@@ -30,16 +32,16 @@ def get_rank(domain_to_query, dest, print_errors=True):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
#Get ranking for this site.
|
#Get ranking for this site.
|
||||||
dest['rank'] = int(root.find('.//REACH').attrib['RANK'])
|
site.alexa_rank = int(root.find('.//REACH').attrib['RANK'])
|
||||||
country = root.find('.//COUNTRY')
|
country = root.find('.//COUNTRY')
|
||||||
if not country is None and country.attrib:
|
if not country is None and country.attrib:
|
||||||
country_code = country.attrib['CODE']
|
country_code = country.attrib['CODE']
|
||||||
tags = set(dest.get('tags', []))
|
tags = set(site.tags)
|
||||||
if country_code:
|
if country_code:
|
||||||
tags.add(country_code.lower())
|
tags.add(country_code.lower())
|
||||||
dest['tags'] = sorted(list(tags))
|
site.tags = sorted(list(tags))
|
||||||
if 'type' in dest and dest['type'] != 'username':
|
if site.type != 'username':
|
||||||
dest['disabled'] = False
|
site.disabled = False
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if print_errors:
|
if print_errors:
|
||||||
logging.error(e)
|
logging.error(e)
|
||||||
@@ -67,38 +69,40 @@ if __name__ == '__main__':
|
|||||||
dest="base_file", default="maigret/resources/data.json",
|
dest="base_file", default="maigret/resources/data.json",
|
||||||
help="JSON file with sites data to update.")
|
help="JSON file with sites data to update.")
|
||||||
|
|
||||||
|
parser.add_argument('--empty-only', help='update only sites without rating', action='store_true')
|
||||||
|
|
||||||
pool = list()
|
pool = list()
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
with open(args.base_file, "r", encoding="utf-8") as data_file:
|
db = MaigretDatabase()
|
||||||
sites_info = json.load(data_file)
|
sites_subset = db.load_from_file(args.base_file).sites
|
||||||
data = sites_info['sites']
|
|
||||||
engines = sites_info['engines']
|
|
||||||
|
|
||||||
with open("sites.md", "w") as site_file:
|
with open("sites.md", "w") as site_file:
|
||||||
data_length = len(data)
|
|
||||||
site_file.write(f"""
|
site_file.write(f"""
|
||||||
## List of supported sites: total {data_length}\n
|
## List of supported sites: total {len(sites_subset)}\n
|
||||||
Rank data fetched from Alexa by domains.
|
Rank data fetched from Alexa by domains.
|
||||||
|
|
||||||
""")
|
""")
|
||||||
|
|
||||||
for social_network in data:
|
for site in sites_subset:
|
||||||
url_main = data.get(social_network).get("urlMain")
|
url_main = site.url_main
|
||||||
data.get(social_network)["rank"] = 0
|
if site.alexa_rank < sys.maxsize and args.empty_only:
|
||||||
th = threading.Thread(target=get_rank, args=(url_main, data.get(social_network)))
|
continue
|
||||||
pool.append((social_network, url_main, th))
|
site.alexa_rank = 0
|
||||||
|
th = threading.Thread(target=get_rank, args=(url_main, site))
|
||||||
|
pool.append((site.name, url_main, th))
|
||||||
th.start()
|
th.start()
|
||||||
|
|
||||||
index = 1
|
index = 1
|
||||||
for social_network, url_main, th in pool:
|
for site_name, url_main, th in pool:
|
||||||
th.join()
|
th.join()
|
||||||
sys.stdout.write("\r{0}".format(f"Updated {index} out of {data_length} entries"))
|
sys.stdout.write("\r{0}".format(f"Updated {index} out of {len(sites_subset)} entries"))
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
index = index + 1
|
index = index + 1
|
||||||
|
|
||||||
sites_full_list = [(site, site_data['rank']) for site, site_data in data.items()]
|
sites_full_list = [(s, s.alexa_rank) for s in sites_subset]
|
||||||
|
|
||||||
sites_full_list.sort(reverse=False, key=lambda x: x[1])
|
sites_full_list.sort(reverse=False, key=lambda x: x[1])
|
||||||
|
|
||||||
while sites_full_list[0][1] == 0:
|
while sites_full_list[0][1] == 0:
|
||||||
@@ -107,20 +111,17 @@ Rank data fetched from Alexa by domains.
|
|||||||
|
|
||||||
for num, site_tuple in enumerate(sites_full_list):
|
for num, site_tuple in enumerate(sites_full_list):
|
||||||
site, rank = site_tuple
|
site, rank = site_tuple
|
||||||
url_main = data[site]['urlMain']
|
url_main = site.url_main
|
||||||
valid_rank = get_step_rank(rank)
|
valid_rank = get_step_rank(rank)
|
||||||
all_tags = data[site].get('tags', [])
|
all_tags = site.tags
|
||||||
tags = ', ' + ', '.join(all_tags) if all_tags else ''
|
tags = ', ' + ', '.join(all_tags) if all_tags else ''
|
||||||
note = ''
|
note = ''
|
||||||
if data[site].get('disabled'):
|
if site.disabled:
|
||||||
note = ', search is disabled'
|
note = ', search is disabled'
|
||||||
site_file.write(f'1. [{site}]({url_main})*: top {valid_rank}{tags}*{note}\n')
|
site_file.write(f'1. [{site}]({url_main})*: top {valid_rank}{tags}*{note}\n')
|
||||||
|
db.update_site(site)
|
||||||
|
|
||||||
site_file.write(f'\nAlexa.com rank data fetched at ({datetime.utcnow()} UTC)\n')
|
site_file.write(f'\nAlexa.com rank data fetched at ({datetime.utcnow()} UTC)\n')
|
||||||
|
db.save_to_file(args.base_file)
|
||||||
sorted_json_data = json.dumps({'sites': data, 'engines': engines}, indent=2, sort_keys=True)
|
|
||||||
|
|
||||||
with open(args.base_file, "w") as data_file:
|
|
||||||
data_file.write(sorted_json_data)
|
|
||||||
|
|
||||||
print("\nFinished updating supported site listing!")
|
print("\nFinished updating supported site listing!")
|
||||||
|
|||||||
Reference in New Issue
Block a user