Tags updates, script added

This commit is contained in:
Soxoj
2021-05-09 16:20:39 +03:00
parent 5bda7fb339
commit 43f189f774
4 changed files with 1864 additions and 965 deletions
+1791 -963
View File
File diff suppressed because it is too large Load Diff
+4 -1
View File
@@ -61,6 +61,9 @@ SUPPORTED_TAGS = [
"military", "military",
"auto", "auto",
"gambling", "gambling",
"business",
"cybercriminal",
"review",
] ]
@@ -472,7 +475,7 @@ class MaigretDatabase:
output += f"{count}\t{url}\n" output += f"{count}\t{url}\n"
output += "Top tags:\n" output += "Top tags:\n"
for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True)[:20]: for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True)[:200]:
mark = "" mark = ""
if tag not in SUPPORTED_TAGS: if tag not in SUPPORTED_TAGS:
mark = " (non-standard)" mark = " (non-standard)"
+15 -1
View File
@@ -2,7 +2,7 @@ import asyncio
import difflib import difflib
import re import re
from typing import List from typing import List
import xml.etree.ElementTree as ET
import requests import requests
from .activation import import_aiohttp_cookies from .activation import import_aiohttp_cookies
@@ -46,6 +46,20 @@ def get_match_ratio(x):
) )
def get_alexa_rank(site_url_main):
url = f"http://data.alexa.com/data?cli=10&url={site_url_main}"
xml_data = requests.get(url).text
root = ET.fromstring(xml_data)
alexa_rank = 0
try:
alexa_rank = int(root.find('.//REACH').attrib['RANK'])
except Exception:
pass
return alexa_rank
def extract_mainpage_url(url): def extract_mainpage_url(url):
return "/".join(url.split("/", 3)[:3]) return "/".join(url.split("/", 3)[:3])
+54
View File
@@ -0,0 +1,54 @@
#!/usr/bin/env python3
import random
from argparse import ArgumentParser, RawDescriptionHelpFormatter
from maigret.maigret import MaigretDatabase
from maigret.submit import get_alexa_rank
def update_tags(site):
tags = []
if not site.tags:
print(f'Site {site.name} doesn\'t have tags')
else:
tags = site.tags
print(f'Site {site.name} tags: ' + ', '.join(tags))
print(f'URL: {site.url_main}')
new_tags = set(input('Enter new tags: ').split(', '))
if "disabled" in new_tags:
new_tags.remove("disabled")
site.disabled = True
print(f'Old alexa rank: {site.alexa_rank}')
rank = get_alexa_rank(site.url_main)
if rank:
print(f'New alexa rank: {rank}')
site.alexa_rank = rank
site.tags = [x for x in list(new_tags) if x]
if __name__ == '__main__':
parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter
)
parser.add_argument("--base","-b", metavar="BASE_FILE",
dest="base_file", default="maigret/resources/data.json",
help="JSON file with sites data to update.")
pool = list()
args = parser.parse_args()
db = MaigretDatabase()
db.load_from_file(args.base_file).sites
while True:
site = random.choice(db.sites)
if site.engine == 'uCoz' or site.tags:
continue
update_tags(site)
db.save_to_file(args.base_file)