mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-06 14:08:59 +00:00
Tags updates, script added
This commit is contained in:
+1791
-963
File diff suppressed because it is too large
Load Diff
+4
-1
@@ -61,6 +61,9 @@ SUPPORTED_TAGS = [
|
|||||||
"military",
|
"military",
|
||||||
"auto",
|
"auto",
|
||||||
"gambling",
|
"gambling",
|
||||||
|
"business",
|
||||||
|
"cybercriminal",
|
||||||
|
"review",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
@@ -472,7 +475,7 @@ class MaigretDatabase:
|
|||||||
output += f"{count}\t{url}\n"
|
output += f"{count}\t{url}\n"
|
||||||
|
|
||||||
output += "Top tags:\n"
|
output += "Top tags:\n"
|
||||||
for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True)[:20]:
|
for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True)[:200]:
|
||||||
mark = ""
|
mark = ""
|
||||||
if tag not in SUPPORTED_TAGS:
|
if tag not in SUPPORTED_TAGS:
|
||||||
mark = " (non-standard)"
|
mark = " (non-standard)"
|
||||||
|
|||||||
+15
-1
@@ -2,7 +2,7 @@ import asyncio
|
|||||||
import difflib
|
import difflib
|
||||||
import re
|
import re
|
||||||
from typing import List
|
from typing import List
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from .activation import import_aiohttp_cookies
|
from .activation import import_aiohttp_cookies
|
||||||
@@ -46,6 +46,20 @@ def get_match_ratio(x):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_alexa_rank(site_url_main):
|
||||||
|
url = f"http://data.alexa.com/data?cli=10&url={site_url_main}"
|
||||||
|
xml_data = requests.get(url).text
|
||||||
|
root = ET.fromstring(xml_data)
|
||||||
|
alexa_rank = 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
alexa_rank = int(root.find('.//REACH').attrib['RANK'])
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return alexa_rank
|
||||||
|
|
||||||
|
|
||||||
def extract_mainpage_url(url):
|
def extract_mainpage_url(url):
|
||||||
return "/".join(url.split("/", 3)[:3])
|
return "/".join(url.split("/", 3)[:3])
|
||||||
|
|
||||||
|
|||||||
Executable
+54
@@ -0,0 +1,54 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import random
|
||||||
|
from argparse import ArgumentParser, RawDescriptionHelpFormatter
|
||||||
|
|
||||||
|
from maigret.maigret import MaigretDatabase
|
||||||
|
from maigret.submit import get_alexa_rank
|
||||||
|
|
||||||
|
|
||||||
|
def update_tags(site):
|
||||||
|
tags = []
|
||||||
|
if not site.tags:
|
||||||
|
print(f'Site {site.name} doesn\'t have tags')
|
||||||
|
else:
|
||||||
|
tags = site.tags
|
||||||
|
print(f'Site {site.name} tags: ' + ', '.join(tags))
|
||||||
|
|
||||||
|
print(f'URL: {site.url_main}')
|
||||||
|
|
||||||
|
new_tags = set(input('Enter new tags: ').split(', '))
|
||||||
|
if "disabled" in new_tags:
|
||||||
|
new_tags.remove("disabled")
|
||||||
|
site.disabled = True
|
||||||
|
|
||||||
|
print(f'Old alexa rank: {site.alexa_rank}')
|
||||||
|
rank = get_alexa_rank(site.url_main)
|
||||||
|
if rank:
|
||||||
|
print(f'New alexa rank: {rank}')
|
||||||
|
site.alexa_rank = rank
|
||||||
|
|
||||||
|
site.tags = [x for x in list(new_tags) if x]
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter
|
||||||
|
)
|
||||||
|
parser.add_argument("--base","-b", metavar="BASE_FILE",
|
||||||
|
dest="base_file", default="maigret/resources/data.json",
|
||||||
|
help="JSON file with sites data to update.")
|
||||||
|
|
||||||
|
pool = list()
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
db = MaigretDatabase()
|
||||||
|
db.load_from_file(args.base_file).sites
|
||||||
|
|
||||||
|
while True:
|
||||||
|
site = random.choice(db.sites)
|
||||||
|
if site.engine == 'uCoz' or site.tags:
|
||||||
|
continue
|
||||||
|
|
||||||
|
update_tags(site)
|
||||||
|
|
||||||
|
db.save_to_file(args.base_file)
|
||||||
Reference in New Issue
Block a user