mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-13 18:05:39 +00:00
Tags updates, script added
This commit is contained in:
+1791
-963
File diff suppressed because it is too large
Load Diff
+4
-1
@@ -61,6 +61,9 @@ SUPPORTED_TAGS = [
|
||||
"military",
|
||||
"auto",
|
||||
"gambling",
|
||||
"business",
|
||||
"cybercriminal",
|
||||
"review",
|
||||
]
|
||||
|
||||
|
||||
@@ -472,7 +475,7 @@ class MaigretDatabase:
|
||||
output += f"{count}\t{url}\n"
|
||||
|
||||
output += "Top tags:\n"
|
||||
for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True)[:20]:
|
||||
for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True)[:200]:
|
||||
mark = ""
|
||||
if tag not in SUPPORTED_TAGS:
|
||||
mark = " (non-standard)"
|
||||
|
||||
+15
-1
@@ -2,7 +2,7 @@ import asyncio
|
||||
import difflib
|
||||
import re
|
||||
from typing import List
|
||||
|
||||
import xml.etree.ElementTree as ET
|
||||
import requests
|
||||
|
||||
from .activation import import_aiohttp_cookies
|
||||
@@ -46,6 +46,20 @@ def get_match_ratio(x):
|
||||
)
|
||||
|
||||
|
||||
def get_alexa_rank(site_url_main):
|
||||
url = f"http://data.alexa.com/data?cli=10&url={site_url_main}"
|
||||
xml_data = requests.get(url).text
|
||||
root = ET.fromstring(xml_data)
|
||||
alexa_rank = 0
|
||||
|
||||
try:
|
||||
alexa_rank = int(root.find('.//REACH').attrib['RANK'])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return alexa_rank
|
||||
|
||||
|
||||
def extract_mainpage_url(url):
|
||||
return "/".join(url.split("/", 3)[:3])
|
||||
|
||||
|
||||
Reference in New Issue
Block a user