Merge pull request #39 from soxoj/import-404-specific

Default engines for specific cases, sites list updated
This commit is contained in:
soxoj
2021-01-21 22:20:38 +03:00
committed by GitHub
4 changed files with 3156 additions and 2807 deletions
+1887 -1584
View File
File diff suppressed because it is too large Load Diff
+2 -1
View File
@@ -117,7 +117,8 @@ class MaigretSite:
# remove list items # remove list items
if isinstance(engine_data[k], list) and is_exists: if isinstance(engine_data[k], list) and is_exists:
for f in engine_data[k]: for f in engine_data[k]:
self_copy.__dict__[field].remove(f) if f in self_copy.__dict__[field]:
self_copy.__dict__[field].remove(f)
continue continue
if is_exists: if is_exists:
del self_copy.__dict__[field] del self_copy.__dict__[field]
+1241 -1199
View File
File diff suppressed because it is too large Load Diff
+26 -23
View File
@@ -24,32 +24,35 @@ RANKS.update({
'50000000': '10M', '50000000': '10M',
}) })
SEMAPHORE = threading.Semaphore(10)
def get_rank(domain_to_query, site, print_errors=True): def get_rank(domain_to_query, site, print_errors=True):
#Retrieve ranking data via alexa API with SEMAPHORE:
url = f"http://data.alexa.com/data?cli=10&url={domain_to_query}" #Retrieve ranking data via alexa API
xml_data = requests.get(url).text url = f"http://data.alexa.com/data?cli=10&url={domain_to_query}"
root = ET.fromstring(xml_data) xml_data = requests.get(url).text
root = ET.fromstring(xml_data)
try: try:
#Get ranking for this site. #Get ranking for this site.
site.alexa_rank = int(root.find('.//REACH').attrib['RANK']) site.alexa_rank = int(root.find('.//REACH').attrib['RANK'])
country = root.find('.//COUNTRY') country = root.find('.//COUNTRY')
if not country is None and country.attrib: if not country is None and country.attrib:
country_code = country.attrib['CODE'] country_code = country.attrib['CODE']
tags = set(site.tags) tags = set(site.tags)
if country_code: if country_code:
tags.add(country_code.lower()) tags.add(country_code.lower())
site.tags = sorted(list(tags)) site.tags = sorted(list(tags))
if site.type != 'username': if site.type != 'username':
site.disabled = False site.disabled = False
except Exception as e: except Exception as e:
if print_errors: if print_errors:
logging.error(e) logging.error(e)
# We did not find the rank for some reason. # We did not find the rank for some reason.
print(f"Error retrieving rank information for '{domain_to_query}'") print(f"Error retrieving rank information for '{domain_to_query}'")
print(f" Returned XML is |{xml_data}|") print(f" Returned XML is |{xml_data}|")
return return
def get_step_rank(rank): def get_step_rank(rank):