Added new forums, updated ranks, some utils improvements (#481)

* Added new forums, updated ranks, some utils improvements

* Updated requirements
This commit is contained in:
Soxoj
2022-05-14 13:29:48 +03:00
committed by GitHub
parent 246c770d5c
commit cbe1f09536
5 changed files with 2356 additions and 920 deletions
+1777 -459
View File
File diff suppressed because it is too large Load Diff
+36 -7
View File
@@ -1,10 +1,11 @@
import asyncio import asyncio
import json import json
import re import re
from typing import List from typing import List, Tuple
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
from aiohttp import TCPConnector, ClientSession from aiohttp import TCPConnector, ClientSession
import requests import requests
import cloudscraper
from .activation import import_aiohttp_cookies from .activation import import_aiohttp_cookies
from .checking import maigret from .checking import maigret
@@ -14,6 +15,27 @@ from .sites import MaigretDatabase, MaigretSite, MaigretEngine
from .utils import get_random_user_agent, get_match_ratio from .utils import get_random_user_agent, get_match_ratio
class CloudflareSession:
def __init__(self):
self.scraper = cloudscraper.create_scraper()
async def get(self, *args, **kwargs):
await asyncio.sleep(0)
res = self.scraper.get(*args, **kwargs)
self.last_text = res.text
self.status = res.status_code
return self
def status_code(self):
return self.status
async def text(self):
await asyncio.sleep(0)
return self.last_text
async def close(self):
pass
class Submitter: class Submitter:
HEADERS = { HEADERS = {
"User-Agent": get_random_user_agent(), "User-Agent": get_random_user_agent(),
@@ -141,16 +163,18 @@ class Submitter:
fields['urlSubpath'] = f'/{subpath}' fields['urlSubpath'] = f'/{subpath}'
return fields return fields
async def detect_known_engine(self, url_exists, url_mainpage) -> List[MaigretSite]: async def detect_known_engine(self, url_exists, url_mainpage) -> [List[MaigretSite], str]:
resp_text = '' resp_text = ''
try: try:
r = await self.session.get(url_mainpage) r = await self.session.get(url_mainpage)
resp_text = await r.text() content = await r.content.read()
charset = r.charset or "utf-8"
resp_text = content.decode(charset, "ignore")
self.logger.debug(resp_text) self.logger.debug(resp_text)
except Exception as e: except Exception as e:
self.logger.warning(e) self.logger.warning(e)
print("Some error while checking main page") print("Some error while checking main page")
return [] return [], resp_text
for engine in self.db.engines: for engine in self.db.engines:
strs_to_check = engine.__dict__.get("presenseStrs") strs_to_check = engine.__dict__.get("presenseStrs")
@@ -193,9 +217,9 @@ class Submitter:
) )
sites.append(maigret_site) sites.append(maigret_site)
return sites return sites, resp_text
return [] return [], resp_text
def extract_username_dialog(self, url): def extract_username_dialog(self, url):
url_parts = url.rstrip("/").split("/") url_parts = url.rstrip("/").split("/")
@@ -338,10 +362,15 @@ class Submitter:
print('Detecting site engine, please wait...') print('Detecting site engine, please wait...')
sites = [] sites = []
try: try:
sites = await self.detect_known_engine(url_exists, url_mainpage) sites, text = await self.detect_known_engine(url_exists, url_exists)
except KeyboardInterrupt: except KeyboardInterrupt:
print('Engine detect process is interrupted.') print('Engine detect process is interrupted.')
if 'cloudflare' in text.lower():
print('Cloudflare protection detected. I will use cloudscraper for futher work')
# self.session = CloudflareSession()
if not sites: if not sites:
print("Unable to detect site engine, lets generate checking features") print("Unable to detect site engine, lets generate checking features")
+1
View File
@@ -36,3 +36,4 @@ yarl==1.7.2
networkx==2.5.1 networkx==2.5.1
pyvis==0.2.1 pyvis==0.2.1
reportlab==3.6.9 reportlab==3.6.9
cloudscraper==1.2.60
+531 -449
View File
File diff suppressed because it is too large Load Diff
+10 -4
View File
@@ -3,7 +3,7 @@ import random
from argparse import ArgumentParser, RawDescriptionHelpFormatter from argparse import ArgumentParser, RawDescriptionHelpFormatter
from maigret.maigret import MaigretDatabase from maigret.maigret import MaigretDatabase
from maigret.submit import get_alexa_rank from maigret.submit import Submitter
def update_tags(site): def update_tags(site):
@@ -22,7 +22,7 @@ def update_tags(site):
site.disabled = True site.disabled = True
print(f'Old alexa rank: {site.alexa_rank}') print(f'Old alexa rank: {site.alexa_rank}')
rank = get_alexa_rank(site.url_main) rank = Submitter.get_alexa_rank(site.url_main)
if rank: if rank:
print(f'New alexa rank: {rank}') print(f'New alexa rank: {rank}')
site.alexa_rank = rank site.alexa_rank = rank
@@ -36,6 +36,7 @@ if __name__ == '__main__':
parser.add_argument("--base","-b", metavar="BASE_FILE", parser.add_argument("--base","-b", metavar="BASE_FILE",
dest="base_file", default="maigret/resources/data.json", dest="base_file", default="maigret/resources/data.json",
help="JSON file with sites data to update.") help="JSON file with sites data to update.")
parser.add_argument("--name", help="Name of site to check")
pool = list() pool = list()
@@ -45,12 +46,17 @@ if __name__ == '__main__':
db.load_from_file(args.base_file).sites db.load_from_file(args.base_file).sites
while True: while True:
if args.name:
sites = list(db.ranked_sites_dict(names=[args.name]).values())
site = random.choice(sites)
else:
site = random.choice(db.sites) site = random.choice(db.sites)
if site.engine == 'uCoz': if site.engine == 'uCoz':
continue continue
if not 'in' in site.tags: # if not 'in' in site.tags:
continue # continue
update_tags(site) update_tags(site)