mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-06 14:08:59 +00:00
Added new forums, updated ranks, some utils improvements (#481)
* Added new forums, updated ranks, some utils improvements * Updated requirements
This commit is contained in:
+1777
-459
File diff suppressed because it is too large
Load Diff
+36
-7
@@ -1,10 +1,11 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
from typing import List
|
from typing import List, Tuple
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
from aiohttp import TCPConnector, ClientSession
|
from aiohttp import TCPConnector, ClientSession
|
||||||
import requests
|
import requests
|
||||||
|
import cloudscraper
|
||||||
|
|
||||||
from .activation import import_aiohttp_cookies
|
from .activation import import_aiohttp_cookies
|
||||||
from .checking import maigret
|
from .checking import maigret
|
||||||
@@ -14,6 +15,27 @@ from .sites import MaigretDatabase, MaigretSite, MaigretEngine
|
|||||||
from .utils import get_random_user_agent, get_match_ratio
|
from .utils import get_random_user_agent, get_match_ratio
|
||||||
|
|
||||||
|
|
||||||
|
class CloudflareSession:
|
||||||
|
def __init__(self):
|
||||||
|
self.scraper = cloudscraper.create_scraper()
|
||||||
|
|
||||||
|
async def get(self, *args, **kwargs):
|
||||||
|
await asyncio.sleep(0)
|
||||||
|
res = self.scraper.get(*args, **kwargs)
|
||||||
|
self.last_text = res.text
|
||||||
|
self.status = res.status_code
|
||||||
|
return self
|
||||||
|
|
||||||
|
def status_code(self):
|
||||||
|
return self.status
|
||||||
|
|
||||||
|
async def text(self):
|
||||||
|
await asyncio.sleep(0)
|
||||||
|
return self.last_text
|
||||||
|
|
||||||
|
async def close(self):
|
||||||
|
pass
|
||||||
|
|
||||||
class Submitter:
|
class Submitter:
|
||||||
HEADERS = {
|
HEADERS = {
|
||||||
"User-Agent": get_random_user_agent(),
|
"User-Agent": get_random_user_agent(),
|
||||||
@@ -141,16 +163,18 @@ class Submitter:
|
|||||||
fields['urlSubpath'] = f'/{subpath}'
|
fields['urlSubpath'] = f'/{subpath}'
|
||||||
return fields
|
return fields
|
||||||
|
|
||||||
async def detect_known_engine(self, url_exists, url_mainpage) -> List[MaigretSite]:
|
async def detect_known_engine(self, url_exists, url_mainpage) -> [List[MaigretSite], str]:
|
||||||
resp_text = ''
|
resp_text = ''
|
||||||
try:
|
try:
|
||||||
r = await self.session.get(url_mainpage)
|
r = await self.session.get(url_mainpage)
|
||||||
resp_text = await r.text()
|
content = await r.content.read()
|
||||||
|
charset = r.charset or "utf-8"
|
||||||
|
resp_text = content.decode(charset, "ignore")
|
||||||
self.logger.debug(resp_text)
|
self.logger.debug(resp_text)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.warning(e)
|
self.logger.warning(e)
|
||||||
print("Some error while checking main page")
|
print("Some error while checking main page")
|
||||||
return []
|
return [], resp_text
|
||||||
|
|
||||||
for engine in self.db.engines:
|
for engine in self.db.engines:
|
||||||
strs_to_check = engine.__dict__.get("presenseStrs")
|
strs_to_check = engine.__dict__.get("presenseStrs")
|
||||||
@@ -193,9 +217,9 @@ class Submitter:
|
|||||||
)
|
)
|
||||||
sites.append(maigret_site)
|
sites.append(maigret_site)
|
||||||
|
|
||||||
return sites
|
return sites, resp_text
|
||||||
|
|
||||||
return []
|
return [], resp_text
|
||||||
|
|
||||||
def extract_username_dialog(self, url):
|
def extract_username_dialog(self, url):
|
||||||
url_parts = url.rstrip("/").split("/")
|
url_parts = url.rstrip("/").split("/")
|
||||||
@@ -338,10 +362,15 @@ class Submitter:
|
|||||||
print('Detecting site engine, please wait...')
|
print('Detecting site engine, please wait...')
|
||||||
sites = []
|
sites = []
|
||||||
try:
|
try:
|
||||||
sites = await self.detect_known_engine(url_exists, url_mainpage)
|
sites, text = await self.detect_known_engine(url_exists, url_exists)
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
print('Engine detect process is interrupted.')
|
print('Engine detect process is interrupted.')
|
||||||
|
|
||||||
|
|
||||||
|
if 'cloudflare' in text.lower():
|
||||||
|
print('Cloudflare protection detected. I will use cloudscraper for futher work')
|
||||||
|
# self.session = CloudflareSession()
|
||||||
|
|
||||||
if not sites:
|
if not sites:
|
||||||
print("Unable to detect site engine, lets generate checking features")
|
print("Unable to detect site engine, lets generate checking features")
|
||||||
|
|
||||||
|
|||||||
@@ -36,3 +36,4 @@ yarl==1.7.2
|
|||||||
networkx==2.5.1
|
networkx==2.5.1
|
||||||
pyvis==0.2.1
|
pyvis==0.2.1
|
||||||
reportlab==3.6.9
|
reportlab==3.6.9
|
||||||
|
cloudscraper==1.2.60
|
||||||
+11
-5
@@ -3,7 +3,7 @@ import random
|
|||||||
from argparse import ArgumentParser, RawDescriptionHelpFormatter
|
from argparse import ArgumentParser, RawDescriptionHelpFormatter
|
||||||
|
|
||||||
from maigret.maigret import MaigretDatabase
|
from maigret.maigret import MaigretDatabase
|
||||||
from maigret.submit import get_alexa_rank
|
from maigret.submit import Submitter
|
||||||
|
|
||||||
|
|
||||||
def update_tags(site):
|
def update_tags(site):
|
||||||
@@ -22,7 +22,7 @@ def update_tags(site):
|
|||||||
site.disabled = True
|
site.disabled = True
|
||||||
|
|
||||||
print(f'Old alexa rank: {site.alexa_rank}')
|
print(f'Old alexa rank: {site.alexa_rank}')
|
||||||
rank = get_alexa_rank(site.url_main)
|
rank = Submitter.get_alexa_rank(site.url_main)
|
||||||
if rank:
|
if rank:
|
||||||
print(f'New alexa rank: {rank}')
|
print(f'New alexa rank: {rank}')
|
||||||
site.alexa_rank = rank
|
site.alexa_rank = rank
|
||||||
@@ -36,6 +36,7 @@ if __name__ == '__main__':
|
|||||||
parser.add_argument("--base","-b", metavar="BASE_FILE",
|
parser.add_argument("--base","-b", metavar="BASE_FILE",
|
||||||
dest="base_file", default="maigret/resources/data.json",
|
dest="base_file", default="maigret/resources/data.json",
|
||||||
help="JSON file with sites data to update.")
|
help="JSON file with sites data to update.")
|
||||||
|
parser.add_argument("--name", help="Name of site to check")
|
||||||
|
|
||||||
pool = list()
|
pool = list()
|
||||||
|
|
||||||
@@ -45,12 +46,17 @@ if __name__ == '__main__':
|
|||||||
db.load_from_file(args.base_file).sites
|
db.load_from_file(args.base_file).sites
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
site = random.choice(db.sites)
|
if args.name:
|
||||||
|
sites = list(db.ranked_sites_dict(names=[args.name]).values())
|
||||||
|
site = random.choice(sites)
|
||||||
|
else:
|
||||||
|
site = random.choice(db.sites)
|
||||||
|
|
||||||
if site.engine == 'uCoz':
|
if site.engine == 'uCoz':
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if not 'in' in site.tags:
|
# if not 'in' in site.tags:
|
||||||
continue
|
# continue
|
||||||
|
|
||||||
update_tags(site)
|
update_tags(site)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user