diff --git a/maigret/maigret.py b/maigret/maigret.py index a722740..75b0c7f 100755 --- a/maigret/maigret.py +++ b/maigret/maigret.py @@ -536,7 +536,7 @@ async def main(): site_data = get_top_sites_for_id(args.id_type) if args.new_site_to_submit: - submitter = Submitter(db=db, logger=logger, settings=settings) + submitter = Submitter(db=db, logger=logger, settings=settings, args=args) is_submitted = await submitter.dialog(args.new_site_to_submit, args.cookie_file) if is_submitted: db.save_to_file(db_file) diff --git a/maigret/resources/data.json b/maigret/resources/data.json index 884b205..e212bda 100644 --- a/maigret/resources/data.json +++ b/maigret/resources/data.json @@ -1833,6 +1833,7 @@ "usernameUnclaimed": "noonewouldeverusethis7" }, "Bestfantasybooks": { + "disabled": true, "tags": [ "us" ], @@ -4432,6 +4433,7 @@ ] }, "Facenama": { + "disabled": true, "tags": [ "ir" ], @@ -28440,6 +28442,156 @@ "usernameUnclaimed": "noonewouldeverusethis7", "checkType": "message", "alexaRank": 6859 + }, + "Worldis.me": { + "absenceStrs": [ + "user_password", + "send_email" + ], + "presenseStrs": [ + "my_profile", + "profile_upi", + "UserInfo" + ], + "url": "http://en.worldis.me/{username}", + "urlMain": "http://en.worldis.me", + "usernameClaimed": "admin", + "usernameUnclaimed": "noonewouldeverusethis7", + "checkType": "message", + "alexaRank": 3233509, + "tags": [ + "ru" + ] + }, + "photoshop-kopona.com": { + "absenceStrs": [ + "noonewouldeverusethis7 » \u0420\u0435\u0441\u0443\u0440\u0441\u044b \u0434\u043b\u044f \u0424\u043e\u0442\u043e\u0448\u043e\u043f\u0430" + ], + "presenseStrs": [ + "offline", + "uspusertitle" + ], + "url": "https://photoshop-kopona.com/ru/user/{username}/", + "urlMain": "https://photoshop-kopona.com", + "usernameClaimed": "test", + "usernameUnclaimed": "noonewouldeverusethis7", + "checkType": "message", + "alexaRank": 44106, + "tags": [ + "ru" + ] + }, + "dumskaya.net": { + "absenceStrs": [ + ">
+            ],
+            
+            ],
+            Username:" + ], + "presenseStrs": [ + "email", + "usernamereg", + "username-top", + "\u041e\u043f\u044b\u0442 \u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u0435\u043b\u044f", + "check-username" + ], + "url": "https://xgm.guru/user/{username}", + "urlMain": "https://xgm.guru", + "usernameClaimed": "test", + "usernameUnclaimed": "noonewouldeverusethis7", + "checkType": "message", + "alexaRank": 692341, + "tags": [ + "forum", + "gaming" + ] } }, "engines": { diff --git a/maigret/submit.py b/maigret/submit.py index 7c4216b..352946f 100644 --- a/maigret/submit.py +++ b/maigret/submit.py @@ -3,6 +3,7 @@ import json import re from typing import List import xml.etree.ElementTree as ET +from aiohttp import TCPConnector, ClientSession import requests from .activation import import_aiohttp_cookies @@ -24,11 +25,24 @@ class Submitter: TOP_FEATURES = 5 URL_RE = re.compile(r"https?://(www\.)?") - def __init__(self, db: MaigretDatabase, settings: Settings, logger): + def __init__(self, db: MaigretDatabase, settings: Settings, logger, args): self.settings = settings + self.args = args self.db = db self.logger = logger + from aiohttp_socks import ProxyConnector + proxy = self.args.proxy + cookie_jar = None + if args.cookie_file: + cookie_jar = import_aiohttp_cookies(args.cookie_file) + + connector = ProxyConnector.from_url(proxy) if proxy else TCPConnector(ssl=False) + connector.verify_ssl = False + self.session = ClientSession( + connector=connector, trust_env=True, cookie_jar=cookie_jar + ) + @staticmethod def get_alexa_rank(site_url_main): url = f"http://data.alexa.com/data?cli=10&url={site_url_main}" @@ -63,6 +77,7 @@ class Submitter: results_dict = await maigret( username=username, site_dict={site.name: site}, + proxy=self.args.proxy, logger=self.logger, timeout=30, id_type=site.type, @@ -126,9 +141,11 @@ class Submitter: return fields async def detect_known_engine(self, url_exists, url_mainpage) -> List[MaigretSite]: + resp_text = '' try: - r = requests.get(url_mainpage) - self.logger.debug(r.text) + r = await self.session.get(url_mainpage) + resp_text = await r.text() + self.logger.debug(resp_text) except Exception as e: self.logger.warning(e) print("Some error while checking main page") @@ -136,10 +153,10 @@ class Submitter: for engine in self.db.engines: strs_to_check = engine.__dict__.get("presenseStrs") - if strs_to_check and r and r.text: + if strs_to_check and resp_text: all_strs_in_response = True for s in strs_to_check: - if s not in r.text: + if s not in resp_text: all_strs_in_response = False sites = [] if all_strs_in_response: @@ -209,32 +226,28 @@ class Submitter: headers = dict(self.HEADERS) headers.update(custom_headers) - # cookies - cookie_dict = None - if cookie_file: - self.logger.info(f'Use {cookie_file} for cookies') - cookie_jar = import_aiohttp_cookies(cookie_file) - cookie_dict = {c.key: c.value for c in cookie_jar} - - exists_resp = requests.get( - url_exists, cookies=cookie_dict, headers=headers, allow_redirects=redirects - ) - self.logger.debug(url_exists) - self.logger.debug(exists_resp.status_code) - self.logger.debug(exists_resp.text) - - non_exists_resp = requests.get( - url_not_exists, - cookies=cookie_dict, + exists_resp = await self.session.get( + url_exists, headers=headers, allow_redirects=redirects, ) - self.logger.debug(url_not_exists) - self.logger.debug(non_exists_resp.status_code) - self.logger.debug(non_exists_resp.text) + exists_resp_text = await exists_resp.text() + self.logger.debug(url_exists) + self.logger.debug(exists_resp.status) + self.logger.debug(exists_resp_text) - a = exists_resp.text - b = non_exists_resp.text + non_exists_resp = await self.session.get( + url_not_exists, + headers=headers, + allow_redirects=redirects, + ) + non_exists_resp_text = await non_exists_resp.text() + self.logger.debug(url_not_exists) + self.logger.debug(non_exists_resp.status) + self.logger.debug(non_exists_resp_text) + + a = exists_resp_text + b = non_exists_resp_text tokens_a = set(re.split(f'[{self.SEPARATORS}]', a)) tokens_b = set(re.split(f'[{self.SEPARATORS}]', b)) diff --git a/utils/update_site_data.py b/utils/update_site_data.py index 8683255..12180fb 100755 --- a/utils/update_site_data.py +++ b/utils/update_site_data.py @@ -25,7 +25,7 @@ RANKS.update({ '100000000': '100M', }) -SEMAPHORE = threading.Semaphore(10) +SEMAPHORE = threading.Semaphore(20) def get_rank(domain_to_query, site, print_errors=True): with SEMAPHORE: