From ecabf88c3a10054e38fa25ba7dc0d63310ffd3ed Mon Sep 17 00:00:00 2001
From: Soxoj <31013580+soxoj@users.noreply.github.com>
Date: Mon, 3 Jan 2022 01:35:53 +0300
Subject: [PATCH] Added a couple of sites, fixed false positives (#286)
---
maigret/maigret.py | 2 +-
maigret/resources/data.json | 152 ++++++++++++++++++++++++++++++++++++
maigret/submit.py | 67 +++++++++-------
utils/update_site_data.py | 2 +-
4 files changed, 194 insertions(+), 29 deletions(-)
diff --git a/maigret/maigret.py b/maigret/maigret.py
index a722740..75b0c7f 100755
--- a/maigret/maigret.py
+++ b/maigret/maigret.py
@@ -536,7 +536,7 @@ async def main():
site_data = get_top_sites_for_id(args.id_type)
if args.new_site_to_submit:
- submitter = Submitter(db=db, logger=logger, settings=settings)
+ submitter = Submitter(db=db, logger=logger, settings=settings, args=args)
is_submitted = await submitter.dialog(args.new_site_to_submit, args.cookie_file)
if is_submitted:
db.save_to_file(db_file)
diff --git a/maigret/resources/data.json b/maigret/resources/data.json
index 884b205..e212bda 100644
--- a/maigret/resources/data.json
+++ b/maigret/resources/data.json
@@ -1833,6 +1833,7 @@
"usernameUnclaimed": "noonewouldeverusethis7"
},
"Bestfantasybooks": {
+ "disabled": true,
"tags": [
"us"
],
@@ -4432,6 +4433,7 @@
]
},
"Facenama": {
+ "disabled": true,
"tags": [
"ir"
],
@@ -28440,6 +28442,156 @@
"usernameUnclaimed": "noonewouldeverusethis7",
"checkType": "message",
"alexaRank": 6859
+ },
+ "Worldis.me": {
+ "absenceStrs": [
+ "user_password",
+ "send_email"
+ ],
+ "presenseStrs": [
+ "my_profile",
+ "profile_upi",
+ "UserInfo"
+ ],
+ "url": "http://en.worldis.me/{username}",
+ "urlMain": "http://en.worldis.me",
+ "usernameClaimed": "admin",
+ "usernameUnclaimed": "noonewouldeverusethis7",
+ "checkType": "message",
+ "alexaRank": 3233509,
+ "tags": [
+ "ru"
+ ]
+ },
+ "photoshop-kopona.com": {
+ "absenceStrs": [
+ "
noonewouldeverusethis7 » \u0420\u0435\u0441\u0443\u0440\u0441\u044b \u0434\u043b\u044f \u0424\u043e\u0442\u043e\u0448\u043e\u043f\u0430"
+ ],
+ "presenseStrs": [
+ "offline",
+ "uspusertitle"
+ ],
+ "url": "https://photoshop-kopona.com/ru/user/{username}/",
+ "urlMain": "https://photoshop-kopona.com",
+ "usernameClaimed": "test",
+ "usernameUnclaimed": "noonewouldeverusethis7",
+ "checkType": "message",
+ "alexaRank": 44106,
+ "tags": [
+ "ru"
+ ]
+ },
+ "dumskaya.net": {
+ "absenceStrs": [
+ ">![+ ],
+](/banner/ps2_/)
Username:"
+ ],
+ "presenseStrs": [
+ "email",
+ "usernamereg",
+ "username-top",
+ "\u041e\u043f\u044b\u0442 \u043f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u0435\u043b\u044f",
+ "check-username"
+ ],
+ "url": "https://xgm.guru/user/{username}",
+ "urlMain": "https://xgm.guru",
+ "usernameClaimed": "test",
+ "usernameUnclaimed": "noonewouldeverusethis7",
+ "checkType": "message",
+ "alexaRank": 692341,
+ "tags": [
+ "forum",
+ "gaming"
+ ]
}
},
"engines": {
diff --git a/maigret/submit.py b/maigret/submit.py
index 7c4216b..352946f 100644
--- a/maigret/submit.py
+++ b/maigret/submit.py
@@ -3,6 +3,7 @@ import json
import re
from typing import List
import xml.etree.ElementTree as ET
+from aiohttp import TCPConnector, ClientSession
import requests
from .activation import import_aiohttp_cookies
@@ -24,11 +25,24 @@ class Submitter:
TOP_FEATURES = 5
URL_RE = re.compile(r"https?://(www\.)?")
- def __init__(self, db: MaigretDatabase, settings: Settings, logger):
+ def __init__(self, db: MaigretDatabase, settings: Settings, logger, args):
self.settings = settings
+ self.args = args
self.db = db
self.logger = logger
+ from aiohttp_socks import ProxyConnector
+ proxy = self.args.proxy
+ cookie_jar = None
+ if args.cookie_file:
+ cookie_jar = import_aiohttp_cookies(args.cookie_file)
+
+ connector = ProxyConnector.from_url(proxy) if proxy else TCPConnector(ssl=False)
+ connector.verify_ssl = False
+ self.session = ClientSession(
+ connector=connector, trust_env=True, cookie_jar=cookie_jar
+ )
+
@staticmethod
def get_alexa_rank(site_url_main):
url = f"http://data.alexa.com/data?cli=10&url={site_url_main}"
@@ -63,6 +77,7 @@ class Submitter:
results_dict = await maigret(
username=username,
site_dict={site.name: site},
+ proxy=self.args.proxy,
logger=self.logger,
timeout=30,
id_type=site.type,
@@ -126,9 +141,11 @@ class Submitter:
return fields
async def detect_known_engine(self, url_exists, url_mainpage) -> List[MaigretSite]:
+ resp_text = ''
try:
- r = requests.get(url_mainpage)
- self.logger.debug(r.text)
+ r = await self.session.get(url_mainpage)
+ resp_text = await r.text()
+ self.logger.debug(resp_text)
except Exception as e:
self.logger.warning(e)
print("Some error while checking main page")
@@ -136,10 +153,10 @@ class Submitter:
for engine in self.db.engines:
strs_to_check = engine.__dict__.get("presenseStrs")
- if strs_to_check and r and r.text:
+ if strs_to_check and resp_text:
all_strs_in_response = True
for s in strs_to_check:
- if s not in r.text:
+ if s not in resp_text:
all_strs_in_response = False
sites = []
if all_strs_in_response:
@@ -209,32 +226,28 @@ class Submitter:
headers = dict(self.HEADERS)
headers.update(custom_headers)
- # cookies
- cookie_dict = None
- if cookie_file:
- self.logger.info(f'Use {cookie_file} for cookies')
- cookie_jar = import_aiohttp_cookies(cookie_file)
- cookie_dict = {c.key: c.value for c in cookie_jar}
-
- exists_resp = requests.get(
- url_exists, cookies=cookie_dict, headers=headers, allow_redirects=redirects
- )
- self.logger.debug(url_exists)
- self.logger.debug(exists_resp.status_code)
- self.logger.debug(exists_resp.text)
-
- non_exists_resp = requests.get(
- url_not_exists,
- cookies=cookie_dict,
+ exists_resp = await self.session.get(
+ url_exists,
headers=headers,
allow_redirects=redirects,
)
- self.logger.debug(url_not_exists)
- self.logger.debug(non_exists_resp.status_code)
- self.logger.debug(non_exists_resp.text)
+ exists_resp_text = await exists_resp.text()
+ self.logger.debug(url_exists)
+ self.logger.debug(exists_resp.status)
+ self.logger.debug(exists_resp_text)
- a = exists_resp.text
- b = non_exists_resp.text
+ non_exists_resp = await self.session.get(
+ url_not_exists,
+ headers=headers,
+ allow_redirects=redirects,
+ )
+ non_exists_resp_text = await non_exists_resp.text()
+ self.logger.debug(url_not_exists)
+ self.logger.debug(non_exists_resp.status)
+ self.logger.debug(non_exists_resp_text)
+
+ a = exists_resp_text
+ b = non_exists_resp_text
tokens_a = set(re.split(f'[{self.SEPARATORS}]', a))
tokens_b = set(re.split(f'[{self.SEPARATORS}]', b))
diff --git a/utils/update_site_data.py b/utils/update_site_data.py
index 8683255..12180fb 100755
--- a/utils/update_site_data.py
+++ b/utils/update_site_data.py
@@ -25,7 +25,7 @@ RANKS.update({
'100000000': '100M',
})
-SEMAPHORE = threading.Semaphore(10)
+SEMAPHORE = threading.Semaphore(20)
def get_rank(domain_to_query, site, print_errors=True):
with SEMAPHORE: