Merge pull request #115 from soxoj/submit-source-improving

Added some new sites, implemented filtering by source site with `--na…
This commit is contained in:
soxoj
2021-04-29 17:18:31 +03:00
committed by GitHub
6 changed files with 142 additions and 58 deletions
+2 -1
View File
@@ -19,6 +19,7 @@ from .executors import AsyncioSimpleExecutor, AsyncioProgressbarQueueExecutor
from .result import QueryResult, QueryStatus
from .sites import MaigretDatabase, MaigretSite
from .types import CheckError
from .utils import get_random_user_agent
supported_recursive_search_ids = (
@@ -383,7 +384,7 @@ async def maigret(username, site_dict, logger, query_notify=None,
results_site['cookies'] = cookie_jar and cookie_jar.filter_cookies(site.url_main) or None
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11.1; rv:55.0) Gecko/20100101 Firefox/55.0',
'User-Agent': get_random_user_agent(),
}
headers.update(site.headers)
+1 -1
View File
@@ -275,7 +275,7 @@ async def main():
site_data = get_top_sites_for_id(args.id_type)
if args.new_site_to_submit:
is_submitted = await submit_dialog(db, args.new_site_to_submit, args.cookie_file)
is_submitted = await submit_dialog(db, args.new_site_to_submit, args.cookie_file, logger)
if is_submitted:
db.save_to_file(args.db_file)
+111 -39
View File
@@ -5436,13 +5436,12 @@
},
"Gitmemory": {
"tags": [
"coding",
"github",
"in"
"coding"
],
"checkType": "message",
"absenceStrs": "Oops,404",
"alexaRank": 6827,
"source": "GitHub",
"url": "https://www.gitmemory.com/{username}",
"urlMain": "https://www.gitmemory.com",
"usernameClaimed": "adam",
@@ -5746,12 +5745,11 @@
},
"Gramho": {
"tags": [
"instagram",
"jp",
"photo"
],
"checkType": "status_code",
"alexaRank": 4445,
"source": "Instagram",
"url": "https://gramho.com/explore-hashtag/{username}",
"urlMain": "https://gramho.com/",
"usernameClaimed": "adam",
@@ -7228,13 +7226,12 @@
},
"Libraries": {
"tags": [
"coding",
"github",
"in"
"coding"
],
"regexCheck": "^[^\\.]+$",
"checkType": "status_code",
"alexaRank": 65552,
"source": "GitHub",
"url": "https://libraries.io/github/{username}/",
"urlMain": "https://libraries.io",
"usernameClaimed": "snooppr",
@@ -9825,9 +9822,7 @@
},
"Picuki": {
"tags": [
"instagram",
"photo",
"us"
"photo"
],
"checkType": "message",
"absenceStrs": [
@@ -11722,8 +11717,9 @@
},
"Shutterstock": {
"tags": [
"fi",
"us"
"photo",
"music",
"stock"
],
"checkType": "message",
"absenceStrs": "T\u00e4m\u00e4p\u00e4 yll\u00e4tt\u00e4v\u00e4\u00e4...",
@@ -12244,9 +12240,7 @@
},
"Steam": {
"tags": [
"gaming",
"steam",
"us"
"gaming"
],
"checkType": "message",
"absenceStrs": "The specified profile could not be found",
@@ -12256,14 +12250,28 @@
"usernameClaimed": "blue",
"usernameUnclaimed": "noonewouldeverusethis7"
},
"SteamGroup": {
"Steam (by id)": {
"tags": [
"steam",
"us"
"gaming"
],
"type": "steam_id",
"checkType": "message",
"absenceStrs": "The specified profile could not be found",
"alexaRank": 370,
"source": "Steam",
"url": "https://steamcommunity.com/profiles/{username}",
"urlMain": "https://steamcommunity.com/",
"usernameClaimed": "76561197960287930",
"usernameUnclaimed": "noonewouldeverusethis7"
},
"Steam (Group)": {
"tags": [
"gaming"
],
"checkType": "message",
"absenceStrs": "No group could be retrieved for the given URL",
"alexaRank": 370,
"source": "Steam",
"url": "https://steamcommunity.com/groups/{username}",
"urlMain": "https://steamcommunity.com/",
"usernameClaimed": "blue",
@@ -12271,14 +12279,12 @@
},
"Steamid": {
"tags": [
"eg",
"gaming",
"steam",
"us"
"gaming"
],
"checkType": "message",
"absenceStrs": "<div class=\"alert alert-warning\">Profile not found</div>",
"alexaRank": 302717,
"source": "Steam",
"url": "https://steamid.uk/profile/{username}",
"urlMain": "https://steamid.uk/",
"usernameClaimed": "blue",
@@ -12286,15 +12292,13 @@
},
"Steamid (by id)": {
"tags": [
"eg",
"gaming",
"steam",
"us"
"gaming"
],
"type": "steam_id",
"checkType": "message",
"absenceStrs": "<div class=\"alert alert-warning\">Profile not found</div>",
"alexaRank": 302717,
"source": "Steam",
"url": "https://steamid.uk/profile/{username}",
"urlMain": "https://steamid.uk/",
"usernameClaimed": "76561197982198022",
@@ -12302,9 +12306,7 @@
},
"Steamidfinder": {
"tags": [
"gaming",
"steam",
"us"
"gaming"
],
"checkType": "message",
"presenseStrs": [
@@ -12314,6 +12316,7 @@
"could not be found."
],
"alexaRank": 72851,
"source": "Steam",
"url": "https://steamidfinder.com/lookup/{username}",
"urlMain": "https://steamidfinder.com",
"usernameClaimed": "channel",
@@ -12321,9 +12324,7 @@
},
"Steamidfinder (by id)": {
"tags": [
"gaming",
"steam",
"us"
"gaming"
],
"type": "steam_id",
"checkType": "message",
@@ -12334,6 +12335,7 @@
"could not be found."
],
"alexaRank": 72851,
"source": "Steam",
"url": "https://steamidfinder.com/lookup/{username}",
"urlMain": "https://steamidfinder.com",
"usernameClaimed": "76561197982198022",
@@ -14688,6 +14690,7 @@
"\u041f\u043e\u043b\u044c\u0437\u043e\u0432\u0430\u0442\u0435\u043b\u044c \u0441\u043a\u0440\u044b\u043b \u0441\u0432\u043e\u044e \u043f\u0443\u0431\u043b\u0438\u0447\u043d\u0443\u044e \u0441\u0442\u0440\u0430\u043d\u0438\u0446\u0443"
],
"alexaRank": 48,
"source": "Yandex",
"url": "https://reviews.yandex.ru/user/{username}",
"urlMain": "https://yandex.ru/",
"usernameClaimed": "20vpvmmwpnwyb0dpbnjvy3k14c",
@@ -14700,6 +14703,7 @@
],
"checkType": "status_code",
"alexaRank": 48,
"source": "Yandex",
"url": "https://yandex.ru/bugbounty/researchers/{username}/",
"urlMain": "https://yandex.ru/bugbounty/",
"usernameClaimed": "pyrk1",
@@ -14722,18 +14726,21 @@
],
"absenceStrs": "cl-not-found-content__title",
"alexaRank": 48,
"source": "Yandex",
"url": "https://yandex.ru/collections/user/{username}",
"urlMain": "https://yandex.ru/collections/",
"usernameClaimed": "yandex",
"usernameUnclaimed": "noonewouldeverusethis7"
},
"YandexLocal": {
"disabled": true,
"tags": [
"ru"
],
"type": "yandex_public_id",
"checkType": "status_code",
"alexaRank": 48,
"source": "Yandex",
"url": "https://local.yandex.ru/users/{username}",
"urlMain": "https://local.yandex.ru/",
"usernameClaimed": "gp7v6ufryzw3m1nvdj4ycexa8g",
@@ -14747,6 +14754,7 @@
"checkType": "message",
"absenceStrs": "//yastatic.net/market-export/_/i/zero-state/404.svg",
"alexaRank": 48,
"source": "Yandex",
"url": "https://market.yandex.ru/user/{username}",
"urlMain": "https://market.yandex.ru/",
"usernameClaimed": "6j2uh4rhp5d9gqgbynaqy2p75m",
@@ -14763,6 +14771,7 @@
"urlProbe": "https://music.yandex.ru/handlers/library.jsx?owner={username}",
"checkType": "status_code",
"alexaRank": 48,
"source": "Yandex",
"url": "https://music.yandex.ru/users/{username}/playlists",
"urlMain": "https://music.yandex.ru/",
"usernameClaimed": "YandexMusic",
@@ -14785,6 +14794,7 @@
"type": "yandex_public_id",
"checkType": "status_code",
"alexaRank": 48,
"source": "Yandex",
"url": "https://yandex.ru/q/profile/{username}",
"urlMain": "https://yandex.ru/q/",
"usernameClaimed": "blue",
@@ -14796,6 +14806,7 @@
],
"checkType": "status_code",
"alexaRank": 48,
"source": "Yandex",
"url": "https://zen.yandex.ru/{username}",
"urlMain": "https://zen.yandex.ru",
"usernameClaimed": "tema",
@@ -14808,6 +14819,7 @@
"type": "yandex_public_id",
"checkType": "status_code",
"alexaRank": 48,
"source": "Yandex",
"url": "https://zen.yandex.ru/user/{username}",
"urlMain": "https://zen.yandex.ru",
"usernameClaimed": "20vpvmmwpnwyb0dpbnjvy3k14c",
@@ -18124,8 +18136,7 @@
"tracr.co": {
"disabled": true,
"tags": [
"gaming",
"discord"
"gaming"
],
"errors": {
"502 - Bad Gateway": "Site error",
@@ -18134,6 +18145,7 @@
"regexCheck": "^[A-Za-z0-9]{2,32}$",
"checkType": "message",
"absenceStrs": "No search results",
"source": "Discord",
"url": "https://tracr.co/users/1/{username}",
"urlMain": "https://tracr.co/",
"usernameClaimed": "blue",
@@ -18171,8 +18183,7 @@
},
"uID.me (by username)": {
"tags": [
"ru",
"ucoz"
"ru"
],
"checkType": "status_code",
"alexaRank": 24715,
@@ -18183,8 +18194,7 @@
},
"uID.me (by uguid)": {
"tags": [
"ru",
"ucoz"
"ru"
],
"type": "uidme_uguid",
"checkType": "status_code",
@@ -22825,6 +22835,7 @@
],
"engine": "engineRedirect",
"alexaRank": 72,
"source": "GitHub",
"url": "https://gist.github.com/{username}",
"urlMain": "https://gist.github.com",
"usernameUnclaimed": "noonewouldeverusethis7",
@@ -23664,6 +23675,9 @@
"usernameUnclaimed": "noonewouldeverusethis7"
},
"pikabu.monster": {
"tags": [
"ru"
],
"checkType": "message",
"presenseStrs": [
"usertotalcomments",
@@ -23677,6 +23691,64 @@
"urlMain": "https://pikabu.monster",
"usernameClaimed": "Avezenit",
"usernameUnclaimed": "noonewouldeverusethis7"
},
"steamdb.info": {
"tags": [
"gaming"
],
"type": "steam_id",
"checkType": "message",
"presenseStrs": [
"profileForm",
" player-name",
" progress",
" data-not-game="
],
"absenceStrs": [
"error-page",
" Error 404"
],
"source": "Steam",
"url": "https://steamdb.info/calculator/{username}",
"urlMain": "https://steamdb.info",
"usernameClaimed": "76561197978866368",
"usernameUnclaimed": "noonewouldeverusethis7"
},
"Niftygateway": {
"urlProbe": "https://api.niftygateway.com/user/profile-and-offchain-nifties-by-url/?profile_url={username}",
"checkType": "message",
"presenseStrs": [
"profile_url",
"name",
"profile_pic_url",
"verified",
"bio"
],
"absenceStrs": [
"not_found",
" User profile not located in our system."
],
"url": "https://niftygateway.com/profile/{username}",
"urlMain": "https://api.niftygateway.com",
"usernameClaimed": "admin",
"usernameUnclaimed": "noonewouldeverusethis7"
},
"opensea.io": {
"checkType": "message",
"presenseStrs": [
"username\\",
"lastSale",
"publicUsername",
"name",
"user"
],
"absenceStrs": [
"><div width="
],
"url": "https://opensea.io/accounts/{username}",
"urlMain": "https://opensea.io",
"usernameClaimed": "admin",
"usernameUnclaimed": "noonewouldeverusethis7"
}
},
"engines": {
+3 -2
View File
@@ -15,7 +15,7 @@ SUPPORTED_TAGS = [
'discussion', 'sharing', 'writing', 'wiki', 'business', 'shopping', 'sport',
'books', 'news', 'documents', 'travel', 'maps', 'hobby', 'apps', 'classified',
'career', 'geosocial', 'streaming', 'education', 'networking', 'torrent',
'science', 'medicine',
'science', 'medicine', 'reading', 'stock',
]
@@ -199,13 +199,14 @@ class MaigretDatabase:
normalized_tags = list(map(str.lower, tags))
is_name_ok = lambda x: x.name.lower() in normalized_names
is_source_ok = lambda x: x.source and x.source.lower() in normalized_names
is_engine_ok = lambda x: isinstance(x.engine, str) and x.engine.lower() in normalized_tags
is_tags_ok = lambda x: set(x.tags).intersection(set(normalized_tags))
is_disabled_needed = lambda x: not x.disabled or ('disabled' in tags or disabled)
is_id_type_ok = lambda x: x.type == id_type
filter_tags_engines_fun = lambda x: not tags or is_engine_ok(x) or is_tags_ok(x)
filter_names_fun = lambda x: not names or is_name_ok(x)
filter_names_fun = lambda x: not names or is_name_ok(x) or is_source_ok(x)
filter_fun = lambda x: filter_tags_engines_fun(x) and filter_names_fun(x) \
and is_disabled_needed(x) and is_id_type_ok(x)
+15 -15
View File
@@ -3,6 +3,7 @@ import difflib
import requests
from .checking import *
from .utils import get_random_user_agent
DESIRED_STRINGS = ["username", "not found", "пользователь", "profile", "lastname", "firstname", "biography",
@@ -11,7 +12,7 @@ DESIRED_STRINGS = ["username", "not found", "пользователь", "profile
SUPPOSED_USERNAMES = ['alex', 'god', 'admin', 'red', 'blue', 'john']
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11.1; rv:55.0) Gecko/20100101 Firefox/55.0',
'User-Agent': get_random_user_agent(),
}
RATIO = 0.6
@@ -125,7 +126,7 @@ async def detect_known_engine(db, url_exists, url_mainpage):
return None
async def check_features_manually(db, url_exists, url_mainpage, cookie_file, redirects=False):
async def check_features_manually(db, url_exists, url_mainpage, cookie_file, logger, redirects=True):
url_parts = url_exists.split('/')
supposed_username = url_parts[-1]
new_name = input(f'Is "{supposed_username}" a valid username? If not, write it manually: ')
@@ -143,7 +144,13 @@ async def check_features_manually(db, url_exists, url_mainpage, cookie_file, red
cookie_dict = {c.key: c.value for c in cookie_jar}
exists_resp = requests.get(url_exists, cookies=cookie_dict, headers=HEADERS, allow_redirects=redirects)
logger.debug(exists_resp.status_code)
logger.debug(exists_resp.text)
non_exists_resp = requests.get(url_not_exists, cookies=cookie_dict, headers=HEADERS, allow_redirects=redirects)
logger.debug(non_exists_resp.status_code)
logger.debug(non_exists_resp.text)
a = exists_resp.text
b = non_exists_resp.text
@@ -187,7 +194,8 @@ async def check_features_manually(db, url_exists, url_mainpage, cookie_file, red
site = MaigretSite(url_mainpage.split('/')[-1], site_data)
return site
async def submit_dialog(db, url_exists, cookie_file):
async def submit_dialog(db, url_exists, cookie_file, logger):
domain_raw = URL_RE.sub('', url_exists).strip().strip('/')
domain_raw = domain_raw.split('/')[0]
@@ -208,19 +216,11 @@ async def submit_dialog(db, url_exists, cookie_file):
sites = await detect_known_engine(db, url_exists, url_mainpage)
if not sites:
print('Unable to detect site engine, lets generate checking features')
sites = [await check_features_manually(db, url_exists, url_mainpage, cookie_file)]
sites = [await check_features_manually(db, url_exists, url_mainpage, cookie_file, logger)]
print(sites[0].__dict__)
logger.debug(sites[0].__dict__)
sem = asyncio.Semaphore(1)
log_level = logging.INFO
logging.basicConfig(
format='[%(filename)s:%(lineno)d] %(levelname)-3s %(asctime)s %(message)s',
datefmt='%H:%M:%S',
level=log_level
)
logger = logging.getLogger('site-submit')
logger.setLevel(log_level)
found = False
chosen_site = None
@@ -236,9 +236,9 @@ async def submit_dialog(db, url_exists, cookie_file):
print('Try to run this mode again and increase features count or choose others.')
else:
if input(f'Site {chosen_site.name} successfully checked. Do you want to save it in the Maigret DB? [Yn] ').lower() in 'y':
print(chosen_site.json)
logger.debug(chosen_site.json)
site_data = chosen_site.strip_engine_data()
print(site_data.json)
logger.debug(site_data.json)
db.update_site(site_data)
return True
+10
View File
@@ -1,4 +1,10 @@
import re
import random
DEFAULT_USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36',
]
class CaseConverter:
@@ -76,3 +82,7 @@ def get_dict_ascii_tree(items, prepend='', new_line=True):
text = text[1:]
return text
def get_random_user_agent():
return random.choice(DEFAULT_USER_AGENTS)