Improve extracting ids from URLs, tests

This commit is contained in:
Soxoj
2021-05-06 22:35:44 +03:00
parent 009d51c380
commit 0e9655c46a
10 changed files with 129 additions and 26 deletions
+29 -13
View File
@@ -60,6 +60,17 @@ def notify_about_errors(search_results: QueryResultWrapper, query_notify):
)
def extract_ids_from_url(url: str, db: MaigretDatabase) -> dict:
results = {}
for s in db.sites:
result = s.extract_id_from_url(url)
if not result:
continue
_id, _type = result
results[_id] = _type
return results
def extract_ids_from_page(url, logger, timeout=5) -> dict:
results = {}
# url, headers
@@ -105,10 +116,8 @@ def extract_ids_from_results(results: QueryResultWrapper, db: MaigretDatabase) -
ids_results[u] = utype
for url in dictionary.get('ids_links', []):
for s in db.sites:
u = s.detect_username(url)
if u:
ids_results[u] = 'username'
ids_results.update(extract_ids_from_url(url, db))
return ids_results
@@ -129,10 +138,9 @@ def setup_arguments_parser():
)
parser.add_argument(
"username",
nargs='?',
nargs='*',
metavar="USERNAMES",
action="append",
help="One or more usernames to check with social networks.",
help="One or more usernames to search by.",
)
parser.add_argument(
"--version",
@@ -231,7 +239,9 @@ def setup_arguments_parser():
help="Make requests over a proxy. e.g. socks5://127.0.0.1:1080",
)
filter_group = parser.add_argument_group('Site filtering', 'Options to set site search scope')
filter_group = parser.add_argument_group(
'Site filtering', 'Options to set site search scope'
)
filter_group.add_argument(
"-a",
"--all-sites",
@@ -269,7 +279,7 @@ def setup_arguments_parser():
modes_group = parser.add_argument_group(
'Operating modes',
'Various functions except the default search by a username. '
'Modes are executed sequentially in the order of declaration.'
'Modes are executed sequentially in the order of declaration.',
)
modes_group.add_argument(
"--parse",
@@ -296,10 +306,12 @@ def setup_arguments_parser():
"--stats",
action="store_true",
default=False,
help="Show database statistics (most frequent sites engines and tags)."
help="Show database statistics (most frequent sites engines and tags).",
)
output_group = parser.add_argument_group('Output options', 'Options to change verbosity and view of the console output')
output_group = parser.add_argument_group(
'Output options', 'Options to change verbosity and view of the console output'
)
output_group.add_argument(
"--print-not-found",
action="store_true",
@@ -354,7 +366,9 @@ def setup_arguments_parser():
help="Don't show progressbar.",
)
report_group = parser.add_argument_group('Report formats', 'Supported formats of report files')
report_group = parser.add_argument_group(
'Report formats', 'Supported formats of report files'
)
report_group.add_argument(
"-T",
"--txt",
@@ -446,7 +460,9 @@ async def main():
print("Using the proxy: " + args.proxy)
if args.parse_url:
extracted_ids = extract_ids_from_page(args.parse_url, logger, timeout=args.timeout)
extracted_ids = extract_ids_from_page(
args.parse_url, logger, timeout=args.timeout
)
usernames.update(extracted_ids)
if args.tags:
+2
View File
@@ -282,6 +282,8 @@ class QueryNotifyPrint(QueryNotify):
sys.stdout.write("\x1b[1K\r")
print(notify)
return notify
def __str__(self):
"""Convert Object To String.
+2
View File
@@ -14365,6 +14365,7 @@
"ru"
],
"checkType": "response_url",
"regexCheck": "^(?!id\\d)\\w*$",
"alexaRank": 27,
"urlMain": "https://vk.com/",
"url": "https://vk.com/{username}",
@@ -14379,6 +14380,7 @@
"checkType": "response_url",
"alexaRank": 27,
"urlMain": "https://vk.com/",
"regexCheck": "^\\d+$",
"url": "https://vk.com/id{username}",
"source": "VK",
"usernameClaimed": "270433952",
+14 -1
View File
@@ -3,7 +3,7 @@
import copy
import json
import sys
from typing import Optional, List, Dict, Any
from typing import Optional, List, Dict, Any, Tuple
import requests
@@ -146,6 +146,19 @@ class MaigretSite:
return None
def extract_id_from_url(self, url: str) -> Optional[Tuple[str, str]]:
if not self.url_regexp:
return None
match_groups = self.url_regexp.match(url)
if not match_groups:
return None
_id = match_groups.groups()[-1].rstrip("/")
_type = self.type
return _id, _type
@property
def pretty_name(self):
if self.source:
+4 -2
View File
@@ -55,9 +55,11 @@ class URLMatcher:
url_main_part = self.extract_main_part(url)
for c in self.UNSAFE_SYMBOLS:
url_main_part = url_main_part.replace(c, f"\\{c}")
username_regexp = username_regexp or ".+?"
prepared_username_regexp = (username_regexp or ".+?").lstrip('^').rstrip('$')
url_regexp = url_main_part.replace("{username}", f"({username_regexp})")
url_regexp = url_main_part.replace(
"{username}", f"({prepared_username_regexp})"
)
regexp_str = self._HTTP_URL_RE_STR.replace("(.+)", url_regexp)
return re.compile(regexp_str)