mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-13 18:05:39 +00:00
Improve extracting ids from URLs, tests
This commit is contained in:
+29
-13
@@ -60,6 +60,17 @@ def notify_about_errors(search_results: QueryResultWrapper, query_notify):
|
||||
)
|
||||
|
||||
|
||||
def extract_ids_from_url(url: str, db: MaigretDatabase) -> dict:
|
||||
results = {}
|
||||
for s in db.sites:
|
||||
result = s.extract_id_from_url(url)
|
||||
if not result:
|
||||
continue
|
||||
_id, _type = result
|
||||
results[_id] = _type
|
||||
return results
|
||||
|
||||
|
||||
def extract_ids_from_page(url, logger, timeout=5) -> dict:
|
||||
results = {}
|
||||
# url, headers
|
||||
@@ -105,10 +116,8 @@ def extract_ids_from_results(results: QueryResultWrapper, db: MaigretDatabase) -
|
||||
ids_results[u] = utype
|
||||
|
||||
for url in dictionary.get('ids_links', []):
|
||||
for s in db.sites:
|
||||
u = s.detect_username(url)
|
||||
if u:
|
||||
ids_results[u] = 'username'
|
||||
ids_results.update(extract_ids_from_url(url, db))
|
||||
|
||||
return ids_results
|
||||
|
||||
|
||||
@@ -129,10 +138,9 @@ def setup_arguments_parser():
|
||||
)
|
||||
parser.add_argument(
|
||||
"username",
|
||||
nargs='?',
|
||||
nargs='*',
|
||||
metavar="USERNAMES",
|
||||
action="append",
|
||||
help="One or more usernames to check with social networks.",
|
||||
help="One or more usernames to search by.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--version",
|
||||
@@ -231,7 +239,9 @@ def setup_arguments_parser():
|
||||
help="Make requests over a proxy. e.g. socks5://127.0.0.1:1080",
|
||||
)
|
||||
|
||||
filter_group = parser.add_argument_group('Site filtering', 'Options to set site search scope')
|
||||
filter_group = parser.add_argument_group(
|
||||
'Site filtering', 'Options to set site search scope'
|
||||
)
|
||||
filter_group.add_argument(
|
||||
"-a",
|
||||
"--all-sites",
|
||||
@@ -269,7 +279,7 @@ def setup_arguments_parser():
|
||||
modes_group = parser.add_argument_group(
|
||||
'Operating modes',
|
||||
'Various functions except the default search by a username. '
|
||||
'Modes are executed sequentially in the order of declaration.'
|
||||
'Modes are executed sequentially in the order of declaration.',
|
||||
)
|
||||
modes_group.add_argument(
|
||||
"--parse",
|
||||
@@ -296,10 +306,12 @@ def setup_arguments_parser():
|
||||
"--stats",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Show database statistics (most frequent sites engines and tags)."
|
||||
help="Show database statistics (most frequent sites engines and tags).",
|
||||
)
|
||||
|
||||
output_group = parser.add_argument_group('Output options', 'Options to change verbosity and view of the console output')
|
||||
output_group = parser.add_argument_group(
|
||||
'Output options', 'Options to change verbosity and view of the console output'
|
||||
)
|
||||
output_group.add_argument(
|
||||
"--print-not-found",
|
||||
action="store_true",
|
||||
@@ -354,7 +366,9 @@ def setup_arguments_parser():
|
||||
help="Don't show progressbar.",
|
||||
)
|
||||
|
||||
report_group = parser.add_argument_group('Report formats', 'Supported formats of report files')
|
||||
report_group = parser.add_argument_group(
|
||||
'Report formats', 'Supported formats of report files'
|
||||
)
|
||||
report_group.add_argument(
|
||||
"-T",
|
||||
"--txt",
|
||||
@@ -446,7 +460,9 @@ async def main():
|
||||
print("Using the proxy: " + args.proxy)
|
||||
|
||||
if args.parse_url:
|
||||
extracted_ids = extract_ids_from_page(args.parse_url, logger, timeout=args.timeout)
|
||||
extracted_ids = extract_ids_from_page(
|
||||
args.parse_url, logger, timeout=args.timeout
|
||||
)
|
||||
usernames.update(extracted_ids)
|
||||
|
||||
if args.tags:
|
||||
|
||||
@@ -282,6 +282,8 @@ class QueryNotifyPrint(QueryNotify):
|
||||
sys.stdout.write("\x1b[1K\r")
|
||||
print(notify)
|
||||
|
||||
return notify
|
||||
|
||||
def __str__(self):
|
||||
"""Convert Object To String.
|
||||
|
||||
|
||||
@@ -14365,6 +14365,7 @@
|
||||
"ru"
|
||||
],
|
||||
"checkType": "response_url",
|
||||
"regexCheck": "^(?!id\\d)\\w*$",
|
||||
"alexaRank": 27,
|
||||
"urlMain": "https://vk.com/",
|
||||
"url": "https://vk.com/{username}",
|
||||
@@ -14379,6 +14380,7 @@
|
||||
"checkType": "response_url",
|
||||
"alexaRank": 27,
|
||||
"urlMain": "https://vk.com/",
|
||||
"regexCheck": "^\\d+$",
|
||||
"url": "https://vk.com/id{username}",
|
||||
"source": "VK",
|
||||
"usernameClaimed": "270433952",
|
||||
|
||||
+14
-1
@@ -3,7 +3,7 @@
|
||||
import copy
|
||||
import json
|
||||
import sys
|
||||
from typing import Optional, List, Dict, Any
|
||||
from typing import Optional, List, Dict, Any, Tuple
|
||||
|
||||
import requests
|
||||
|
||||
@@ -146,6 +146,19 @@ class MaigretSite:
|
||||
|
||||
return None
|
||||
|
||||
def extract_id_from_url(self, url: str) -> Optional[Tuple[str, str]]:
|
||||
if not self.url_regexp:
|
||||
return None
|
||||
|
||||
match_groups = self.url_regexp.match(url)
|
||||
if not match_groups:
|
||||
return None
|
||||
|
||||
_id = match_groups.groups()[-1].rstrip("/")
|
||||
_type = self.type
|
||||
|
||||
return _id, _type
|
||||
|
||||
@property
|
||||
def pretty_name(self):
|
||||
if self.source:
|
||||
|
||||
+4
-2
@@ -55,9 +55,11 @@ class URLMatcher:
|
||||
url_main_part = self.extract_main_part(url)
|
||||
for c in self.UNSAFE_SYMBOLS:
|
||||
url_main_part = url_main_part.replace(c, f"\\{c}")
|
||||
username_regexp = username_regexp or ".+?"
|
||||
prepared_username_regexp = (username_regexp or ".+?").lstrip('^').rstrip('$')
|
||||
|
||||
url_regexp = url_main_part.replace("{username}", f"({username_regexp})")
|
||||
url_regexp = url_main_part.replace(
|
||||
"{username}", f"({prepared_username_regexp})"
|
||||
)
|
||||
regexp_str = self._HTTP_URL_RE_STR.replace("(.+)", url_regexp)
|
||||
|
||||
return re.compile(regexp_str)
|
||||
|
||||
Reference in New Issue
Block a user