mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-07 06:24:35 +00:00
Improved extraction of usernames from links in personal data
This commit is contained in:
+27
-1
@@ -28,4 +28,30 @@ def enrich_link_str(link: str) -> str:
|
||||
link = link.strip()
|
||||
if link.startswith('www.') or (link.startswith('http') and '//' in link):
|
||||
return f'<a class="auto-link" href="{link}">{link}</a>'
|
||||
return link
|
||||
return link
|
||||
|
||||
|
||||
class URLMatcher:
|
||||
_HTTP_URL_RE_STR = '^https?://(www.)?(.+)$'
|
||||
HTTP_URL_RE = re.compile(_HTTP_URL_RE_STR)
|
||||
UNSAFE_SYMBOLS = '.?'
|
||||
|
||||
@classmethod
|
||||
def extract_main_part(self, url: str) -> str:
|
||||
match = self.HTTP_URL_RE.search(url)
|
||||
if match and match.group(2):
|
||||
return match.group(2).rstrip('/')
|
||||
|
||||
return ''
|
||||
|
||||
@classmethod
|
||||
def make_profile_url_regexp(self, url: str, username_regexp: str = '') -> re.Pattern:
|
||||
url_main_part = self.extract_main_part(url)
|
||||
for c in self.UNSAFE_SYMBOLS:
|
||||
url_main_part = url_main_part.replace(c, f'\\{c}')
|
||||
username_regexp = username_regexp or '.+?'
|
||||
|
||||
url_regexp = url_main_part.replace('{username}', f'({username_regexp})')
|
||||
regexp_str = self._HTTP_URL_RE_STR.replace('(.+)', url_regexp)
|
||||
|
||||
return re.compile(regexp_str)
|
||||
Reference in New Issue
Block a user