Improved extraction of usernames from links in personal data

This commit is contained in:
Soxoj
2021-02-15 01:36:10 +03:00
parent bb4c5dc67a
commit c0956a0e23
6 changed files with 111 additions and 5 deletions
+27 -1
View File
@@ -28,4 +28,30 @@ def enrich_link_str(link: str) -> str:
link = link.strip()
if link.startswith('www.') or (link.startswith('http') and '//' in link):
return f'<a class="auto-link" href="{link}">{link}</a>'
return link
return link
class URLMatcher:
_HTTP_URL_RE_STR = '^https?://(www.)?(.+)$'
HTTP_URL_RE = re.compile(_HTTP_URL_RE_STR)
UNSAFE_SYMBOLS = '.?'
@classmethod
def extract_main_part(self, url: str) -> str:
match = self.HTTP_URL_RE.search(url)
if match and match.group(2):
return match.group(2).rstrip('/')
return ''
@classmethod
def make_profile_url_regexp(self, url: str, username_regexp: str = '') -> re.Pattern:
url_main_part = self.extract_main_part(url)
for c in self.UNSAFE_SYMBOLS:
url_main_part = url_main_part.replace(c, f'\\{c}')
username_regexp = username_regexp or '.+?'
url_regexp = url_main_part.replace('{username}', f'({username_regexp})')
regexp_str = self._HTTP_URL_RE_STR.replace('(.+)', url_regexp)
return re.compile(regexp_str)