diff --git a/maigret/checking.py b/maigret/checking.py
index d0c5300..1f1b9c2 100644
--- a/maigret/checking.py
+++ b/maigret/checking.py
@@ -97,7 +97,7 @@ async def update_site_dict_from_response(sitename, site_dict, results_info, sema
site_dict[sitename] = process_site_result(response, query_notify, logger, results_info, site_obj)
-# TODO: move info separate module
+# TODO: move to separate class
def detect_error_page(html_text, status_code, fail_flags, ignore_403):
# Detect service restrictions such as a country restriction
for flag, msg in fail_flags.items():
@@ -270,6 +270,7 @@ def process_site_result(response, query_notify, logger, results_info, site: Maig
new_usernames[v] = k
results_info['ids_usernames'] = new_usernames
+ results_info['ids_links'] = eval(extracted_ids_data.get('links', '[]'))
result.ids_data = extracted_ids_data
# Notify caller about results of query.
diff --git a/maigret/maigret.py b/maigret/maigret.py
index ac6bc8b..47d4dc7 100755
--- a/maigret/maigret.py
+++ b/maigret/maigret.py
@@ -325,11 +325,18 @@ async def main():
# TODO: fix no site data issue
if not dictionary:
continue
+
new_usernames = dictionary.get('ids_usernames')
if new_usernames:
for u, utype in new_usernames.items():
usernames[u] = utype
+ for url in dictionary.get('ids_links', []):
+ for s in db.sites:
+ u = s.detect_username(url)
+ if u:
+ usernames[u] = 'username'
+
# reporting for a one username
if args.xmind:
filename = report_filepath_tpl.format(username=username, postfix='.xmind')
diff --git a/maigret/sites.py b/maigret/sites.py
index ba9a4a3..2e03244 100644
--- a/maigret/sites.py
+++ b/maigret/sites.py
@@ -2,11 +2,12 @@
"""Maigret Sites Information"""
import copy
import json
+import re
import sys
import requests
-from .utils import CaseConverter
+from .utils import CaseConverter, URLMatcher
class MaigretEngine:
@@ -21,6 +22,16 @@ class MaigretEngine:
class MaigretSite:
+ NOT_SERIALIZABLE_FIELDS = [
+ 'name',
+ 'engineData',
+ 'requestFuture',
+ 'detectedEngine',
+ 'engineObj',
+ 'stats',
+ 'urlRegexp',
+ ]
+
def __init__(self, name, information):
self.name = name
@@ -57,10 +68,29 @@ class MaigretSite:
# We do not know the popularity, so make site go to bottom of list.
self.alexa_rank = sys.maxsize
+ self.update_detectors()
def __str__(self):
return f"{self.name} ({self.url_main})"
+ def update_detectors(self):
+ if 'url' in self.__dict__:
+ url = self.url
+ for group in ['urlMain', 'urlSubpath']:
+ if group in url:
+ url = url.replace('{'+group+'}', self.__dict__[CaseConverter.camel_to_snake(group)])
+
+ self.url_regexp = URLMatcher.make_profile_url_regexp(url, self.regex_check)
+
+ def detect_username(self, url: str) -> str:
+ if self.url_regexp:
+ import logging
+ match_groups = self.url_regexp.match(url)
+ if match_groups:
+ return match_groups.groups()[-1].rstrip('/')
+
+ return None
+
@property
def json(self):
result = {}
@@ -70,7 +100,7 @@ class MaigretSite:
# strip empty elements
if v in (False, '', [], {}, None, sys.maxsize, 'username'):
continue
- if field in ['name', 'engineData', 'requestFuture', 'detectedEngine', 'engineObj', 'stats']:
+ if field in self.NOT_SERIALIZABLE_FIELDS:
continue
result[field] = v
@@ -78,6 +108,7 @@ class MaigretSite:
def update(self, updates: dict) -> MaigretSite:
self.__dict__.update(updates)
+ self.update_detectors()
return self
@@ -95,6 +126,7 @@ class MaigretSite:
self.__dict__[field] = v
self.engine_obj = engine
+ self.update_detectors()
return self
diff --git a/maigret/utils.py b/maigret/utils.py
index 851d3db..f68bdc3 100644
--- a/maigret/utils.py
+++ b/maigret/utils.py
@@ -28,4 +28,30 @@ def enrich_link_str(link: str) -> str:
link = link.strip()
if link.startswith('www.') or (link.startswith('http') and '//' in link):
return f'{link}'
- return link
\ No newline at end of file
+ return link
+
+
+class URLMatcher:
+ _HTTP_URL_RE_STR = '^https?://(www.)?(.+)$'
+ HTTP_URL_RE = re.compile(_HTTP_URL_RE_STR)
+ UNSAFE_SYMBOLS = '.?'
+
+ @classmethod
+ def extract_main_part(self, url: str) -> str:
+ match = self.HTTP_URL_RE.search(url)
+ if match and match.group(2):
+ return match.group(2).rstrip('/')
+
+ return ''
+
+ @classmethod
+ def make_profile_url_regexp(self, url: str, username_regexp: str = '') -> re.Pattern:
+ url_main_part = self.extract_main_part(url)
+ for c in self.UNSAFE_SYMBOLS:
+ url_main_part = url_main_part.replace(c, f'\\{c}')
+ username_regexp = username_regexp or '.+?'
+
+ url_regexp = url_main_part.replace('{username}', f'({username_regexp})')
+ regexp_str = self._HTTP_URL_RE_STR.replace('(.+)', url_regexp)
+
+ return re.compile(regexp_str)
\ No newline at end of file
diff --git a/tests/test_sites.py b/tests/test_sites.py
index b25d784..ff33a9a 100644
--- a/tests/test_sites.py
+++ b/tests/test_sites.py
@@ -113,6 +113,14 @@ def test_saving_site_error():
assert amperka.strip_engine_data().json['errors'] == {'error1': 'text1'}
+def test_site_url_detector():
+ db = MaigretDatabase()
+ db.load_from_json(EXAMPLE_DB)
+
+ assert db.sites[0].url_regexp.pattern == r'^https?://(www.)?forum\.amperka\.ru/members/\?username=(.+?)$'
+ assert db.sites[0].detect_username('http://forum.amperka.ru/members/?username=test') == 'test'
+
+
def test_ranked_sites_dict():
db = MaigretDatabase()
db.update_site(MaigretSite('3', {'alexaRank': 1000, 'engine': 'ucoz'}))
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 18b9825..e2a1bed 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,5 +1,7 @@
"""Maigret utils test functions"""
-from maigret.utils import CaseConverter, is_country_tag, enrich_link_str
+import itertools
+import re
+from maigret.utils import CaseConverter, is_country_tag, enrich_link_str, URLMatcher
def test_case_convert_camel_to_snake():
@@ -32,3 +34,33 @@ def test_is_country_tag():
def test_enrich_link_str():
assert enrich_link_str('test') == 'test'
assert enrich_link_str(' www.flickr.com/photos/alexaimephotography/') == 'www.flickr.com/photos/alexaimephotography/'
+
+def test_url_extract_main_part():
+ url_main_part = 'flickr.com/photos/alexaimephotography'
+
+ parts = [
+ ['http://', 'https://'],
+ ['www.', ''],
+ [url_main_part],
+ ['/', ''],
+ ]
+
+ url_regexp = re.compile('^https?://(www.)?flickr.com/photos/(.+?)$')
+ for url_parts in itertools.product(*parts):
+ url = ''.join(url_parts)
+ assert URLMatcher.extract_main_part(url) == url_main_part
+ assert not url_regexp.match(url) is None
+
+def test_url_make_profile_url_regexp():
+ url_main_part = 'flickr.com/photos/{username}'
+
+ parts = [
+ ['http://', 'https://'],
+ ['www.', ''],
+ [url_main_part],
+ ['/', ''],
+ ]
+
+ for url_parts in itertools.product(*parts):
+ url = ''.join(url_parts)
+ assert URLMatcher.make_profile_url_regexp(url).pattern == r'^https?://(www.)?flickr\.com/photos/(.+?)$'