Refactoring and linting, added notifications about frequent search errors

2026-05-13 18:05:39 +00:00 · 2021-04-30 12:03:13 +03:00
parent bfaf276f6e
commit bfa6afac32
20 changed files with 1351 additions and 787 deletions
@@ -1,8 +1,9 @@
-# -*- coding: future_annotations -*-
+# ****************************** -*-
 """Maigret Sites Information"""
 import copy
 import json
 import sys
+from typing import Optional

 import requests

@@ -10,12 +11,48 @@ from .utils import CaseConverter, URLMatcher, is_country_tag

 # TODO: move to data.json
 SUPPORTED_TAGS = [
-    'gaming', 'coding', 'photo', 'music', 'blog', 'finance', 'freelance', 'dating',
-    'tech', 'forum', 'porn', 'erotic', 'webcam', 'video', 'movies', 'hacking', 'art',
-    'discussion', 'sharing', 'writing', 'wiki', 'business', 'shopping', 'sport',
-    'books', 'news', 'documents', 'travel', 'maps', 'hobby', 'apps', 'classified',
-    'career', 'geosocial', 'streaming', 'education', 'networking', 'torrent',
-    'science', 'medicine', 'reading', 'stock',
+    "gaming",
+    "coding",
+    "photo",
+    "music",
+    "blog",
+    "finance",
+    "freelance",
+    "dating",
+    "tech",
+    "forum",
+    "porn",
+    "erotic",
+    "webcam",
+    "video",
+    "movies",
+    "hacking",
+    "art",
+    "discussion",
+    "sharing",
+    "writing",
+    "wiki",
+    "business",
+    "shopping",
+    "sport",
+    "books",
+    "news",
+    "documents",
+    "travel",
+    "maps",
+    "hobby",
+    "apps",
+    "classified",
+    "career",
+    "geosocial",
+    "streaming",
+    "education",
+    "networking",
+    "torrent",
+    "science",
+    "medicine",
+    "reading",
+    "stock",
 ]


@@ -32,13 +69,13 @@ class MaigretEngine:

 class MaigretSite:
    NOT_SERIALIZABLE_FIELDS = [
-        'name',
-        'engineData',
-        'requestFuture',
-        'detectedEngine',
-        'engineObj',
-        'stats',
-        'urlRegexp',
+        "name",
+        "engineData",
+        "requestFuture",
+        "detectedEngine",
+        "engineObj",
+        "stats",
+        "urlRegexp",
    ]

    def __init__(self, name, information):
@@ -49,15 +86,15 @@ class MaigretSite:
        self.ignore403 = False
        self.tags = []

-        self.type = 'username'
+        self.type = "username"
        self.headers = {}
        self.errors = {}
        self.activation = {}
-        self.url_subpath = ''
+        self.url_subpath = ""
        self.regex_check = None
        self.url_probe = None
-        self.check_type = ''
-        self.request_head_only = ''
+        self.check_type = ""
+        self.request_head_only = ""
        self.get_params = {}

        self.presense_strs = []
@@ -84,26 +121,29 @@ class MaigretSite:
        return f"{self.name} ({self.url_main})"

    def update_detectors(self):
-        if 'url' in self.__dict__:
+        if "url" in self.__dict__:
            url = self.url
-            for group in ['urlMain', 'urlSubpath']:
+            for group in ["urlMain", "urlSubpath"]:
                if group in url:
-                    url = url.replace('{' + group + '}', self.__dict__[CaseConverter.camel_to_snake(group)])
+                    url = url.replace(
+                        "{" + group + "}",
+                        self.__dict__[CaseConverter.camel_to_snake(group)],
+                    )

            self.url_regexp = URLMatcher.make_profile_url_regexp(url, self.regex_check)

-    def detect_username(self, url: str) -> str:
+    def detect_username(self, url: str) -> Optional[str]:
        if self.url_regexp:
            match_groups = self.url_regexp.match(url)
            if match_groups:
-                return match_groups.groups()[-1].rstrip('/')
+                return match_groups.groups()[-1].rstrip("/")

        return None

    @property
    def pretty_name(self):
        if self.source:
-            return f'{self.name} [{self.source}]'
+            return f"{self.name} [{self.source}]"
        return self.name

    @property
@@ -113,7 +153,7 @@ class MaigretSite:
            # convert to camelCase
            field = CaseConverter.snake_to_camel(k)
            # strip empty elements
-            if v in (False, '', [], {}, None, sys.maxsize, 'username'):
+            if v in (False, "", [], {}, None, sys.maxsize, "username"):
                continue
            if field in self.NOT_SERIALIZABLE_FIELDS:
                continue
@@ -121,13 +161,13 @@ class MaigretSite:

        return result

-    def update(self, updates: dict) -> MaigretSite:
+    def update(self, updates: "dict") -> "MaigretSite":
        self.__dict__.update(updates)
        self.update_detectors()

        return self

-    def update_from_engine(self, engine: MaigretEngine) -> MaigretSite:
+    def update_from_engine(self, engine: MaigretEngine) -> "MaigretSite":
        engine_data = engine.site
        for k, v in engine_data.items():
            field = CaseConverter.camel_to_snake(k)
@@ -145,7 +185,7 @@ class MaigretSite:

        return self

-    def strip_engine_data(self) -> MaigretSite:
+    def strip_engine_data(self) -> "MaigretSite":
        if not self.engine_obj:
            return self

@@ -190,30 +230,47 @@ class MaigretDatabase:
    def sites_dict(self):
        return {site.name: site for site in self._sites}

-    def ranked_sites_dict(self, reverse=False, top=sys.maxsize, tags=[], names=[],
-                          disabled=True, id_type='username'):
+    def ranked_sites_dict(
+        self,
+        reverse=False,
+        top=sys.maxsize,
+        tags=[],
+        names=[],
+        disabled=True,
+        id_type="username",
+    ):
        """
-            Ranking and filtering of the sites list
+        Ranking and filtering of the sites list
        """
        normalized_names = list(map(str.lower, names))
        normalized_tags = list(map(str.lower, tags))

        is_name_ok = lambda x: x.name.lower() in normalized_names
        is_source_ok = lambda x: x.source and x.source.lower() in normalized_names
-        is_engine_ok = lambda x: isinstance(x.engine, str) and x.engine.lower() in normalized_tags
+        is_engine_ok = (
+            lambda x: isinstance(x.engine, str) and x.engine.lower() in normalized_tags
+        )
        is_tags_ok = lambda x: set(x.tags).intersection(set(normalized_tags))
-        is_disabled_needed = lambda x: not x.disabled or ('disabled' in tags or disabled)
+        is_disabled_needed = lambda x: not x.disabled or (
+            "disabled" in tags or disabled
+        )
        is_id_type_ok = lambda x: x.type == id_type

        filter_tags_engines_fun = lambda x: not tags or is_engine_ok(x) or is_tags_ok(x)
        filter_names_fun = lambda x: not names or is_name_ok(x) or is_source_ok(x)

-        filter_fun = lambda x: filter_tags_engines_fun(x) and filter_names_fun(x) \
-                               and is_disabled_needed(x) and is_id_type_ok(x)
+        filter_fun = (
+            lambda x: filter_tags_engines_fun(x)
+            and filter_names_fun(x)
+            and is_disabled_needed(x)
+            and is_id_type_ok(x)
+        )

        filtered_list = [s for s in self.sites if filter_fun(s)]

-        sorted_list = sorted(filtered_list, key=lambda x: x.alexa_rank, reverse=reverse)[:top]
+        sorted_list = sorted(
+            filtered_list, key=lambda x: x.alexa_rank, reverse=reverse
+        )[:top]
        return {site.name: site for site in sorted_list}

    @property
@@ -224,7 +281,7 @@ class MaigretDatabase:
    def engines_dict(self):
        return {engine.name: engine for engine in self._engines}

-    def update_site(self, site: MaigretSite) -> MaigretDatabase:
+    def update_site(self, site: MaigretSite) -> "MaigretDatabase":
        for s in self._sites:
            if s.name == site.name:
                s = site
@@ -233,20 +290,20 @@ class MaigretDatabase:
        self._sites.append(site)
        return self

-    def save_to_file(self, filename: str) -> MaigretDatabase:
+    def save_to_file(self, filename: str) -> "MaigretDatabase":
        db_data = {
-            'sites': {site.name: site.strip_engine_data().json for site in self._sites},
-            'engines': {engine.name: engine.json for engine in self._engines},
+            "sites": {site.name: site.strip_engine_data().json for site in self._sites},
+            "engines": {engine.name: engine.json for engine in self._engines},
        }

        json_data = json.dumps(db_data, indent=4)

-        with open(filename, 'w') as f:
+        with open(filename, "w") as f:
            f.write(json_data)

        return self

-    def load_from_json(self, json_data: dict) -> MaigretDatabase:
+    def load_from_json(self, json_data: dict) -> "MaigretDatabase":
        # Add all of site information from the json file to internal site list.
        site_data = json_data.get("sites", {})
        engines_data = json_data.get("engines", {})
@@ -258,30 +315,32 @@ class MaigretDatabase:
            try:
                maigret_site = MaigretSite(site_name, site_data[site_name])

-                engine = site_data[site_name].get('engine')
+                engine = site_data[site_name].get("engine")
                if engine:
                    maigret_site.update_from_engine(self.engines_dict[engine])

                self._sites.append(maigret_site)
            except KeyError as error:
-                raise ValueError(f"Problem parsing json content for site {site_name}: "
-                                 f"Missing attribute {str(error)}."
-                                 )
+                raise ValueError(
+                    f"Problem parsing json content for site {site_name}: "
+                    f"Missing attribute {str(error)}."
+                )

        return self

-    def load_from_str(self, db_str: str) -> MaigretDatabase:
+    def load_from_str(self, db_str: "str") -> "MaigretDatabase":
        try:
            data = json.loads(db_str)
        except Exception as error:
-            raise ValueError(f"Problem parsing json contents from str"
-                             f"'{db_str[:50]}'...:  {str(error)}."
-                             )
+            raise ValueError(
+                f"Problem parsing json contents from str"
+                f"'{db_str[:50]}'...:  {str(error)}."
+            )

        return self.load_from_json(data)

-    def load_from_url(self, url: str) -> MaigretDatabase:
-        is_url_valid = url.startswith('http://') or url.startswith('https://')
+    def load_from_url(self, url: str) -> "MaigretDatabase":
+        is_url_valid = url.startswith("http://") or url.startswith("https://")

        if not is_url_valid:
            raise FileNotFoundError(f"Invalid data file URL '{url}'.")
@@ -289,38 +348,40 @@ class MaigretDatabase:
        try:
            response = requests.get(url=url)
        except Exception as error:
-            raise FileNotFoundError(f"Problem while attempting to access "
-                                    f"data file URL '{url}':  "
-                                    f"{str(error)}"
-                                    )
+            raise FileNotFoundError(
+                f"Problem while attempting to access "
+                f"data file URL '{url}':  "
+                f"{str(error)}"
+            )

        if response.status_code == 200:
            try:
                data = response.json()
            except Exception as error:
-                raise ValueError(f"Problem parsing json contents at "
-                                 f"'{url}':  {str(error)}."
-                                 )
+                raise ValueError(
+                    f"Problem parsing json contents at " f"'{url}':  {str(error)}."
+                )
        else:
-            raise FileNotFoundError(f"Bad response while accessing "
-                                    f"data file URL '{url}'."
-                                    )
+            raise FileNotFoundError(
+                f"Bad response while accessing " f"data file URL '{url}'."
+            )

        return self.load_from_json(data)

-    def load_from_file(self, filename: str) -> MaigretDatabase:
+    def load_from_file(self, filename: "str") -> "MaigretDatabase":
        try:
-            with open(filename, 'r', encoding='utf-8') as file:
+            with open(filename, "r", encoding="utf-8") as file:
                try:
                    data = json.load(file)
                except Exception as error:
-                    raise ValueError(f"Problem parsing json contents from "
-                                     f"file '{filename}':  {str(error)}."
-                                     )
+                    raise ValueError(
+                        f"Problem parsing json contents from "
+                        f"file '{filename}':  {str(error)}."
+                    )
        except FileNotFoundError as error:
-            raise FileNotFoundError(f"Problem while attempting to access "
-                                    f"data file '{filename}'."
-                                    )
+            raise FileNotFoundError(
+                f"Problem while attempting to access " f"data file '{filename}'."
+            ) from error

        return self.load_from_json(data)

@@ -328,8 +389,8 @@ class MaigretDatabase:
        sites = sites_dict or self.sites_dict
        found_flags = {}
        for _, s in sites.items():
-            if 'presense_flag' in s.stats:
-                flag = s.stats['presense_flag']
+            if "presense_flag" in s.stats:
+                flag = s.stats["presense_flag"]
                found_flags[flag] = found_flags.get(flag, 0) + 1

        return found_flags
@@ -338,7 +399,7 @@ class MaigretDatabase:
        if not sites_dict:
            sites_dict = self.sites_dict()

-        output = ''
+        output = ""
        disabled_count = 0
        total_count = len(sites_dict)
        urls = {}
@@ -349,18 +410,18 @@ class MaigretDatabase:
                disabled_count += 1

            url = URLMatcher.extract_main_part(site.url)
-            if url.startswith('{username}'):
-                url = 'SUBDOMAIN'
-            elif url == '':
-                url = f'{site.url} ({site.engine})'
+            if url.startswith("{username}"):
+                url = "SUBDOMAIN"
+            elif url == "":
+                url = f"{site.url} ({site.engine})"
            else:
-                parts = url.split('/')
-                url = '/' + '/'.join(parts[1:])
+                parts = url.split("/")
+                url = "/" + "/".join(parts[1:])

            urls[url] = urls.get(url, 0) + 1

            if not site.tags:
-                tags['NO_TAGS'] = tags.get('NO_TAGS', 0) + 1
+                tags["NO_TAGS"] = tags.get("NO_TAGS", 0) + 1

            for tag in site.tags:
                if is_country_tag(tag):
@@ -368,17 +429,17 @@ class MaigretDatabase:
                    continue
                tags[tag] = tags.get(tag, 0) + 1

-        output += f'Enabled/total sites: {total_count - disabled_count}/{total_count}\n'
-        output += 'Top sites\' profile URLs:\n'
+        output += f"Enabled/total sites: {total_count - disabled_count}/{total_count}\n"
+        output += "Top sites' profile URLs:\n"
        for url, count in sorted(urls.items(), key=lambda x: x[1], reverse=True)[:20]:
            if count == 1:
                break
-            output += f'{count}\t{url}\n'
-        output += 'Top sites\' tags:\n'
+            output += f"{count}\t{url}\n"
+        output += "Top sites' tags:\n"
        for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True):
-            mark = ''
-            if not tag in SUPPORTED_TAGS:
-                mark = ' (non-standard)'
-            output += f'{count}\t{tag}{mark}\n'
+            mark = ""
+            if tag not in SUPPORTED_TAGS:
+                mark = " (non-standard)"
+            output += f"{count}\t{tag}{mark}\n"

        return output