Merge branch 'main' of https://github.com/soxoj/maigret into site_adds

2026-05-06 22:19:01 +00:00 · 2022-02-24 09:39:58 +01:00
parent 22f158e749 1683e5b744
commit 3c8c1d1f5a
10 changed files with 567 additions and 12 deletions
@@ -101,7 +101,7 @@ maigret user --tags photo,dating
 maigret user1 user2 user3 -a
 ```

-Use `maigret --help` to get full options description. Also options are documented in [the Maigret Wiki](https://github.com/soxoj/maigret/wiki/Command-line-options).
+Use `maigret --help` to get full options description. Also options [are documented](https://maigret.readthedocs.io/en/latest/command-line-options.html).


 ## Demo with page parsing and recursive username search
@@ -0,0 +1,70 @@
+.. _development:
+
+Development
+==============
+
+How to publish new version of Maigret
+-------------------------------------
+
+**Collaborats rights are requires, write Soxoj to get them**.
+
+For new version publishing you must create a new branch in repository
+with a bumped version number and actual changelog first. After it you
+must create a release, and GitHub action automatically create a new 
+PyPi package. 
+
+- New branch example: https://github.com/soxoj/maigret/commit/e520418f6a25d7edacde2d73b41a8ae7c80ddf39
+- Release example: https://github.com/soxoj/maigret/releases/tag/v0.4.1
+
+1. Make a new branch locally with a new version name. Check the current version number here: https://pypi.org/project/maigret/.
+**Increase only patch version (third number)** if there are no breaking changes.
+
+.. code-block:: console
+
+  git checkout -b 0.4.0
+
+2. Update Maigret version in three files manually:
+
+- setup.py
+- maigret/__version__.py 
+- docs/source/conf.py 
+
+3. Create a new empty text section in the beginning of the file `CHANGELOG.md` with a current date:
+
+.. code-block:: console
+
+  ## [0.4.0] - 2022-01-03
+
+4. Get auto-generate release notes:
+
+- Open https://github.com/soxoj/maigret/releases/new
+- Click `Choose a tag`, enter `test`
+- Click `Create new tag`
+- Press `+ Auto-generate release notes`
+- Copy all the text from description text field below
+- Paste it to empty text section in `CHANGELOG.txt`
+- Remove redundant lines `## What's Changed` and `## New Contributors` section if it exists
+- *Close the new release page*
+
+5. Commit all the changes, push, make pull request
+
+.. code-block:: console
+
+  git add ...
+  git commit -m 'Bump to 0.4.0'
+  git push origin head
+
+
+6. Merge pull request
+
+7. Create new release
+
+- Open https://github.com/soxoj/maigret/releases/new again
+- Click `Choose a tag`
+- Enter actual version in format `v0.4.0`
+- Also enter actual version in the field `Release title` 
+- Click `Create new tag`
+- Press `+ Auto-generate release notes`
+- **Press "Publish release" button**
+
+8. That's all, now you can simply wait push to PyPi. You can monitor it in Action page: https://github.com/soxoj/maigret/actions/workflows/python-publish.yml
@@ -28,3 +28,4 @@ You may be interested in:
   tags
   usage-examples
   settings
+   development
@@ -566,7 +566,7 @@ async def main():

    # Database statistics
    if args.stats:
-        print(db.get_db_stats(db.sites_dict))
+        print(db.get_db_stats())

    report_dir = path.join(os.getcwd(), args.folderoutput)

@@ -419,9 +419,8 @@ class MaigretDatabase:
            results[_id] = _type
        return results

-    def get_db_stats(self, sites_dict):
-        if not sites_dict:
-            sites_dict = self.sites_dict()
+    def get_db_stats(self, is_markdown=False):
+        sites_dict = self.sites_dict

        urls = {}
        tags = {}
@@ -429,6 +428,9 @@ class MaigretDatabase:
        disabled_count = 0
        total_count = len(sites_dict)

+        message_checks = 0
+        message_checks_one_factor = 0
+
        for _, site in sites_dict.items():
            if site.disabled:
                disabled_count += 1
@@ -436,24 +438,34 @@ class MaigretDatabase:
            url_type = site.get_url_template()
            urls[url_type] = urls.get(url_type, 0) + 1

+            if site.check_type == 'message':
+                message_checks += 1
+                if site.absence_strs and site.presense_strs:
+                    continue
+                message_checks_one_factor += 1
+
            if not site.tags:
                tags["NO_TAGS"] = tags.get("NO_TAGS", 0) + 1

            for tag in filter(lambda x: not is_country_tag(x), site.tags):
                tags[tag] = tags.get(tag, 0) + 1

-        output += f"Enabled/total sites: {total_count - disabled_count}/{total_count}\n"
-        output += "Top profile URLs:\n"
-        for url, count in sorted(urls.items(), key=lambda x: x[1], reverse=True)[:20]:
+        output += f"Enabled/total sites: {total_count - disabled_count}/{total_count}\n\n"
+        output += f"Incomplete checks: {message_checks_one_factor}/{message_checks} (false positive risks)\n\n"
+
+        top_urls_count = 20
+        output += f"Top {top_urls_count} profile URLs:\n"
+        for url, count in sorted(urls.items(), key=lambda x: x[1], reverse=True)[:top_urls_count]:
            if count == 1:
                break
-            output += f"{count}\t{url}\n"
+            output += f"- ({count})\t`{url}`\n" if is_markdown else f"{count}\t{url}\n"

-        output += "Top tags:\n"
-        for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True)[:200]:
+        top_tags_count = 20
+        output += f"\nTop {top_tags_count} tags:\n"
+        for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True)[:top_tags_count]:
            mark = ""
            if tag not in self._tags:
                mark = " (non-standard)"
-            output += f"{count}\t{tag}{mark}\n"
+            output += f"- ({count})\t`{tag}`{mark}\n" if is_markdown else f"{count}\t{tag}{mark}\n"

        return output
@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+"""Maigret: Supported Site Listing with Alexa ranking and country tags
+This module generates the listing of supported sites in file `SITES.md`
+and pretty prints file with sites data.
+"""
+import aiohttp
+import asyncio
+import json
+import sys
+import requests
+import logging
+import threading
+import xml.etree.ElementTree as ET
+from datetime import datetime
+from argparse import ArgumentParser, RawDescriptionHelpFormatter
+
+import tqdm.asyncio
+
+from maigret.maigret import get_response, site_self_check
+from maigret.sites import MaigretSite, MaigretDatabase, MaigretEngine
+from maigret.utils import CaseConverter
+
+
+async def check_engine_of_site(site_name, sites_with_engines, future, engine_name, semaphore, logger):
+    async with semaphore:
+        response = await get_response(request_future=future,
+                                      site_name=site_name,
+                                      logger=logger)
+
+        html_text, status_code, error_text, expection_text = response
+
+        if html_text and engine_name in html_text:
+            sites_with_engines.append(site_name)
+            return True
+    return False
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter
+                            )
+    parser.add_argument("--base","-b", metavar="BASE_FILE",
+                        dest="base_file", default="maigret/resources/data.json",
+                        help="JSON file with sites data to update.")
+
+    parser.add_argument('--engine', '-e', help='check only selected engine', type=str)
+
+    args = parser.parse_args()
+
+    log_level = logging.INFO
+    logging.basicConfig(
+        format='[%(filename)s:%(lineno)d] %(levelname)-3s  %(asctime)s %(message)s',
+        datefmt='%H:%M:%S',
+        level=log_level
+    )
+    logger = logging.getLogger('engines-check')
+    logger.setLevel(log_level)
+
+    db = MaigretDatabase()
+    sites_subset = db.load_from_file(args.base_file).sites
+    sites = {site.name: site for site in sites_subset}
+
+    with open(args.base_file, "r", encoding="utf-8") as data_file:
+        sites_info = json.load(data_file)
+        engines = sites_info['engines']
+
+    for engine_name, engine_data in engines.items():
+        if args.engine and args.engine != engine_name:
+            continue
+
+        if not 'presenseStrs' in engine_data:
+            print(f'No features to automatically detect sites on engine {engine_name}')
+            continue
+
+        engine_obj = MaigretEngine(engine_name, engine_data)
+
+        # setup connections for checking both engine and usernames
+        connector = aiohttp.TCPConnector(ssl=False)
+        connector.verify_ssl=False
+        session = aiohttp.ClientSession(connector=connector)
+
+        sem = asyncio.Semaphore(100)
+        loop = asyncio.get_event_loop()
+        tasks = []
+
+        # check sites without engine if they look like sites on this engine
+        new_engine_sites = []
+        for site_name, site_data in sites.items():
+            if site_data.engine:
+                continue
+
+            future = session.get(url=site_data.url_main,
+                                 allow_redirects=True,
+                                 timeout=10,
+                                 )
+
+            check_engine_coro = check_engine_of_site(site_name, new_engine_sites, future, engine_name, sem, logger)
+            future = asyncio.ensure_future(check_engine_coro)
+            tasks.append(future)
+
+        # progress bar
+        for f in tqdm.asyncio.tqdm.as_completed(tasks):
+            loop.run_until_complete(f)
+
+        print(f'Total detected {len(new_engine_sites)} sites on engine {engine_name}')
+        # dict with new found engine sites
+        new_sites = {site_name: sites[site_name] for site_name in new_engine_sites}
+
+        # update sites obj from engine
+        for site_name, site in new_sites.items():
+            site.request_future = None
+            site.engine = engine_name
+            site.update_from_engine(engine_obj)
+
+        async def update_site_data(site_name, site_data, all_sites, logger, no_progressbar):
+            updates = await site_self_check(site_name, site_data, logger, no_progressbar)
+            all_sites[site_name].update(updates)
+
+        tasks = []
+        # for new_site_name, new_site_data in new_sites.items():
+            # coro = update_site_data(new_site_name, new_site_data, new_sites, logger)
+            # future = asyncio.ensure_future(coro)
+            # tasks.append(future)
+
+        # asyncio.gather(*tasks)
+        for new_site_name, new_site_data in new_sites.items():
+            coro = update_site_data(new_site_name, new_site_data, new_sites, logger, no_progressbar=True)
+            loop.run_until_complete(coro)
+
+        updated_sites_count = 0
+
+        for s in new_sites:
+            site = new_sites[s]
+            site.request_future = None
+
+            if site.disabled:
+                print(f'{site.name} failed username checking of engine {engine_name}')
+                continue
+
+            site = site.strip_engine_data()
+
+            db.update_site(site)
+            updated_sites_count += 1
+            db.save_to_file(args.base_file)
+
+            print(f'Site "{s}": ' + json.dumps(site.json, indent=4))
+
+        print(f'Updated total {updated_sites_count} sites!')
+        print(f'Checking all sites on engine {engine_name}')
+
+        loop.run_until_complete(session.close())
+
+    print("\nFinished updating supported site listing!")
@@ -0,0 +1,280 @@
+#!/usr/bin/env python3
+import json
+import random
+import re
+
+import tqdm.asyncio
+from mock import Mock
+import requests
+
+from maigret.maigret import *
+from maigret.result import QueryStatus
+from maigret.sites import MaigretSite
+
+URL_RE = re.compile(r"https?://(www\.)?")
+TIMEOUT = 200
+
+
+async def maigret_check(site, site_data, username, status, logger):
+    query_notify = Mock()
+    logger.debug(f'Checking {site}...')
+
+    for username, status in [(username, status)]:
+        results = await maigret(
+            username,
+            {site: site_data},
+            logger,
+            query_notify,
+            timeout=TIMEOUT,
+            forced=True,
+            no_progressbar=True,
+        )
+
+        if results[site]['status'].status != status:
+            if results[site]['status'].status == QueryStatus.UNKNOWN:
+                msg = site_data.absence_strs
+                etype = site_data.check_type
+                context = results[site]['status'].context
+
+                logger.debug(f'Error while searching {username} in {site}, must be claimed. Context: {context}')
+                # if site_data.get('errors'):
+                #     continue
+                return False
+
+            if status == QueryStatus.CLAIMED:
+                logger.debug(f'Not found {username} in {site}, must be claimed')
+                logger.debug(results[site])
+                pass
+            else:
+                logger.debug(f'Found {username} in {site}, must be available')
+                logger.debug(results[site])
+                pass
+            return False
+
+    return site_data
+
+
+async def check_and_add_maigret_site(site_data, semaphore, logger, ok_usernames, bad_usernames):
+    async with semaphore:
+        sitename = site_data.name
+        positive = False
+        negative = False
+
+        for ok_username in ok_usernames:
+            site_data.username_claimed = ok_username
+            status = QueryStatus.CLAIMED
+            if await maigret_check(sitename, site_data, ok_username, status, logger):
+                # print(f'{sitename} positive case is okay')
+                positive = True
+                break
+
+        for bad_username in bad_usernames:
+            site_data.username_unclaimed = bad_username
+            status = QueryStatus.AVAILABLE
+            if await maigret_check(sitename, site_data, bad_username, status, logger):
+                # print(f'{sitename} negative case is okay')
+                negative = True
+                break
+
+        if positive and negative:
+            site_data = site_data.strip_engine_data()
+
+            db.update_site(site_data)
+            print(site_data.json)
+            try:
+                db.save_to_file(args.base_file)
+            except Exception as e:
+                logging.error(e, exc_info=True)
+            print(f'Saved new site {sitename}...')
+            ok_sites.append(site_data)
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter
+                            )
+    parser.add_argument("--base", "-b", metavar="BASE_FILE",
+                        dest="base_file", default="maigret/resources/data.json",
+                        help="JSON file with sites data to update.")
+
+    parser.add_argument("--add-engine", dest="add_engine", help="Additional engine to check")
+
+    parser.add_argument("--only-engine", dest="only_engine", help="Use only this engine from detected to check")
+
+    parser.add_argument('--check', help='only check sites in database', action='store_true')
+
+    parser.add_argument('--random', help='shuffle list of urls', action='store_true', default=False)
+
+    parser.add_argument('--top', help='top count of records in file', type=int, default=10000)
+
+    parser.add_argument('--filter', help='substring to filter input urls', type=str, default='')
+
+    parser.add_argument('--username', help='preferable username to check with', type=str)
+
+    parser.add_argument(
+        "--info",
+        "-vv",
+        action="store_true",
+        dest="info",
+        default=False,
+        help="Display service information.",
+    )
+    parser.add_argument(
+        "--verbose",
+        "-v",
+        action="store_true",
+        dest="verbose",
+        default=False,
+        help="Display extra information and metrics.",
+    )
+    parser.add_argument(
+        "-d",
+        "--debug",
+        "-vvv",
+        action="store_true",
+        dest="debug",
+        default=False,
+        help="Saving debugging information and sites responses in debug.txt.",
+    )
+
+    parser.add_argument("urls_file",
+                        metavar='URLS_FILE',
+                        action="store",
+                        help="File with base site URLs"
+                        )
+
+    args = parser.parse_args()
+
+    log_level = logging.ERROR
+    if args.debug:
+        log_level = logging.DEBUG
+    elif args.info:
+        log_level = logging.INFO
+    elif args.verbose:
+        log_level = logging.WARNING
+
+    logging.basicConfig(
+        format='[%(filename)s:%(lineno)d] %(levelname)-3s  %(asctime)s %(message)s',
+        datefmt='%H:%M:%S',
+        level=log_level
+    )
+    logger = logging.getLogger('engines-check')
+    logger.setLevel(log_level)
+
+    db = MaigretDatabase()
+    sites_subset = db.load_from_file(args.base_file).sites
+    sites = {site.name: site for site in sites_subset}
+    engines = db.engines
+
+    # TODO: usernames extractors
+    ok_usernames = ['alex', 'god', 'admin', 'red', 'blue', 'john']
+    if args.username:
+        ok_usernames = [args.username] + ok_usernames
+
+    bad_usernames = ['noonewouldeverusethis7']
+
+    with open(args.urls_file, 'r') as urls_file:
+        urls = urls_file.read().splitlines()
+        if args.random:
+            random.shuffle(urls)
+        urls = urls[:args.top]
+
+    raw_maigret_data = json.dumps({site.name: site.json for site in sites_subset})
+
+    new_sites = []
+    for site in tqdm.asyncio.tqdm(urls):
+        site_lowercase = site.lower()
+
+        domain_raw = URL_RE.sub('', site_lowercase).strip().strip('/')
+        domain_raw = domain_raw.split('/')[0]
+
+        if args.filter and args.filter not in domain_raw:
+            logger.debug('Site %s skipped due to filtering by "%s"', domain_raw, args.filter)
+            continue
+
+        if domain_raw in raw_maigret_data:
+            logger.debug(f'Site {domain_raw} already exists in the Maigret database!')
+            continue
+
+        if '"' in domain_raw:
+            logger.debug(f'Invalid site {domain_raw}')
+            continue
+
+        main_page_url = '/'.join(site.split('/', 3)[:3])
+
+        site_data = {
+            'url': site,
+            'urlMain': main_page_url,
+            'name': domain_raw,
+        }
+
+        try:
+            r = requests.get(main_page_url, timeout=5)
+        except:
+            r = None
+            pass
+
+        detected_engines = []
+
+        for e in engines:
+            strs_to_check = e.__dict__.get('presenseStrs')
+            if strs_to_check and r and r.text:
+                all_strs_in_response = True
+                for s in strs_to_check:
+                    if not s in r.text:
+                        all_strs_in_response = False
+                if all_strs_in_response:
+                    engine_name = e.__dict__.get('name')
+                    detected_engines.append(engine_name)
+                    logger.info(f'Detected engine {engine_name} for site {main_page_url}')
+
+        if args.only_engine and args.only_engine in detected_engines:
+            detected_engines = [args.only_engine]
+        elif not detected_engines and args.add_engine:
+            logging.debug('Could not detect any engine, applying default engine %s...', args.add_engine)
+            detected_engines = [args.add_engine]
+
+        def create_site_from_engine(sitename, data, e):
+            site = MaigretSite(sitename, data)
+            site.update_from_engine(db.engines_dict[e])
+            site.engine = e
+            return site
+
+        for engine_name in detected_engines:
+            site = create_site_from_engine(domain_raw, site_data, engine_name)
+            new_sites.append(site)
+            logger.debug(site.json)
+
+            # if engine_name == "phpBB":
+            #     site_data_with_subpath = dict(site_data)
+            #     site_data_with_subpath["urlSubpath"] = "/forum"
+            #     site = create_site_from_engine(domain_raw, site_data_with_subpath, engine_name)
+            #     new_sites.append(site)
+
+        # except Exception as e:
+        #     print(f'Error: {str(e)}')
+        #     pass
+
+    print(f'Found {len(new_sites)}/{len(urls)} new sites')
+
+    if args.check:
+        for s in new_sites:
+            print(s.url_main)
+        sys.exit(0)
+
+    sem = asyncio.Semaphore(20)
+    loop = asyncio.get_event_loop()
+
+    ok_sites = []
+    tasks = []
+    for site in new_sites:
+        check_coro = check_and_add_maigret_site(site, sem, logger, ok_usernames, bad_usernames)
+        future = asyncio.ensure_future(check_coro)
+        tasks.append(future)
+
+    for f in tqdm.asyncio.tqdm.as_completed(tasks, timeout=TIMEOUT):
+        try:
+            loop.run_until_complete(f)
+        except asyncio.exceptions.TimeoutError:
+            pass
+
+    print(f'Found and saved {len(ok_sites)} sites!')
@@ -0,0 +1,36 @@
+import sys
+import difflib
+import requests
+
+
+a = requests.get(sys.argv[1]).text
+b = requests.get(sys.argv[2]).text
+
+
+tokens_a = set(a.split('"'))
+tokens_b = set(b.split('"'))
+
+a_minus_b = tokens_a.difference(tokens_b)
+b_minus_a = tokens_b.difference(tokens_a)
+
+print(a_minus_b)
+print(b_minus_a)
+
+print(len(a_minus_b))
+print(len(b_minus_a))
+
+desired_strings = ["username", "not found", "пользователь", "profile", "lastname", "firstname", "biography",
+"birthday", "репутация", "информация", "e-mail"]
+
+
+def get_match_ratio(x):
+    return round(max([
+    	difflib.SequenceMatcher(a=x.lower(), b=y).ratio()
+    	for y in desired_strings
+    ]), 2)
+
+
+RATIO = 0.6
+
+print(sorted(a_minus_b, key=get_match_ratio, reverse=True)[:10])
+print(sorted(b_minus_a, key=get_match_ratio, reverse=True)[:10])
@@ -140,4 +140,8 @@ Rank data fetched from Alexa by domains.
        site_file.write(f'\nAlexa.com rank data fetched at ({datetime.utcnow()} UTC)\n')
        db.save_to_file(args.base_file)

+        statistics_text = db.get_db_stats(is_markdown=True)
+        site_file.write('## Statistics\n\n')
+        site_file.write(statistics_text)
+
    print("\nFinished updating supported site listing!")