diff --git a/README.md b/README.md index f5404eb..074fc88 100644 --- a/README.md +++ b/README.md @@ -101,7 +101,7 @@ maigret user --tags photo,dating maigret user1 user2 user3 -a ``` -Use `maigret --help` to get full options description. Also options are documented in [the Maigret Wiki](https://github.com/soxoj/maigret/wiki/Command-line-options). +Use `maigret --help` to get full options description. Also options [are documented](https://maigret.readthedocs.io/en/latest/command-line-options.html). ## Demo with page parsing and recursive username search diff --git a/docs/source/development.rst b/docs/source/development.rst new file mode 100644 index 0000000..95ae37a --- /dev/null +++ b/docs/source/development.rst @@ -0,0 +1,70 @@ +.. _development: + +Development +============== + +How to publish new version of Maigret +------------------------------------- + +**Collaborats rights are requires, write Soxoj to get them**. + +For new version publishing you must create a new branch in repository +with a bumped version number and actual changelog first. After it you +must create a release, and GitHub action automatically create a new +PyPi package. + +- New branch example: https://github.com/soxoj/maigret/commit/e520418f6a25d7edacde2d73b41a8ae7c80ddf39 +- Release example: https://github.com/soxoj/maigret/releases/tag/v0.4.1 + +1. Make a new branch locally with a new version name. Check the current version number here: https://pypi.org/project/maigret/. +**Increase only patch version (third number)** if there are no breaking changes. + +.. code-block:: console + + git checkout -b 0.4.0 + +2. Update Maigret version in three files manually: + +- setup.py +- maigret/__version__.py +- docs/source/conf.py + +3. Create a new empty text section in the beginning of the file `CHANGELOG.md` with a current date: + +.. code-block:: console + + ## [0.4.0] - 2022-01-03 + +4. Get auto-generate release notes: + +- Open https://github.com/soxoj/maigret/releases/new +- Click `Choose a tag`, enter `test` +- Click `Create new tag` +- Press `+ Auto-generate release notes` +- Copy all the text from description text field below +- Paste it to empty text section in `CHANGELOG.txt` +- Remove redundant lines `## What's Changed` and `## New Contributors` section if it exists +- *Close the new release page* + +5. Commit all the changes, push, make pull request + +.. code-block:: console + + git add ... + git commit -m 'Bump to 0.4.0' + git push origin head + + +6. Merge pull request + +7. Create new release + +- Open https://github.com/soxoj/maigret/releases/new again +- Click `Choose a tag` +- Enter actual version in format `v0.4.0` +- Also enter actual version in the field `Release title` +- Click `Create new tag` +- Press `+ Auto-generate release notes` +- **Press "Publish release" button** + +8. That's all, now you can simply wait push to PyPi. You can monitor it in Action page: https://github.com/soxoj/maigret/actions/workflows/python-publish.yml \ No newline at end of file diff --git a/docs/source/index.rst b/docs/source/index.rst index 273e4cf..85a5562 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -28,3 +28,4 @@ You may be interested in: tags usage-examples settings + development diff --git a/maigret/maigret.py b/maigret/maigret.py index 75b0c7f..224b33c 100755 --- a/maigret/maigret.py +++ b/maigret/maigret.py @@ -566,7 +566,7 @@ async def main(): # Database statistics if args.stats: - print(db.get_db_stats(db.sites_dict)) + print(db.get_db_stats()) report_dir = path.join(os.getcwd(), args.folderoutput) diff --git a/maigret/sites.py b/maigret/sites.py index 5d40076..9ea540d 100644 --- a/maigret/sites.py +++ b/maigret/sites.py @@ -419,9 +419,8 @@ class MaigretDatabase: results[_id] = _type return results - def get_db_stats(self, sites_dict): - if not sites_dict: - sites_dict = self.sites_dict() + def get_db_stats(self, is_markdown=False): + sites_dict = self.sites_dict urls = {} tags = {} @@ -429,6 +428,9 @@ class MaigretDatabase: disabled_count = 0 total_count = len(sites_dict) + message_checks = 0 + message_checks_one_factor = 0 + for _, site in sites_dict.items(): if site.disabled: disabled_count += 1 @@ -436,24 +438,34 @@ class MaigretDatabase: url_type = site.get_url_template() urls[url_type] = urls.get(url_type, 0) + 1 + if site.check_type == 'message': + message_checks += 1 + if site.absence_strs and site.presense_strs: + continue + message_checks_one_factor += 1 + if not site.tags: tags["NO_TAGS"] = tags.get("NO_TAGS", 0) + 1 for tag in filter(lambda x: not is_country_tag(x), site.tags): tags[tag] = tags.get(tag, 0) + 1 - output += f"Enabled/total sites: {total_count - disabled_count}/{total_count}\n" - output += "Top profile URLs:\n" - for url, count in sorted(urls.items(), key=lambda x: x[1], reverse=True)[:20]: + output += f"Enabled/total sites: {total_count - disabled_count}/{total_count}\n\n" + output += f"Incomplete checks: {message_checks_one_factor}/{message_checks} (false positive risks)\n\n" + + top_urls_count = 20 + output += f"Top {top_urls_count} profile URLs:\n" + for url, count in sorted(urls.items(), key=lambda x: x[1], reverse=True)[:top_urls_count]: if count == 1: break - output += f"{count}\t{url}\n" + output += f"- ({count})\t`{url}`\n" if is_markdown else f"{count}\t{url}\n" - output += "Top tags:\n" - for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True)[:200]: + top_tags_count = 20 + output += f"\nTop {top_tags_count} tags:\n" + for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True)[:top_tags_count]: mark = "" if tag not in self._tags: mark = " (non-standard)" - output += f"{count}\t{tag}{mark}\n" + output += f"- ({count})\t`{tag}`{mark}\n" if is_markdown else f"{count}\t{tag}{mark}\n" return output diff --git a/utils/__init__.py b/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/utils/check_engines.py b/utils/check_engines.py new file mode 100755 index 0000000..40838ca --- /dev/null +++ b/utils/check_engines.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python3 +"""Maigret: Supported Site Listing with Alexa ranking and country tags +This module generates the listing of supported sites in file `SITES.md` +and pretty prints file with sites data. +""" +import aiohttp +import asyncio +import json +import sys +import requests +import logging +import threading +import xml.etree.ElementTree as ET +from datetime import datetime +from argparse import ArgumentParser, RawDescriptionHelpFormatter + +import tqdm.asyncio + +from maigret.maigret import get_response, site_self_check +from maigret.sites import MaigretSite, MaigretDatabase, MaigretEngine +from maigret.utils import CaseConverter + + +async def check_engine_of_site(site_name, sites_with_engines, future, engine_name, semaphore, logger): + async with semaphore: + response = await get_response(request_future=future, + site_name=site_name, + logger=logger) + + html_text, status_code, error_text, expection_text = response + + if html_text and engine_name in html_text: + sites_with_engines.append(site_name) + return True + return False + + +if __name__ == '__main__': + parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter + ) + parser.add_argument("--base","-b", metavar="BASE_FILE", + dest="base_file", default="maigret/resources/data.json", + help="JSON file with sites data to update.") + + parser.add_argument('--engine', '-e', help='check only selected engine', type=str) + + args = parser.parse_args() + + log_level = logging.INFO + logging.basicConfig( + format='[%(filename)s:%(lineno)d] %(levelname)-3s %(asctime)s %(message)s', + datefmt='%H:%M:%S', + level=log_level + ) + logger = logging.getLogger('engines-check') + logger.setLevel(log_level) + + db = MaigretDatabase() + sites_subset = db.load_from_file(args.base_file).sites + sites = {site.name: site for site in sites_subset} + + with open(args.base_file, "r", encoding="utf-8") as data_file: + sites_info = json.load(data_file) + engines = sites_info['engines'] + + for engine_name, engine_data in engines.items(): + if args.engine and args.engine != engine_name: + continue + + if not 'presenseStrs' in engine_data: + print(f'No features to automatically detect sites on engine {engine_name}') + continue + + engine_obj = MaigretEngine(engine_name, engine_data) + + # setup connections for checking both engine and usernames + connector = aiohttp.TCPConnector(ssl=False) + connector.verify_ssl=False + session = aiohttp.ClientSession(connector=connector) + + sem = asyncio.Semaphore(100) + loop = asyncio.get_event_loop() + tasks = [] + + # check sites without engine if they look like sites on this engine + new_engine_sites = [] + for site_name, site_data in sites.items(): + if site_data.engine: + continue + + future = session.get(url=site_data.url_main, + allow_redirects=True, + timeout=10, + ) + + check_engine_coro = check_engine_of_site(site_name, new_engine_sites, future, engine_name, sem, logger) + future = asyncio.ensure_future(check_engine_coro) + tasks.append(future) + + # progress bar + for f in tqdm.asyncio.tqdm.as_completed(tasks): + loop.run_until_complete(f) + + print(f'Total detected {len(new_engine_sites)} sites on engine {engine_name}') + # dict with new found engine sites + new_sites = {site_name: sites[site_name] for site_name in new_engine_sites} + + # update sites obj from engine + for site_name, site in new_sites.items(): + site.request_future = None + site.engine = engine_name + site.update_from_engine(engine_obj) + + async def update_site_data(site_name, site_data, all_sites, logger, no_progressbar): + updates = await site_self_check(site_name, site_data, logger, no_progressbar) + all_sites[site_name].update(updates) + + tasks = [] + # for new_site_name, new_site_data in new_sites.items(): + # coro = update_site_data(new_site_name, new_site_data, new_sites, logger) + # future = asyncio.ensure_future(coro) + # tasks.append(future) + + # asyncio.gather(*tasks) + for new_site_name, new_site_data in new_sites.items(): + coro = update_site_data(new_site_name, new_site_data, new_sites, logger, no_progressbar=True) + loop.run_until_complete(coro) + + updated_sites_count = 0 + + for s in new_sites: + site = new_sites[s] + site.request_future = None + + if site.disabled: + print(f'{site.name} failed username checking of engine {engine_name}') + continue + + site = site.strip_engine_data() + + db.update_site(site) + updated_sites_count += 1 + db.save_to_file(args.base_file) + + print(f'Site "{s}": ' + json.dumps(site.json, indent=4)) + + print(f'Updated total {updated_sites_count} sites!') + print(f'Checking all sites on engine {engine_name}') + + loop.run_until_complete(session.close()) + + print("\nFinished updating supported site listing!") diff --git a/utils/import_sites.py b/utils/import_sites.py new file mode 100755 index 0000000..5ed2346 --- /dev/null +++ b/utils/import_sites.py @@ -0,0 +1,280 @@ +#!/usr/bin/env python3 +import json +import random +import re + +import tqdm.asyncio +from mock import Mock +import requests + +from maigret.maigret import * +from maigret.result import QueryStatus +from maigret.sites import MaigretSite + +URL_RE = re.compile(r"https?://(www\.)?") +TIMEOUT = 200 + + +async def maigret_check(site, site_data, username, status, logger): + query_notify = Mock() + logger.debug(f'Checking {site}...') + + for username, status in [(username, status)]: + results = await maigret( + username, + {site: site_data}, + logger, + query_notify, + timeout=TIMEOUT, + forced=True, + no_progressbar=True, + ) + + if results[site]['status'].status != status: + if results[site]['status'].status == QueryStatus.UNKNOWN: + msg = site_data.absence_strs + etype = site_data.check_type + context = results[site]['status'].context + + logger.debug(f'Error while searching {username} in {site}, must be claimed. Context: {context}') + # if site_data.get('errors'): + # continue + return False + + if status == QueryStatus.CLAIMED: + logger.debug(f'Not found {username} in {site}, must be claimed') + logger.debug(results[site]) + pass + else: + logger.debug(f'Found {username} in {site}, must be available') + logger.debug(results[site]) + pass + return False + + return site_data + + +async def check_and_add_maigret_site(site_data, semaphore, logger, ok_usernames, bad_usernames): + async with semaphore: + sitename = site_data.name + positive = False + negative = False + + for ok_username in ok_usernames: + site_data.username_claimed = ok_username + status = QueryStatus.CLAIMED + if await maigret_check(sitename, site_data, ok_username, status, logger): + # print(f'{sitename} positive case is okay') + positive = True + break + + for bad_username in bad_usernames: + site_data.username_unclaimed = bad_username + status = QueryStatus.AVAILABLE + if await maigret_check(sitename, site_data, bad_username, status, logger): + # print(f'{sitename} negative case is okay') + negative = True + break + + if positive and negative: + site_data = site_data.strip_engine_data() + + db.update_site(site_data) + print(site_data.json) + try: + db.save_to_file(args.base_file) + except Exception as e: + logging.error(e, exc_info=True) + print(f'Saved new site {sitename}...') + ok_sites.append(site_data) + + +if __name__ == '__main__': + parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter + ) + parser.add_argument("--base", "-b", metavar="BASE_FILE", + dest="base_file", default="maigret/resources/data.json", + help="JSON file with sites data to update.") + + parser.add_argument("--add-engine", dest="add_engine", help="Additional engine to check") + + parser.add_argument("--only-engine", dest="only_engine", help="Use only this engine from detected to check") + + parser.add_argument('--check', help='only check sites in database', action='store_true') + + parser.add_argument('--random', help='shuffle list of urls', action='store_true', default=False) + + parser.add_argument('--top', help='top count of records in file', type=int, default=10000) + + parser.add_argument('--filter', help='substring to filter input urls', type=str, default='') + + parser.add_argument('--username', help='preferable username to check with', type=str) + + parser.add_argument( + "--info", + "-vv", + action="store_true", + dest="info", + default=False, + help="Display service information.", + ) + parser.add_argument( + "--verbose", + "-v", + action="store_true", + dest="verbose", + default=False, + help="Display extra information and metrics.", + ) + parser.add_argument( + "-d", + "--debug", + "-vvv", + action="store_true", + dest="debug", + default=False, + help="Saving debugging information and sites responses in debug.txt.", + ) + + parser.add_argument("urls_file", + metavar='URLS_FILE', + action="store", + help="File with base site URLs" + ) + + args = parser.parse_args() + + log_level = logging.ERROR + if args.debug: + log_level = logging.DEBUG + elif args.info: + log_level = logging.INFO + elif args.verbose: + log_level = logging.WARNING + + logging.basicConfig( + format='[%(filename)s:%(lineno)d] %(levelname)-3s %(asctime)s %(message)s', + datefmt='%H:%M:%S', + level=log_level + ) + logger = logging.getLogger('engines-check') + logger.setLevel(log_level) + + db = MaigretDatabase() + sites_subset = db.load_from_file(args.base_file).sites + sites = {site.name: site for site in sites_subset} + engines = db.engines + + # TODO: usernames extractors + ok_usernames = ['alex', 'god', 'admin', 'red', 'blue', 'john'] + if args.username: + ok_usernames = [args.username] + ok_usernames + + bad_usernames = ['noonewouldeverusethis7'] + + with open(args.urls_file, 'r') as urls_file: + urls = urls_file.read().splitlines() + if args.random: + random.shuffle(urls) + urls = urls[:args.top] + + raw_maigret_data = json.dumps({site.name: site.json for site in sites_subset}) + + new_sites = [] + for site in tqdm.asyncio.tqdm(urls): + site_lowercase = site.lower() + + domain_raw = URL_RE.sub('', site_lowercase).strip().strip('/') + domain_raw = domain_raw.split('/')[0] + + if args.filter and args.filter not in domain_raw: + logger.debug('Site %s skipped due to filtering by "%s"', domain_raw, args.filter) + continue + + if domain_raw in raw_maigret_data: + logger.debug(f'Site {domain_raw} already exists in the Maigret database!') + continue + + if '"' in domain_raw: + logger.debug(f'Invalid site {domain_raw}') + continue + + main_page_url = '/'.join(site.split('/', 3)[:3]) + + site_data = { + 'url': site, + 'urlMain': main_page_url, + 'name': domain_raw, + } + + try: + r = requests.get(main_page_url, timeout=5) + except: + r = None + pass + + detected_engines = [] + + for e in engines: + strs_to_check = e.__dict__.get('presenseStrs') + if strs_to_check and r and r.text: + all_strs_in_response = True + for s in strs_to_check: + if not s in r.text: + all_strs_in_response = False + if all_strs_in_response: + engine_name = e.__dict__.get('name') + detected_engines.append(engine_name) + logger.info(f'Detected engine {engine_name} for site {main_page_url}') + + if args.only_engine and args.only_engine in detected_engines: + detected_engines = [args.only_engine] + elif not detected_engines and args.add_engine: + logging.debug('Could not detect any engine, applying default engine %s...', args.add_engine) + detected_engines = [args.add_engine] + + def create_site_from_engine(sitename, data, e): + site = MaigretSite(sitename, data) + site.update_from_engine(db.engines_dict[e]) + site.engine = e + return site + + for engine_name in detected_engines: + site = create_site_from_engine(domain_raw, site_data, engine_name) + new_sites.append(site) + logger.debug(site.json) + + # if engine_name == "phpBB": + # site_data_with_subpath = dict(site_data) + # site_data_with_subpath["urlSubpath"] = "/forum" + # site = create_site_from_engine(domain_raw, site_data_with_subpath, engine_name) + # new_sites.append(site) + + # except Exception as e: + # print(f'Error: {str(e)}') + # pass + + print(f'Found {len(new_sites)}/{len(urls)} new sites') + + if args.check: + for s in new_sites: + print(s.url_main) + sys.exit(0) + + sem = asyncio.Semaphore(20) + loop = asyncio.get_event_loop() + + ok_sites = [] + tasks = [] + for site in new_sites: + check_coro = check_and_add_maigret_site(site, sem, logger, ok_usernames, bad_usernames) + future = asyncio.ensure_future(check_coro) + tasks.append(future) + + for f in tqdm.asyncio.tqdm.as_completed(tasks, timeout=TIMEOUT): + try: + loop.run_until_complete(f) + except asyncio.exceptions.TimeoutError: + pass + + print(f'Found and saved {len(ok_sites)} sites!') diff --git a/utils/sites_diff.py b/utils/sites_diff.py new file mode 100644 index 0000000..32a69f8 --- /dev/null +++ b/utils/sites_diff.py @@ -0,0 +1,36 @@ +import sys +import difflib +import requests + + +a = requests.get(sys.argv[1]).text +b = requests.get(sys.argv[2]).text + + +tokens_a = set(a.split('"')) +tokens_b = set(b.split('"')) + +a_minus_b = tokens_a.difference(tokens_b) +b_minus_a = tokens_b.difference(tokens_a) + +print(a_minus_b) +print(b_minus_a) + +print(len(a_minus_b)) +print(len(b_minus_a)) + +desired_strings = ["username", "not found", "пользователь", "profile", "lastname", "firstname", "biography", +"birthday", "репутация", "информация", "e-mail"] + + +def get_match_ratio(x): + return round(max([ + difflib.SequenceMatcher(a=x.lower(), b=y).ratio() + for y in desired_strings + ]), 2) + + +RATIO = 0.6 + +print(sorted(a_minus_b, key=get_match_ratio, reverse=True)[:10]) +print(sorted(b_minus_a, key=get_match_ratio, reverse=True)[:10]) \ No newline at end of file diff --git a/utils/update_site_data.py b/utils/update_site_data.py index 12180fb..106444a 100755 --- a/utils/update_site_data.py +++ b/utils/update_site_data.py @@ -140,4 +140,8 @@ Rank data fetched from Alexa by domains. site_file.write(f'\nAlexa.com rank data fetched at ({datetime.utcnow()} UTC)\n') db.save_to_file(args.base_file) + statistics_text = db.get_db_stats(is_markdown=True) + site_file.write('## Statistics\n\n') + site_file.write(statistics_text) + print("\nFinished updating supported site listing!")