mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-07 14:34:33 +00:00
Merge branch 'main' of https://github.com/soxoj/maigret into site_adds
This commit is contained in:
@@ -101,7 +101,7 @@ maigret user --tags photo,dating
|
|||||||
maigret user1 user2 user3 -a
|
maigret user1 user2 user3 -a
|
||||||
```
|
```
|
||||||
|
|
||||||
Use `maigret --help` to get full options description. Also options are documented in [the Maigret Wiki](https://github.com/soxoj/maigret/wiki/Command-line-options).
|
Use `maigret --help` to get full options description. Also options [are documented](https://maigret.readthedocs.io/en/latest/command-line-options.html).
|
||||||
|
|
||||||
|
|
||||||
## Demo with page parsing and recursive username search
|
## Demo with page parsing and recursive username search
|
||||||
|
|||||||
@@ -0,0 +1,70 @@
|
|||||||
|
.. _development:
|
||||||
|
|
||||||
|
Development
|
||||||
|
==============
|
||||||
|
|
||||||
|
How to publish new version of Maigret
|
||||||
|
-------------------------------------
|
||||||
|
|
||||||
|
**Collaborats rights are requires, write Soxoj to get them**.
|
||||||
|
|
||||||
|
For new version publishing you must create a new branch in repository
|
||||||
|
with a bumped version number and actual changelog first. After it you
|
||||||
|
must create a release, and GitHub action automatically create a new
|
||||||
|
PyPi package.
|
||||||
|
|
||||||
|
- New branch example: https://github.com/soxoj/maigret/commit/e520418f6a25d7edacde2d73b41a8ae7c80ddf39
|
||||||
|
- Release example: https://github.com/soxoj/maigret/releases/tag/v0.4.1
|
||||||
|
|
||||||
|
1. Make a new branch locally with a new version name. Check the current version number here: https://pypi.org/project/maigret/.
|
||||||
|
**Increase only patch version (third number)** if there are no breaking changes.
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
git checkout -b 0.4.0
|
||||||
|
|
||||||
|
2. Update Maigret version in three files manually:
|
||||||
|
|
||||||
|
- setup.py
|
||||||
|
- maigret/__version__.py
|
||||||
|
- docs/source/conf.py
|
||||||
|
|
||||||
|
3. Create a new empty text section in the beginning of the file `CHANGELOG.md` with a current date:
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
## [0.4.0] - 2022-01-03
|
||||||
|
|
||||||
|
4. Get auto-generate release notes:
|
||||||
|
|
||||||
|
- Open https://github.com/soxoj/maigret/releases/new
|
||||||
|
- Click `Choose a tag`, enter `test`
|
||||||
|
- Click `Create new tag`
|
||||||
|
- Press `+ Auto-generate release notes`
|
||||||
|
- Copy all the text from description text field below
|
||||||
|
- Paste it to empty text section in `CHANGELOG.txt`
|
||||||
|
- Remove redundant lines `## What's Changed` and `## New Contributors` section if it exists
|
||||||
|
- *Close the new release page*
|
||||||
|
|
||||||
|
5. Commit all the changes, push, make pull request
|
||||||
|
|
||||||
|
.. code-block:: console
|
||||||
|
|
||||||
|
git add ...
|
||||||
|
git commit -m 'Bump to 0.4.0'
|
||||||
|
git push origin head
|
||||||
|
|
||||||
|
|
||||||
|
6. Merge pull request
|
||||||
|
|
||||||
|
7. Create new release
|
||||||
|
|
||||||
|
- Open https://github.com/soxoj/maigret/releases/new again
|
||||||
|
- Click `Choose a tag`
|
||||||
|
- Enter actual version in format `v0.4.0`
|
||||||
|
- Also enter actual version in the field `Release title`
|
||||||
|
- Click `Create new tag`
|
||||||
|
- Press `+ Auto-generate release notes`
|
||||||
|
- **Press "Publish release" button**
|
||||||
|
|
||||||
|
8. That's all, now you can simply wait push to PyPi. You can monitor it in Action page: https://github.com/soxoj/maigret/actions/workflows/python-publish.yml
|
||||||
@@ -28,3 +28,4 @@ You may be interested in:
|
|||||||
tags
|
tags
|
||||||
usage-examples
|
usage-examples
|
||||||
settings
|
settings
|
||||||
|
development
|
||||||
|
|||||||
+1
-1
@@ -566,7 +566,7 @@ async def main():
|
|||||||
|
|
||||||
# Database statistics
|
# Database statistics
|
||||||
if args.stats:
|
if args.stats:
|
||||||
print(db.get_db_stats(db.sites_dict))
|
print(db.get_db_stats())
|
||||||
|
|
||||||
report_dir = path.join(os.getcwd(), args.folderoutput)
|
report_dir = path.join(os.getcwd(), args.folderoutput)
|
||||||
|
|
||||||
|
|||||||
+22
-10
@@ -419,9 +419,8 @@ class MaigretDatabase:
|
|||||||
results[_id] = _type
|
results[_id] = _type
|
||||||
return results
|
return results
|
||||||
|
|
||||||
def get_db_stats(self, sites_dict):
|
def get_db_stats(self, is_markdown=False):
|
||||||
if not sites_dict:
|
sites_dict = self.sites_dict
|
||||||
sites_dict = self.sites_dict()
|
|
||||||
|
|
||||||
urls = {}
|
urls = {}
|
||||||
tags = {}
|
tags = {}
|
||||||
@@ -429,6 +428,9 @@ class MaigretDatabase:
|
|||||||
disabled_count = 0
|
disabled_count = 0
|
||||||
total_count = len(sites_dict)
|
total_count = len(sites_dict)
|
||||||
|
|
||||||
|
message_checks = 0
|
||||||
|
message_checks_one_factor = 0
|
||||||
|
|
||||||
for _, site in sites_dict.items():
|
for _, site in sites_dict.items():
|
||||||
if site.disabled:
|
if site.disabled:
|
||||||
disabled_count += 1
|
disabled_count += 1
|
||||||
@@ -436,24 +438,34 @@ class MaigretDatabase:
|
|||||||
url_type = site.get_url_template()
|
url_type = site.get_url_template()
|
||||||
urls[url_type] = urls.get(url_type, 0) + 1
|
urls[url_type] = urls.get(url_type, 0) + 1
|
||||||
|
|
||||||
|
if site.check_type == 'message':
|
||||||
|
message_checks += 1
|
||||||
|
if site.absence_strs and site.presense_strs:
|
||||||
|
continue
|
||||||
|
message_checks_one_factor += 1
|
||||||
|
|
||||||
if not site.tags:
|
if not site.tags:
|
||||||
tags["NO_TAGS"] = tags.get("NO_TAGS", 0) + 1
|
tags["NO_TAGS"] = tags.get("NO_TAGS", 0) + 1
|
||||||
|
|
||||||
for tag in filter(lambda x: not is_country_tag(x), site.tags):
|
for tag in filter(lambda x: not is_country_tag(x), site.tags):
|
||||||
tags[tag] = tags.get(tag, 0) + 1
|
tags[tag] = tags.get(tag, 0) + 1
|
||||||
|
|
||||||
output += f"Enabled/total sites: {total_count - disabled_count}/{total_count}\n"
|
output += f"Enabled/total sites: {total_count - disabled_count}/{total_count}\n\n"
|
||||||
output += "Top profile URLs:\n"
|
output += f"Incomplete checks: {message_checks_one_factor}/{message_checks} (false positive risks)\n\n"
|
||||||
for url, count in sorted(urls.items(), key=lambda x: x[1], reverse=True)[:20]:
|
|
||||||
|
top_urls_count = 20
|
||||||
|
output += f"Top {top_urls_count} profile URLs:\n"
|
||||||
|
for url, count in sorted(urls.items(), key=lambda x: x[1], reverse=True)[:top_urls_count]:
|
||||||
if count == 1:
|
if count == 1:
|
||||||
break
|
break
|
||||||
output += f"{count}\t{url}\n"
|
output += f"- ({count})\t`{url}`\n" if is_markdown else f"{count}\t{url}\n"
|
||||||
|
|
||||||
output += "Top tags:\n"
|
top_tags_count = 20
|
||||||
for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True)[:200]:
|
output += f"\nTop {top_tags_count} tags:\n"
|
||||||
|
for tag, count in sorted(tags.items(), key=lambda x: x[1], reverse=True)[:top_tags_count]:
|
||||||
mark = ""
|
mark = ""
|
||||||
if tag not in self._tags:
|
if tag not in self._tags:
|
||||||
mark = " (non-standard)"
|
mark = " (non-standard)"
|
||||||
output += f"{count}\t{tag}{mark}\n"
|
output += f"- ({count})\t`{tag}`{mark}\n" if is_markdown else f"{count}\t{tag}{mark}\n"
|
||||||
|
|
||||||
return output
|
return output
|
||||||
|
|||||||
Executable
+152
@@ -0,0 +1,152 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Maigret: Supported Site Listing with Alexa ranking and country tags
|
||||||
|
This module generates the listing of supported sites in file `SITES.md`
|
||||||
|
and pretty prints file with sites data.
|
||||||
|
"""
|
||||||
|
import aiohttp
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import requests
|
||||||
|
import logging
|
||||||
|
import threading
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
from datetime import datetime
|
||||||
|
from argparse import ArgumentParser, RawDescriptionHelpFormatter
|
||||||
|
|
||||||
|
import tqdm.asyncio
|
||||||
|
|
||||||
|
from maigret.maigret import get_response, site_self_check
|
||||||
|
from maigret.sites import MaigretSite, MaigretDatabase, MaigretEngine
|
||||||
|
from maigret.utils import CaseConverter
|
||||||
|
|
||||||
|
|
||||||
|
async def check_engine_of_site(site_name, sites_with_engines, future, engine_name, semaphore, logger):
|
||||||
|
async with semaphore:
|
||||||
|
response = await get_response(request_future=future,
|
||||||
|
site_name=site_name,
|
||||||
|
logger=logger)
|
||||||
|
|
||||||
|
html_text, status_code, error_text, expection_text = response
|
||||||
|
|
||||||
|
if html_text and engine_name in html_text:
|
||||||
|
sites_with_engines.append(site_name)
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter
|
||||||
|
)
|
||||||
|
parser.add_argument("--base","-b", metavar="BASE_FILE",
|
||||||
|
dest="base_file", default="maigret/resources/data.json",
|
||||||
|
help="JSON file with sites data to update.")
|
||||||
|
|
||||||
|
parser.add_argument('--engine', '-e', help='check only selected engine', type=str)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
log_level = logging.INFO
|
||||||
|
logging.basicConfig(
|
||||||
|
format='[%(filename)s:%(lineno)d] %(levelname)-3s %(asctime)s %(message)s',
|
||||||
|
datefmt='%H:%M:%S',
|
||||||
|
level=log_level
|
||||||
|
)
|
||||||
|
logger = logging.getLogger('engines-check')
|
||||||
|
logger.setLevel(log_level)
|
||||||
|
|
||||||
|
db = MaigretDatabase()
|
||||||
|
sites_subset = db.load_from_file(args.base_file).sites
|
||||||
|
sites = {site.name: site for site in sites_subset}
|
||||||
|
|
||||||
|
with open(args.base_file, "r", encoding="utf-8") as data_file:
|
||||||
|
sites_info = json.load(data_file)
|
||||||
|
engines = sites_info['engines']
|
||||||
|
|
||||||
|
for engine_name, engine_data in engines.items():
|
||||||
|
if args.engine and args.engine != engine_name:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not 'presenseStrs' in engine_data:
|
||||||
|
print(f'No features to automatically detect sites on engine {engine_name}')
|
||||||
|
continue
|
||||||
|
|
||||||
|
engine_obj = MaigretEngine(engine_name, engine_data)
|
||||||
|
|
||||||
|
# setup connections for checking both engine and usernames
|
||||||
|
connector = aiohttp.TCPConnector(ssl=False)
|
||||||
|
connector.verify_ssl=False
|
||||||
|
session = aiohttp.ClientSession(connector=connector)
|
||||||
|
|
||||||
|
sem = asyncio.Semaphore(100)
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
tasks = []
|
||||||
|
|
||||||
|
# check sites without engine if they look like sites on this engine
|
||||||
|
new_engine_sites = []
|
||||||
|
for site_name, site_data in sites.items():
|
||||||
|
if site_data.engine:
|
||||||
|
continue
|
||||||
|
|
||||||
|
future = session.get(url=site_data.url_main,
|
||||||
|
allow_redirects=True,
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
|
|
||||||
|
check_engine_coro = check_engine_of_site(site_name, new_engine_sites, future, engine_name, sem, logger)
|
||||||
|
future = asyncio.ensure_future(check_engine_coro)
|
||||||
|
tasks.append(future)
|
||||||
|
|
||||||
|
# progress bar
|
||||||
|
for f in tqdm.asyncio.tqdm.as_completed(tasks):
|
||||||
|
loop.run_until_complete(f)
|
||||||
|
|
||||||
|
print(f'Total detected {len(new_engine_sites)} sites on engine {engine_name}')
|
||||||
|
# dict with new found engine sites
|
||||||
|
new_sites = {site_name: sites[site_name] for site_name in new_engine_sites}
|
||||||
|
|
||||||
|
# update sites obj from engine
|
||||||
|
for site_name, site in new_sites.items():
|
||||||
|
site.request_future = None
|
||||||
|
site.engine = engine_name
|
||||||
|
site.update_from_engine(engine_obj)
|
||||||
|
|
||||||
|
async def update_site_data(site_name, site_data, all_sites, logger, no_progressbar):
|
||||||
|
updates = await site_self_check(site_name, site_data, logger, no_progressbar)
|
||||||
|
all_sites[site_name].update(updates)
|
||||||
|
|
||||||
|
tasks = []
|
||||||
|
# for new_site_name, new_site_data in new_sites.items():
|
||||||
|
# coro = update_site_data(new_site_name, new_site_data, new_sites, logger)
|
||||||
|
# future = asyncio.ensure_future(coro)
|
||||||
|
# tasks.append(future)
|
||||||
|
|
||||||
|
# asyncio.gather(*tasks)
|
||||||
|
for new_site_name, new_site_data in new_sites.items():
|
||||||
|
coro = update_site_data(new_site_name, new_site_data, new_sites, logger, no_progressbar=True)
|
||||||
|
loop.run_until_complete(coro)
|
||||||
|
|
||||||
|
updated_sites_count = 0
|
||||||
|
|
||||||
|
for s in new_sites:
|
||||||
|
site = new_sites[s]
|
||||||
|
site.request_future = None
|
||||||
|
|
||||||
|
if site.disabled:
|
||||||
|
print(f'{site.name} failed username checking of engine {engine_name}')
|
||||||
|
continue
|
||||||
|
|
||||||
|
site = site.strip_engine_data()
|
||||||
|
|
||||||
|
db.update_site(site)
|
||||||
|
updated_sites_count += 1
|
||||||
|
db.save_to_file(args.base_file)
|
||||||
|
|
||||||
|
print(f'Site "{s}": ' + json.dumps(site.json, indent=4))
|
||||||
|
|
||||||
|
print(f'Updated total {updated_sites_count} sites!')
|
||||||
|
print(f'Checking all sites on engine {engine_name}')
|
||||||
|
|
||||||
|
loop.run_until_complete(session.close())
|
||||||
|
|
||||||
|
print("\nFinished updating supported site listing!")
|
||||||
Executable
+280
@@ -0,0 +1,280 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import json
|
||||||
|
import random
|
||||||
|
import re
|
||||||
|
|
||||||
|
import tqdm.asyncio
|
||||||
|
from mock import Mock
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from maigret.maigret import *
|
||||||
|
from maigret.result import QueryStatus
|
||||||
|
from maigret.sites import MaigretSite
|
||||||
|
|
||||||
|
URL_RE = re.compile(r"https?://(www\.)?")
|
||||||
|
TIMEOUT = 200
|
||||||
|
|
||||||
|
|
||||||
|
async def maigret_check(site, site_data, username, status, logger):
|
||||||
|
query_notify = Mock()
|
||||||
|
logger.debug(f'Checking {site}...')
|
||||||
|
|
||||||
|
for username, status in [(username, status)]:
|
||||||
|
results = await maigret(
|
||||||
|
username,
|
||||||
|
{site: site_data},
|
||||||
|
logger,
|
||||||
|
query_notify,
|
||||||
|
timeout=TIMEOUT,
|
||||||
|
forced=True,
|
||||||
|
no_progressbar=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
if results[site]['status'].status != status:
|
||||||
|
if results[site]['status'].status == QueryStatus.UNKNOWN:
|
||||||
|
msg = site_data.absence_strs
|
||||||
|
etype = site_data.check_type
|
||||||
|
context = results[site]['status'].context
|
||||||
|
|
||||||
|
logger.debug(f'Error while searching {username} in {site}, must be claimed. Context: {context}')
|
||||||
|
# if site_data.get('errors'):
|
||||||
|
# continue
|
||||||
|
return False
|
||||||
|
|
||||||
|
if status == QueryStatus.CLAIMED:
|
||||||
|
logger.debug(f'Not found {username} in {site}, must be claimed')
|
||||||
|
logger.debug(results[site])
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
logger.debug(f'Found {username} in {site}, must be available')
|
||||||
|
logger.debug(results[site])
|
||||||
|
pass
|
||||||
|
return False
|
||||||
|
|
||||||
|
return site_data
|
||||||
|
|
||||||
|
|
||||||
|
async def check_and_add_maigret_site(site_data, semaphore, logger, ok_usernames, bad_usernames):
|
||||||
|
async with semaphore:
|
||||||
|
sitename = site_data.name
|
||||||
|
positive = False
|
||||||
|
negative = False
|
||||||
|
|
||||||
|
for ok_username in ok_usernames:
|
||||||
|
site_data.username_claimed = ok_username
|
||||||
|
status = QueryStatus.CLAIMED
|
||||||
|
if await maigret_check(sitename, site_data, ok_username, status, logger):
|
||||||
|
# print(f'{sitename} positive case is okay')
|
||||||
|
positive = True
|
||||||
|
break
|
||||||
|
|
||||||
|
for bad_username in bad_usernames:
|
||||||
|
site_data.username_unclaimed = bad_username
|
||||||
|
status = QueryStatus.AVAILABLE
|
||||||
|
if await maigret_check(sitename, site_data, bad_username, status, logger):
|
||||||
|
# print(f'{sitename} negative case is okay')
|
||||||
|
negative = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if positive and negative:
|
||||||
|
site_data = site_data.strip_engine_data()
|
||||||
|
|
||||||
|
db.update_site(site_data)
|
||||||
|
print(site_data.json)
|
||||||
|
try:
|
||||||
|
db.save_to_file(args.base_file)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(e, exc_info=True)
|
||||||
|
print(f'Saved new site {sitename}...')
|
||||||
|
ok_sites.append(site_data)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = ArgumentParser(formatter_class=RawDescriptionHelpFormatter
|
||||||
|
)
|
||||||
|
parser.add_argument("--base", "-b", metavar="BASE_FILE",
|
||||||
|
dest="base_file", default="maigret/resources/data.json",
|
||||||
|
help="JSON file with sites data to update.")
|
||||||
|
|
||||||
|
parser.add_argument("--add-engine", dest="add_engine", help="Additional engine to check")
|
||||||
|
|
||||||
|
parser.add_argument("--only-engine", dest="only_engine", help="Use only this engine from detected to check")
|
||||||
|
|
||||||
|
parser.add_argument('--check', help='only check sites in database', action='store_true')
|
||||||
|
|
||||||
|
parser.add_argument('--random', help='shuffle list of urls', action='store_true', default=False)
|
||||||
|
|
||||||
|
parser.add_argument('--top', help='top count of records in file', type=int, default=10000)
|
||||||
|
|
||||||
|
parser.add_argument('--filter', help='substring to filter input urls', type=str, default='')
|
||||||
|
|
||||||
|
parser.add_argument('--username', help='preferable username to check with', type=str)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--info",
|
||||||
|
"-vv",
|
||||||
|
action="store_true",
|
||||||
|
dest="info",
|
||||||
|
default=False,
|
||||||
|
help="Display service information.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--verbose",
|
||||||
|
"-v",
|
||||||
|
action="store_true",
|
||||||
|
dest="verbose",
|
||||||
|
default=False,
|
||||||
|
help="Display extra information and metrics.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-d",
|
||||||
|
"--debug",
|
||||||
|
"-vvv",
|
||||||
|
action="store_true",
|
||||||
|
dest="debug",
|
||||||
|
default=False,
|
||||||
|
help="Saving debugging information and sites responses in debug.txt.",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument("urls_file",
|
||||||
|
metavar='URLS_FILE',
|
||||||
|
action="store",
|
||||||
|
help="File with base site URLs"
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
log_level = logging.ERROR
|
||||||
|
if args.debug:
|
||||||
|
log_level = logging.DEBUG
|
||||||
|
elif args.info:
|
||||||
|
log_level = logging.INFO
|
||||||
|
elif args.verbose:
|
||||||
|
log_level = logging.WARNING
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
format='[%(filename)s:%(lineno)d] %(levelname)-3s %(asctime)s %(message)s',
|
||||||
|
datefmt='%H:%M:%S',
|
||||||
|
level=log_level
|
||||||
|
)
|
||||||
|
logger = logging.getLogger('engines-check')
|
||||||
|
logger.setLevel(log_level)
|
||||||
|
|
||||||
|
db = MaigretDatabase()
|
||||||
|
sites_subset = db.load_from_file(args.base_file).sites
|
||||||
|
sites = {site.name: site for site in sites_subset}
|
||||||
|
engines = db.engines
|
||||||
|
|
||||||
|
# TODO: usernames extractors
|
||||||
|
ok_usernames = ['alex', 'god', 'admin', 'red', 'blue', 'john']
|
||||||
|
if args.username:
|
||||||
|
ok_usernames = [args.username] + ok_usernames
|
||||||
|
|
||||||
|
bad_usernames = ['noonewouldeverusethis7']
|
||||||
|
|
||||||
|
with open(args.urls_file, 'r') as urls_file:
|
||||||
|
urls = urls_file.read().splitlines()
|
||||||
|
if args.random:
|
||||||
|
random.shuffle(urls)
|
||||||
|
urls = urls[:args.top]
|
||||||
|
|
||||||
|
raw_maigret_data = json.dumps({site.name: site.json for site in sites_subset})
|
||||||
|
|
||||||
|
new_sites = []
|
||||||
|
for site in tqdm.asyncio.tqdm(urls):
|
||||||
|
site_lowercase = site.lower()
|
||||||
|
|
||||||
|
domain_raw = URL_RE.sub('', site_lowercase).strip().strip('/')
|
||||||
|
domain_raw = domain_raw.split('/')[0]
|
||||||
|
|
||||||
|
if args.filter and args.filter not in domain_raw:
|
||||||
|
logger.debug('Site %s skipped due to filtering by "%s"', domain_raw, args.filter)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if domain_raw in raw_maigret_data:
|
||||||
|
logger.debug(f'Site {domain_raw} already exists in the Maigret database!')
|
||||||
|
continue
|
||||||
|
|
||||||
|
if '"' in domain_raw:
|
||||||
|
logger.debug(f'Invalid site {domain_raw}')
|
||||||
|
continue
|
||||||
|
|
||||||
|
main_page_url = '/'.join(site.split('/', 3)[:3])
|
||||||
|
|
||||||
|
site_data = {
|
||||||
|
'url': site,
|
||||||
|
'urlMain': main_page_url,
|
||||||
|
'name': domain_raw,
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
r = requests.get(main_page_url, timeout=5)
|
||||||
|
except:
|
||||||
|
r = None
|
||||||
|
pass
|
||||||
|
|
||||||
|
detected_engines = []
|
||||||
|
|
||||||
|
for e in engines:
|
||||||
|
strs_to_check = e.__dict__.get('presenseStrs')
|
||||||
|
if strs_to_check and r and r.text:
|
||||||
|
all_strs_in_response = True
|
||||||
|
for s in strs_to_check:
|
||||||
|
if not s in r.text:
|
||||||
|
all_strs_in_response = False
|
||||||
|
if all_strs_in_response:
|
||||||
|
engine_name = e.__dict__.get('name')
|
||||||
|
detected_engines.append(engine_name)
|
||||||
|
logger.info(f'Detected engine {engine_name} for site {main_page_url}')
|
||||||
|
|
||||||
|
if args.only_engine and args.only_engine in detected_engines:
|
||||||
|
detected_engines = [args.only_engine]
|
||||||
|
elif not detected_engines and args.add_engine:
|
||||||
|
logging.debug('Could not detect any engine, applying default engine %s...', args.add_engine)
|
||||||
|
detected_engines = [args.add_engine]
|
||||||
|
|
||||||
|
def create_site_from_engine(sitename, data, e):
|
||||||
|
site = MaigretSite(sitename, data)
|
||||||
|
site.update_from_engine(db.engines_dict[e])
|
||||||
|
site.engine = e
|
||||||
|
return site
|
||||||
|
|
||||||
|
for engine_name in detected_engines:
|
||||||
|
site = create_site_from_engine(domain_raw, site_data, engine_name)
|
||||||
|
new_sites.append(site)
|
||||||
|
logger.debug(site.json)
|
||||||
|
|
||||||
|
# if engine_name == "phpBB":
|
||||||
|
# site_data_with_subpath = dict(site_data)
|
||||||
|
# site_data_with_subpath["urlSubpath"] = "/forum"
|
||||||
|
# site = create_site_from_engine(domain_raw, site_data_with_subpath, engine_name)
|
||||||
|
# new_sites.append(site)
|
||||||
|
|
||||||
|
# except Exception as e:
|
||||||
|
# print(f'Error: {str(e)}')
|
||||||
|
# pass
|
||||||
|
|
||||||
|
print(f'Found {len(new_sites)}/{len(urls)} new sites')
|
||||||
|
|
||||||
|
if args.check:
|
||||||
|
for s in new_sites:
|
||||||
|
print(s.url_main)
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
sem = asyncio.Semaphore(20)
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
|
||||||
|
ok_sites = []
|
||||||
|
tasks = []
|
||||||
|
for site in new_sites:
|
||||||
|
check_coro = check_and_add_maigret_site(site, sem, logger, ok_usernames, bad_usernames)
|
||||||
|
future = asyncio.ensure_future(check_coro)
|
||||||
|
tasks.append(future)
|
||||||
|
|
||||||
|
for f in tqdm.asyncio.tqdm.as_completed(tasks, timeout=TIMEOUT):
|
||||||
|
try:
|
||||||
|
loop.run_until_complete(f)
|
||||||
|
except asyncio.exceptions.TimeoutError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
print(f'Found and saved {len(ok_sites)} sites!')
|
||||||
@@ -0,0 +1,36 @@
|
|||||||
|
import sys
|
||||||
|
import difflib
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
a = requests.get(sys.argv[1]).text
|
||||||
|
b = requests.get(sys.argv[2]).text
|
||||||
|
|
||||||
|
|
||||||
|
tokens_a = set(a.split('"'))
|
||||||
|
tokens_b = set(b.split('"'))
|
||||||
|
|
||||||
|
a_minus_b = tokens_a.difference(tokens_b)
|
||||||
|
b_minus_a = tokens_b.difference(tokens_a)
|
||||||
|
|
||||||
|
print(a_minus_b)
|
||||||
|
print(b_minus_a)
|
||||||
|
|
||||||
|
print(len(a_minus_b))
|
||||||
|
print(len(b_minus_a))
|
||||||
|
|
||||||
|
desired_strings = ["username", "not found", "пользователь", "profile", "lastname", "firstname", "biography",
|
||||||
|
"birthday", "репутация", "информация", "e-mail"]
|
||||||
|
|
||||||
|
|
||||||
|
def get_match_ratio(x):
|
||||||
|
return round(max([
|
||||||
|
difflib.SequenceMatcher(a=x.lower(), b=y).ratio()
|
||||||
|
for y in desired_strings
|
||||||
|
]), 2)
|
||||||
|
|
||||||
|
|
||||||
|
RATIO = 0.6
|
||||||
|
|
||||||
|
print(sorted(a_minus_b, key=get_match_ratio, reverse=True)[:10])
|
||||||
|
print(sorted(b_minus_a, key=get_match_ratio, reverse=True)[:10])
|
||||||
@@ -140,4 +140,8 @@ Rank data fetched from Alexa by domains.
|
|||||||
site_file.write(f'\nAlexa.com rank data fetched at ({datetime.utcnow()} UTC)\n')
|
site_file.write(f'\nAlexa.com rank data fetched at ({datetime.utcnow()} UTC)\n')
|
||||||
db.save_to_file(args.base_file)
|
db.save_to_file(args.base_file)
|
||||||
|
|
||||||
|
statistics_text = db.get_db_stats(is_markdown=True)
|
||||||
|
site_file.write('## Statistics\n\n')
|
||||||
|
site_file.write(statistics_text)
|
||||||
|
|
||||||
print("\nFinished updating supported site listing!")
|
print("\nFinished updating supported site listing!")
|
||||||
|
|||||||
Reference in New Issue
Block a user