mirror of
https://github.com/soxoj/maigret.git
synced 2026-05-06 14:08:59 +00:00
Introduced --retries flag, made thorough refactoring
- updated sites list - test scripts linting
This commit is contained in:
@@ -26,6 +26,7 @@ Currently supported more than 2000 sites ([full list](./sites.md)), by default s
|
||||
* Search by tags (site categories, countries)
|
||||
* Censorship and captcha detection
|
||||
* Very few false positives
|
||||
* Failed requests' restarts
|
||||
|
||||
## Installation
|
||||
|
||||
@@ -49,7 +50,7 @@ pip3 install .
|
||||
git clone https://github.com/soxoj/maigret && cd maigret
|
||||
```
|
||||
|
||||
You can use your a free virtual machine, the repo will be automatically cloned:
|
||||
You can use a free virtual machine, the repo will be automatically cloned:
|
||||
|
||||
[](https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/soxoj/maigret&tutorial=README.md) [](https://repl.it/github/soxoj/maigret)
|
||||
<a href="https://colab.research.google.com/gist//soxoj/879b51bc3b2f8b695abb054090645000/maigret.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" height="40"></a>
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#!/bin/sh
|
||||
FILES="maigret wizard.py maigret.py"
|
||||
FILES="maigret wizard.py maigret.py tests"
|
||||
|
||||
echo 'black'
|
||||
black --skip-string-normalization $FILES
|
||||
@@ -1,5 +1,5 @@
|
||||
#!/bin/sh
|
||||
FILES="maigret wizard.py maigret.py"
|
||||
FILES="maigret wizard.py maigret.py tests"
|
||||
|
||||
echo 'syntax errors or undefined names'
|
||||
flake8 --count --select=E9,F63,F7,F82 --show-source --statistics $FILES
|
||||
@@ -8,4 +8,4 @@ echo 'warning'
|
||||
flake8 --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --ignore=E731,W503 $FILES
|
||||
|
||||
echo 'mypy'
|
||||
mypy ./maigret
|
||||
mypy ./maigret ./wizard.py ./tests
|
||||
+251
-214
@@ -5,7 +5,7 @@ import re
|
||||
import ssl
|
||||
import sys
|
||||
import tqdm
|
||||
from typing import Tuple, Optional
|
||||
from typing import Tuple, Optional, Dict, List
|
||||
|
||||
import aiohttp
|
||||
import tqdm.asyncio
|
||||
@@ -16,9 +16,14 @@ from socid_extractor import extract
|
||||
from .activation import ParsingActivator, import_aiohttp_cookies
|
||||
from . import errors
|
||||
from .errors import CheckError
|
||||
from .executors import AsyncioSimpleExecutor, AsyncioProgressbarQueueExecutor
|
||||
from .executors import (
|
||||
AsyncExecutor,
|
||||
AsyncioSimpleExecutor,
|
||||
AsyncioProgressbarQueueExecutor,
|
||||
)
|
||||
from .result import QueryResult, QueryStatus
|
||||
from .sites import MaigretDatabase, MaigretSite
|
||||
from .types import QueryOptions, QueryResultWrapper
|
||||
from .utils import get_random_user_agent
|
||||
|
||||
|
||||
@@ -35,12 +40,10 @@ supported_recursive_search_ids = (
|
||||
unsupported_characters = "#"
|
||||
|
||||
|
||||
async def get_response(
|
||||
request_future, site_name, logger
|
||||
) -> Tuple[str, int, Optional[CheckError]]:
|
||||
async def get_response(request_future, logger) -> Tuple[str, int, Optional[CheckError]]:
|
||||
html_text = None
|
||||
status_code = 0
|
||||
error: Optional[CheckError] = CheckError("Error")
|
||||
error: Optional[CheckError] = CheckError("Unknown")
|
||||
|
||||
try:
|
||||
response = await request_future
|
||||
@@ -76,32 +79,12 @@ async def get_response(
|
||||
):
|
||||
error = CheckError("SSL", str(e))
|
||||
else:
|
||||
logger.warning(f"Unhandled error while requesting {site_name}: {e}")
|
||||
logger.debug(e, exc_info=True)
|
||||
error = CheckError("Error", str(e))
|
||||
error = CheckError("Unexpected", str(e))
|
||||
|
||||
# TODO: return only needed information
|
||||
return str(html_text), status_code, error
|
||||
|
||||
|
||||
async def update_site_dict_from_response(
|
||||
sitename, site_dict, results_info, logger, query_notify
|
||||
):
|
||||
site_obj = site_dict[sitename]
|
||||
future = site_obj.request_future
|
||||
if not future:
|
||||
# ignore: search by incompatible id type
|
||||
return
|
||||
|
||||
response = await get_response(
|
||||
request_future=future, site_name=sitename, logger=logger
|
||||
)
|
||||
|
||||
return sitename, process_site_result(
|
||||
response, query_notify, logger, results_info, site_obj
|
||||
)
|
||||
|
||||
|
||||
# TODO: move to separate class
|
||||
def detect_error_page(
|
||||
html_text, status_code, fail_flags, ignore_403
|
||||
@@ -127,7 +110,7 @@ def detect_error_page(
|
||||
|
||||
|
||||
def process_site_result(
|
||||
response, query_notify, logger, results_info, site: MaigretSite
|
||||
response, query_notify, logger, results_info: QueryResultWrapper, site: MaigretSite
|
||||
):
|
||||
if not response:
|
||||
return results_info
|
||||
@@ -205,6 +188,17 @@ def process_site_result(
|
||||
logger.debug(presense_flag)
|
||||
break
|
||||
|
||||
def build_result(status, **kwargs):
|
||||
return QueryResult(
|
||||
username,
|
||||
site_name,
|
||||
url,
|
||||
status,
|
||||
query_time=response_time,
|
||||
tags=fulltags,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if check_error:
|
||||
logger.debug(check_error)
|
||||
result = QueryResult(
|
||||
@@ -218,53 +212,20 @@ def process_site_result(
|
||||
tags=fulltags,
|
||||
)
|
||||
elif check_type == "message":
|
||||
absence_flags = site.absence_strs
|
||||
is_absence_flags_list = isinstance(absence_flags, list)
|
||||
absence_flags_set = (
|
||||
set(absence_flags) if is_absence_flags_list else {absence_flags}
|
||||
)
|
||||
# Checks if the error message is in the HTML
|
||||
is_absence_detected = any(
|
||||
[(absence_flag in html_text) for absence_flag in absence_flags_set]
|
||||
[(absence_flag in html_text) for absence_flag in site.absence_strs]
|
||||
)
|
||||
if not is_absence_detected and is_presense_detected:
|
||||
result = QueryResult(
|
||||
username,
|
||||
site_name,
|
||||
url,
|
||||
QueryStatus.CLAIMED,
|
||||
query_time=response_time,
|
||||
tags=fulltags,
|
||||
)
|
||||
result = build_result(QueryStatus.CLAIMED)
|
||||
else:
|
||||
result = QueryResult(
|
||||
username,
|
||||
site_name,
|
||||
url,
|
||||
QueryStatus.AVAILABLE,
|
||||
query_time=response_time,
|
||||
tags=fulltags,
|
||||
)
|
||||
result = build_result(QueryStatus.AVAILABLE)
|
||||
elif check_type == "status_code":
|
||||
# Checks if the status code of the response is 2XX
|
||||
if (not status_code >= 300 or status_code < 200) and is_presense_detected:
|
||||
result = QueryResult(
|
||||
username,
|
||||
site_name,
|
||||
url,
|
||||
QueryStatus.CLAIMED,
|
||||
query_time=response_time,
|
||||
tags=fulltags,
|
||||
)
|
||||
if is_presense_detected and (not status_code >= 300 or status_code < 200):
|
||||
result = build_result(QueryStatus.CLAIMED)
|
||||
else:
|
||||
result = QueryResult(
|
||||
username,
|
||||
site_name,
|
||||
url,
|
||||
QueryStatus.AVAILABLE,
|
||||
query_time=response_time,
|
||||
tags=fulltags,
|
||||
)
|
||||
result = build_result(QueryStatus.AVAILABLE)
|
||||
elif check_type == "response_url":
|
||||
# For this detection method, we have turned off the redirect.
|
||||
# So, there is no need to check the response URL: it will always
|
||||
@@ -272,23 +233,9 @@ def process_site_result(
|
||||
# code indicates that the request was successful (i.e. no 404, or
|
||||
# forward to some odd redirect).
|
||||
if 200 <= status_code < 300 and is_presense_detected:
|
||||
result = QueryResult(
|
||||
username,
|
||||
site_name,
|
||||
url,
|
||||
QueryStatus.CLAIMED,
|
||||
query_time=response_time,
|
||||
tags=fulltags,
|
||||
)
|
||||
result = build_result(QueryStatus.CLAIMED)
|
||||
else:
|
||||
result = QueryResult(
|
||||
username,
|
||||
site_name,
|
||||
url,
|
||||
QueryStatus.AVAILABLE,
|
||||
query_time=response_time,
|
||||
tags=fulltags,
|
||||
)
|
||||
result = build_result(QueryStatus.AVAILABLE)
|
||||
else:
|
||||
# It should be impossible to ever get here...
|
||||
raise ValueError(
|
||||
@@ -329,109 +276,20 @@ def process_site_result(
|
||||
return results_info
|
||||
|
||||
|
||||
async def maigret(
|
||||
username,
|
||||
site_dict,
|
||||
logger,
|
||||
query_notify=None,
|
||||
proxy=None,
|
||||
timeout=None,
|
||||
is_parsing_enabled=False,
|
||||
id_type="username",
|
||||
debug=False,
|
||||
forced=False,
|
||||
max_connections=100,
|
||||
no_progressbar=False,
|
||||
cookies=None,
|
||||
):
|
||||
"""Main search func
|
||||
|
||||
Checks for existence of username on certain sites.
|
||||
|
||||
Keyword Arguments:
|
||||
username -- Username string will be used for search.
|
||||
site_dict -- Dictionary containing sites data.
|
||||
query_notify -- Object with base type of QueryNotify().
|
||||
This will be used to notify the caller about
|
||||
query results.
|
||||
logger -- Standard Python logger object.
|
||||
timeout -- Time in seconds to wait before timing out request.
|
||||
Default is no timeout.
|
||||
is_parsing_enabled -- Extract additional info from account pages.
|
||||
id_type -- Type of username to search.
|
||||
Default is 'username', see all supported here:
|
||||
https://github.com/soxoj/maigret/wiki/Supported-identifier-types
|
||||
max_connections -- Maximum number of concurrent connections allowed.
|
||||
Default is 100.
|
||||
no_progressbar -- Displaying of ASCII progressbar during scanner.
|
||||
cookies -- Filename of a cookie jar file to use for each request.
|
||||
|
||||
Return Value:
|
||||
Dictionary containing results from report. Key of dictionary is the name
|
||||
of the social network site, and the value is another dictionary with
|
||||
the following keys:
|
||||
url_main: URL of main site.
|
||||
url_user: URL of user on site (if account exists).
|
||||
status: QueryResult() object indicating results of test for
|
||||
account existence.
|
||||
http_status: HTTP status code of query which checked for existence on
|
||||
site.
|
||||
response_text: Text that came back from request. May be None if
|
||||
there was an HTTP error when checking for existence.
|
||||
"""
|
||||
|
||||
# Notify caller that we are starting the query.
|
||||
if not query_notify:
|
||||
query_notify = Mock()
|
||||
|
||||
query_notify.start(username, id_type)
|
||||
|
||||
# TODO: connector
|
||||
connector = (
|
||||
ProxyConnector.from_url(proxy) if proxy else aiohttp.TCPConnector(ssl=False)
|
||||
)
|
||||
# connector = aiohttp.TCPConnector(ssl=False)
|
||||
connector.verify_ssl = False
|
||||
|
||||
cookie_jar = None
|
||||
if cookies:
|
||||
logger.debug(f"Using cookies jar file {cookies}")
|
||||
cookie_jar = await import_aiohttp_cookies(cookies)
|
||||
|
||||
session = aiohttp.ClientSession(
|
||||
connector=connector, trust_env=True, cookie_jar=cookie_jar
|
||||
)
|
||||
|
||||
if logger.level == logging.DEBUG:
|
||||
future = session.get(url="https://icanhazip.com")
|
||||
ip, status, check_error = await get_response(future, None, logger)
|
||||
if ip:
|
||||
logger.debug(f"My IP is: {ip.strip()}")
|
||||
else:
|
||||
logger.debug(f"IP requesting {check_error[0]}: {check_error[1]}")
|
||||
|
||||
# Results from analysis of all sites
|
||||
results_total = {}
|
||||
|
||||
# First create futures for all requests. This allows for the requests to run in parallel
|
||||
for site_name, site in site_dict.items():
|
||||
|
||||
if site.type != id_type:
|
||||
continue
|
||||
|
||||
if site.disabled and not forced:
|
||||
logger.debug(f"Site {site.name} is disabled, skipping...")
|
||||
continue
|
||||
|
||||
# Results from analysis of this specific site
|
||||
results_site = {}
|
||||
def make_site_result(
|
||||
site: MaigretSite, username: str, options: QueryOptions, logger
|
||||
) -> QueryResultWrapper:
|
||||
results_site: QueryResultWrapper = {}
|
||||
|
||||
# Record URL of main site and username
|
||||
results_site["site"] = site
|
||||
results_site["username"] = username
|
||||
results_site["parsing_enabled"] = is_parsing_enabled
|
||||
results_site["parsing_enabled"] = options["parsing"]
|
||||
results_site["url_main"] = site.url_main
|
||||
results_site["cookies"] = (
|
||||
cookie_jar and cookie_jar.filter_cookies(site.url_main) or None
|
||||
options.get("cookie_jar")
|
||||
and options["cookie_jar"].filter_cookies(site.url_main)
|
||||
or None
|
||||
)
|
||||
|
||||
headers = {
|
||||
@@ -442,23 +300,51 @@ async def maigret(
|
||||
|
||||
if "url" not in site.__dict__:
|
||||
logger.error("No URL for site %s", site.name)
|
||||
|
||||
# URL of user on site (if it exists)
|
||||
url = site.url.format(
|
||||
urlMain=site.url_main, urlSubpath=site.url_subpath, username=username
|
||||
)
|
||||
|
||||
# workaround to prevent slash errors
|
||||
url = re.sub("(?<!:)/+", "/", url)
|
||||
|
||||
# Don't make request if username is invalid for the site
|
||||
if site.regex_check and re.search(site.regex_check, username) is None:
|
||||
# No need to do the check at the site: this user name is not allowed.
|
||||
session = options['session']
|
||||
|
||||
# site check is disabled
|
||||
if site.disabled and not options['forced']:
|
||||
logger.debug(f"Site {site.name} is disabled, skipping...")
|
||||
results_site["status"] = QueryResult(
|
||||
username, site_name, url, QueryStatus.ILLEGAL
|
||||
username,
|
||||
site.name,
|
||||
url,
|
||||
QueryStatus.ILLEGAL,
|
||||
error=CheckError("Check is disabled"),
|
||||
)
|
||||
# current username type could not be applied
|
||||
elif site.type != options["id_type"]:
|
||||
results_site["status"] = QueryResult(
|
||||
username,
|
||||
site.name,
|
||||
url,
|
||||
QueryStatus.ILLEGAL,
|
||||
error=CheckError('Unsupported identifier type', f'Want "{site.type}"'),
|
||||
)
|
||||
# username is not allowed.
|
||||
elif site.regex_check and re.search(site.regex_check, username) is None:
|
||||
results_site["status"] = QueryResult(
|
||||
username,
|
||||
site.name,
|
||||
url,
|
||||
QueryStatus.ILLEGAL,
|
||||
error=CheckError(
|
||||
'Unsupported username format', f'Want "{site.regex_check}"'
|
||||
),
|
||||
)
|
||||
results_site["url_user"] = ""
|
||||
results_site["http_status"] = ""
|
||||
results_site["response_text"] = ""
|
||||
query_notify.update(results_site["status"])
|
||||
# query_notify.update(results_site["status"])
|
||||
else:
|
||||
# URL of user on site (if it exists)
|
||||
results_site["url_user"] = url
|
||||
@@ -503,26 +389,130 @@ async def maigret(
|
||||
url=url_probe,
|
||||
headers=headers,
|
||||
allow_redirects=allow_redirects,
|
||||
timeout=timeout,
|
||||
timeout=options['timeout'],
|
||||
)
|
||||
|
||||
# Store future in data for access later
|
||||
# TODO: move to separate obj
|
||||
site.request_future = future
|
||||
# Store future request object in the results object
|
||||
results_site["future"] = future
|
||||
|
||||
# Add this site's results into final dictionary with all of the other results.
|
||||
results_total[site_name] = results_site
|
||||
return results_site
|
||||
|
||||
coroutines = []
|
||||
for sitename, result_obj in results_total.items():
|
||||
coroutines.append(
|
||||
(
|
||||
update_site_dict_from_response,
|
||||
[sitename, site_dict, result_obj, logger, query_notify],
|
||||
{},
|
||||
)
|
||||
|
||||
async def check_site_for_username(
|
||||
site, username, options: QueryOptions, logger, query_notify, *args, **kwargs
|
||||
) -> Tuple[str, QueryResultWrapper]:
|
||||
default_result = make_site_result(site, username, options, logger)
|
||||
future = default_result.get("future")
|
||||
if not future:
|
||||
return site.name, default_result
|
||||
|
||||
response = await get_response(request_future=future, logger=logger)
|
||||
|
||||
response_result = process_site_result(
|
||||
response, query_notify, logger, default_result, site
|
||||
)
|
||||
|
||||
return site.name, response_result
|
||||
|
||||
|
||||
async def debug_ip_request(session, logger):
|
||||
future = session.get(url="https://icanhazip.com")
|
||||
ip, status, check_error = await get_response(future, logger)
|
||||
if ip:
|
||||
logger.debug(f"My IP is: {ip.strip()}")
|
||||
else:
|
||||
logger.debug(f"IP requesting {check_error.type}: {check_error.desc}")
|
||||
|
||||
|
||||
def get_failed_sites(results: Dict[str, QueryResultWrapper]) -> List[str]:
|
||||
sites = []
|
||||
for sitename, r in results.items():
|
||||
status = r.get('status', {})
|
||||
if status and status.error:
|
||||
if errors.is_permanent(status.error.type):
|
||||
continue
|
||||
sites.append(sitename)
|
||||
return sites
|
||||
|
||||
|
||||
async def maigret(
|
||||
username: str,
|
||||
site_dict: Dict[str, MaigretSite],
|
||||
logger,
|
||||
query_notify=None,
|
||||
proxy=None,
|
||||
timeout=None,
|
||||
is_parsing_enabled=False,
|
||||
id_type="username",
|
||||
debug=False,
|
||||
forced=False,
|
||||
max_connections=100,
|
||||
no_progressbar=False,
|
||||
cookies=None,
|
||||
retries=0,
|
||||
) -> QueryResultWrapper:
|
||||
"""Main search func
|
||||
|
||||
Checks for existence of username on certain sites.
|
||||
|
||||
Keyword Arguments:
|
||||
username -- Username string will be used for search.
|
||||
site_dict -- Dictionary containing sites data in MaigretSite objects.
|
||||
query_notify -- Object with base type of QueryNotify().
|
||||
This will be used to notify the caller about
|
||||
query results.
|
||||
logger -- Standard Python logger object.
|
||||
timeout -- Time in seconds to wait before timing out request.
|
||||
Default is no timeout.
|
||||
is_parsing_enabled -- Extract additional info from account pages.
|
||||
id_type -- Type of username to search.
|
||||
Default is 'username', see all supported here:
|
||||
https://github.com/soxoj/maigret/wiki/Supported-identifier-types
|
||||
max_connections -- Maximum number of concurrent connections allowed.
|
||||
Default is 100.
|
||||
no_progressbar -- Displaying of ASCII progressbar during scanner.
|
||||
cookies -- Filename of a cookie jar file to use for each request.
|
||||
|
||||
Return Value:
|
||||
Dictionary containing results from report. Key of dictionary is the name
|
||||
of the social network site, and the value is another dictionary with
|
||||
the following keys:
|
||||
url_main: URL of main site.
|
||||
url_user: URL of user on site (if account exists).
|
||||
status: QueryResult() object indicating results of test for
|
||||
account existence.
|
||||
http_status: HTTP status code of query which checked for existence on
|
||||
site.
|
||||
response_text: Text that came back from request. May be None if
|
||||
there was an HTTP error when checking for existence.
|
||||
"""
|
||||
|
||||
# notify caller that we are starting the query.
|
||||
if not query_notify:
|
||||
query_notify = Mock()
|
||||
|
||||
query_notify.start(username, id_type)
|
||||
|
||||
# make http client session
|
||||
connector = (
|
||||
ProxyConnector.from_url(proxy) if proxy else aiohttp.TCPConnector(ssl=False)
|
||||
)
|
||||
connector.verify_ssl = False
|
||||
|
||||
cookie_jar = None
|
||||
if cookies:
|
||||
logger.debug(f"Using cookies jar file {cookies}")
|
||||
cookie_jar = await import_aiohttp_cookies(cookies)
|
||||
|
||||
session = aiohttp.ClientSession(
|
||||
connector=connector, trust_env=True, cookie_jar=cookie_jar
|
||||
)
|
||||
|
||||
if logger.level == logging.DEBUG:
|
||||
await debug_ip_request(session, logger)
|
||||
|
||||
# setup parallel executor
|
||||
executor: Optional[AsyncExecutor] = None
|
||||
if no_progressbar:
|
||||
executor = AsyncioSimpleExecutor(logger=logger)
|
||||
else:
|
||||
@@ -530,24 +520,68 @@ async def maigret(
|
||||
logger=logger, in_parallel=max_connections, timeout=timeout + 0.5
|
||||
)
|
||||
|
||||
results = await executor.run(coroutines)
|
||||
# make options objects for all the requests
|
||||
options: QueryOptions = {}
|
||||
options["cookies"] = cookie_jar
|
||||
options["session"] = session
|
||||
options["parsing"] = is_parsing_enabled
|
||||
options["timeout"] = timeout
|
||||
options["id_type"] = id_type
|
||||
options["forced"] = forced
|
||||
|
||||
# results from analysis of all sites
|
||||
all_results: Dict[str, QueryResultWrapper] = {}
|
||||
|
||||
sites = list(site_dict.keys())
|
||||
|
||||
attempts = retries + 1
|
||||
while attempts:
|
||||
tasks_dict = {}
|
||||
|
||||
for sitename, site in site_dict.items():
|
||||
if sitename not in sites:
|
||||
continue
|
||||
default_result: QueryResultWrapper = {
|
||||
'site': site,
|
||||
'status': QueryResult(
|
||||
username,
|
||||
sitename,
|
||||
'',
|
||||
QueryStatus.UNKNOWN,
|
||||
error=CheckError('Request failed'),
|
||||
),
|
||||
}
|
||||
tasks_dict[sitename] = (
|
||||
check_site_for_username,
|
||||
[site, username, options, logger, query_notify],
|
||||
{'default': (sitename, default_result)},
|
||||
)
|
||||
|
||||
cur_results = await executor.run(tasks_dict.values())
|
||||
|
||||
# wait for executor timeout errors
|
||||
await asyncio.sleep(1)
|
||||
|
||||
all_results.update(cur_results)
|
||||
|
||||
sites = get_failed_sites(dict(cur_results))
|
||||
attempts -= 1
|
||||
|
||||
if not sites:
|
||||
break
|
||||
|
||||
if attempts:
|
||||
query_notify.warning(
|
||||
f'Restarting checks for {len(sites)} sites... ({attempts} attempts left)'
|
||||
)
|
||||
|
||||
# closing http client session
|
||||
await session.close()
|
||||
|
||||
# Notify caller that all queries are finished.
|
||||
# notify caller that all queries are finished
|
||||
query_notify.finish()
|
||||
|
||||
data = {}
|
||||
for result in results:
|
||||
# TODO: still can be empty
|
||||
if result:
|
||||
try:
|
||||
data[result[0]] = result[1]
|
||||
except Exception as e:
|
||||
logger.error(e, exc_info=True)
|
||||
logger.info(result)
|
||||
|
||||
return data
|
||||
return all_results
|
||||
|
||||
|
||||
def timeout_check(value):
|
||||
@@ -575,7 +609,9 @@ def timeout_check(value):
|
||||
return timeout
|
||||
|
||||
|
||||
async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=False):
|
||||
async def site_self_check(
|
||||
site: MaigretSite, logger, semaphore, db: MaigretDatabase, silent=False
|
||||
):
|
||||
changes = {
|
||||
"disabled": False,
|
||||
}
|
||||
@@ -602,6 +638,7 @@ async def site_self_check(site, logger, semaphore, db: MaigretDatabase, silent=F
|
||||
id_type=site.type,
|
||||
forced=True,
|
||||
no_progressbar=True,
|
||||
retries=1,
|
||||
)
|
||||
|
||||
# don't disable entries with other ids types
|
||||
|
||||
+13
-2
@@ -57,6 +57,17 @@ ERRORS_TYPES = {
|
||||
'Request timeout': 'Try to increase timeout or to switch to another internet service provider',
|
||||
}
|
||||
|
||||
TEMPORARY_ERRORS_TYPES = [
|
||||
'Request timeout',
|
||||
'Unknown',
|
||||
'Request failed',
|
||||
'Connecting failure',
|
||||
'HTTP',
|
||||
'Proxy',
|
||||
'Interrupted',
|
||||
'Connection lost',
|
||||
]
|
||||
|
||||
THRESHOLD = 3 # percent
|
||||
|
||||
|
||||
@@ -64,8 +75,8 @@ def is_important(err_data):
|
||||
return err_data['perc'] >= THRESHOLD
|
||||
|
||||
|
||||
def is_not_permanent(err_data):
|
||||
return True
|
||||
def is_permanent(err_type):
|
||||
return err_type not in TEMPORARY_ERRORS_TYPES
|
||||
|
||||
|
||||
def detect(text):
|
||||
|
||||
@@ -93,7 +93,7 @@ class AsyncioProgressbarQueueExecutor(AsyncExecutor):
|
||||
try:
|
||||
result = await asyncio.wait_for(query_task, timeout=self.timeout)
|
||||
except asyncio.TimeoutError:
|
||||
result = None
|
||||
result = kwargs.get('default')
|
||||
|
||||
self.results.append(result)
|
||||
self.progress.update(1)
|
||||
|
||||
+15
-2
@@ -59,7 +59,7 @@ def notify_about_errors(search_results, query_notify):
|
||||
)
|
||||
|
||||
|
||||
async def main():
|
||||
def setup_arguments_parser():
|
||||
version_string = '\n'.join(
|
||||
[
|
||||
f'%(prog)s {__version__}',
|
||||
@@ -148,6 +148,14 @@ async def main():
|
||||
"A longer timeout will be more likely to get results from slow sites. "
|
||||
"On the other hand, this may cause a long delay to gather all results. ",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--retries",
|
||||
action="store",
|
||||
type=int,
|
||||
metavar='RETRIES',
|
||||
default=1,
|
||||
help="Attempts to restart temporary failed requests.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-n",
|
||||
"--max-connections",
|
||||
@@ -334,8 +342,12 @@ async def main():
|
||||
help=f"Generate a JSON report of specific type: {', '.join(SUPPORTED_JSON_REPORT_FORMATS)}"
|
||||
" (one report per username).",
|
||||
)
|
||||
return parser
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
async def main():
|
||||
arg_parser = setup_arguments_parser()
|
||||
args = arg_parser.parse_args()
|
||||
|
||||
# Logging
|
||||
log_level = logging.ERROR
|
||||
@@ -528,6 +540,7 @@ async def main():
|
||||
forced=args.use_disabled_sites,
|
||||
max_connections=args.connections,
|
||||
no_progressbar=args.no_progressbar,
|
||||
retries=args.retries,
|
||||
)
|
||||
|
||||
notify_about_errors(results, query_notify)
|
||||
|
||||
+4407
-3417
File diff suppressed because it is too large
Load Diff
+35
-29
@@ -3,7 +3,7 @@
|
||||
import copy
|
||||
import json
|
||||
import sys
|
||||
from typing import Optional
|
||||
from typing import Optional, List, Dict, Any
|
||||
|
||||
import requests
|
||||
|
||||
@@ -57,9 +57,10 @@ SUPPORTED_TAGS = [
|
||||
|
||||
|
||||
class MaigretEngine:
|
||||
site: Dict[str, Any] = {}
|
||||
|
||||
def __init__(self, name, data):
|
||||
self.name = name
|
||||
self.site = {}
|
||||
self.__dict__.update(data)
|
||||
|
||||
@property
|
||||
@@ -78,35 +79,40 @@ class MaigretSite:
|
||||
"urlRegexp",
|
||||
]
|
||||
|
||||
username_claimed = ""
|
||||
username_unclaimed = ""
|
||||
url_subpath = ""
|
||||
url_main = ""
|
||||
url = ""
|
||||
disabled = False
|
||||
similar_search = False
|
||||
ignore403 = False
|
||||
tags: List[str] = []
|
||||
|
||||
type = "username"
|
||||
headers: Dict[str, str] = {}
|
||||
errors: Dict[str, str] = {}
|
||||
activation: Dict[str, Any] = {}
|
||||
regex_check = None
|
||||
url_probe = None
|
||||
check_type = ""
|
||||
request_head_only = ""
|
||||
get_params: Dict[str, Any] = {}
|
||||
|
||||
presense_strs: List[str] = []
|
||||
absence_strs: List[str] = []
|
||||
stats: Dict[str, Any] = {}
|
||||
|
||||
engine = None
|
||||
engine_data: Dict[str, Any] = {}
|
||||
engine_obj: Optional["MaigretEngine"] = None
|
||||
request_future = None
|
||||
alexa_rank = None
|
||||
source = None
|
||||
|
||||
def __init__(self, name, information):
|
||||
self.name = name
|
||||
|
||||
self.disabled = False
|
||||
self.similar_search = False
|
||||
self.ignore403 = False
|
||||
self.tags = []
|
||||
|
||||
self.type = "username"
|
||||
self.headers = {}
|
||||
self.errors = {}
|
||||
self.activation = {}
|
||||
self.url_subpath = ""
|
||||
self.regex_check = None
|
||||
self.url_probe = None
|
||||
self.check_type = ""
|
||||
self.request_head_only = ""
|
||||
self.get_params = {}
|
||||
|
||||
self.presense_strs = []
|
||||
self.absence_strs = []
|
||||
self.stats = {}
|
||||
|
||||
self.engine = None
|
||||
self.engine_data = {}
|
||||
self.engine_obj = None
|
||||
self.request_future = None
|
||||
self.alexa_rank = None
|
||||
self.source = None
|
||||
|
||||
for k, v in information.items():
|
||||
self.__dict__[CaseConverter.camel_to_snake(k)] = v
|
||||
@@ -193,7 +199,7 @@ class MaigretSite:
|
||||
self.url_regexp = None
|
||||
|
||||
self_copy = copy.deepcopy(self)
|
||||
engine_data = self_copy.engine_obj.site
|
||||
engine_data = self_copy.engine_obj and self_copy.engine_obj.site or {}
|
||||
site_data_keys = list(self_copy.__dict__.keys())
|
||||
|
||||
for k in engine_data.keys():
|
||||
|
||||
+8
-2
@@ -1,5 +1,11 @@
|
||||
from typing import Callable, Any, Tuple
|
||||
from typing import Callable, List, Dict, Tuple, Any
|
||||
|
||||
|
||||
# search query
|
||||
QueryDraft = Tuple[Callable, Any, Any]
|
||||
QueryDraft = Tuple[Callable, List, Dict]
|
||||
|
||||
# options dict
|
||||
QueryOptions = Dict[str, Any]
|
||||
|
||||
# TODO: throw out
|
||||
QueryResultWrapper = Dict[str, Any]
|
||||
|
||||
+2
-1
@@ -26,7 +26,8 @@ def get_test_reports_filenames():
|
||||
|
||||
def remove_test_reports():
|
||||
reports_list = get_test_reports_filenames()
|
||||
for f in reports_list: os.remove(f)
|
||||
for f in reports_list:
|
||||
os.remove(f)
|
||||
logging.error(f'Removed test reports {reports_list}')
|
||||
|
||||
|
||||
|
||||
@@ -44,8 +44,9 @@ async def test_import_aiohttp_cookies():
|
||||
|
||||
url = 'https://httpbin.org/cookies'
|
||||
connector = aiohttp.TCPConnector(ssl=False)
|
||||
session = aiohttp.ClientSession(connector=connector, trust_env=True,
|
||||
cookie_jar=cookie_jar)
|
||||
session = aiohttp.ClientSession(
|
||||
connector=connector, trust_env=True, cookie_jar=cookie_jar
|
||||
)
|
||||
|
||||
response = await session.get(url=url)
|
||||
result = json.loads(await response.content.read())
|
||||
|
||||
@@ -2,11 +2,16 @@
|
||||
import pytest
|
||||
import asyncio
|
||||
import logging
|
||||
from maigret.executors import AsyncioSimpleExecutor, AsyncioProgressbarExecutor, \
|
||||
AsyncioProgressbarSemaphoreExecutor, AsyncioProgressbarQueueExecutor
|
||||
from maigret.executors import (
|
||||
AsyncioSimpleExecutor,
|
||||
AsyncioProgressbarExecutor,
|
||||
AsyncioProgressbarSemaphoreExecutor,
|
||||
AsyncioProgressbarQueueExecutor,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def func(n):
|
||||
await asyncio.sleep(0.1 * (n % 3))
|
||||
return n
|
||||
@@ -20,6 +25,7 @@ async def test_simple_asyncio_executor():
|
||||
assert executor.execution_time > 0.2
|
||||
assert executor.execution_time < 0.3
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_asyncio_progressbar_executor():
|
||||
tasks = [(func, [n], {}) for n in range(10)]
|
||||
|
||||
+7
-17
@@ -8,40 +8,30 @@ from maigret.maigret import self_check
|
||||
from maigret.sites import MaigretDatabase
|
||||
|
||||
EXAMPLE_DB = {
|
||||
'engines': {
|
||||
},
|
||||
'engines': {},
|
||||
'sites': {
|
||||
"GooglePlayStore": {
|
||||
"tags": [
|
||||
"global",
|
||||
"us"
|
||||
],
|
||||
"tags": ["global", "us"],
|
||||
"disabled": False,
|
||||
"checkType": "status_code",
|
||||
"alexaRank": 1,
|
||||
"url": "https://play.google.com/store/apps/developer?id={username}",
|
||||
"urlMain": "https://play.google.com/store",
|
||||
"usernameClaimed": "Facebook_nosuchname",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
"usernameUnclaimed": "noonewouldeverusethis7",
|
||||
},
|
||||
"Reddit": {
|
||||
"tags": [
|
||||
"news",
|
||||
"social",
|
||||
"us"
|
||||
],
|
||||
"tags": ["news", "social", "us"],
|
||||
"checkType": "status_code",
|
||||
"presenseStrs": [
|
||||
"totalKarma"
|
||||
],
|
||||
"presenseStrs": ["totalKarma"],
|
||||
"disabled": True,
|
||||
"alexaRank": 17,
|
||||
"url": "https://www.reddit.com/user/{username}",
|
||||
"urlMain": "https://www.reddit.com/",
|
||||
"usernameClaimed": "blue",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
"usernameUnclaimed": "noonewouldeverusethis7",
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
+188
-53
@@ -7,8 +7,16 @@ from io import StringIO
|
||||
import xmind
|
||||
from jinja2 import Template
|
||||
|
||||
from maigret.report import generate_csv_report, generate_txt_report, save_xmind_report, save_html_report, \
|
||||
save_pdf_report, generate_report_template, generate_report_context, generate_json_report
|
||||
from maigret.report import (
|
||||
generate_csv_report,
|
||||
generate_txt_report,
|
||||
save_xmind_report,
|
||||
save_html_report,
|
||||
save_pdf_report,
|
||||
generate_report_template,
|
||||
generate_report_context,
|
||||
generate_json_report,
|
||||
)
|
||||
from maigret.result import QueryResult, QueryStatus
|
||||
|
||||
EXAMPLE_RESULTS = {
|
||||
@@ -17,14 +25,16 @@ EXAMPLE_RESULTS = {
|
||||
'parsing_enabled': True,
|
||||
'url_main': 'https://www.github.com/',
|
||||
'url_user': 'https://www.github.com/test',
|
||||
'status': QueryResult('test',
|
||||
'status': QueryResult(
|
||||
'test',
|
||||
'GitHub',
|
||||
'https://www.github.com/test',
|
||||
QueryStatus.CLAIMED,
|
||||
tags=['test_tag']),
|
||||
tags=['test_tag'],
|
||||
),
|
||||
'http_status': 200,
|
||||
'is_similar': False,
|
||||
'rank': 78
|
||||
'rank': 78,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -33,74 +43,196 @@ BAD_RESULT = QueryResult('', '', '', QueryStatus.AVAILABLE)
|
||||
|
||||
GOOD_500PX_RESULT = copy.deepcopy(GOOD_RESULT)
|
||||
GOOD_500PX_RESULT.tags = ['photo', 'us', 'global']
|
||||
GOOD_500PX_RESULT.ids_data = {"uid": "dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==", "legacy_id": "26403415",
|
||||
"username": "alexaimephotographycars", "name": "Alex Aim\u00e9",
|
||||
GOOD_500PX_RESULT.ids_data = {
|
||||
"uid": "dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==",
|
||||
"legacy_id": "26403415",
|
||||
"username": "alexaimephotographycars",
|
||||
"name": "Alex Aim\u00e9",
|
||||
"website": "www.flickr.com/photos/alexaimephotography/",
|
||||
"facebook_link": " www.instagram.com/street.reality.photography/",
|
||||
"instagram_username": "alexaimephotography", "twitter_username": "Alexaimephotogr"}
|
||||
"instagram_username": "alexaimephotography",
|
||||
"twitter_username": "Alexaimephotogr",
|
||||
}
|
||||
|
||||
GOOD_REDDIT_RESULT = copy.deepcopy(GOOD_RESULT)
|
||||
GOOD_REDDIT_RESULT.tags = ['news', 'us']
|
||||
GOOD_REDDIT_RESULT.ids_data = {"reddit_id": "t5_1nytpy", "reddit_username": "alexaimephotography",
|
||||
GOOD_REDDIT_RESULT.ids_data = {
|
||||
"reddit_id": "t5_1nytpy",
|
||||
"reddit_username": "alexaimephotography",
|
||||
"fullname": "alexaimephotography",
|
||||
"image": "https://styles.redditmedia.com/t5_1nytpy/styles/profileIcon_7vmhdwzd3g931.jpg?width=256&height=256&crop=256:256,smart&frame=1&s=4f355f16b4920844a3f4eacd4237a7bf76b2e97e",
|
||||
"is_employee": "False", "is_nsfw": "False", "is_mod": "True", "is_following": "True",
|
||||
"has_user_profile": "True", "hide_from_robots": "False",
|
||||
"created_at": "2019-07-10 12:20:03", "total_karma": "53959", "post_karma": "52738"}
|
||||
"is_employee": "False",
|
||||
"is_nsfw": "False",
|
||||
"is_mod": "True",
|
||||
"is_following": "True",
|
||||
"has_user_profile": "True",
|
||||
"hide_from_robots": "False",
|
||||
"created_at": "2019-07-10 12:20:03",
|
||||
"total_karma": "53959",
|
||||
"post_karma": "52738",
|
||||
}
|
||||
|
||||
GOOD_IG_RESULT = copy.deepcopy(GOOD_RESULT)
|
||||
GOOD_IG_RESULT.tags = ['photo', 'global']
|
||||
GOOD_IG_RESULT.ids_data = {"instagram_username": "alexaimephotography", "fullname": "Alexaimephotography",
|
||||
GOOD_IG_RESULT.ids_data = {
|
||||
"instagram_username": "alexaimephotography",
|
||||
"fullname": "Alexaimephotography",
|
||||
"id": "6828488620",
|
||||
"image": "https://scontent-hel3-1.cdninstagram.com/v/t51.2885-19/s320x320/95420076_1169632876707608_8741505804647006208_n.jpg?_nc_ht=scontent-hel3-1.cdninstagram.com&_nc_ohc=jd87OUGsX4MAX_Ym5GX&tp=1&oh=0f42badd68307ba97ec7fb1ef7b4bfd4&oe=601E5E6F",
|
||||
"bio": "Photographer \nChild of fine street arts",
|
||||
"external_url": "https://www.flickr.com/photos/alexaimephotography2020/"}
|
||||
"external_url": "https://www.flickr.com/photos/alexaimephotography2020/",
|
||||
}
|
||||
|
||||
GOOD_TWITTER_RESULT = copy.deepcopy(GOOD_RESULT)
|
||||
GOOD_TWITTER_RESULT.tags = ['social', 'us']
|
||||
|
||||
TEST = [('alexaimephotographycars', 'username', {
|
||||
'500px': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://500px.com/',
|
||||
TEST = [
|
||||
(
|
||||
'alexaimephotographycars',
|
||||
'username',
|
||||
{
|
||||
'500px': {
|
||||
'username': 'alexaimephotographycars',
|
||||
'parsing_enabled': True,
|
||||
'url_main': 'https://500px.com/',
|
||||
'url_user': 'https://500px.com/p/alexaimephotographycars',
|
||||
'ids_usernames': {'alexaimephotographycars': 'username', 'alexaimephotography': 'username',
|
||||
'Alexaimephotogr': 'username'}, 'status': GOOD_500PX_RESULT, 'http_status': 200,
|
||||
'is_similar': False, 'rank': 2981},
|
||||
'Reddit': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/',
|
||||
'url_user': 'https://www.reddit.com/user/alexaimephotographycars', 'status': BAD_RESULT,
|
||||
'http_status': 404, 'is_similar': False, 'rank': 17},
|
||||
'Twitter': {'username': 'alexaimephotographycars', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/',
|
||||
'url_user': 'https://twitter.com/alexaimephotographycars', 'status': BAD_RESULT, 'http_status': 400,
|
||||
'is_similar': False, 'rank': 55},
|
||||
'Instagram': {'username': 'alexaimephotographycars', 'parsing_enabled': True,
|
||||
'ids_usernames': {
|
||||
'alexaimephotographycars': 'username',
|
||||
'alexaimephotography': 'username',
|
||||
'Alexaimephotogr': 'username',
|
||||
},
|
||||
'status': GOOD_500PX_RESULT,
|
||||
'http_status': 200,
|
||||
'is_similar': False,
|
||||
'rank': 2981,
|
||||
},
|
||||
'Reddit': {
|
||||
'username': 'alexaimephotographycars',
|
||||
'parsing_enabled': True,
|
||||
'url_main': 'https://www.reddit.com/',
|
||||
'url_user': 'https://www.reddit.com/user/alexaimephotographycars',
|
||||
'status': BAD_RESULT,
|
||||
'http_status': 404,
|
||||
'is_similar': False,
|
||||
'rank': 17,
|
||||
},
|
||||
'Twitter': {
|
||||
'username': 'alexaimephotographycars',
|
||||
'parsing_enabled': True,
|
||||
'url_main': 'https://www.twitter.com/',
|
||||
'url_user': 'https://twitter.com/alexaimephotographycars',
|
||||
'status': BAD_RESULT,
|
||||
'http_status': 400,
|
||||
'is_similar': False,
|
||||
'rank': 55,
|
||||
},
|
||||
'Instagram': {
|
||||
'username': 'alexaimephotographycars',
|
||||
'parsing_enabled': True,
|
||||
'url_main': 'https://www.instagram.com/',
|
||||
'url_user': 'https://www.instagram.com/alexaimephotographycars', 'status': BAD_RESULT,
|
||||
'http_status': 404, 'is_similar': False, 'rank': 29}}), ('alexaimephotography', 'username', {
|
||||
'500px': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://500px.com/',
|
||||
'url_user': 'https://500px.com/p/alexaimephotography', 'status': BAD_RESULT, 'http_status': 200,
|
||||
'is_similar': False, 'rank': 2981},
|
||||
'Reddit': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/',
|
||||
'url_user': 'https://www.instagram.com/alexaimephotographycars',
|
||||
'status': BAD_RESULT,
|
||||
'http_status': 404,
|
||||
'is_similar': False,
|
||||
'rank': 29,
|
||||
},
|
||||
},
|
||||
),
|
||||
(
|
||||
'alexaimephotography',
|
||||
'username',
|
||||
{
|
||||
'500px': {
|
||||
'username': 'alexaimephotography',
|
||||
'parsing_enabled': True,
|
||||
'url_main': 'https://500px.com/',
|
||||
'url_user': 'https://500px.com/p/alexaimephotography',
|
||||
'status': BAD_RESULT,
|
||||
'http_status': 200,
|
||||
'is_similar': False,
|
||||
'rank': 2981,
|
||||
},
|
||||
'Reddit': {
|
||||
'username': 'alexaimephotography',
|
||||
'parsing_enabled': True,
|
||||
'url_main': 'https://www.reddit.com/',
|
||||
'url_user': 'https://www.reddit.com/user/alexaimephotography',
|
||||
'ids_usernames': {'alexaimephotography': 'username'}, 'status': GOOD_REDDIT_RESULT, 'http_status': 200,
|
||||
'is_similar': False, 'rank': 17},
|
||||
'Twitter': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/',
|
||||
'url_user': 'https://twitter.com/alexaimephotography', 'status': BAD_RESULT, 'http_status': 400,
|
||||
'is_similar': False, 'rank': 55},
|
||||
'Instagram': {'username': 'alexaimephotography', 'parsing_enabled': True, 'url_main': 'https://www.instagram.com/',
|
||||
'ids_usernames': {'alexaimephotography': 'username'},
|
||||
'status': GOOD_REDDIT_RESULT,
|
||||
'http_status': 200,
|
||||
'is_similar': False,
|
||||
'rank': 17,
|
||||
},
|
||||
'Twitter': {
|
||||
'username': 'alexaimephotography',
|
||||
'parsing_enabled': True,
|
||||
'url_main': 'https://www.twitter.com/',
|
||||
'url_user': 'https://twitter.com/alexaimephotography',
|
||||
'status': BAD_RESULT,
|
||||
'http_status': 400,
|
||||
'is_similar': False,
|
||||
'rank': 55,
|
||||
},
|
||||
'Instagram': {
|
||||
'username': 'alexaimephotography',
|
||||
'parsing_enabled': True,
|
||||
'url_main': 'https://www.instagram.com/',
|
||||
'url_user': 'https://www.instagram.com/alexaimephotography',
|
||||
'ids_usernames': {'alexaimephotography': 'username'}, 'status': GOOD_IG_RESULT, 'http_status': 200,
|
||||
'is_similar': False, 'rank': 29}}), ('Alexaimephotogr', 'username', {
|
||||
'500px': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://500px.com/',
|
||||
'url_user': 'https://500px.com/p/Alexaimephotogr', 'status': BAD_RESULT, 'http_status': 200,
|
||||
'is_similar': False, 'rank': 2981},
|
||||
'Reddit': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.reddit.com/',
|
||||
'url_user': 'https://www.reddit.com/user/Alexaimephotogr', 'status': BAD_RESULT, 'http_status': 404,
|
||||
'is_similar': False, 'rank': 17},
|
||||
'Twitter': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.twitter.com/',
|
||||
'url_user': 'https://twitter.com/Alexaimephotogr', 'status': GOOD_TWITTER_RESULT, 'http_status': 400,
|
||||
'is_similar': False, 'rank': 55},
|
||||
'Instagram': {'username': 'Alexaimephotogr', 'parsing_enabled': True, 'url_main': 'https://www.instagram.com/',
|
||||
'url_user': 'https://www.instagram.com/Alexaimephotogr', 'status': BAD_RESULT, 'http_status': 404,
|
||||
'is_similar': False, 'rank': 29}})]
|
||||
'ids_usernames': {'alexaimephotography': 'username'},
|
||||
'status': GOOD_IG_RESULT,
|
||||
'http_status': 200,
|
||||
'is_similar': False,
|
||||
'rank': 29,
|
||||
},
|
||||
},
|
||||
),
|
||||
(
|
||||
'Alexaimephotogr',
|
||||
'username',
|
||||
{
|
||||
'500px': {
|
||||
'username': 'Alexaimephotogr',
|
||||
'parsing_enabled': True,
|
||||
'url_main': 'https://500px.com/',
|
||||
'url_user': 'https://500px.com/p/Alexaimephotogr',
|
||||
'status': BAD_RESULT,
|
||||
'http_status': 200,
|
||||
'is_similar': False,
|
||||
'rank': 2981,
|
||||
},
|
||||
'Reddit': {
|
||||
'username': 'Alexaimephotogr',
|
||||
'parsing_enabled': True,
|
||||
'url_main': 'https://www.reddit.com/',
|
||||
'url_user': 'https://www.reddit.com/user/Alexaimephotogr',
|
||||
'status': BAD_RESULT,
|
||||
'http_status': 404,
|
||||
'is_similar': False,
|
||||
'rank': 17,
|
||||
},
|
||||
'Twitter': {
|
||||
'username': 'Alexaimephotogr',
|
||||
'parsing_enabled': True,
|
||||
'url_main': 'https://www.twitter.com/',
|
||||
'url_user': 'https://twitter.com/Alexaimephotogr',
|
||||
'status': GOOD_TWITTER_RESULT,
|
||||
'http_status': 400,
|
||||
'is_similar': False,
|
||||
'rank': 55,
|
||||
},
|
||||
'Instagram': {
|
||||
'username': 'Alexaimephotogr',
|
||||
'parsing_enabled': True,
|
||||
'url_main': 'https://www.instagram.com/',
|
||||
'url_user': 'https://www.instagram.com/Alexaimephotogr',
|
||||
'status': BAD_RESULT,
|
||||
'http_status': 404,
|
||||
'is_similar': False,
|
||||
'rank': 29,
|
||||
},
|
||||
},
|
||||
),
|
||||
]
|
||||
|
||||
SUPPOSED_BRIEF = """Search by username alexaimephotographycars returned 1 accounts. Found target's other IDs: alexaimephotography, Alexaimephotogr. Search by username alexaimephotography returned 2 accounts. Search by username Alexaimephotogr returned 1 accounts. Extended info extracted from 3 accounts."""
|
||||
|
||||
@@ -187,7 +319,10 @@ def test_save_xmind_report():
|
||||
assert data['topic']['topics'][0]['title'] == 'Undefined'
|
||||
assert data['topic']['topics'][1]['title'] == 'test_tag'
|
||||
assert len(data['topic']['topics'][1]['topics']) == 1
|
||||
assert data['topic']['topics'][1]['topics'][0]['label'] == 'https://www.github.com/test'
|
||||
assert (
|
||||
data['topic']['topics'][1]['topics'][0]['label']
|
||||
== 'https://www.github.com/test'
|
||||
)
|
||||
|
||||
|
||||
def test_html_report():
|
||||
|
||||
+13
-11
@@ -10,25 +10,21 @@ EXAMPLE_DB = {
|
||||
"The specified member cannot be found. Please enter a member's entire name.",
|
||||
],
|
||||
"checkType": "message",
|
||||
"errors": {
|
||||
"You must be logged-in to do that.": "Login required"
|
||||
"errors": {"You must be logged-in to do that.": "Login required"},
|
||||
"url": "{urlMain}{urlSubpath}/members/?username={username}",
|
||||
},
|
||||
"url": "{urlMain}{urlSubpath}/members/?username={username}"
|
||||
}
|
||||
},
|
||||
},
|
||||
'sites': {
|
||||
"Amperka": {
|
||||
"engine": "XenForo",
|
||||
"rank": 121613,
|
||||
"tags": [
|
||||
"ru"
|
||||
],
|
||||
"tags": ["ru"],
|
||||
"urlMain": "http://forum.amperka.ru",
|
||||
"usernameClaimed": "adam",
|
||||
"usernameUnclaimed": "noonewouldeverusethis7"
|
||||
"usernameUnclaimed": "noonewouldeverusethis7",
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -116,8 +112,14 @@ def test_site_url_detector():
|
||||
db = MaigretDatabase()
|
||||
db.load_from_json(EXAMPLE_DB)
|
||||
|
||||
assert db.sites[0].url_regexp.pattern == r'^https?://(www.)?forum\.amperka\.ru/members/\?username=(.+?)$'
|
||||
assert db.sites[0].detect_username('http://forum.amperka.ru/members/?username=test') == 'test'
|
||||
assert (
|
||||
db.sites[0].url_regexp.pattern
|
||||
== r'^https?://(www.)?forum\.amperka\.ru/members/\?username=(.+?)$'
|
||||
)
|
||||
assert (
|
||||
db.sites[0].detect_username('http://forum.amperka.ru/members/?username=test')
|
||||
== 'test'
|
||||
)
|
||||
|
||||
|
||||
def test_ranked_sites_dict():
|
||||
|
||||
+32
-6
@@ -2,7 +2,13 @@
|
||||
import itertools
|
||||
import re
|
||||
|
||||
from maigret.utils import CaseConverter, is_country_tag, enrich_link_str, URLMatcher, get_dict_ascii_tree
|
||||
from maigret.utils import (
|
||||
CaseConverter,
|
||||
is_country_tag,
|
||||
enrich_link_str,
|
||||
URLMatcher,
|
||||
get_dict_ascii_tree,
|
||||
)
|
||||
|
||||
|
||||
def test_case_convert_camel_to_snake():
|
||||
@@ -45,8 +51,10 @@ def test_is_country_tag():
|
||||
|
||||
def test_enrich_link_str():
|
||||
assert enrich_link_str('test') == 'test'
|
||||
assert enrich_link_str(
|
||||
' www.flickr.com/photos/alexaimephotography/') == '<a class="auto-link" href="www.flickr.com/photos/alexaimephotography/">www.flickr.com/photos/alexaimephotography/</a>'
|
||||
assert (
|
||||
enrich_link_str(' www.flickr.com/photos/alexaimephotography/')
|
||||
== '<a class="auto-link" href="www.flickr.com/photos/alexaimephotography/">www.flickr.com/photos/alexaimephotography/</a>'
|
||||
)
|
||||
|
||||
|
||||
def test_url_extract_main_part():
|
||||
@@ -78,15 +86,32 @@ def test_url_make_profile_url_regexp():
|
||||
|
||||
for url_parts in itertools.product(*parts):
|
||||
url = ''.join(url_parts)
|
||||
assert URLMatcher.make_profile_url_regexp(url).pattern == r'^https?://(www.)?flickr\.com/photos/(.+?)$'
|
||||
assert (
|
||||
URLMatcher.make_profile_url_regexp(url).pattern
|
||||
== r'^https?://(www.)?flickr\.com/photos/(.+?)$'
|
||||
)
|
||||
|
||||
|
||||
def test_get_dict_ascii_tree():
|
||||
data = {'uid': 'dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==', 'legacy_id': '26403415', 'username': 'alexaimephotographycars', 'name': 'Alex Aimé', 'created_at': '2018-05-04T10:17:01.000+0000', 'image': 'https://drscdn.500px.org/user_avatar/26403415/q%3D85_w%3D300_h%3D300/v2?webp=true&v=2&sig=0235678a4f7b65e007e864033ebfaf5ef6d87fad34f80a8639d985320c20fe3b', 'image_bg': 'https://drscdn.500px.org/user_cover/26403415/q%3D65_m%3D2048/v2?webp=true&v=1&sig=bea411fb158391a4fdad498874ff17088f91257e59dfb376ff67e3a44c3a4201', 'website': 'www.instagram.com/street.reality.photography/', 'facebook_link': ' www.instagram.com/street.reality.photography/', 'instagram_username': 'Street.Reality.Photography', 'twitter_username': 'Alexaimephotogr'}
|
||||
data = {
|
||||
'uid': 'dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==',
|
||||
'legacy_id': '26403415',
|
||||
'username': 'alexaimephotographycars',
|
||||
'name': 'Alex Aimé',
|
||||
'created_at': '2018-05-04T10:17:01.000+0000',
|
||||
'image': 'https://drscdn.500px.org/user_avatar/26403415/q%3D85_w%3D300_h%3D300/v2?webp=true&v=2&sig=0235678a4f7b65e007e864033ebfaf5ef6d87fad34f80a8639d985320c20fe3b',
|
||||
'image_bg': 'https://drscdn.500px.org/user_cover/26403415/q%3D65_m%3D2048/v2?webp=true&v=1&sig=bea411fb158391a4fdad498874ff17088f91257e59dfb376ff67e3a44c3a4201',
|
||||
'website': 'www.instagram.com/street.reality.photography/',
|
||||
'facebook_link': ' www.instagram.com/street.reality.photography/',
|
||||
'instagram_username': 'Street.Reality.Photography',
|
||||
'twitter_username': 'Alexaimephotogr',
|
||||
}
|
||||
|
||||
ascii_tree = get_dict_ascii_tree(data.items())
|
||||
|
||||
assert ascii_tree == """
|
||||
assert (
|
||||
ascii_tree
|
||||
== """
|
||||
┣╸uid: dXJpOm5vZGU6VXNlcjoyNjQwMzQxNQ==
|
||||
┣╸legacy_id: 26403415
|
||||
┣╸username: alexaimephotographycars
|
||||
@@ -98,3 +123,4 @@ def test_get_dict_ascii_tree():
|
||||
┣╸facebook_link: www.instagram.com/street.reality.photography/
|
||||
┣╸instagram_username: Street.Reality.Photography
|
||||
┗╸twitter_username: Alexaimephotogr"""
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user