2024-04-26 10:41:02 +00:00
|
|
|
import difflib
|
2024-01-31 00:48:20 +00:00
|
|
|
import hashlib
|
2024-04-26 10:41:02 +00:00
|
|
|
import re
|
2024-01-31 00:48:20 +00:00
|
|
|
import secrets
|
2024-04-26 10:41:02 +00:00
|
|
|
import string
|
2024-01-31 00:48:20 +00:00
|
|
|
import urllib.parse
|
|
|
|
|
from datetime import datetime
|
|
|
|
|
from functools import lru_cache
|
2024-04-26 10:41:02 +00:00
|
|
|
from urllib.parse import parse_qs, urlparse
|
2024-01-31 00:48:20 +00:00
|
|
|
|
2024-04-26 10:41:02 +00:00
|
|
|
import aiohttp
|
2024-01-31 00:48:20 +00:00
|
|
|
import tld
|
2024-04-26 10:41:02 +00:00
|
|
|
from aiohttp_retry import RetryClient
|
|
|
|
|
from async_lru import alru_cache
|
2024-01-31 00:48:20 +00:00
|
|
|
from bs4 import BeautifulSoup
|
2024-04-26 10:41:02 +00:00
|
|
|
from loguru import logger
|
|
|
|
|
|
2024-07-22 10:10:50 +00:00
|
|
|
from medium_parser import exceptions, retry_options
|
2024-04-26 10:41:02 +00:00
|
|
|
|
|
|
|
|
DEFAULT_URL_PROTOCOL = "https://"
|
2024-01-31 00:48:20 +00:00
|
|
|
|
|
|
|
|
VALID_ID_CHARS = set(string.ascii_letters + string.digits)
|
|
|
|
|
|
2024-04-26 10:41:02 +00:00
|
|
|
KNOWN_MEDIUM_CUSTOM_DOMAINS = (
|
|
|
|
|
"javascript.plainenglish.io",
|
|
|
|
|
"blog.llamaindex.ai",
|
|
|
|
|
"code.likeagirl.io",
|
|
|
|
|
"medium.datadriveninvestor.com",
|
|
|
|
|
"blog.det.life",
|
|
|
|
|
"python.plainenglish.io",
|
|
|
|
|
"blog.stackademic.com",
|
|
|
|
|
"ai.gopubby.com",
|
|
|
|
|
"blog.devops.dev",
|
|
|
|
|
"levelup.gitconnected.com",
|
|
|
|
|
"betterhumans.coach.me",
|
|
|
|
|
"ai.plainenglish.io",
|
|
|
|
|
)
|
|
|
|
|
KNOWN_MEDIUM_DOMAINS = (
|
|
|
|
|
"medium.com",
|
|
|
|
|
"uxplanet.org",
|
|
|
|
|
"osintteam.blog",
|
|
|
|
|
"ahmedelfakharany.com",
|
|
|
|
|
"drlee.io",
|
|
|
|
|
"artificialcorner.com",
|
|
|
|
|
"generativeai.pub",
|
|
|
|
|
"productcoalition.com",
|
|
|
|
|
"towardsdev.com",
|
|
|
|
|
"infosecwriteups.com",
|
|
|
|
|
"towardsdatascience.com",
|
|
|
|
|
"thetaoist.online",
|
|
|
|
|
"devopsquare.com",
|
2024-06-07 10:20:52 +00:00
|
|
|
"laceydearie.com",
|
2024-04-26 10:41:02 +00:00
|
|
|
"bettermarketing.pub",
|
|
|
|
|
"itnext.io",
|
|
|
|
|
"eand.co",
|
|
|
|
|
"betterprogramming.pub",
|
|
|
|
|
"curiouse.co",
|
|
|
|
|
"betterhumans.pub",
|
|
|
|
|
"uxdesign.cc",
|
|
|
|
|
"thebolditalic.com",
|
|
|
|
|
"arcdigital.media",
|
|
|
|
|
"codeburst.io",
|
|
|
|
|
"psiloveyou.xyz",
|
|
|
|
|
"writingcooperative.com",
|
|
|
|
|
"entrepreneurshandbook.co",
|
|
|
|
|
"prototypr.io",
|
|
|
|
|
"theascent.pub",
|
|
|
|
|
"storiusmag.com"
|
|
|
|
|
)
|
|
|
|
|
NOT_MEDIUM_DOMAINS = (
|
|
|
|
|
"github.com",
|
|
|
|
|
"yandex.ru",
|
|
|
|
|
"yandex.kz",
|
|
|
|
|
"youtube.com",
|
|
|
|
|
"nytimes.com",
|
|
|
|
|
"wsj.com",
|
|
|
|
|
"reddit.com",
|
|
|
|
|
"elpais.com",
|
|
|
|
|
"forbes.com",
|
|
|
|
|
"bloomberg.com",
|
2024-06-07 10:20:52 +00:00
|
|
|
"lesechos.fr",
|
|
|
|
|
"otz.de",
|
|
|
|
|
"businessinsider.com",
|
2024-04-26 10:41:02 +00:00
|
|
|
"buff.ly",
|
2024-06-07 10:20:52 +00:00
|
|
|
"delish.com",
|
|
|
|
|
"economist.com",
|
|
|
|
|
"wired.com",
|
|
|
|
|
"rollingstone.com",
|
2024-04-26 10:41:02 +00:00
|
|
|
)
|
2024-01-31 00:48:20 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def is_valid_url(url):
|
2024-04-26 10:41:02 +00:00
|
|
|
"""
|
|
|
|
|
Check if the given URL is valid by verifying if it has a valid scheme and netloc.
|
|
|
|
|
|
|
|
|
|
Parameters:
|
|
|
|
|
url (str): The URL to be validated.
|
|
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
|
bool: True if the URL is valid, False otherwise.
|
|
|
|
|
"""
|
2024-01-31 00:48:20 +00:00
|
|
|
fld = get_fld(url)
|
|
|
|
|
if not fld:
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
parsed_url = urlparse(url)
|
|
|
|
|
return bool(parsed_url.scheme and parsed_url.netloc)
|
|
|
|
|
|
|
|
|
|
|
2024-07-20 10:14:00 +00:00
|
|
|
def getting_percontage_of_match(string: str, matched_string: str) -> float:
|
2024-01-31 00:48:20 +00:00
|
|
|
if string is None or matched_string is None:
|
2024-07-20 10:14:00 +00:00
|
|
|
return 0.0
|
|
|
|
|
|
2024-01-31 00:48:20 +00:00
|
|
|
return difflib.SequenceMatcher(None, string, matched_string).ratio() * 100
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def generate_random_sha256_hash():
|
|
|
|
|
# Encode the input string to bytes before hashing
|
|
|
|
|
random_input_bytes = secrets.token_bytes()
|
|
|
|
|
# Create the SHA-256 hash object
|
|
|
|
|
sha256_hash = hashlib.sha256()
|
|
|
|
|
# Update the hash object with the input bytes
|
|
|
|
|
sha256_hash.update(random_input_bytes)
|
|
|
|
|
# Get the hexadecimal representation of the hash
|
|
|
|
|
sha256_hex = sha256_hash.hexdigest()
|
|
|
|
|
return sha256_hex
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_unix_ms() -> int:
|
|
|
|
|
# Get the current date and time
|
|
|
|
|
current_date_time = datetime.now()
|
|
|
|
|
|
|
|
|
|
# Convert to the number of milliseconds since January 1, 1970 (Unix Epoch time)
|
|
|
|
|
milliseconds_since_epoch = int(current_date_time.timestamp() * 1000)
|
|
|
|
|
|
|
|
|
|
return milliseconds_since_epoch
|
|
|
|
|
|
|
|
|
|
|
2024-06-07 10:20:52 +00:00
|
|
|
def unquerify_url(url: str) -> str:
|
2024-02-19 03:12:51 +00:00
|
|
|
"""
|
|
|
|
|
Sanitizes a URL by removing all query parameters.
|
2024-04-26 10:41:02 +00:00
|
|
|
|
2024-02-19 03:12:51 +00:00
|
|
|
Args:
|
2024-04-26 10:41:02 +00:00
|
|
|
url: The URL to sanitize.
|
|
|
|
|
|
2024-02-19 03:12:51 +00:00
|
|
|
Returns:
|
2024-04-26 10:41:02 +00:00
|
|
|
A sanitized URL.
|
2024-02-19 03:12:51 +00:00
|
|
|
"""
|
2024-04-26 10:41:02 +00:00
|
|
|
|
2024-02-19 03:12:51 +00:00
|
|
|
parsed_url = urllib.parse.urlparse(url)
|
|
|
|
|
query = parsed_url.query
|
|
|
|
|
if query:
|
2024-04-26 10:41:02 +00:00
|
|
|
parsed_url = parsed_url._replace(query="")
|
2024-02-19 03:12:51 +00:00
|
|
|
sanitized_url = urllib.parse.urlunparse(parsed_url)
|
|
|
|
|
return sanitized_url.removesuffix("/")
|
2024-01-31 00:48:20 +00:00
|
|
|
|
|
|
|
|
|
2024-06-07 10:20:52 +00:00
|
|
|
@lru_cache(maxsize=500)
|
|
|
|
|
def un_wwwify(url: str):
|
2024-07-22 10:10:50 +00:00
|
|
|
# TODO: enhanced type checks
|
2024-06-07 10:20:52 +00:00
|
|
|
if url.startswith("www."):
|
|
|
|
|
return url.removeprefix("www.")
|
|
|
|
|
return url
|
|
|
|
|
|
|
|
|
|
|
2024-04-26 10:41:02 +00:00
|
|
|
def correct_url(url: str) -> str:
|
|
|
|
|
# Workaround for Safari bug. We don't known by what condition this happens, but sometimes we get
|
|
|
|
|
# some broken URL, for example like "", and all of them based on user-agent comes from Safari browser engine,
|
|
|
|
|
# from some kinda different platforms like Windows, and that's strange bcz does Windows has Safari browser? lmao
|
|
|
|
|
|
|
|
|
|
# TODO: fix
|
|
|
|
|
|
|
|
|
|
# unsafari_url = re.sub(r"https?://", DEFAULT_URL_PROTOCOL, url)
|
|
|
|
|
# logger.debug(f"Is URL broken by Safari bug: {unsafari_url != url}")
|
|
|
|
|
|
|
|
|
|
unsafari_url = url
|
|
|
|
|
|
|
|
|
|
unquerified_url = unquerify_url(unsafari_url)
|
|
|
|
|
logger.debug(f"Is URL has query data: {unquerified_url != unsafari_url}")
|
|
|
|
|
|
|
|
|
|
unplaginated_url = unplaginate_url(unquerified_url)
|
|
|
|
|
logger.debug(f"Is URL has plagination: {unplaginated_url != unquerified_url}")
|
|
|
|
|
|
|
|
|
|
# parsed_url = urlparse(url)
|
|
|
|
|
# if not bool(parsed_url.netloc and parsed_url.scheme):
|
|
|
|
|
# return DEFAULT_PROTOCOL + url
|
|
|
|
|
|
|
|
|
|
# if not re.match(r'http[s]?://', url):
|
|
|
|
|
# url = DEFAULT_PROTOCOL + url
|
|
|
|
|
|
|
|
|
|
return url
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def unplaginate_url(url):
|
|
|
|
|
"""
|
|
|
|
|
Removes page plaginations from URL
|
|
|
|
|
"""
|
2024-01-31 00:48:20 +00:00
|
|
|
sanitized_url = url.removesuffix("/page/2")
|
|
|
|
|
return sanitized_url.removesuffix("/")
|
|
|
|
|
|
|
|
|
|
|
2024-04-26 10:41:02 +00:00
|
|
|
@lru_cache(maxsize=100)
|
2024-04-26 11:22:52 +00:00
|
|
|
def is_has_valid_medium_post_id(hex_string: str) -> bool:
|
|
|
|
|
return extract_hex_string(hex_string) is not None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@lru_cache(maxsize=100)
|
|
|
|
|
def basic_hex_check(hex_string: str) -> bool:
|
2024-02-19 03:12:51 +00:00
|
|
|
# Check if the string is a valid hexadecimal string
|
|
|
|
|
for char in hex_string:
|
|
|
|
|
if char not in VALID_ID_CHARS:
|
2024-01-31 00:48:20 +00:00
|
|
|
return False
|
|
|
|
|
|
2024-04-26 10:41:02 +00:00
|
|
|
# Unfortunately, this logic doesn't works correctly sometimes, because
|
|
|
|
|
# there is some unique URLs that are has only digits, like this:
|
|
|
|
|
# https://valeman.medium.com/python-vs-r-for-time-series-forecasting-395390432598
|
|
|
|
|
|
2024-02-19 03:12:51 +00:00
|
|
|
# Check if the string contains only lowercase hexadecimal characters
|
|
|
|
|
# if not hex_string.islower():
|
|
|
|
|
# return False
|
|
|
|
|
|
|
|
|
|
# Check if the length of the string is correct for a hexadecimal string (e.g., 10, 11 or 12 characters)
|
2024-04-26 10:41:02 +00:00
|
|
|
if len(hex_string) not in range(8, 12 + 1):
|
2024-02-19 03:12:51 +00:00
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
return True
|
2024-01-31 00:48:20 +00:00
|
|
|
|
|
|
|
|
|
2024-04-26 11:22:52 +00:00
|
|
|
@lru_cache(maxsize=100)
|
|
|
|
|
def extract_hex_string(input_string: str) -> str:
|
|
|
|
|
# First try to find a hexadecimal string preceded by a '-'
|
2024-06-10 14:57:31 +00:00
|
|
|
match = re.findall(r'-(\b[a-fA-F0-9]{8,12}\b)', input_string)
|
2024-04-26 11:22:52 +00:00
|
|
|
if not match:
|
|
|
|
|
# If no match, try to find a hexadecimal string without the '-'
|
2024-06-10 14:57:31 +00:00
|
|
|
match = re.findall(r'(\b[a-fA-F0-9]{8,12}\b)', input_string)
|
|
|
|
|
return match[-1] if match else None
|
2024-04-26 11:22:52 +00:00
|
|
|
|
|
|
|
|
|
2024-04-26 10:41:02 +00:00
|
|
|
async def resolve_medium_short_link(short_url_id: str, timeout: int = 5) -> str:
|
2024-01-31 00:48:20 +00:00
|
|
|
async with aiohttp.ClientSession() as session:
|
|
|
|
|
retry_client = RetryClient(client_session=session, raise_for_status=False, retry_options=retry_options)
|
|
|
|
|
request = await retry_client.get(
|
|
|
|
|
f"https://rsci.app.link/{short_url_id}",
|
|
|
|
|
timeout=timeout,
|
|
|
|
|
headers={"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"},
|
|
|
|
|
allow_redirects=False,
|
|
|
|
|
)
|
|
|
|
|
post_url = request.headers["Location"]
|
2024-07-11 13:14:38 +00:00
|
|
|
|
2024-04-26 10:41:02 +00:00
|
|
|
return post_url
|
2024-01-31 00:48:20 +00:00
|
|
|
|
|
|
|
|
|
2024-04-26 10:41:02 +00:00
|
|
|
@alru_cache(maxsize=500)
|
2024-04-25 08:58:00 +00:00
|
|
|
async def resolve_medium_url(url: str, timeout: int = 5) -> str:
|
2024-04-26 10:41:02 +00:00
|
|
|
logger.debug(f"Trying resolve {url=}, with {timeout=}")
|
2024-01-31 00:48:20 +00:00
|
|
|
parsed_url = urlparse(url)
|
2024-06-07 10:20:52 +00:00
|
|
|
parsed_netloc = un_wwwify(parsed_url.netloc)
|
2024-04-26 10:41:02 +00:00
|
|
|
|
2024-01-31 00:48:20 +00:00
|
|
|
if parsed_url.path.startswith("/p/"):
|
2024-04-26 10:41:02 +00:00
|
|
|
logger.debug("URL is Medium 'mobile' link")
|
2024-01-31 00:48:20 +00:00
|
|
|
post_id = parsed_url.path.rsplit("/p/")[1]
|
2024-04-26 10:41:02 +00:00
|
|
|
|
2024-06-07 10:20:52 +00:00
|
|
|
elif parsed_netloc == "l.facebook.com" and parsed_url.path.startswith("/l.php"):
|
2024-04-26 10:41:02 +00:00
|
|
|
logger.debug("URL seems like is Facebook redirect (tracking) link")
|
|
|
|
|
|
2024-01-31 00:48:20 +00:00
|
|
|
parsed_query = parse_qs(parsed_url.query)
|
|
|
|
|
if parsed_query.get("u") and len(parsed_query["u"]) == 1:
|
|
|
|
|
post_url = parsed_query["u"][0]
|
2024-04-25 08:58:00 +00:00
|
|
|
return await resolve_medium_url(post_url)
|
2024-04-26 10:41:02 +00:00
|
|
|
|
|
|
|
|
logger.debug("...but we get fucked up...")
|
2024-01-31 00:48:20 +00:00
|
|
|
return False
|
2024-04-26 10:41:02 +00:00
|
|
|
|
2024-06-07 10:20:52 +00:00
|
|
|
elif parsed_netloc == "webcache.googleusercontent.com" and parsed_url.path.startswith("/search"):
|
2024-04-26 10:41:02 +00:00
|
|
|
logger.debug("URL seems like is Google Web Archive page link")
|
|
|
|
|
|
2024-01-31 00:48:20 +00:00
|
|
|
parsed_query = parse_qs(parsed_url.query)
|
|
|
|
|
if parsed_query.get("q") and len(parsed_query["q"]) == 1:
|
|
|
|
|
post_url = parsed_query["q"][0].removeprefix("cache:")
|
2024-04-25 08:58:00 +00:00
|
|
|
return await resolve_medium_url(post_url)
|
2024-04-26 10:41:02 +00:00
|
|
|
|
|
|
|
|
logger.debug("...but we get fucked up...")
|
2024-01-31 00:48:20 +00:00
|
|
|
return False
|
2024-04-26 10:41:02 +00:00
|
|
|
|
2024-06-07 10:20:52 +00:00
|
|
|
elif parsed_netloc == "google.com" and parsed_url.path.startswith("/url"):
|
2024-04-26 10:41:02 +00:00
|
|
|
logger.debug("URL seems like is Google redirect (tracking) link")
|
|
|
|
|
|
2024-01-31 00:48:20 +00:00
|
|
|
parsed_query = parse_qs(parsed_url.query)
|
|
|
|
|
if parsed_query.get("url") and len(parsed_query["url"]) == 1:
|
2024-04-26 10:41:02 +00:00
|
|
|
logger.debug("..and we got 'url' passed param. Make resolve them....")
|
2024-01-31 00:48:20 +00:00
|
|
|
post_url = parsed_query["url"][0]
|
2024-04-25 08:58:00 +00:00
|
|
|
return await resolve_medium_url(post_url)
|
2024-01-31 00:48:20 +00:00
|
|
|
elif parsed_query.get("q") and len(parsed_query["q"]) == 1:
|
2024-04-26 10:41:02 +00:00
|
|
|
logger.debug("..and we got 'q' passed param. Make resolve them....")
|
2024-01-31 00:48:20 +00:00
|
|
|
post_url = parsed_query["q"][0]
|
2024-04-25 08:58:00 +00:00
|
|
|
return await resolve_medium_url(post_url)
|
2024-04-26 10:41:02 +00:00
|
|
|
|
|
|
|
|
logger.debug("...but we get fucked up...")
|
2024-01-31 00:48:20 +00:00
|
|
|
return False
|
2024-04-26 10:41:02 +00:00
|
|
|
|
2024-06-07 10:20:52 +00:00
|
|
|
elif parsed_netloc == "12ft.io":
|
2024-04-26 10:41:02 +00:00
|
|
|
logger.debug("URL seems like is from our partner named 12ft.io")
|
|
|
|
|
|
2024-01-31 00:48:20 +00:00
|
|
|
parsed_query = parse_qs(parsed_url.query)
|
|
|
|
|
if parsed_query.get("q") and len(parsed_query["q"]) == 1:
|
2024-04-26 10:41:02 +00:00
|
|
|
logger.debug("..and we got 'q' passed param. Make resolve them....")
|
2024-01-31 00:48:20 +00:00
|
|
|
post_url = parsed_query["q"][0]
|
2024-04-25 08:58:00 +00:00
|
|
|
return await resolve_medium_url(post_url)
|
2024-04-26 10:41:02 +00:00
|
|
|
|
|
|
|
|
logger.debug("...but we get fucked up...")
|
2024-01-31 00:48:20 +00:00
|
|
|
return False
|
2024-04-26 10:41:02 +00:00
|
|
|
|
2024-01-31 00:48:20 +00:00
|
|
|
elif parsed_url.path.startswith("/m/global-identity-2"):
|
2024-04-26 10:41:02 +00:00
|
|
|
logger.debug("URL seems like is Medium redirect (tracking) link. Possibly from email subscription")
|
|
|
|
|
|
2024-01-31 00:48:20 +00:00
|
|
|
parsed_query = parse_qs(parsed_url.query)
|
|
|
|
|
if parsed_query.get("redirectUrl") and len(parsed_query["redirectUrl"]) == 1:
|
2024-04-26 10:41:02 +00:00
|
|
|
logger.debug("..and we got 'redirectUrl' passed param. Make resolve them....")
|
2024-01-31 00:48:20 +00:00
|
|
|
post_url = parsed_query["redirectUrl"][0]
|
2024-04-25 08:58:00 +00:00
|
|
|
return await resolve_medium_url(post_url)
|
2024-04-26 10:41:02 +00:00
|
|
|
|
|
|
|
|
logger.debug("...but we get fucked up...")
|
2024-01-31 00:48:20 +00:00
|
|
|
return False
|
2024-04-26 10:41:02 +00:00
|
|
|
|
2024-06-07 10:20:52 +00:00
|
|
|
elif parsed_netloc == "link.medium.com":
|
2024-04-26 10:41:02 +00:00
|
|
|
logger.debug("URL seems like is Medium short (SHORT) redirect (tracking) link. Make resolve them...")
|
2024-01-31 00:48:20 +00:00
|
|
|
short_url_id = parsed_url.path.removeprefix("/")
|
2024-04-26 10:41:02 +00:00
|
|
|
post_url = await resolve_medium_short_link(short_url_id, timeout)
|
|
|
|
|
return await resolve_medium_url(post_url)
|
|
|
|
|
|
2024-01-31 00:48:20 +00:00
|
|
|
else:
|
2024-04-26 10:41:02 +00:00
|
|
|
logger.debug("We can't determine the URL type. Let's just try to extract the post_id...")
|
2024-01-31 00:48:20 +00:00
|
|
|
post_url = parsed_url.path.split("/")[-1]
|
|
|
|
|
post_id = post_url.split("-")[-1]
|
|
|
|
|
|
2024-04-26 11:22:52 +00:00
|
|
|
if not is_has_valid_medium_post_id(post_id):
|
2024-04-26 10:41:02 +00:00
|
|
|
logger.warning(f"...but hoops, that's invalid post_id: {post_id}")
|
2024-01-31 00:48:20 +00:00
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
return post_id
|
|
|
|
|
|
|
|
|
|
|
2024-04-25 08:58:00 +00:00
|
|
|
async def resolve_medium_url_old(url: str, timeout: int = 5) -> str:
|
2024-01-31 00:48:20 +00:00
|
|
|
async with aiohttp.ClientSession() as session:
|
|
|
|
|
retry_client = RetryClient(client_session=session, raise_for_status=False, retry_options=retry_options)
|
|
|
|
|
request = await retry_client.get(url, timeout=timeout)
|
|
|
|
|
response = await request.text()
|
2024-06-07 10:20:52 +00:00
|
|
|
|
2024-01-31 00:48:20 +00:00
|
|
|
soup = BeautifulSoup(response, "html.parser")
|
|
|
|
|
type_meta_tag = soup.head.find("meta", property="og:type")
|
|
|
|
|
if not type_meta_tag or type_meta_tag.get("content") != "article":
|
|
|
|
|
return False
|
2024-06-07 10:20:52 +00:00
|
|
|
|
2024-01-31 00:48:20 +00:00
|
|
|
url_meta_tag = soup.head.find("meta", property="al:android:url")
|
|
|
|
|
if not url_meta_tag or not url_meta_tag.get("content"):
|
|
|
|
|
return False
|
2024-06-07 10:20:52 +00:00
|
|
|
|
2024-01-31 00:48:20 +00:00
|
|
|
parsed_url = urlparse(url_meta_tag["content"])
|
|
|
|
|
path = parsed_url.path.strip("/")
|
|
|
|
|
parsed_value = path.split("/")[-1]
|
|
|
|
|
return parsed_value
|
|
|
|
|
|
|
|
|
|
|
2024-04-26 10:41:02 +00:00
|
|
|
async def is_valid_medium_url_old(url: str, timeout: int = 5):
|
|
|
|
|
async with aiohttp.ClientSession() as session:
|
|
|
|
|
retry_client = RetryClient(client_session=session, raise_for_status=False, retry_options=retry_options)
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
request = await retry_client.get(url, timeout=timeout)
|
|
|
|
|
except Exception as ex:
|
|
|
|
|
raise exceptions.PageLoadingError(ex) from ex
|
|
|
|
|
|
|
|
|
|
response = await request.text()
|
|
|
|
|
|
|
|
|
|
soup = BeautifulSoup(response, "html.parser")
|
|
|
|
|
|
|
|
|
|
if not soup.head:
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
site_name_meta_tag = soup.head.find("meta", property="og:site_name")
|
|
|
|
|
|
|
|
|
|
if not site_name_meta_tag or site_name_meta_tag.get("content") != "Medium":
|
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@lru_cache(maxsize=500)
|
2024-01-31 00:48:20 +00:00
|
|
|
def get_fld(url: str):
|
|
|
|
|
try:
|
|
|
|
|
fld = tld.get_fld(url)
|
|
|
|
|
except Exception as ex:
|
|
|
|
|
logger.trace(ex)
|
|
|
|
|
return None
|
|
|
|
|
else:
|
|
|
|
|
return fld
|
|
|
|
|
|
|
|
|
|
|
2024-04-26 10:41:02 +00:00
|
|
|
@alru_cache(maxsize=100)
|
|
|
|
|
async def is_valid_medium_url(url: str) -> bool:
|
2024-01-31 00:48:20 +00:00
|
|
|
"""
|
2024-04-26 10:41:02 +00:00
|
|
|
Check if the url is a valid Medium article page
|
2024-01-31 00:48:20 +00:00
|
|
|
|
2024-04-26 10:41:02 +00:00
|
|
|
Check if the domain is in the known Medium domains and subdomains list. If the doman/subdomain is in the list, then the url is valid
|
2024-01-31 00:48:20 +00:00
|
|
|
"""
|
|
|
|
|
domain = get_fld(url)
|
|
|
|
|
parsed_url = urlparse(url)
|
2024-06-07 10:20:52 +00:00
|
|
|
domain_netloc = un_wwwify(parsed_url.netloc)
|
2024-01-31 00:48:20 +00:00
|
|
|
|
2024-04-26 10:41:02 +00:00
|
|
|
# TODO: http://freedium.cfd/https://www.google.com.vn/url?sa=i&url=https%3A%2F%2Fmedium.com%2F%40dugguRK%2Fabout-android-hardware-abstraction-layer-hal-5d191dafeb2c&psig=AOvVaw17KP0U_haPMmhAByeMTxSg&ust=1711354113283000&source=images&cd=vfe&opi=89978449&ved=0CBQQjhxqFwoTCMCM_oG5jIUDFQAAAAAdAAAAABAa
|
|
|
|
|
|
2024-01-31 00:48:20 +00:00
|
|
|
if domain in ["12ft.io", "google.com", "facebook.com", "googleusercontent.com"]:
|
|
|
|
|
return True
|
|
|
|
|
|
2024-06-07 10:20:52 +00:00
|
|
|
if domain in NOT_MEDIUM_DOMAINS or domain_netloc in NOT_MEDIUM_DOMAINS:
|
2024-01-31 00:48:20 +00:00
|
|
|
raise exceptions.NotValidMediumURL("100% not valid Medium URL")
|
|
|
|
|
|
2024-06-07 10:20:52 +00:00
|
|
|
if domain in KNOWN_MEDIUM_DOMAINS or domain_netloc in KNOWN_MEDIUM_CUSTOM_DOMAINS:
|
2024-01-31 00:48:20 +00:00
|
|
|
return True
|
|
|
|
|
|
2024-04-26 10:41:02 +00:00
|
|
|
logger.warning(f"url '{url}' wasn't detected in known Medium domains")
|
2024-01-31 00:48:20 +00:00
|
|
|
|
2024-04-26 10:41:02 +00:00
|
|
|
# XXX: Unfourtunately, for now we don't know ALL Medium's domains, so we need resolve links
|
|
|
|
|
resolve_result = bool(await resolve_medium_url(url))
|
2024-01-31 00:48:20 +00:00
|
|
|
|
2024-06-07 10:20:52 +00:00
|
|
|
# send_message(f"We found that {domain=}, {domain_netloc=} is not listed in out known Medium database.\nURL: {url}")
|
2024-01-31 00:48:20 +00:00
|
|
|
|
2024-04-26 10:41:02 +00:00
|
|
|
return resolve_result
|
2024-01-31 00:48:20 +00:00
|
|
|
|
2024-04-26 10:41:02 +00:00
|
|
|
# return False
|