refactor: cython integration & refactor-refactor

This commit is contained in:
ZhymabekRoman 2024-07-27 21:55:13 +05:00
parent f4fef20301
commit 61c9f062fd
18 changed files with 9978 additions and 143 deletions

View file

@ -162,10 +162,6 @@
respond "Access denied" 403
}
route @miro/* {
uri replace @miro/ /
reverse_proxy https://miro.medium.com
}
route /* {
reverse_proxy web:7080

View file

@ -5,11 +5,6 @@
{{ template }}
route @miro/* {
uri replace @miro/ /
reverse_proxy https://miro.medium.com
}
route /* {
reverse_proxy web:7080
}

View file

@ -6,11 +6,6 @@ freedium.cfd {
{{ template }}
route @miro/* {
uri replace @miro/ /
reverse_proxy https://miro.medium.com
}
route /* {
reverse_proxy web:7080
}

View file

@ -2,21 +2,23 @@ FROM python:3.12.3-slim
WORKDIR /app
COPY ./requirements.txt ./
COPY ./requirements-fast.txt ./
COPY ./server ./server
COPY ./core ./core
COPY ./rl_string_helper ./rl_string_helper
COPY ./database-lib ./database-lib
RUN pip install --no-cache-dir wheel
COPY ./rl_string_helper ./rl_string_helper
RUN pip3 install --no-cache-dir ./rl_string_helper
COPY ./database-lib ./database-lib
RUN pip3 install --no-cache-dir ./database-lib
COPY ./core ./core
RUN pip3 install --no-cache-dir ./core
# COPY ./server ./server
COPY ./requirements.txt ./
RUN pip3 install --no-cache-dir -r requirements.txt
COPY ./requirements-fast.txt ./
RUN pip3 install --no-cache-dir -r requirements-fast.txt
EXPOSE 7080

4
DockerfileDante Normal file
View file

@ -0,0 +1,4 @@
FROM shturman/dante:1.4.2
ENV DEBIAN_FRONTEND=noninteractive
RUN apt update && apt upgrade -y && apt install -y curl && apt clean && rm -rf /var/lib/apt/lists/*

View file

@ -11,8 +11,6 @@ from medium_parser import retry_options
from medium_parser.time import get_unix_ms
from medium_parser.utils import generate_random_sha256_hash
socks_proxy = "socks5://wgcf1:1080"
class MediumApi:
__slots__ = ("auth_cookies", "proxy_list", "timeout")
@ -23,16 +21,21 @@ class MediumApi:
self.timeout = timeout
async def query_post_by_id(self, post_id: str):
return await self.query_post_graphql(post_id, self.timeout)
logger.debug("Using graphql implementation")
return await self.query_post_graphql(post_id)
async def query_post_graphql(self, post_id: str):
logger.debug(f"Starting request construction for post {post_id}")
if self.proxy_list:
connector = ProxyConnector.from_url(random.choice(self.proxy_list))
proxy = random.choice(self.proxy_list)
connector = ProxyConnector.from_url(proxy)
logger.debug(f"Using proxy: {proxy}")
else:
connector = None
logger.debug(f"Using connector: {connector}")
headers = {
"X-APOLLO-OPERATION-ID": generate_random_sha256_hash(),
"X-APOLLO-OPERATION-NAME": "FullPostQuery",

View file

@ -14,7 +14,7 @@ from rl_string_helper import RLStringHelper, parse_markups, split_overlapping_ra
from . import jinja_env
from .exceptions import InvalidMediumPostID, InvalidMediumPostURL, InvalidURL, MediumParserException, MediumPostQueryError
from .medium_api import MediumApi
from .api import MediumApi
from .models.html_result import HtmlResult
from .time import convert_datetime_to_human_readable
from .utils import correct_url, extract_hex_string, getting_percontage_of_match, is_has_valid_medium_post_id, is_valid_medium_url, is_valid_url, resolve_medium_url
@ -24,7 +24,7 @@ if typing.TYPE_CHECKING:
class MediumParser:
__slots__ = ("cache", "host_address", "jinja_template", "post_template", "timeout")
__slots__ = ("cache", "host_address", "jinja_template", "post_template", "timeout", "medium_api")
def __init__(self, cache: "AbstractCacheBackend", medium_api: MediumApi, timeout: int, host_address: str, template_folder: str = "./templates"):
self.timeout: int = timeout

View file

@ -18,7 +18,11 @@ services:
networks:
- caddy_net
healthcheck:
test: ["CMD-SHELL", "curl -f http://caddy_freedium:6752/ --max-time 80 --header 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Safari/605.1.15'"]
test:
[
"CMD-SHELL",
"curl -f http://caddy_freedium:6752/ --max-time 80 --header 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Safari/605.1.15'"
]
interval: 30s
start_period: 20s
timeout: 80s
@ -29,7 +33,7 @@ services:
wgcf1:
image: neilpang/wgcf-docker:latest
volumes:
# - ./wgcf:/wgcf
# - ./wgcf:/wgcf
- /lib/modules:/lib/modules
- ./scripts/entryWGCF.sh:/entry.sh
- /etc/localtime:/etc/localtime:ro
@ -51,6 +55,9 @@ services:
dante_1:
image: shturman/dante:1.4.2
# build:
# context: ./
# dockerfile: ./DockerfileDante
volumes:
- ./scripts/dante.config:/etc/sockd.conf
- /etc/localtime:/etc/localtime:ro
@ -61,12 +68,19 @@ services:
depends_on:
wgcf1:
condition: service_healthy
# healthcheck:
# test: curl --proxy socks5://localhost:1080 https://google.com
# interval: 5s
# timeout: 2s
# retries: 5
web:
build:
context: ./
dockerfile: ./Dockerfile
command: python3 -m server server
environment:
- "PROXY_LIST=socks5://wgcf1:1080"
volumes:
# - ./.env:/app/.env
# - ./server/user_data/logs:/app/server/user_data/logs
@ -129,7 +143,7 @@ services:
volumes:
- dragonflydata:/data
healthcheck:
test: ["CMD", "redis-cli", "ping"]
test: [ "CMD", "redis-cli", "ping" ]
interval: 30s
start_period: 20s
timeout: 10s
@ -141,7 +155,7 @@ services:
postgres:
image: postgres:16.3-alpine3.20
networks:
- caddy_net
- caddy_net
# ports:
# - 5432:5432
volumes:
@ -167,6 +181,7 @@ volumes:
caddy_config:
dragonflydata:
networks:
caddy_net:
external: true

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,9 @@
cdef class StringAssignmentMixin:
cdef:
str string
list string_list
cdef void __render_string(self)
cpdef StringAssignmentMixin pop(self, int key)
cpdef StringAssignmentMixin insert(self, int key, str value)

View file

@ -0,0 +1,43 @@
# cython: language_level=3str
cdef class StringAssignmentMixin:
def __cinit__(self, str string):
self.string = str(string) if isinstance(string, StringAssignmentMixin) else string
self.string_list = list(self.string)
cdef void __render_string(self):
self.string = "".join(self.string_list)
def __len__(self):
return len(self.string_list)
cpdef StringAssignmentMixin pop(self, int key):
self.string_list.pop(key)
return self
def encode(self, str encoding):
self.__render_string()
return self.string.encode(encoding, "surrogatepass")
cpdef StringAssignmentMixin insert(self, int key, str value):
self.string_list.insert(key, value)
return self
def __setitem__(self, object key, str value):
self.string_list[key] = value
# return self
def __getitem__(self, object key):
if isinstance(key, slice):
return "".join(self.string_list[key])
else:
return self.string_list[key]
def __str__(self):
self.__render_string()
return self.string
__repr__ = __str__
# Make the class available to Python
StringAssignmentMixin_py = StringAssignmentMixin

View file

@ -3,46 +3,10 @@ from .logger_trace import trace
from .utils import quote_html, quote_symbol
from jinja2 import Environment, DebugUndefined, Template
from rl_string_helper.mixins.string_assignment import StringAssignmentMixin_py as StringAssignmentMixin
jinja_env = Environment(undefined=DebugUndefined)
class StringAssignmentMix:
__slots__ = ("string", "string_list")
def __init__(self, string: str):
self.string = str(string) if isinstance(string, StringAssignmentMix) else string
self.string_list = list(self.string)
def __render_string(self):
self.string = "".join(self.string_list)
def __len__(self):
return len(self.string_list)
def pop(self, key):
self.string_list.pop(key)
return self
def encode(self, encoding: str):
self.__render_string()
return self.string.encode(encoding, "surrogatepass")
def insert(self, key: int, value):
self.string_list.insert(key, value)
return self
def __setitem__(self, key, value):
self.string_list[key] = value
return self
def __getitem__(self, key):
return "".join(self.string_list[key])
def __str__(self):
self.__render_string()
return self.string
__repr__ = __str__
# TODO: more clarified description
"""
@ -53,13 +17,14 @@ the value returned by length might not match the actual number of Unicode charac
Python uses UTF-8 encoding, which each character is encoded as one byte. So here is a workaround to get the actual number of characters and manipulate them in string as in UTF-16 encoding. See pre_utf_16_bang and post_utf_16_bang function.
"""
# TODO: doc! Who will read this noodles lol?
# TODO: check cases when UTF-16 character can be more that 2 bytes
class RLStringHelper:
__slots__ = ("string", "templates", "replaces", "quote_html_type", "quote_replaces", "_default_bang_char")
def __init__(self, string: str, quote_html_type: list[str] = ["full"], _default_bang_char: str = "R"):
self.string = StringAssignmentMix(quote_symbol(string))
self.string: str = quote_symbol(string)
self.templates = []
self.quote_replaces = []
self.replaces = []
@ -108,8 +73,8 @@ class RLStringHelper:
return string, string_pos_matrix
@trace
def post_utf_16_bang(self, string: str, string_pos_matrix: list, utf_16_bang_list: list):
string = StringAssignmentMix(string)
def post_utf_16_bang(self, string: StringAssignmentMixin, string_pos_matrix: list, utf_16_bang_list: list):
# string = StringAssignmentMixin(str(string))
post_transbang = 0
for bang_pos, char_len, old_pos in utf_16_bang_list:
string, string_pos_matrix = self._delete_char(string, string_pos_matrix, bang_pos - post_transbang, char_len, old_pos - post_transbang)
@ -178,11 +143,11 @@ class RLStringHelper:
return updated_text, string_pos_matrix, utf_16_bang_list
@trace
def _render_replaces(self, string: str, string_pos_matrix: list, utf_16_bang_list: list):
def _render_replaces(self, string: StringAssignmentMixin, string_pos_matrix: list, utf_16_bang_list: list):
if not self.replaces and not self.quote_replaces:
return string, string_pos_matrix, utf_16_bang_list
string = StringAssignmentMix(string)
# string = StringAssignmentMixin(str(string))
replaces = self.replaces + self.quote_replaces
@trace
@ -229,7 +194,8 @@ class RLStringHelper:
@trace
def __str__(self):
string = StringAssignmentMix(self.string)
string = StringAssignmentMixin(self.string)
string_pos_matrix = list(range(len(string)))
updated_text, string_pos_matrix, utf_16_bang_list = self.pre_utf_16_bang(string, string_pos_matrix)
@ -237,7 +203,7 @@ class RLStringHelper:
self.quote_replaces = list(quote_html(str(updated_text), self.quote_html_type))
if not self.templates and not self.replaces and not self.quote_replaces:
return str(self.string)
return self.string
updated_text, string_pos_matrix, utf_16_bang_list = self._render_templates(updated_text, string_pos_matrix, utf_16_bang_list)
updated_text, string_pos_matrix, utf_16_bang_list = self._render_replaces(updated_text, string_pos_matrix, utf_16_bang_list)
@ -247,6 +213,7 @@ class RLStringHelper:
def get_text(self):
return self.__str__()
def split_overlapping_ranges(markups, _retry_count: int = 7):
for _ in range(len(markups) * _retry_count):
new_markups = split_overlapping_range_position(markups)
@ -255,6 +222,7 @@ def split_overlapping_ranges(markups, _retry_count: int = 7):
markups = new_markups
return markups
def split_overlapping_range_position(positions):
if not positions:
return []
@ -264,46 +232,48 @@ def split_overlapping_range_position(positions):
for pos in positions[1:]:
last = result[-1]
if pos["start"] < last["end"]:
if pos["type"] != last["type"]:
if pos["end"] <= last["end"]:
result[-1] = {
"start": last["start"],
"end": pos["start"],
"type": last["type"],
"template": last["template"],
}
result.append(pos.copy())
if pos["end"] < last["end"]:
result.append(
{
"start": pos["end"],
"end": last["end"],
"type": last["type"],
"template": last["template"],
}
)
else:
result[-1] = {
"start": last["start"],
"end": pos["start"],
"type": last["type"],
"template": last["template"],
}
result.append(pos.copy())
else:
result[-1]["end"] = max(last["end"], pos["end"])
else:
if not pos["start"] < last["end"]:
result.append(pos.copy())
continue
if pos["type"] != last["type"]:
if pos["end"] <= last["end"]:
result[-1] = {
"start": last["start"],
"end": pos["start"],
"type": last["type"],
"template": last["template"],
}
result.append(pos.copy())
if pos["end"] < last["end"]:
result.append(
{
"start": pos["end"],
"end": last["end"],
"type": last["type"],
"template": last["template"],
}
)
else:
result[-1] = {
"start": last["start"],
"end": pos["start"],
"type": last["type"],
"template": last["template"],
}
result.append(pos.copy())
else:
result[-1]["end"] = max(last["end"], pos["end"])
return result
def raw_render(**kwargs):
for key, value in kwargs.items():
if isinstance(value, str):
kwargs[key] = f"{{% raw %}}{value}{{% endraw %}}"
return kwargs
def parse_markups(markups: list[str]):
markups_out = []

View file

@ -1,25 +1,34 @@
from setuptools import setup, find_packages
from setuptools import setup, find_packages, Extension
from Cython.Build import cythonize
# Function to read the contents of the requirements file
def read_requirements():
with open('requirements.txt', 'r') as req:
with open("requirements.txt", "r") as req:
return req.read().splitlines()
cython_extension_src = "rl_string_helper/mixins/string_assignment.pyx"
cython_extensions = [Extension("rl_string_helper.mixins.string_assignment", [cython_extension_src])]
# cython_extensions = ["rl_string_helper/test.pyx"]
setup(
name='rl_string_helper',
version='0.1.0',
author='Freedium community',
author_email='admin@freedium.cfd',
description='Helper for Medium parser backend',
long_description=open('README.md').read(),
long_description_content_type='text/markdown',
url='https://codeberg.org/Freedium-cfd/web',
name="rl_string_helper",
version="0.1.0",
author="Freedium community",
author_email="admin@freedium.cfd",
description="Helper for Medium parser backend",
long_description=open("README.md").read(),
long_description_content_type="text/markdown",
url="https://codeberg.org/Freedium-cfd/web",
packages=find_packages(),
install_requires=read_requirements(),
classifiers=[
'Programming Language :: Python :: 3',
'License :: OSI Approved :: MIT License',
'Operating System :: OS Independent',
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
python_requires='>=3.7',
)
ext_modules=cythonize(cython_extensions, force=True, show_all_warnings=True),
python_requires=">=3.7",
zip_safe=False,
include_package_data=True,
)

View file

@ -19,7 +19,7 @@ from xkcdpass import xkcd_password as xp
from server.utils.logger import configure_logger
from medium_parser.core import MediumParser
from server.utils.loguru_handler import InterceptHandler
from medium_parser.api import MediumApi
def wait_for_postgres(max_retries=5, retry_interval=5):
@ -48,7 +48,8 @@ medium_cache.init_db()
# migrate_to_postgres_thread = execute_migrate_to_postgres_in_thread("medium_db_cache.sqlite", "postgresql://postgres:postgres@postgres:5432/postgres")
logger.debug(f"Database length: {medium_cache.all_length()}")
medium_parser = MediumParser(cache=medium_cache, timeout=3, host_address=config.HOST_ADDRESS, auth_cookies=config.MEDIUM_AUTH_COOKIES, template_folder="server/templates")
medium_api = MediumApi(auth_cookies=config.MEDIUM_AUTH_COOKIES, timeout=3, proxy_list=config.PROXY_LIST)
medium_parser = MediumParser(cache=medium_cache, medium_api=medium_api, timeout=3, host_address=config.HOST_ADDRESS, template_folder="server/templates")
redis_storage = redis.Redis(
host=config.REDIS_HOST,

View file

@ -23,7 +23,7 @@ WORKER_TIMEOUT = config("WORKER_TIMEOUT", cast=int, default=120)
CACHE_LIFE_TIME = config("CACHE_LIFE_TIME", cast=int, default=60 * 60 * 5)
HOME_PAGE_MAX_POSTS = config("HOME_PAGE_MAX_POSTS", cast=int, default=15)
HOME_PAGE_MAX_POSTS = config("HOME_PAGE_MAX_POSTS", cast=int, default=25)
ENABLE_ADS_BANNER = config("ENABLE_ADS_BANNER", cast=bool, default=False)
REDIS_HOST = config("REDIS_HOST", default="redis_service")
@ -33,3 +33,6 @@ REDIS_TIMEOUT = config("REDIS_TIMEOUT", cast=int, default=1.75)
SENTRY_SDK_DSN = config("SENTRY_SDK_DSN", default=None)
SENTRY_TRACES_SAMPLE_RATE = config("SENTRY_TRACES_SAMPLE_RATE", cast=float, default=0.2)
SENTRY_PROFILES_SAMPLE_RATE = config("SENTRY_PROFILES_SAMPLE_RATE", cast=float, default=0.2)
_PROXY_LIST = config("PROXY_LIST", cast=str, default="")
PROXY_LIST = _PROXY_LIST.split(",") if _PROXY_LIST else []

View file

@ -39,9 +39,9 @@ async def route_processing(path: str, request: Request):
if key_data != config.ADMIN_SECRET_KEY:
return JSONResponse({"message": f"Wrong secret key: {key_data}"}, status_code=403)
# if path.startswith("@miro/"):
# miro_data = path.removeprefix("@miro/")
# return await miro_proxy(miro_data)
if path.startswith("@miro/"):
miro_data = path.removeprefix("@miro/")
return await miro_proxy(miro_data)
if path.startswith("render_iframe/"):
iframe_id = path.removeprefix("render_iframe/")
return await iframe_proxy(iframe_id)
@ -55,7 +55,7 @@ async def main_page():
main_template_rendered = main_template.render(postleter=homepage_template)
base_template_rendered = base_template.render(body_template=main_template_rendered, host_address=config.HOST_ADDRESS)
parsed_template = parse(base_template_rendered)
serialized_template = serialize(parsed_template, encoding='utf-8')
serialized_template = serialize(parsed_template, encoding="utf-8")
return HTMLResponse(serialized_template)

View file

@ -20,11 +20,12 @@ from server.utils.utils import safe_check_redis_connection
@trace
@aio_redis_cache(10 * 60)
async def render_homepage(limit: int = config.HOME_PAGE_MAX_POSTS, as_html: bool = False):
random_post_id_list = list(set([i[0] for i in medium_cache.random(limit)]))
random_post_id_list = list(set([i.key for i in medium_cache.random(limit)]))
outlet_posts_list = []
tasks = []
for post_id in random_post_id_list:
async def fetch_post_metadata(post_id):
try:
logger.debug(f"Fetching post_id: {post_id}")

View file

@ -23,21 +23,20 @@ async def iframe_proxy(iframe_id: str):
request_content_soup = BeautifulSoup(request_content, "html.parser")
iframe_hack_script = '<script src="https://cdn.jsdelivr.net/npm/@iframe-resizer/child"></script>'
new_script_tag = BeautifulSoup(iframe_hack_script, 'html.parser').script
new_script_tag = BeautifulSoup(iframe_hack_script, "html.parser").script
request_content_soup.head.append(new_script_tag)
return Response(content=request_content_soup.prettify(), media_type="text/html", headers=IFRAME_HEADERS)
# async def miro_proxy(miro_data: str):
# async with aiohttp.ClientSession() as client:
# request = await client.get(
# f"https://miro.medium.com/{miro_data}",
# timeout=config.TIMEOUT,
# headers={"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"},
# )
# request_content = await request.read()
# content_type = request.headers["Content-Type"]
# return Response(content=request_content, media_type=content_type)
#
async def miro_proxy(miro_data: str):
async with aiohttp.ClientSession() as client:
request = await client.get(
f"https://miro.medium.com/{miro_data}",
timeout=config.TIMEOUT,
headers={"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"},
)
request_content = await request.read()
content_type = request.headers["Content-Type"]
return Response(content=request_content, media_type=content_type)