2024-09-21 11:09:30 +00:00
from __future__ import annotations
2024-02-19 03:28:05 +00:00
import asyncio
2024-01-31 00:48:20 +00:00
import math
import textwrap
2024-04-26 10:41:02 +00:00
import typing
2024-01-31 00:48:20 +00:00
import urllib . parse
import jinja2
import tld
2024-07-20 10:14:00 +00:00
from asyncer import asyncify
2024-01-31 00:48:20 +00:00
from loguru import logger
2024-09-21 11:09:30 +00:00
from rl_string_helper import RLStringHelper , split_overlapping_ranges
2024-01-31 00:48:20 +00:00
2024-04-26 10:41:02 +00:00
from . import jinja_env
2024-10-22 12:23:26 +00:00
from . api import MediumApi
2024-09-21 11:09:30 +00:00
from . exceptions import (
InvalidMediumPostURL ,
InvalidURL ,
MediumPostQueryError ,
)
2024-10-22 12:23:26 +00:00
from . markups import parse_markups
2024-01-31 00:48:20 +00:00
from . models . html_result import HtmlResult
from . time import convert_datetime_to_human_readable
2024-09-21 11:09:30 +00:00
from . utils import (
correct_url ,
extract_hex_string ,
getting_percontage_of_match ,
is_has_valid_medium_post_id ,
is_valid_medium_url ,
is_valid_url ,
resolve_medium_url ,
)
2024-01-31 00:48:20 +00:00
2024-04-26 10:41:02 +00:00
if typing . TYPE_CHECKING :
2024-07-22 10:10:50 +00:00
from database_lib import AbstractCacheBackend
2024-01-31 00:48:20 +00:00
2024-07-20 07:14:54 +00:00
2024-01-31 00:48:20 +00:00
class MediumParser :
2024-09-21 11:09:30 +00:00
__slots__ = (
" cache " ,
" host_address " ,
" jinja_template " ,
" post_template " ,
" timeout " ,
" medium_api " ,
)
def __init__ (
self ,
cache : AbstractCacheBackend ,
medium_api : MediumApi ,
timeout : int ,
host_address : str ,
template_folder : str = " ./templates " ,
) :
2024-07-22 10:10:50 +00:00
self . timeout : int = timeout
self . cache : AbstractCacheBackend = cache
self . host_address : str = host_address
2024-09-21 11:09:30 +00:00
self . jinja_template : jinja2 . Environment = jinja2 . Environment (
loader = jinja2 . FileSystemLoader ( template_folder )
)
self . post_template : jinja2 . Template = self . jinja_template . get_template (
" post.html "
)
2024-07-22 10:10:50 +00:00
self . medium_api : MediumApi = medium_api
2024-01-31 00:48:20 +00:00
2024-07-20 10:14:00 +00:00
async def resolve ( self , unknown : str ) - > str :
logger . debug ( f " We got some unknown data: { unknown =} . Trying resolve them.../// " )
2024-10-22 12:23:26 +00:00
post_id = None
try :
logger . debug ( " ...maybe it ' s URL. Let ' s checkout... " )
post_id = await self . resolve_url ( unknown )
except Exception as e :
logger . exception ( e )
logger . error ( f " Error while resolving URL: { e } " )
if is_has_valid_medium_post_id ( unknown ) :
logger . debug ( " Seems like it ' s valid post_id " )
return extract_hex_string ( unknown )
logger . error ( f " Unknown data: { unknown } " )
2024-04-26 10:41:02 +00:00
2024-10-22 12:23:26 +00:00
raise e
2024-04-26 10:41:02 +00:00
2024-07-20 10:14:00 +00:00
return post_id
2024-04-26 10:41:02 +00:00
2024-07-20 10:14:00 +00:00
async def resolve_url ( self , url : str ) - > str :
2024-04-26 10:41:02 +00:00
sanitized_url = correct_url ( url )
if not is_valid_url ( url ) or not await is_valid_medium_url ( sanitized_url ) :
2024-04-08 16:44:32 +00:00
raise InvalidURL ( f " Invalid Medium URL: { sanitized_url } " )
2024-01-31 00:48:20 +00:00
2024-07-20 10:14:00 +00:00
post_id = await resolve_medium_url ( sanitized_url , self . timeout )
2024-01-31 00:48:20 +00:00
if not post_id :
2024-09-21 11:09:30 +00:00
raise InvalidMediumPostURL (
f " Could not find Medium post ID for URL: { sanitized_url } "
)
2024-01-31 00:48:20 +00:00
2024-07-20 10:14:00 +00:00
return post_id
2024-01-31 00:48:20 +00:00
2024-07-20 10:14:00 +00:00
async def delete_from_cache ( self , post_id : str ) :
2024-04-26 10:41:02 +00:00
self . cache . delete ( post_id )
2024-01-31 00:48:20 +00:00
return True
2024-07-20 10:14:00 +00:00
async def get_post_data_from_cache ( self , post_id : str ) :
2024-04-08 16:44:32 +00:00
async def _get_from_cache ( ) :
logger . debug ( " Using cache backend " )
2024-07-20 10:14:00 +00:00
post_data = self . cache . pull ( post_id )
2024-09-21 15:41:38 +00:00
if post_data :
2024-04-08 16:44:32 +00:00
logger . debug ( " post query was found on cache " )
2024-09-21 11:18:22 +00:00
parsed_data = post_data . json ( )
if parsed_data :
return parsed_data
2024-07-20 10:14:00 +00:00
logger . debug ( f " No data found in cache by { post_id } " )
2024-04-08 16:44:32 +00:00
return None
2024-07-20 10:14:00 +00:00
try :
return await asyncio . wait_for ( _get_from_cache ( ) , timeout = self . timeout )
except asyncio . TimeoutError :
logger . debug ( " Timeout while waiting for cache " )
return None
except Exception as e :
2024-09-21 11:09:30 +00:00
logger . exception ( e )
2024-07-20 10:14:00 +00:00
logger . error ( f " Error while waiting for cache: { e } " )
return None
2024-01-31 00:48:20 +00:00
2024-07-20 10:14:00 +00:00
async def get_post_data_from_api ( self , post_id : str ) :
2024-04-08 16:44:32 +00:00
async def _get_from_api ( ) :
2024-04-26 10:41:02 +00:00
logger . debug ( " Using API to gather post data " )
2024-04-08 16:44:32 +00:00
try :
2024-07-22 10:10:50 +00:00
return await self . medium_api . query_post_by_id ( post_id )
2024-04-08 16:44:32 +00:00
except Exception as ex :
2024-04-26 10:41:02 +00:00
logger . debug ( " Error while querying post data from Medium API " )
2024-04-08 16:44:32 +00:00
logger . exception ( ex )
return None
2024-07-20 10:14:00 +00:00
try :
return await asyncio . wait_for ( _get_from_api ( ) , timeout = self . timeout )
except asyncio . TimeoutError :
logger . debug ( " Timeout while waiting for cache " )
return None
except Exception as e :
logger . error ( f " Error while waiting for cache: { e } " )
return None
2024-01-31 00:48:20 +00:00
2024-07-20 10:14:00 +00:00
async def query_get ( self , post_id : str , use_cache : bool , force_cache : bool = False ) :
2024-02-23 08:08:05 +00:00
cache_used = True
2024-07-20 10:14:00 +00:00
post_data = await self . get_post_data_from_cache ( post_id ) if use_cache else None
2024-01-31 00:48:20 +00:00
2024-04-05 17:13:54 +00:00
if not post_data and not force_cache :
2024-02-23 04:49:43 +00:00
logger . debug ( " Getting value from cache failed, using API " )
2024-04-05 17:13:54 +00:00
cache_used = False
2024-07-20 10:14:00 +00:00
post_data = await self . get_post_data_from_api ( post_id )
2024-03-09 05:24:40 +00:00
return post_data , cache_used
2024-09-21 11:09:30 +00:00
async def query (
self ,
post_id : str ,
use_cache : bool = True ,
retry : int = 2 ,
force_cache : bool = False ,
) :
2024-04-05 17:13:54 +00:00
logger . debug ( f " Medium QUERY: { use_cache =} , { retry =} , { force_cache =} " )
2024-03-09 05:24:40 +00:00
post_data , is_cache_used = None , False
2024-02-19 03:28:05 +00:00
attempt = 0
2024-03-09 05:35:22 +00:00
reason = None
2024-02-19 03:28:05 +00:00
while not post_data and attempt < retry :
try :
2024-09-21 11:09:30 +00:00
post_data , is_cache_used = await self . query_get (
post_id , use_cache , force_cache
)
2024-03-09 15:37:08 +00:00
if not post_data :
reason = " No post data returned "
elif not isinstance ( post_data , dict ) :
2024-04-26 11:22:52 +00:00
reason = f " Post data is not a dictionary: { post_data =} "
2024-03-09 15:37:08 +00:00
elif post_data . get ( " error " ) :
2024-04-26 11:22:52 +00:00
reason = f " Post data contains an error: { post_data =} "
2024-03-09 15:37:08 +00:00
elif not post_data . get ( " data " ) :
2024-04-26 11:22:52 +00:00
reason = f " Post data missing ' data ' key: { post_data =} "
2024-07-20 10:14:00 +00:00
elif not post_data . get ( " data " , { } ) . get ( " post " ) :
2024-04-26 11:22:52 +00:00
reason = f " Post data missing ' data.post ' key: { post_data =} "
2024-03-09 15:37:08 +00:00
if reason is None :
2024-04-26 10:41:02 +00:00
logger . debug ( " Post data was successfully queried " )
2024-03-09 15:37:08 +00:00
break
2024-02-19 03:28:05 +00:00
except Exception as e :
2024-03-09 05:24:40 +00:00
logger . error ( f " Attempt { attempt + 1 } failed with exception: { e } " )
2024-04-26 10:41:02 +00:00
logger . debug ( f " Retrying in { 2 * * attempt } seconds... " )
2024-04-08 16:44:32 +00:00
await asyncio . sleep ( 2 * * attempt )
2024-07-20 10:14:00 +00:00
finally :
2024-03-09 15:37:08 +00:00
attempt + = 1
2024-03-09 05:18:23 +00:00
else :
if not reason :
reason = " Unknown "
2024-01-31 00:48:20 +00:00
2024-09-21 11:09:30 +00:00
raise MediumPostQueryError (
f " Could not query post by ID from API: { post_id } . Reason: { reason } "
)
2024-02-23 08:08:05 +00:00
2024-03-09 05:24:40 +00:00
if not is_cache_used :
2024-04-26 10:41:02 +00:00
logger . debug ( " Pushing post data to cache " )
2024-07-20 10:14:00 +00:00
self . cache . push ( post_id , post_data )
2024-01-31 00:48:20 +00:00
2024-10-22 12:23:26 +00:00
logger . trace ( " Query: done " )
2024-03-09 05:18:23 +00:00
return post_data
2024-01-31 00:48:20 +00:00
2024-09-21 11:09:30 +00:00
def _parse_and_render_content_html_post (
self ,
content : dict ,
title : str ,
subtitle : str ,
preview_image_id : str ,
highlights : list ,
tags : list ,
2025-06-26 20:06:13 +00:00
post_data : dict ,
2024-09-21 11:09:30 +00:00
) - > tuple [ list , str , str ] :
2024-01-31 00:48:20 +00:00
paragraphs = content [ " bodyModel " ] [ " paragraphs " ]
tags_list = [ tag [ " displayTitle " ] for tag in tags ]
2024-06-07 10:20:52 +00:00
out_paragraphs : list [ str ] = [ ]
2024-01-31 00:48:20 +00:00
current_pos = 0
2024-09-21 11:09:30 +00:00
def parse_paragraph_text (
text : str , markups : list , is_code : bool = False
) - > RLStringHelper :
2024-09-23 07:27:11 +00:00
# Hotfix, workaround for code block
has_code_block = any ( markup [ " type " ] == " CODE " for markup in markups )
if is_code or has_code_block :
2024-10-22 12:23:26 +00:00
quote_html_type = [ " minimal " ]
# quote_html_type = None
2024-01-31 00:48:20 +00:00
else :
quote_html_type = [ " full " ]
text_formater = RLStringHelper ( text , quote_html_type = quote_html_type )
parsed_markups = parse_markups ( markups )
fixed_markups = split_overlapping_ranges ( parsed_markups )
for markup in fixed_markups :
2024-09-21 11:09:30 +00:00
text_formater . set_template (
markup [ " start " ] , markup [ " end " ] , markup [ " template " ]
)
2024-01-31 00:48:20 +00:00
return text_formater
while len ( paragraphs ) > current_pos :
paragraph = paragraphs [ current_pos ]
logger . trace ( f " Current paragraph # { current_pos } data: { paragraph } " )
# For debugging stuff...
# if paragraph["id"] != "":
# current_pos += 1
# continue
if current_pos in range ( 4 ) :
if paragraph [ " type " ] in [ " H3 " , " H4 " , " H2 " ] :
if getting_percontage_of_match ( paragraph [ " text " ] , title ) > 80 :
2024-04-26 10:41:02 +00:00
if title . endswith ( " … " ) :
logger . trace ( " Title was detected, replace... " )
title = paragraph [ " text " ]
else :
logger . trace ( " Title was detected, ignore... " )
2024-01-31 00:48:20 +00:00
current_pos + = 1
continue
if paragraph [ " type " ] in [ " H4 " ] :
if paragraph [ " text " ] in tags_list :
logger . trace ( " Tag was detected, ignore... " )
current_pos + = 1
continue
if paragraph [ " type " ] in [ " H4 " , " P " ] :
2024-09-21 11:09:30 +00:00
is_paragraph_subtitle = (
getting_percontage_of_match ( paragraph [ " text " ] , subtitle ) > 80
)
2024-01-31 00:48:20 +00:00
if is_paragraph_subtitle and not subtitle . endswith ( " … " ) :
logger . trace ( " Subtitle was detected, ignore... " )
subtitle = paragraph [ " text " ]
current_pos + = 1
continue
2024-09-21 11:09:30 +00:00
elif (
subtitle
and subtitle . endswith ( " … " )
and len ( paragraph [ " text " ] ) > 100
) :
2024-07-20 10:14:00 +00:00
subtitle = " "
2024-01-31 00:48:20 +00:00
elif paragraph [ " type " ] == " IMG " :
2024-09-21 11:09:30 +00:00
if (
paragraph [ " metadata " ]
and paragraph [ " metadata " ] [ " id " ] == preview_image_id
) :
2024-01-31 00:48:20 +00:00
logger . trace ( " Preview image was detected, ignore... " )
current_pos + = 1
continue
2024-04-05 17:13:54 +00:00
if paragraph [ " text " ] is not None :
2024-09-21 11:09:30 +00:00
text_formater = parse_paragraph_text (
paragraph [ " text " ] , paragraph [ " markups " ]
)
2024-04-05 17:13:54 +00:00
else :
2024-09-21 11:09:30 +00:00
text_formater = parse_paragraph_text ( " " , [ ] )
2024-01-31 00:48:20 +00:00
2024-07-22 10:10:50 +00:00
for highlight in highlights :
for highlight_paragraph in highlight [ " paragraphs " ] :
if highlight_paragraph [ " name " ] == paragraph [ " name " ] :
logger . trace ( " Apply highlight to this paragraph " )
if highlight_paragraph [ " text " ] != text_formater . get_text ( ) :
2024-09-21 11:09:30 +00:00
logger . warning (
" Highlighted text and paragraph text are not the same! Skip... "
)
2024-01-31 00:48:20 +00:00
break
2024-09-21 11:09:30 +00:00
quote_markup_template = (
' <mark class= " bg-emerald-300 " > {{ text }}</mark> '
)
2024-07-22 10:10:50 +00:00
text_formater . set_template (
highlight [ " startOffset " ] ,
highlight [ " endOffset " ] ,
quote_markup_template ,
)
break
2024-01-31 00:48:20 +00:00
if paragraph [ " type " ] == " H2 " :
css_class = [ ]
if out_paragraphs :
css_class . append ( " pt-12 " )
2024-09-21 11:09:30 +00:00
header_template = jinja_env . from_string (
2024-09-23 07:27:11 +00:00
' <h2 id= {{ id }} class= " font-bold font-sans break-normal text-gray-900 dark:text-gray-100 text-1xl md:text-2xl {{ css_class }} " > {{ text }}</h2> '
2024-09-21 11:09:30 +00:00
)
header_template_rendered = header_template . render (
2024-09-23 07:27:11 +00:00
id = paragraph [ " name " ] ,
text = text_formater . get_text ( ) ,
css_class = " " . join ( css_class ) ,
2024-09-21 11:09:30 +00:00
)
2024-01-31 00:48:20 +00:00
out_paragraphs . append ( header_template_rendered )
elif paragraph [ " type " ] == " H3 " :
css_class = [ ]
if out_paragraphs :
css_class . append ( " pt-12 " )
2024-09-21 11:09:30 +00:00
header_template = jinja_env . from_string (
2024-09-23 07:27:11 +00:00
' <h3 id= {{ id }} class= " font-bold font-sans break-normal text-gray-900 dark:text-gray-100 text-1xl md:text-2xl {{ css_class }} " > {{ text }}</h3> '
2024-09-21 11:09:30 +00:00
)
header_template_rendered = header_template . render (
2024-09-23 07:27:11 +00:00
id = paragraph [ " name " ] ,
text = text_formater . get_text ( ) ,
css_class = " " . join ( css_class ) ,
2024-09-21 11:09:30 +00:00
)
2024-01-31 00:48:20 +00:00
out_paragraphs . append ( header_template_rendered )
elif paragraph [ " type " ] == " H4 " :
css_class = [ ]
if out_paragraphs :
css_class . append ( " pt-8 " )
2024-09-21 11:09:30 +00:00
header_template = jinja_env . from_string (
2024-09-23 07:27:11 +00:00
' <h4 id= {{ id }} class= " font-bold font-sans break-normal text-gray-900 dark:text-gray-100 text-l md:text-xl {{ css_class }} " > {{ text }}</h4> '
2024-09-21 11:09:30 +00:00
)
header_template_rendered = header_template . render (
2024-09-23 07:27:11 +00:00
id = paragraph [ " name " ] ,
text = text_formater . get_text ( ) ,
css_class = " " . join ( css_class ) ,
2024-09-21 11:09:30 +00:00
)
2024-01-31 00:48:20 +00:00
out_paragraphs . append ( header_template_rendered )
elif paragraph [ " type " ] == " IMG " :
image_template = jinja_env . from_string (
2025-01-25 06:58:13 +00:00
' <div class= " mt-7 " ><img loading= " eager " alt= " {{ paragraph.metadata.alt }} " class= " pt-5 m-auto " role= " presentation " referrerpolicy= " no-referrer " src= " https://miro.medium.com/v2/resize:fit:700/ {{ paragraph.metadata.id }} " ></div> '
2024-01-31 00:48:20 +00:00
)
2024-09-21 11:09:30 +00:00
image_caption_template = jinja_env . from_string (
" <figcaption class= ' mt-3 text-sm text-center text-gray-500 dark:text-gray-200 ' > {{ text }}</figcaption> "
)
2024-01-31 00:48:20 +00:00
if paragraph [ " layout " ] == " OUTSET_ROW " :
image_templates_row = [ ]
2024-09-21 11:09:30 +00:00
img_row_template = jinja_env . from_string (
' <div class= " mx-5 " ><div class= " flex flex-row justify-center " > {{ images }}</div></div> '
)
2024-06-22 07:41:32 +00:00
image_template_rendered = image_template . render ( paragraph = paragraph )
2024-01-31 00:48:20 +00:00
image_templates_row . append ( image_template_rendered )
_tmp_current_pos = current_pos + 1
while len ( paragraphs ) > _tmp_current_pos :
_paragraph = paragraphs [ _tmp_current_pos ]
if _paragraph [ " layout " ] == " OUTSET_ROW_CONTINUE " :
2024-09-21 11:09:30 +00:00
image_template_rendered = image_template . render (
paragraph = _paragraph
)
2024-01-31 00:48:20 +00:00
image_templates_row . append ( image_template_rendered )
else :
break
_tmp_current_pos + = 1
2024-09-21 11:09:30 +00:00
img_row_template_rendered = img_row_template . render (
images = " " . join ( image_templates_row )
)
2024-01-31 00:48:20 +00:00
out_paragraphs . append ( img_row_template_rendered )
current_pos = _tmp_current_pos - 1
2024-02-02 00:02:32 +00:00
elif paragraph [ " layout " ] == " FULL_WIDTH " :
logger . warning ( " IMG: not implemented FULL_WIDTH layout " )
current_pos + = 1
continue
2024-01-31 00:48:20 +00:00
else :
2024-06-22 07:41:32 +00:00
image_template_rendered = image_template . render ( paragraph = paragraph )
2024-01-31 00:48:20 +00:00
out_paragraphs . append ( image_template_rendered )
if paragraph [ " text " ] :
2024-09-21 11:09:30 +00:00
out_paragraphs . append (
image_caption_template . render ( text = text_formater . get_text ( ) )
)
2024-01-31 00:48:20 +00:00
elif paragraph [ " type " ] == " P " :
css_class = [ " leading-8 " ]
2025-06-26 16:48:03 +00:00
if paragraph . get ( " hasDropCap " , False ) :
2025-06-26 17:01:09 +00:00
# не понятно как логика срабатывает, иногда как-будто две буквы идут в drop cap как здесь - https://medium.com/write-a-catalyst/trumps-gaza-proposal-a-negotiation-tactic-for-real-change-0291df856c77
2025-06-26 16:48:03 +00:00
css_class . extend ( [ " first-letter:text-7xl " , " first-letter:float-left " , " first-letter:mr-2 " , " first-letter:pt-2 " ] )
2024-09-21 11:09:30 +00:00
paragraph_template = jinja_env . from_string (
' <p class= " {{ css_class }} " > {{ text }}</p> '
)
2024-01-31 00:48:20 +00:00
if paragraphs [ current_pos - 1 ] [ " type " ] in [ " H4 " , " H3 " ] :
css_class . append ( " mt-3 " )
else :
css_class . append ( " mt-7 " )
2024-09-21 11:09:30 +00:00
paragraph_template_rendered = paragraph_template . render (
text = text_formater . get_text ( ) , css_class = " " . join ( css_class )
)
2024-01-31 00:48:20 +00:00
out_paragraphs . append ( paragraph_template_rendered )
elif paragraph [ " type " ] == " ULI " :
2024-09-21 11:09:30 +00:00
uli_template = jinja_env . from_string (
2024-10-22 12:23:26 +00:00
' <ul class= " pl-8 mt-2 list-disc " > {{ li }}</ul> '
2024-09-21 11:09:30 +00:00
)
2024-01-31 00:48:20 +00:00
li_template = jinja_env . from_string ( " <li class= ' mt-3 ' > {{ text }}</li> " )
li_templates = [ ]
_tmp_current_pos = current_pos
while len ( paragraphs ) > _tmp_current_pos :
_paragraph = paragraphs [ _tmp_current_pos ]
if _paragraph [ " type " ] == " ULI " :
2024-09-21 11:09:30 +00:00
text_formater = parse_paragraph_text (
_paragraph [ " text " ] , _paragraph [ " markups " ]
)
li_template_rendered = li_template . render (
text = text_formater . get_text ( )
)
2024-01-31 00:48:20 +00:00
li_templates . append ( li_template_rendered )
else :
break
_tmp_current_pos + = 1
2024-06-22 07:41:32 +00:00
uli_template_rendered = uli_template . render ( li = " " . join ( li_templates ) )
2024-01-31 00:48:20 +00:00
out_paragraphs . append ( uli_template_rendered )
current_pos = _tmp_current_pos - 1
elif paragraph [ " type " ] == " OLI " :
2024-09-21 11:09:30 +00:00
ol_template = jinja_env . from_string (
2024-10-22 12:23:26 +00:00
' <ol class= " pl-8 mt-2 list-decimal " > {{ li }}</ol> '
2024-09-21 11:09:30 +00:00
)
2024-01-31 00:48:20 +00:00
li_template = jinja_env . from_string ( " <li class= ' mt-3 ' > {{ text }}</li> " )
li_templates = [ ]
_tmp_current_pos = current_pos
while len ( paragraphs ) > _tmp_current_pos :
_paragraph = paragraphs [ _tmp_current_pos ]
if _paragraph [ " type " ] == " OLI " :
2024-09-21 11:09:30 +00:00
text_formater = parse_paragraph_text (
_paragraph [ " text " ] , _paragraph [ " markups " ]
)
li_template_rendered = li_template . render (
text = text_formater . get_text ( )
)
2024-01-31 00:48:20 +00:00
li_templates . append ( li_template_rendered )
else :
break
_tmp_current_pos + = 1
2024-06-22 07:41:32 +00:00
ol_template_rendered = ol_template . render ( li = " " . join ( li_templates ) )
2024-01-31 00:48:20 +00:00
out_paragraphs . append ( ol_template_rendered )
current_pos = _tmp_current_pos - 1
elif paragraph [ " type " ] == " PRE " :
2024-09-21 11:09:30 +00:00
pre_template = jinja_env . from_string (
2024-10-22 12:23:26 +00:00
' <pre class= " flex flex-col justify-center border mt-7 dark:border-gray-700 " > {{ code_block}}</pre> '
2024-09-21 11:09:30 +00:00
)
code_block_template = jinja_env . from_string (
' <code class= " p-2 bg-gray-100 dark:bg-gray-900 overflow-x-auto {{ code_css_class }} " > {{ text }}</code> '
)
2024-01-31 00:48:20 +00:00
code_css_class = [ ]
2024-09-21 11:09:30 +00:00
if (
paragraph [ " codeBlockMetadata " ]
and paragraph [ " codeBlockMetadata " ] [ " lang " ] is not None
) :
code_css_class . append (
f ' language- { paragraph [ " codeBlockMetadata " ] [ " lang " ] } '
)
2024-01-31 00:48:20 +00:00
else :
2024-04-08 16:44:32 +00:00
code_css_class . append ( " nohighlight " )
2024-06-07 10:20:52 +00:00
# code_css_class.append("auto")
2024-01-31 00:48:20 +00:00
code_list = [ ]
_tmp_current_pos = current_pos
while len ( paragraphs ) > _tmp_current_pos :
_paragraph = paragraphs [ _tmp_current_pos ]
if _paragraph [ " type " ] == " PRE " :
2024-09-21 11:09:30 +00:00
text_formater = parse_paragraph_text (
_paragraph [ " text " ] , _paragraph [ " markups " ] , is_code = True
)
2024-01-31 00:48:20 +00:00
code_list . append ( text_formater . get_text ( ) )
else :
break
_tmp_current_pos + = 1
2024-04-05 17:13:54 +00:00
2024-09-21 11:09:30 +00:00
code_block_template_rendered = code_block_template . render (
text = " \n " . join ( code_list ) , code_css_class = " " . join ( code_css_class )
)
pre_template_rendered = pre_template . render (
code_block = code_block_template_rendered
)
2024-04-05 17:13:54 +00:00
2024-01-31 00:48:20 +00:00
out_paragraphs . append ( pre_template_rendered )
current_pos = _tmp_current_pos - 1
elif paragraph [ " type " ] == " BQ " :
2024-06-07 10:20:52 +00:00
bq_template = jinja_env . from_string (
' <blockquote style= " box-shadow: inset 3px 0 0 0 rgb(209 207 239 / var(--tw-bg-opacity)); " class= " px-5 pt-3 pb-3 mt-5 " ><p class= " font-italic " > {{ text }}</p></blockquote> '
)
2024-06-22 07:41:32 +00:00
bq_template_rendered = bq_template . render ( text = text_formater . get_text ( ) )
2024-01-31 00:48:20 +00:00
logger . trace ( bq_template_rendered )
out_paragraphs . append ( bq_template_rendered )
elif paragraph [ " type " ] == " PQ " :
2024-09-21 11:09:30 +00:00
pq_template = jinja_env . from_string (
2024-10-22 12:23:26 +00:00
' <blockquote class= " ml-5 text-2xl text-gray-600 mt-7 dark:text-gray-300 " ><p> {{ text }}</p></blockquote> '
2024-09-21 11:09:30 +00:00
)
2024-06-22 07:41:32 +00:00
pq_template_rendered = pq_template . render ( text = text_formater . get_text ( ) )
2024-01-31 00:48:20 +00:00
logger . trace ( pq_template_rendered )
out_paragraphs . append ( pq_template_rendered )
2024-04-08 16:44:32 +00:00
elif paragraph [ " type " ] == " MIXTAPE_EMBED " :
2024-04-26 10:41:02 +00:00
# TODO: redirect all Medium embeding articles to Fredium
2025-01-25 06:54:50 +00:00
mixtape_embed_html = """
< div class = " items-center p-2 overflow-hidden border border-gray-300 mt-7 " >
< a rel = " noopener follow " href = " {{ url }} " target = " _blank " >
< div class = " flex flex-row justify-between p-2 overflow-hidden " >
< div class = " flex flex-col justify-center p-2 " >
< h2 class = " text-base font-bold text-black dark:text-gray-100 " > { { embed_title } } < / h2 >
< div class = " block mt-2 " >
< h3 class = " text-sm text-grey-darker " > { { embed_description } } < / h3 >
< / div >
< div class = " mt-5 " >
< p class = " text-xs text-grey-darker " > { { embed_site } } < / p >
< / div >
< / div >
< div class = " relative flex h-40 flew-row w-60 " >
< div class = " absolute inset-0 bg-center bg-cover " style = " background-image: url( ' https://miro.medium.com/v2/resize:fit:320/ {{ paragraph.mixtapeMetadata.thumbnailImageId }} ' ); background-repeat: no-repeat; " referrerpolicy = " no-referrer " > < / div >
< / div >
< / div >
< / a >
< / div > """
embed_template = jinja_env . from_string ( mixtape_embed_html )
2024-01-31 00:48:20 +00:00
if paragraph . get ( " mixtapeMetadata " ) is not None :
url = paragraph [ " mixtapeMetadata " ] [ " href " ]
else :
2024-09-21 11:09:30 +00:00
logger . warning (
" Ignore MIXTAPE_EMBED paragraph type, since we can ' t get url "
)
2024-01-31 00:48:20 +00:00
current_pos + = 1
continue
text_raw = paragraph [ " text " ]
if len ( paragraph [ " markups " ] ) != 3 :
2024-09-21 11:09:30 +00:00
logger . warning (
" Ignore MIXTAPE_EMBED paragraph type, since we can ' t split text "
)
2024-01-31 00:48:20 +00:00
current_pos + = 1
continue
title_range = paragraph [ " markups " ] [ 1 ]
description_range = paragraph [ " markups " ] [ 2 ]
2024-04-26 10:41:02 +00:00
logger . trace ( f " { title_range =} " )
logger . trace ( f " { description_range =} " )
2024-04-08 16:44:32 +00:00
embed_title = text_raw [ title_range [ " start " ] : title_range [ " end " ] ]
2024-09-21 11:09:30 +00:00
embed_description = text_raw [
description_range [ " start " ] : description_range [ " end " ]
]
2024-04-26 10:41:02 +00:00
logger . trace ( f " { embed_title =} " )
logger . trace ( f " { embed_description =} " )
2024-01-31 00:48:20 +00:00
try :
embed_site = tld . get_fld ( url )
except Exception as ex :
2024-09-21 11:09:30 +00:00
logger . warning (
f " Can ' t get embed site fld: { ex } . Using custom logic... "
)
2024-01-31 00:48:20 +00:00
parsed_url = urllib . parse . urlparse ( url )
embed_site = parsed_url . hostname
2024-04-26 10:41:02 +00:00
logger . trace ( f " { embed_site =} " )
2024-09-21 11:09:30 +00:00
embed_template_rendered = embed_template . render (
paragraph = paragraph ,
url = url ,
embed_title = embed_title ,
embed_description = embed_description ,
embed_site = embed_site ,
)
2024-01-31 00:48:20 +00:00
out_paragraphs . append ( embed_template_rendered )
elif paragraph [ " type " ] == " IFRAME " :
2025-06-26 20:06:13 +00:00
logger . debug ( f " Processing IFRAME paragraph " )
# First check if we have direct mediaResource in the iframe
media_resource = paragraph . get ( " iframe " , { } ) . get ( " mediaResource " , { } )
# If mediaResource is just a reference, look it up in post_data
media_resource_ref = paragraph . get ( " iframe " , { } ) . get ( " mediaResource " , { } ) . get ( " __ref " )
if media_resource_ref and not media_resource . get ( " id " ) and not media_resource . get ( " iframeSrc " ) :
logger . debug ( f " Found media resource reference: { media_resource_ref } " )
data_payload = post_data . get ( " data " , { } )
if media_resource_ref in data_payload :
media_resource = data_payload [ media_resource_ref ]
logger . debug ( f " Found media resource for ref: { media_resource_ref } " )
else :
logger . warning ( f " Could not find media resource for ref: { media_resource_ref } " )
# Get iframe source from mediaResource
iframe_src_val = media_resource . get ( " iframeSrc " )
iframe_id = media_resource . get ( " id " )
# Determine the source URL for the iframe
src = iframe_src_val
if not src and iframe_id :
logger . debug ( f " Using fallback iframe URL with ID: { iframe_id } " )
src = f " { self . host_address } /render_iframe/ { iframe_id } "
if not src :
logger . warning ( " No iframe source found, skipping iframe " )
current_pos + = 1
continue
# Get iframe dimensions
iframe_width = media_resource . get ( " iframeWidth " )
iframe_height = media_resource . get ( " iframeHeight " )
# If dimensions are available in paragraph.iframe directly, use those
if not iframe_width and paragraph . get ( " iframe " , { } ) . get ( " iframeWidth " ) :
iframe_width = paragraph [ " iframe " ] [ " iframeWidth " ]
if not iframe_height and paragraph . get ( " iframe " , { } ) . get ( " iframeHeight " ) :
iframe_height = paragraph [ " iframe " ] [ " iframeHeight " ]
logger . debug ( f " Iframe dimensions: { iframe_width } x { iframe_height } " )
# Render with aspect ratio if we have valid dimensions
if iframe_width and iframe_height and iframe_width > 0 :
ratio = ( iframe_height / iframe_width ) * 100
iframe_template = jinja_env . from_string (
""" <div class= " mt-7 " ><div>
< iframe class = " w-full " src = " {{ src }} " referrerpolicy = " no-referrer " width = " {{ iframe_width }} " height = " {{ iframe_height }} " allowfullscreen = " " frameborder = " 0 " scrolling = " no " > < / iframe >
< / div > < / div > """
)
iframe_template_rendered = iframe_template . render (
src = src ,
ratio = f " { ratio : .4f } " ,
iframe_width = iframe_width or " 100 % " ,
iframe_height = iframe_height or " 100 % " ,
)
out_paragraphs . append ( iframe_template_rendered )
else :
# Fallback to responsive iframe without aspect ratio
iframe_template = jinja_env . from_string (
' <div class= " mt-7 " ><iframe class= " w-full " src= " {{ src }} " width= " {{ iframe_width }} " height= " {{ iframe_height }} " referrerpolicy= " no-referrer " allowfullscreen= " " frameborder= " 0 " scrolling= " no " ></iframe></div> '
)
iframe_template_rendered = iframe_template . render (
src = src ,
iframe_width = iframe_width or " 100 % " ,
iframe_height = iframe_height or " 100 % " ,
)
out_paragraphs . append ( iframe_template_rendered )
2024-01-31 00:48:20 +00:00
else :
logger . error ( f " Unknown { paragraph [ ' type ' ] } : { paragraph } " )
current_pos + = 1
return out_paragraphs , title , subtitle
2024-07-20 10:14:00 +00:00
async def render_as_html ( self , post_id : str ) :
post_data = await self . query ( post_id )
2024-01-31 00:48:20 +00:00
try :
2024-07-20 10:14:00 +00:00
result = await self . _render_as_html ( post_data , post_id )
2024-01-31 00:48:20 +00:00
except Exception as ex :
raise ex
else :
return result
2024-09-21 11:09:30 +00:00
async def generate_metadata (
self , post_data : dict , post_id : str , as_dict : bool = False
) - > tuple | dict [ str , str ] :
title = RLStringHelper (
post_data [ " data " ] [ " post " ] [ " title " ] , [ " minimal " ]
) . get_text ( )
subtitle = RLStringHelper (
post_data [ " data " ] [ " post " ] [ " previewContent " ] [ " subtitle " ]
) . get_text ( )
description = RLStringHelper (
textwrap . shorten ( subtitle , width = 100 , placeholder = " ... " )
) . get_text ( )
2024-07-20 10:14:00 +00:00
preview_image_id = post_data [ " data " ] [ " post " ] [ " previewImage " ] [ " id " ]
creator = post_data [ " data " ] [ " post " ] [ " creator " ]
collection = post_data [ " data " ] [ " post " ] [ " collection " ]
url = post_data [ " data " ] [ " post " ] [ " mediumUrl " ]
2024-01-31 00:48:20 +00:00
2024-07-20 10:14:00 +00:00
reading_time = math . ceil ( post_data [ " data " ] [ " post " ] [ " readingTime " ] )
free_access = " No " if post_data [ " data " ] [ " post " ] [ " isLocked " ] else " Yes "
2024-09-21 11:09:30 +00:00
updated_at = convert_datetime_to_human_readable (
post_data [ " data " ] [ " post " ] [ " updatedAt " ]
)
first_published_at = convert_datetime_to_human_readable (
post_data [ " data " ] [ " post " ] [ " firstPublishedAt " ]
)
2024-07-20 10:14:00 +00:00
tags = post_data [ " data " ] [ " post " ] [ " tags " ]
2024-01-31 00:48:20 +00:00
if as_dict :
2024-04-08 16:44:32 +00:00
return {
2024-07-20 10:14:00 +00:00
" post_id " : post_id ,
2024-04-08 16:44:32 +00:00
" title " : title ,
" subtitle " : subtitle ,
" description " : description ,
" url " : url ,
" creator " : creator ,
" collection " : collection ,
" reading_time " : reading_time ,
" free_access " : free_access ,
" updated_at " : updated_at ,
" first_published_at " : first_published_at ,
" preview_image_id " : preview_image_id ,
" tags " : tags ,
}
2024-01-31 00:48:20 +00:00
2024-09-21 11:09:30 +00:00
return (
title ,
subtitle ,
description ,
url ,
creator ,
collection ,
reading_time ,
free_access ,
updated_at ,
first_published_at ,
preview_image_id ,
tags ,
)
2024-01-31 00:48:20 +00:00
2024-07-20 10:14:00 +00:00
async def _render_as_html ( self , post_data : dict , post_id : str ) - > " HtmlResult " :
2024-07-20 07:14:54 +00:00
# Generate metadata in parallel
2024-07-20 10:14:00 +00:00
metadata_task = asyncio . create_task ( self . generate_metadata ( post_data , post_id ) )
2024-07-20 07:14:54 +00:00
# Parse and render content in parallel
2024-09-21 11:09:30 +00:00
content , title , subtitle = await asyncify (
self . _parse_and_render_content_html_post
) (
2024-07-20 10:14:00 +00:00
post_data [ " data " ] [ " post " ] [ " content " ] ,
post_data [ " data " ] [ " post " ] [ " title " ] ,
post_data [ " data " ] [ " post " ] [ " previewContent " ] [ " subtitle " ] ,
post_data [ " data " ] [ " post " ] [ " previewImage " ] [ " id " ] ,
post_data [ " data " ] [ " post " ] [ " highlights " ] ,
post_data [ " data " ] [ " post " ] [ " tags " ] ,
2025-06-26 20:06:13 +00:00
post_data ,
2024-07-20 07:14:54 +00:00
)
# Await metadata
2024-09-21 11:09:30 +00:00
(
title ,
subtitle ,
description ,
url ,
creator ,
collection ,
reading_time ,
free_access ,
updated_at ,
first_published_at ,
preview_image_id ,
tags ,
) = await metadata_task
2024-01-31 00:48:20 +00:00
post_page_title_raw = " {{ title }} | by {{ creator.name }} "
if collection :
post_page_title_raw + = " | in {{ collection.name }} "
post_page_title = jinja_env . from_string ( post_page_title_raw )
2024-09-21 11:09:30 +00:00
post_page_title_rendered = post_page_title . render (
title = title , creator = creator , collection = collection
)
2024-01-31 00:48:20 +00:00
post_context = {
" subtitle " : subtitle ,
" title " : title ,
" url " : url ,
" creator " : creator ,
" collection " : collection ,
" readingTime " : reading_time ,
" freeAccess " : free_access ,
" updatedAt " : updated_at ,
" firstPublishedAt " : first_published_at ,
" previewImageId " : preview_image_id ,
" content " : content ,
" tags " : tags ,
}
2024-07-20 10:14:00 +00:00
post_template_rendered = self . post_template . render ( post_context )
2024-01-31 00:48:20 +00:00
2024-09-21 11:09:30 +00:00
return HtmlResult (
post_page_title_rendered , description , url , post_template_rendered
)
2024-01-31 00:48:20 +00:00
async def render_as_markdown ( self ) - > str :
2024-09-21 11:09:30 +00:00
raise NotImplementedError (
" Markdown rendering is not implemented. Please use HTML rendering instead "
)