parser: correct href jumps handle && test update

This commit is contained in:
ZhymabekRoman 2024-09-23 12:27:11 +05:00
parent e87ecbe882
commit e43d5f8442
7 changed files with 80 additions and 65 deletions

View file

@ -38,7 +38,7 @@ services:
- dante_1
- dante_2
# volumes:
# - ./web:/app/web
# - ../web:/app/web
# - ./core/medium_parser/:/app/medium_parser
# - ./core/rl_string_helper/:/app/rl_string_helper
ports:

View file

@ -220,8 +220,11 @@ class MediumParser:
def parse_paragraph_text(
text: str, markups: list, is_code: bool = False
) -> RLStringHelper:
if is_code:
quote_html_type = ["minimal"]
# Hotfix, workaround for code block
has_code_block = any(markup["type"] == "CODE" for markup in markups)
if is_code or has_code_block:
# quote_html_type = ["minimal"]
quote_html_type = None
else:
quote_html_type = ["full"]
text_formater = RLStringHelper(text, quote_html_type=quote_html_type)
@ -315,10 +318,12 @@ class MediumParser:
if out_paragraphs:
css_class.append("pt-12")
header_template = jinja_env.from_string(
'<h2 class="font-bold font-sans break-normal text-gray-900 dark:text-gray-100 text-1xl md:text-2xl {{ css_class }}">{{ text }}</h2>'
'<h2 id={{ id }} class="font-bold font-sans break-normal text-gray-900 dark:text-gray-100 text-1xl md:text-2xl {{ css_class }}">{{ text }}</h2>'
)
header_template_rendered = header_template.render(
text=text_formater.get_text(), css_class="".join(css_class)
id=paragraph["name"],
text=text_formater.get_text(),
css_class="".join(css_class),
)
out_paragraphs.append(header_template_rendered)
elif paragraph["type"] == "H3":
@ -326,10 +331,12 @@ class MediumParser:
if out_paragraphs:
css_class.append("pt-12")
header_template = jinja_env.from_string(
'<h3 class="font-bold font-sans break-normal text-gray-900 dark:text-gray-100 text-1xl md:text-2xl {{ css_class }}">{{ text }}</h3>'
'<h3 id={{ id }} class="font-bold font-sans break-normal text-gray-900 dark:text-gray-100 text-1xl md:text-2xl {{ css_class }}">{{ text }}</h3>'
)
header_template_rendered = header_template.render(
text=text_formater.get_text(), css_class="".join(css_class)
id=paragraph["name"],
text=text_formater.get_text(),
css_class="".join(css_class),
)
out_paragraphs.append(header_template_rendered)
elif paragraph["type"] == "H4":
@ -337,10 +344,12 @@ class MediumParser:
if out_paragraphs:
css_class.append("pt-8")
header_template = jinja_env.from_string(
'<h4 class="font-bold font-sans break-normal text-gray-900 dark:text-gray-100 text-l md:text-xl {{ css_class }}">{{ text }}</h4>'
'<h4 id={{ id }} class="font-bold font-sans break-normal text-gray-900 dark:text-gray-100 text-l md:text-xl {{ css_class }}">{{ text }}</h4>'
)
header_template_rendered = header_template.render(
text=text_formater.get_text(), css_class="".join(css_class)
id=paragraph["name"],
text=text_formater.get_text(),
css_class="".join(css_class),
)
out_paragraphs.append(header_template_rendered)
elif paragraph["type"] == "IMG":

View file

@ -1,3 +1,5 @@
import jinja2
from medium_parser import jinja_env_debug
@ -9,18 +11,25 @@ def raw_render(**kwargs):
return kwargs
def parse_markups(markups: list[str]):
def parse_markups(
markups: list[dict[str, str | jinja2.Template]]
) -> list[dict[str, str | jinja2.Template]]:
markups_out = []
for markup in markups:
if markup["type"] == "A":
if markup["anchorType"] == "LINK":
target: str = ""
if not markup.get("href", "").startswith("#"):
target = "_blank"
template = jinja_env_debug.from_string(
'<a style="text-decoration: underline;" rel="{{rel}}" title="{{title}}" href="{{href}}" target="_blank">{{text}}</a>'
'<a style="text-decoration: underline;" rel="{{rel}}" title="{{title}}" href="{{href}}" target="{{target}}">{{text}}</a>'
)
template = template.render(
template_rendered = template.render(
raw_render(
rel=markup.get("rel", ""),
target=target,
title=markup.get("title", ""),
href=markup["href"],
)
@ -29,21 +38,21 @@ def parse_markups(markups: list[str]):
template = jinja_env_debug.from_string(
'<a style="text-decoration: underline;" href="https://medium.com/u/{{userId}}">{{text}}</a>'
)
template = template.render(userId=markup["userId"])
template_rendered = template.render(userId=markup["userId"])
else:
continue
elif markup["type"] == "STRONG":
template = "<strong>{{text}}</strong>"
template_rendered = "<strong>{{text}}</strong>"
elif markup["type"] == "EM":
template = "<em>{{text}}</em>"
template_rendered = "<em>{{text}}</em>"
elif markup["type"] == "CODE":
template = (
template_rendered = (
"<code class='p-1.5 bg-gray-300 dark:bg-gray-600'>{{text}}</code>"
)
else:
continue
template = jinja_env_debug.from_string(template)
template = jinja_env_debug.from_string(template_rendered)
markup["template"] = template
markups_out.append(markup)

View file

@ -1,2 +1,7 @@
from .string_helper import RLStringHelper, split_overlapping_ranges
from .string_helper import (
RLStringHelper,
UTF16Handler,
StringAssignmentMixin,
split_overlapping_ranges,
)
from .utils import quote_html, quote_symbol

View file

@ -0,0 +1 @@
Noah dragged his two printers out from <code class='test'>Settings ⚙️ < Printers & Scanners 🖨</code> and dropped them in Dock or Desktop, I don't remember — but you can drag to both the places.

View file

@ -3,6 +3,8 @@ import re
from loguru import logger
from rl_string_helper import (
RLStringHelper,
UTF16Handler,
StringAssignmentMixin,
quote_html,
split_overlapping_ranges,
)
@ -133,52 +135,37 @@ class TestRLStringHelper:
for markup in parsed_markups:
helper.set_template(markup["start"], markup["end"], markup["template"])
expected_output = (
"<span>The <b>quick <i>(brown)</b> <u>fox</u></i><u> jumps</u> "
'over <code>13 lazy</code><a href="#"><code> </code>dogs!</a></span>'
)
expected_output = """<span>The </span><b><span>quick </span></b><i><b><span>(brown)</span></b></i><i><span> </span></i><u><i><span>fox</span></i></u><u><span> jumps</span></u><span> over </span><code><span>13 lazy</span></code><a href="#"><code><span> </span></code></a><a href="#"><span>dogs!</span></a>"""
assert str(helper) == expected_output
def test_nmultibyte_emoji(self):
from medium_parser.markups import parse_markups
text = "Noah dragged his two printers out from Settings ⚙️ < Printers & Scanners 🖨️ and dropped them in Dock or Desktop, I dont remember — but you can drag to both the places."
string_helper = StringAssignmentMixin(text)
utf_handler = UTF16Handler()
string_pos_matrix = list(range(len(text)))
updated_text, string_pos_matrix, utf_16_bang_list = utf_handler.pre_utf_16_bang(
string_helper, string_pos_matrix
)
updated_text, string_pos_matrix = utf_handler.post_utf_16_bang(
updated_text, string_pos_matrix, utf_16_bang_list
)
assert str(updated_text) == text
from icecream import ic
data = {
"__typename": "Paragraph",
"id": "236e7049b537_33",
"name": "ba8c",
"href": None,
"text": "Noah dragged his two printers out from Settings ⚙️ < Printers & Scanners \ud83d\udda8 and dropped them in Dock or Desktop, I dont remember — but you can drag to both the places.",
"iframe": None,
"layout": None,
"markups": [
{
"__typename": "Markup",
"name": None,
"type": "CODE",
"start": 39,
"end": 76,
"href": None,
"title": None,
"rel": None,
"anchorType": None,
"userId": None,
"creatorIds": None,
}
],
"metadata": None,
"mixtapeMetadata": None,
"type": "P",
"hasDropCap": None,
"dropCapImage": None,
"codeBlockMetadata": None,
}
helper = RLStringHelper(data["text"])
parsed_markups = split_overlapping_ranges(parse_markups(data["markups"]))
for markup in parsed_markups:
helper.set_template(markup["start"], markup["end"], markup["template"])
print(str(helper))
string_helper = RLStringHelper(text, None)
string_helper.set_template(0, 100000, "<span>{{text}}</span>")
assert str(string_helper) == f"<span>{text}</span>".replace("", "'")
assert str(helper) == data["text"]
string_helper = RLStringHelper(text, ["None"])
string_helper.set_template(39, 76, "<code class='test'>{{text}}</code>")
with open("test.txt", "w") as f:
f.write(str(string_helper) + "\n")
assert str(
string_helper
) == f"{text[:39]}<code class='test'>Settings ⚙️ < Printers & Scanners 🖨</code>{text[76:]}".replace(
"", "'"
)
def test_basic_replace(self):
# Replace A to B - ONE to ONE char

16
test.py
View file

@ -1,6 +1,8 @@
import requests
server_url = input("Enter your Freedium instance server URL (for example: http://localhost:6752/ ): ")
server_url = input(
"Enter your Freedium instance server URL (for example: http://localhost:6752/ ): "
)
# List of some problematic Medium posts
url_for_test = {
@ -9,18 +11,20 @@ url_for_test = {
"https://levelup.gitconnected.com/some-linux-commands-that-can-boost-your-work-efficiency-dramatically-9dc802a10618",
"https://leshchuk.medium.com/http-cache-on-rails-nginx-stack-950fee2f8eef",
"https://medium.com/swlh/35-actionable-tips-to-grow-your-medium-blog-4e4017b89905",
"https://valeman.medium.com/benchmarking-neural-prophet-part-i-neural-prophet-vs-prophet-252990763468",
"https://valeman.medium.com/benchmarking-neural-prophet-part-i-neural-prophet-vs-prophet-252990763468", # Stil have a problems
"https://valeman.medium.com/python-vs-r-for-time-series-forecasting-395390432598",
"https://medium.com/@aleb/how-to-generate-random-user-agents-with-an-api-22aad3d232cb",
"https://medium.com/angular-in-depth/the-best-way-to-unsubscribe-rxjs-observable-in-the-angular-applications-d8f9aa42f6a0",
"515dd5a43948", # full address: https://medium.com/macoclock/12-macos-apps-so-good-you-will-wonder-how-they-are-free-515dd5a43948
"515dd5a43948", # full address: https://medium.com/macoclock/12-macos-apps-so-good-you-will-wonder-how-they-are-free-515dd5a43948
"https://anudeep-vysyaraju.medium.com/how-any-gitamite-can-get-free-linkedin-premium-membership-d4222bd1a0b3", # <--- Check for non properly aligned emojies
"http://freedium.cfd/https://johndanielraines.medium.com/be-an-engineer-not-a-frameworker-c58fe28d0c88",
"https://medium.com/coding-beauty/parseint-strange-behavior-cdff5e1f9ff7" # <---- Title renderer some weird characters
"https://johndanielraines.medium.com/be-an-engineer-not-a-frameworker-c58fe28d0c88",
"https://medium.com/coding-beauty/parseint-strange-behavior-cdff5e1f9ff7", # <---- Title renderer some weird characters
"https://medium.com/macoclock/the-11-craziest-and-most-advanced-macos-tips-tricks-ive-ever-seen-cd842ce3f0a3", # Still have some problem on paragraph_id 236e7049b537_43
}
blacklist_url = {"51e23c5a2aac"}
def main():
for url in url_for_test:
print(f"Processing: {url}")
@ -29,5 +33,5 @@ def main():
raise ValueError(f"Can't process URL: {url}")
if __name__ == '__main__':
if __name__ == "__main__":
main()