mirror of
https://codeberg.org/Freedium-cfd/web.git
synced 2026-03-11 09:04:37 +00:00
parser: correct href jumps handle && test update
This commit is contained in:
parent
e87ecbe882
commit
e43d5f8442
7 changed files with 80 additions and 65 deletions
|
|
@ -38,7 +38,7 @@ services:
|
|||
- dante_1
|
||||
- dante_2
|
||||
# volumes:
|
||||
# - ./web:/app/web
|
||||
# - ../web:/app/web
|
||||
# - ./core/medium_parser/:/app/medium_parser
|
||||
# - ./core/rl_string_helper/:/app/rl_string_helper
|
||||
ports:
|
||||
|
|
|
|||
|
|
@ -220,8 +220,11 @@ class MediumParser:
|
|||
def parse_paragraph_text(
|
||||
text: str, markups: list, is_code: bool = False
|
||||
) -> RLStringHelper:
|
||||
if is_code:
|
||||
quote_html_type = ["minimal"]
|
||||
# Hotfix, workaround for code block
|
||||
has_code_block = any(markup["type"] == "CODE" for markup in markups)
|
||||
if is_code or has_code_block:
|
||||
# quote_html_type = ["minimal"]
|
||||
quote_html_type = None
|
||||
else:
|
||||
quote_html_type = ["full"]
|
||||
text_formater = RLStringHelper(text, quote_html_type=quote_html_type)
|
||||
|
|
@ -315,10 +318,12 @@ class MediumParser:
|
|||
if out_paragraphs:
|
||||
css_class.append("pt-12")
|
||||
header_template = jinja_env.from_string(
|
||||
'<h2 class="font-bold font-sans break-normal text-gray-900 dark:text-gray-100 text-1xl md:text-2xl {{ css_class }}">{{ text }}</h2>'
|
||||
'<h2 id={{ id }} class="font-bold font-sans break-normal text-gray-900 dark:text-gray-100 text-1xl md:text-2xl {{ css_class }}">{{ text }}</h2>'
|
||||
)
|
||||
header_template_rendered = header_template.render(
|
||||
text=text_formater.get_text(), css_class="".join(css_class)
|
||||
id=paragraph["name"],
|
||||
text=text_formater.get_text(),
|
||||
css_class="".join(css_class),
|
||||
)
|
||||
out_paragraphs.append(header_template_rendered)
|
||||
elif paragraph["type"] == "H3":
|
||||
|
|
@ -326,10 +331,12 @@ class MediumParser:
|
|||
if out_paragraphs:
|
||||
css_class.append("pt-12")
|
||||
header_template = jinja_env.from_string(
|
||||
'<h3 class="font-bold font-sans break-normal text-gray-900 dark:text-gray-100 text-1xl md:text-2xl {{ css_class }}">{{ text }}</h3>'
|
||||
'<h3 id={{ id }} class="font-bold font-sans break-normal text-gray-900 dark:text-gray-100 text-1xl md:text-2xl {{ css_class }}">{{ text }}</h3>'
|
||||
)
|
||||
header_template_rendered = header_template.render(
|
||||
text=text_formater.get_text(), css_class="".join(css_class)
|
||||
id=paragraph["name"],
|
||||
text=text_formater.get_text(),
|
||||
css_class="".join(css_class),
|
||||
)
|
||||
out_paragraphs.append(header_template_rendered)
|
||||
elif paragraph["type"] == "H4":
|
||||
|
|
@ -337,10 +344,12 @@ class MediumParser:
|
|||
if out_paragraphs:
|
||||
css_class.append("pt-8")
|
||||
header_template = jinja_env.from_string(
|
||||
'<h4 class="font-bold font-sans break-normal text-gray-900 dark:text-gray-100 text-l md:text-xl {{ css_class }}">{{ text }}</h4>'
|
||||
'<h4 id={{ id }} class="font-bold font-sans break-normal text-gray-900 dark:text-gray-100 text-l md:text-xl {{ css_class }}">{{ text }}</h4>'
|
||||
)
|
||||
header_template_rendered = header_template.render(
|
||||
text=text_formater.get_text(), css_class="".join(css_class)
|
||||
id=paragraph["name"],
|
||||
text=text_formater.get_text(),
|
||||
css_class="".join(css_class),
|
||||
)
|
||||
out_paragraphs.append(header_template_rendered)
|
||||
elif paragraph["type"] == "IMG":
|
||||
|
|
|
|||
|
|
@ -1,3 +1,5 @@
|
|||
import jinja2
|
||||
|
||||
from medium_parser import jinja_env_debug
|
||||
|
||||
|
||||
|
|
@ -9,18 +11,25 @@ def raw_render(**kwargs):
|
|||
return kwargs
|
||||
|
||||
|
||||
def parse_markups(markups: list[str]):
|
||||
def parse_markups(
|
||||
markups: list[dict[str, str | jinja2.Template]]
|
||||
) -> list[dict[str, str | jinja2.Template]]:
|
||||
markups_out = []
|
||||
|
||||
for markup in markups:
|
||||
if markup["type"] == "A":
|
||||
if markup["anchorType"] == "LINK":
|
||||
target: str = ""
|
||||
if not markup.get("href", "").startswith("#"):
|
||||
target = "_blank"
|
||||
|
||||
template = jinja_env_debug.from_string(
|
||||
'<a style="text-decoration: underline;" rel="{{rel}}" title="{{title}}" href="{{href}}" target="_blank">{{text}}</a>'
|
||||
'<a style="text-decoration: underline;" rel="{{rel}}" title="{{title}}" href="{{href}}" target="{{target}}">{{text}}</a>'
|
||||
)
|
||||
template = template.render(
|
||||
template_rendered = template.render(
|
||||
raw_render(
|
||||
rel=markup.get("rel", ""),
|
||||
target=target,
|
||||
title=markup.get("title", ""),
|
||||
href=markup["href"],
|
||||
)
|
||||
|
|
@ -29,21 +38,21 @@ def parse_markups(markups: list[str]):
|
|||
template = jinja_env_debug.from_string(
|
||||
'<a style="text-decoration: underline;" href="https://medium.com/u/{{userId}}">{{text}}</a>'
|
||||
)
|
||||
template = template.render(userId=markup["userId"])
|
||||
template_rendered = template.render(userId=markup["userId"])
|
||||
else:
|
||||
continue
|
||||
elif markup["type"] == "STRONG":
|
||||
template = "<strong>{{text}}</strong>"
|
||||
template_rendered = "<strong>{{text}}</strong>"
|
||||
elif markup["type"] == "EM":
|
||||
template = "<em>{{text}}</em>"
|
||||
template_rendered = "<em>{{text}}</em>"
|
||||
elif markup["type"] == "CODE":
|
||||
template = (
|
||||
template_rendered = (
|
||||
"<code class='p-1.5 bg-gray-300 dark:bg-gray-600'>{{text}}</code>"
|
||||
)
|
||||
else:
|
||||
continue
|
||||
|
||||
template = jinja_env_debug.from_string(template)
|
||||
template = jinja_env_debug.from_string(template_rendered)
|
||||
markup["template"] = template
|
||||
markups_out.append(markup)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,2 +1,7 @@
|
|||
from .string_helper import RLStringHelper, split_overlapping_ranges
|
||||
from .string_helper import (
|
||||
RLStringHelper,
|
||||
UTF16Handler,
|
||||
StringAssignmentMixin,
|
||||
split_overlapping_ranges,
|
||||
)
|
||||
from .utils import quote_html, quote_symbol
|
||||
|
|
|
|||
1
rl_string_helper/test.txt
Normal file
1
rl_string_helper/test.txt
Normal file
|
|
@ -0,0 +1 @@
|
|||
Noah dragged his two printers out from <code class='test'>Settings ⚙️ < Printers & Scanners 🖨</code>️ and dropped them in Dock or Desktop, I don't remember — but you can drag to both the places.
|
||||
|
|
@ -3,6 +3,8 @@ import re
|
|||
from loguru import logger
|
||||
from rl_string_helper import (
|
||||
RLStringHelper,
|
||||
UTF16Handler,
|
||||
StringAssignmentMixin,
|
||||
quote_html,
|
||||
split_overlapping_ranges,
|
||||
)
|
||||
|
|
@ -133,52 +135,37 @@ class TestRLStringHelper:
|
|||
for markup in parsed_markups:
|
||||
helper.set_template(markup["start"], markup["end"], markup["template"])
|
||||
|
||||
expected_output = (
|
||||
"<span>The <b>quick <i>(brown)</b> <u>fox</u></i><u> jumps</u> "
|
||||
'over <code>13 lazy</code><a href="#"><code> </code>dogs!</a></span>'
|
||||
)
|
||||
expected_output = """<span>The </span><b><span>quick </span></b><i><b><span>(brown)</span></b></i><i><span> </span></i><u><i><span>fox</span></i></u><u><span> jumps</span></u><span> over </span><code><span>13 lazy</span></code><a href="#"><code><span> </span></code></a><a href="#"><span>dogs!</span></a>"""
|
||||
|
||||
assert str(helper) == expected_output
|
||||
|
||||
def test_nmultibyte_emoji(self):
|
||||
from medium_parser.markups import parse_markups
|
||||
text = "Noah dragged his two printers out from Settings ⚙️ < Printers & Scanners 🖨️ and dropped them in Dock or Desktop, I don’t remember — but you can drag to both the places."
|
||||
string_helper = StringAssignmentMixin(text)
|
||||
utf_handler = UTF16Handler()
|
||||
string_pos_matrix = list(range(len(text)))
|
||||
updated_text, string_pos_matrix, utf_16_bang_list = utf_handler.pre_utf_16_bang(
|
||||
string_helper, string_pos_matrix
|
||||
)
|
||||
updated_text, string_pos_matrix = utf_handler.post_utf_16_bang(
|
||||
updated_text, string_pos_matrix, utf_16_bang_list
|
||||
)
|
||||
assert str(updated_text) == text
|
||||
from icecream import ic
|
||||
|
||||
data = {
|
||||
"__typename": "Paragraph",
|
||||
"id": "236e7049b537_33",
|
||||
"name": "ba8c",
|
||||
"href": None,
|
||||
"text": "Noah dragged his two printers out from Settings ⚙️ < Printers & Scanners \ud83d\udda8️ and dropped them in Dock or Desktop, I don’t remember — but you can drag to both the places.",
|
||||
"iframe": None,
|
||||
"layout": None,
|
||||
"markups": [
|
||||
{
|
||||
"__typename": "Markup",
|
||||
"name": None,
|
||||
"type": "CODE",
|
||||
"start": 39,
|
||||
"end": 76,
|
||||
"href": None,
|
||||
"title": None,
|
||||
"rel": None,
|
||||
"anchorType": None,
|
||||
"userId": None,
|
||||
"creatorIds": None,
|
||||
}
|
||||
],
|
||||
"metadata": None,
|
||||
"mixtapeMetadata": None,
|
||||
"type": "P",
|
||||
"hasDropCap": None,
|
||||
"dropCapImage": None,
|
||||
"codeBlockMetadata": None,
|
||||
}
|
||||
helper = RLStringHelper(data["text"])
|
||||
parsed_markups = split_overlapping_ranges(parse_markups(data["markups"]))
|
||||
for markup in parsed_markups:
|
||||
helper.set_template(markup["start"], markup["end"], markup["template"])
|
||||
print(str(helper))
|
||||
string_helper = RLStringHelper(text, None)
|
||||
string_helper.set_template(0, 100000, "<span>{{text}}</span>")
|
||||
assert str(string_helper) == f"<span>{text}</span>".replace("’", "'")
|
||||
|
||||
assert str(helper) == data["text"]
|
||||
string_helper = RLStringHelper(text, ["None"])
|
||||
string_helper.set_template(39, 76, "<code class='test'>{{text}}</code>")
|
||||
with open("test.txt", "w") as f:
|
||||
f.write(str(string_helper) + "\n")
|
||||
assert str(
|
||||
string_helper
|
||||
) == f"{text[:39]}<code class='test'>Settings ⚙️ < Printers & Scanners 🖨</code>️{text[76:]}".replace(
|
||||
"’", "'"
|
||||
)
|
||||
|
||||
def test_basic_replace(self):
|
||||
# Replace A to B - ONE to ONE char
|
||||
|
|
|
|||
16
test.py
16
test.py
|
|
@ -1,6 +1,8 @@
|
|||
import requests
|
||||
|
||||
server_url = input("Enter your Freedium instance server URL (for example: http://localhost:6752/ ): ")
|
||||
server_url = input(
|
||||
"Enter your Freedium instance server URL (for example: http://localhost:6752/ ): "
|
||||
)
|
||||
|
||||
# List of some problematic Medium posts
|
||||
url_for_test = {
|
||||
|
|
@ -9,18 +11,20 @@ url_for_test = {
|
|||
"https://levelup.gitconnected.com/some-linux-commands-that-can-boost-your-work-efficiency-dramatically-9dc802a10618",
|
||||
"https://leshchuk.medium.com/http-cache-on-rails-nginx-stack-950fee2f8eef",
|
||||
"https://medium.com/swlh/35-actionable-tips-to-grow-your-medium-blog-4e4017b89905",
|
||||
"https://valeman.medium.com/benchmarking-neural-prophet-part-i-neural-prophet-vs-prophet-252990763468",
|
||||
"https://valeman.medium.com/benchmarking-neural-prophet-part-i-neural-prophet-vs-prophet-252990763468", # Stil have a problems
|
||||
"https://valeman.medium.com/python-vs-r-for-time-series-forecasting-395390432598",
|
||||
"https://medium.com/@aleb/how-to-generate-random-user-agents-with-an-api-22aad3d232cb",
|
||||
"https://medium.com/angular-in-depth/the-best-way-to-unsubscribe-rxjs-observable-in-the-angular-applications-d8f9aa42f6a0",
|
||||
"515dd5a43948", # full address: https://medium.com/macoclock/12-macos-apps-so-good-you-will-wonder-how-they-are-free-515dd5a43948
|
||||
"515dd5a43948", # full address: https://medium.com/macoclock/12-macos-apps-so-good-you-will-wonder-how-they-are-free-515dd5a43948
|
||||
"https://anudeep-vysyaraju.medium.com/how-any-gitamite-can-get-free-linkedin-premium-membership-d4222bd1a0b3", # <--- Check for non properly aligned emojies
|
||||
"http://freedium.cfd/https://johndanielraines.medium.com/be-an-engineer-not-a-frameworker-c58fe28d0c88",
|
||||
"https://medium.com/coding-beauty/parseint-strange-behavior-cdff5e1f9ff7" # <---- Title renderer some weird characters
|
||||
"https://johndanielraines.medium.com/be-an-engineer-not-a-frameworker-c58fe28d0c88",
|
||||
"https://medium.com/coding-beauty/parseint-strange-behavior-cdff5e1f9ff7", # <---- Title renderer some weird characters
|
||||
"https://medium.com/macoclock/the-11-craziest-and-most-advanced-macos-tips-tricks-ive-ever-seen-cd842ce3f0a3", # Still have some problem on paragraph_id 236e7049b537_43
|
||||
}
|
||||
|
||||
blacklist_url = {"51e23c5a2aac"}
|
||||
|
||||
|
||||
def main():
|
||||
for url in url_for_test:
|
||||
print(f"Processing: {url}")
|
||||
|
|
@ -29,5 +33,5 @@ def main():
|
|||
raise ValueError(f"Can't process URL: {url}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
|
|||
Loading…
Reference in a new issue