diff --git a/docker-compose/docker-compose.main.yml b/docker-compose/docker-compose.main.yml index 5ddb3fc..85025e9 100644 --- a/docker-compose/docker-compose.main.yml +++ b/docker-compose/docker-compose.main.yml @@ -38,7 +38,7 @@ services: - dante_1 - dante_2 # volumes: - # - ./web:/app/web + # - ../web:/app/web # - ./core/medium_parser/:/app/medium_parser # - ./core/rl_string_helper/:/app/rl_string_helper ports: diff --git a/medium-parser/medium_parser/core.py b/medium-parser/medium_parser/core.py index ed12b83..663d9e3 100644 --- a/medium-parser/medium_parser/core.py +++ b/medium-parser/medium_parser/core.py @@ -220,8 +220,11 @@ class MediumParser: def parse_paragraph_text( text: str, markups: list, is_code: bool = False ) -> RLStringHelper: - if is_code: - quote_html_type = ["minimal"] + # Hotfix, workaround for code block + has_code_block = any(markup["type"] == "CODE" for markup in markups) + if is_code or has_code_block: + # quote_html_type = ["minimal"] + quote_html_type = None else: quote_html_type = ["full"] text_formater = RLStringHelper(text, quote_html_type=quote_html_type) @@ -315,10 +318,12 @@ class MediumParser: if out_paragraphs: css_class.append("pt-12") header_template = jinja_env.from_string( - '

{{ text }}

' + '

{{ text }}

' ) header_template_rendered = header_template.render( - text=text_formater.get_text(), css_class="".join(css_class) + id=paragraph["name"], + text=text_formater.get_text(), + css_class="".join(css_class), ) out_paragraphs.append(header_template_rendered) elif paragraph["type"] == "H3": @@ -326,10 +331,12 @@ class MediumParser: if out_paragraphs: css_class.append("pt-12") header_template = jinja_env.from_string( - '

{{ text }}

' + '

{{ text }}

' ) header_template_rendered = header_template.render( - text=text_formater.get_text(), css_class="".join(css_class) + id=paragraph["name"], + text=text_formater.get_text(), + css_class="".join(css_class), ) out_paragraphs.append(header_template_rendered) elif paragraph["type"] == "H4": @@ -337,10 +344,12 @@ class MediumParser: if out_paragraphs: css_class.append("pt-8") header_template = jinja_env.from_string( - '

{{ text }}

' + '

{{ text }}

' ) header_template_rendered = header_template.render( - text=text_formater.get_text(), css_class="".join(css_class) + id=paragraph["name"], + text=text_formater.get_text(), + css_class="".join(css_class), ) out_paragraphs.append(header_template_rendered) elif paragraph["type"] == "IMG": diff --git a/medium-parser/medium_parser/markups.py b/medium-parser/medium_parser/markups.py index 1b1a0cc..7714c78 100644 --- a/medium-parser/medium_parser/markups.py +++ b/medium-parser/medium_parser/markups.py @@ -1,3 +1,5 @@ +import jinja2 + from medium_parser import jinja_env_debug @@ -9,18 +11,25 @@ def raw_render(**kwargs): return kwargs -def parse_markups(markups: list[str]): +def parse_markups( + markups: list[dict[str, str | jinja2.Template]] +) -> list[dict[str, str | jinja2.Template]]: markups_out = [] for markup in markups: if markup["type"] == "A": if markup["anchorType"] == "LINK": + target: str = "" + if not markup.get("href", "").startswith("#"): + target = "_blank" + template = jinja_env_debug.from_string( - '{{text}}' + '{{text}}' ) - template = template.render( + template_rendered = template.render( raw_render( rel=markup.get("rel", ""), + target=target, title=markup.get("title", ""), href=markup["href"], ) @@ -29,21 +38,21 @@ def parse_markups(markups: list[str]): template = jinja_env_debug.from_string( '{{text}}' ) - template = template.render(userId=markup["userId"]) + template_rendered = template.render(userId=markup["userId"]) else: continue elif markup["type"] == "STRONG": - template = "{{text}}" + template_rendered = "{{text}}" elif markup["type"] == "EM": - template = "{{text}}" + template_rendered = "{{text}}" elif markup["type"] == "CODE": - template = ( + template_rendered = ( "{{text}}" ) else: continue - template = jinja_env_debug.from_string(template) + template = jinja_env_debug.from_string(template_rendered) markup["template"] = template markups_out.append(markup) diff --git a/rl_string_helper/rl_string_helper/__init__.py b/rl_string_helper/rl_string_helper/__init__.py index c22dee0..8476adb 100644 --- a/rl_string_helper/rl_string_helper/__init__.py +++ b/rl_string_helper/rl_string_helper/__init__.py @@ -1,2 +1,7 @@ -from .string_helper import RLStringHelper, split_overlapping_ranges +from .string_helper import ( + RLStringHelper, + UTF16Handler, + StringAssignmentMixin, + split_overlapping_ranges, +) from .utils import quote_html, quote_symbol diff --git a/rl_string_helper/test.txt b/rl_string_helper/test.txt new file mode 100644 index 0000000..c18b1df --- /dev/null +++ b/rl_string_helper/test.txt @@ -0,0 +1 @@ +Noah dragged his two printers out from Settings ⚙️ < Printers & Scanners 🖨️ and dropped them in Dock or Desktop, I don't remember — but you can drag to both the places. diff --git a/rl_string_helper/tests/test_rl_string_helper.py b/rl_string_helper/tests/test_rl_string_helper.py index e56adca..27e766d 100644 --- a/rl_string_helper/tests/test_rl_string_helper.py +++ b/rl_string_helper/tests/test_rl_string_helper.py @@ -3,6 +3,8 @@ import re from loguru import logger from rl_string_helper import ( RLStringHelper, + UTF16Handler, + StringAssignmentMixin, quote_html, split_overlapping_ranges, ) @@ -133,52 +135,37 @@ class TestRLStringHelper: for markup in parsed_markups: helper.set_template(markup["start"], markup["end"], markup["template"]) - expected_output = ( - "The quick (brown) fox jumps " - 'over 13 lazy dogs!' - ) + expected_output = """The quick (brown) fox jumps over 13 lazy dogs!""" + assert str(helper) == expected_output def test_nmultibyte_emoji(self): - from medium_parser.markups import parse_markups + text = "Noah dragged his two printers out from Settings ⚙️ < Printers & Scanners 🖨️ and dropped them in Dock or Desktop, I don’t remember — but you can drag to both the places." + string_helper = StringAssignmentMixin(text) + utf_handler = UTF16Handler() + string_pos_matrix = list(range(len(text))) + updated_text, string_pos_matrix, utf_16_bang_list = utf_handler.pre_utf_16_bang( + string_helper, string_pos_matrix + ) + updated_text, string_pos_matrix = utf_handler.post_utf_16_bang( + updated_text, string_pos_matrix, utf_16_bang_list + ) + assert str(updated_text) == text + from icecream import ic - data = { - "__typename": "Paragraph", - "id": "236e7049b537_33", - "name": "ba8c", - "href": None, - "text": "Noah dragged his two printers out from Settings ⚙️ < Printers & Scanners \ud83d\udda8️ and dropped them in Dock or Desktop, I don’t remember — but you can drag to both the places.", - "iframe": None, - "layout": None, - "markups": [ - { - "__typename": "Markup", - "name": None, - "type": "CODE", - "start": 39, - "end": 76, - "href": None, - "title": None, - "rel": None, - "anchorType": None, - "userId": None, - "creatorIds": None, - } - ], - "metadata": None, - "mixtapeMetadata": None, - "type": "P", - "hasDropCap": None, - "dropCapImage": None, - "codeBlockMetadata": None, - } - helper = RLStringHelper(data["text"]) - parsed_markups = split_overlapping_ranges(parse_markups(data["markups"])) - for markup in parsed_markups: - helper.set_template(markup["start"], markup["end"], markup["template"]) - print(str(helper)) + string_helper = RLStringHelper(text, None) + string_helper.set_template(0, 100000, "{{text}}") + assert str(string_helper) == f"{text}".replace("’", "'") - assert str(helper) == data["text"] + string_helper = RLStringHelper(text, ["None"]) + string_helper.set_template(39, 76, "{{text}}") + with open("test.txt", "w") as f: + f.write(str(string_helper) + "\n") + assert str( + string_helper + ) == f"{text[:39]}Settings ⚙️ < Printers & Scanners 🖨️{text[76:]}".replace( + "’", "'" + ) def test_basic_replace(self): # Replace A to B - ONE to ONE char diff --git a/test.py b/test.py index 76acb3a..f716178 100644 --- a/test.py +++ b/test.py @@ -1,6 +1,8 @@ import requests -server_url = input("Enter your Freedium instance server URL (for example: http://localhost:6752/ ): ") +server_url = input( + "Enter your Freedium instance server URL (for example: http://localhost:6752/ ): " +) # List of some problematic Medium posts url_for_test = { @@ -9,18 +11,20 @@ url_for_test = { "https://levelup.gitconnected.com/some-linux-commands-that-can-boost-your-work-efficiency-dramatically-9dc802a10618", "https://leshchuk.medium.com/http-cache-on-rails-nginx-stack-950fee2f8eef", "https://medium.com/swlh/35-actionable-tips-to-grow-your-medium-blog-4e4017b89905", - "https://valeman.medium.com/benchmarking-neural-prophet-part-i-neural-prophet-vs-prophet-252990763468", + "https://valeman.medium.com/benchmarking-neural-prophet-part-i-neural-prophet-vs-prophet-252990763468", # Stil have a problems "https://valeman.medium.com/python-vs-r-for-time-series-forecasting-395390432598", "https://medium.com/@aleb/how-to-generate-random-user-agents-with-an-api-22aad3d232cb", "https://medium.com/angular-in-depth/the-best-way-to-unsubscribe-rxjs-observable-in-the-angular-applications-d8f9aa42f6a0", - "515dd5a43948", # full address: https://medium.com/macoclock/12-macos-apps-so-good-you-will-wonder-how-they-are-free-515dd5a43948 + "515dd5a43948", # full address: https://medium.com/macoclock/12-macos-apps-so-good-you-will-wonder-how-they-are-free-515dd5a43948 "https://anudeep-vysyaraju.medium.com/how-any-gitamite-can-get-free-linkedin-premium-membership-d4222bd1a0b3", # <--- Check for non properly aligned emojies - "http://freedium.cfd/https://johndanielraines.medium.com/be-an-engineer-not-a-frameworker-c58fe28d0c88", - "https://medium.com/coding-beauty/parseint-strange-behavior-cdff5e1f9ff7" # <---- Title renderer some weird characters + "https://johndanielraines.medium.com/be-an-engineer-not-a-frameworker-c58fe28d0c88", + "https://medium.com/coding-beauty/parseint-strange-behavior-cdff5e1f9ff7", # <---- Title renderer some weird characters + "https://medium.com/macoclock/the-11-craziest-and-most-advanced-macos-tips-tricks-ive-ever-seen-cd842ce3f0a3", # Still have some problem on paragraph_id 236e7049b537_43 } blacklist_url = {"51e23c5a2aac"} + def main(): for url in url_for_test: print(f"Processing: {url}") @@ -29,5 +33,5 @@ def main(): raise ValueError(f"Can't process URL: {url}") -if __name__ == '__main__': +if __name__ == "__main__": main()