refs #1247: Remove DOCTYPE before html parsed

2026-03-11 08:54:34 +00:00 · 2025-09-24 10:57:31 +02:00 · 2025-09-24 10:57:31 +02:00 · a9faca8ab6
commit a9faca8ab6
parent 8c85ab600c
1 changed files with 3 additions and 2 deletions
--- a/app/utils/search.py
+++ b/app/utils/search.py
@ -157,8 +157,9 @@ class Search:
                                       user_agent=self.user_agent)

        # Produce cleanable html soup from response
-        get_body_safed = get_body.text.replace("&lt;","andlt;").replace("&gt;","andgt;")
-        html_soup = bsoup(get_body_safed, 'html.parser').html
+        get_body_safed = re.sub(r'<!DOCTYPE[^>]*>\s*', '', get_body.text, flags=re.IGNORECASE)
+        get_body_safed = get_body_safed.replace("&lt;","andlt;").replace("&gt;","andgt;")
+        html_soup = bsoup(get_body_safed, 'html.parser')

        # Replace current soup if view_image is active
        # FIXME: Broken since the user agent changes as of 16 Jan 2025