fix: don't append wikiless params to wikipedia.org replacements

This commit is contained in:
RoyalOughtness 2025-07-16 13:52:21 -07:00
parent e4cabe3e5b
commit 8a24e4a03c
No known key found for this signature in database
GPG key ID: 0EEB398E8A6BA550
3 changed files with 24 additions and 7 deletions

View file

@ -1,4 +1,5 @@
import cssutils
from app.utils.misc import SKIP_PREFIX
from bs4 import BeautifulSoup
from bs4.element import ResultSet, Tag
from cryptography.fernet import Fernet
@ -690,7 +691,12 @@ class Filter:
link_str.find('medium.com') + len('medium.com'):]
new_desc.string = link_str
else:
new_desc.string = link_str.replace(site, alt)
# start replacing after scheme//
repl_start = link_str.find('//') + 1
# end replacement on the next / that's found
repl_end = link_str.find('/', repl_start)
replace_site = link_str[repl_start:] if repl_end == -1 else link_str[repl_start:repl_end]
new_desc.string = link_str.replace(replace_site, alt)
link_desc.replace_with(new_desc)

View file

@ -11,6 +11,8 @@ from bs4 import BeautifulSoup as bsoup
from cryptography.fernet import Fernet
from flask import Request
SKIP_PREFIX = ['//www.', '//mobile.', '//m.']
ddg_favicon_site = 'http://icons.duckduckgo.com/ip2'
empty_gif = base64.b64decode(

View file

@ -1,6 +1,6 @@
from app.models.config import Config
from app.models.endpoint import Endpoint
from app.utils.misc import list_to_dict
from app.utils.misc import list_to_dict, SKIP_PREFIX
from bs4 import BeautifulSoup, NavigableString
import copy
from flask import current_app
@ -12,7 +12,6 @@ import re
import warnings
SKIP_ARGS = ['ref_src', 'utm']
SKIP_PREFIX = ['//www.', '//mobile.', '//m.']
GOOG_STATIC = 'www.gstatic.com'
G_M_LOGO_URL = 'https://www.gstatic.com/m/images/icons/googleg.gif'
GOOG_IMG = '/images/branding/searchlogo/1x/googlelogo'
@ -197,6 +196,11 @@ def get_site_alt(link: str, site_alts: dict = SITE_ALTS) -> str:
for site_key in site_alts.keys():
site_alt = f'{parsed_link.scheme}://{site_key}'
# If an site alt is already present in the link, skip updating it
if site_alts[site_key] in link:
break
if not hostname or site_alt not in hostcomp or not site_alts[site_key]:
continue
@ -204,7 +208,12 @@ def get_site_alt(link: str, site_alts: dict = SITE_ALTS) -> str:
# a 2-char language code) to be passed as a URL param to Wikiless
# in order to preserve the language setting.
params = ''
if 'wikipedia' in hostname and len(subdomain) == 2:
# Fix edge case for simple.wikipedia.org where substitution with a language subdomain
# breaks the link
if 'wikipedia' in hostname and subdomain == 'simple' and 'wikipedia' in site_alts[site_key]:
break
elif 'wikipedia' in hostname and len(subdomain) == 2:
hostname = f'{subdomain}.{hostname}'
params = f'?lang={subdomain}'
elif 'medium' in hostname and len(subdomain) > 0:
@ -220,13 +229,13 @@ def get_site_alt(link: str, site_alts: dict = SITE_ALTS) -> str:
link = '//'.join(link.split('//')[1:])
for prefix in SKIP_PREFIX:
# replace the first occurrence of the prefix
link = link.replace(prefix, '//', 1)
if parsed_alt.scheme:
# If a scheme is specified, remove everything before the
# first occurence of it
link = f'{parsed_alt.scheme}{link.split(parsed_alt.scheme, 1)[-1]}'
else:
# Otherwise, replace the first occurrence of the prefix
link = link.replace(prefix, '//', 1)
break
return link