diff --git a/app/filter.py b/app/filter.py index 42d175f..d68bf80 100644 --- a/app/filter.py +++ b/app/filter.py @@ -1,4 +1,5 @@ import cssutils +from app.utils.misc import SKIP_PREFIX from bs4 import BeautifulSoup from bs4.element import ResultSet, Tag from cryptography.fernet import Fernet @@ -690,7 +691,12 @@ class Filter: link_str.find('medium.com') + len('medium.com'):] new_desc.string = link_str else: - new_desc.string = link_str.replace(site, alt) + # start replacing after scheme// + repl_start = link_str.find('//') + 1 + # end replacement on the next / that's found + repl_end = link_str.find('/', repl_start) + replace_site = link_str[repl_start:] if repl_end == -1 else link_str[repl_start:repl_end] + new_desc.string = link_str.replace(replace_site, alt) link_desc.replace_with(new_desc) diff --git a/app/utils/misc.py b/app/utils/misc.py index b0c74c2..8503044 100644 --- a/app/utils/misc.py +++ b/app/utils/misc.py @@ -11,6 +11,8 @@ from bs4 import BeautifulSoup as bsoup from cryptography.fernet import Fernet from flask import Request +SKIP_PREFIX = ['//www.', '//mobile.', '//m.'] + ddg_favicon_site = 'http://icons.duckduckgo.com/ip2' empty_gif = base64.b64decode( diff --git a/app/utils/results.py b/app/utils/results.py index d7b3991..ace1c3b 100644 --- a/app/utils/results.py +++ b/app/utils/results.py @@ -1,6 +1,6 @@ from app.models.config import Config from app.models.endpoint import Endpoint -from app.utils.misc import list_to_dict +from app.utils.misc import list_to_dict, SKIP_PREFIX from bs4 import BeautifulSoup, NavigableString import copy from flask import current_app @@ -12,7 +12,6 @@ import re import warnings SKIP_ARGS = ['ref_src', 'utm'] -SKIP_PREFIX = ['//www.', '//mobile.', '//m.'] GOOG_STATIC = 'www.gstatic.com' G_M_LOGO_URL = 'https://www.gstatic.com/m/images/icons/googleg.gif' GOOG_IMG = '/images/branding/searchlogo/1x/googlelogo' @@ -197,6 +196,11 @@ def get_site_alt(link: str, site_alts: dict = SITE_ALTS) -> str: for site_key in site_alts.keys(): site_alt = f'{parsed_link.scheme}://{site_key}' + + # If an site alt is already present in the link, skip updating it + if site_alts[site_key] in link: + break + if not hostname or site_alt not in hostcomp or not site_alts[site_key]: continue @@ -204,7 +208,12 @@ def get_site_alt(link: str, site_alts: dict = SITE_ALTS) -> str: # a 2-char language code) to be passed as a URL param to Wikiless # in order to preserve the language setting. params = '' - if 'wikipedia' in hostname and len(subdomain) == 2: + + # Fix edge case for simple.wikipedia.org where substitution with a language subdomain + # breaks the link + if 'wikipedia' in hostname and subdomain == 'simple' and 'wikipedia' in site_alts[site_key]: + break + elif 'wikipedia' in hostname and len(subdomain) == 2: hostname = f'{subdomain}.{hostname}' params = f'?lang={subdomain}' elif 'medium' in hostname and len(subdomain) > 0: @@ -220,13 +229,13 @@ def get_site_alt(link: str, site_alts: dict = SITE_ALTS) -> str: link = '//'.join(link.split('//')[1:]) for prefix in SKIP_PREFIX: + # replace the first occurrence of the prefix + link = link.replace(prefix, '//', 1) if parsed_alt.scheme: # If a scheme is specified, remove everything before the # first occurence of it link = f'{parsed_alt.scheme}{link.split(parsed_alt.scheme, 1)[-1]}' - else: - # Otherwise, replace the first occurrence of the prefix - link = link.replace(prefix, '//', 1) + break return link