mirror of
https://github.com/benbusby/whoogle-search.git
synced 2026-03-11 08:54:34 +00:00
This commit introduces support for the Mullvad Leta search backend, which is now enabled by default. It includes a new conversion function to transform Leta's search results into a format compatible with Whoogle. The README has been updated to reflect this change, detailing the limitations of the Leta backend and how to switch back to Google if needed. Additionally, the configuration model has been updated to include a setting for enabling/disabling the Leta backend.
469 lines
16 KiB
Python
469 lines
16 KiB
Python
from app.models.config import Config
|
|
from app.models.endpoint import Endpoint
|
|
from app.utils.misc import list_to_dict
|
|
from bs4 import BeautifulSoup, NavigableString, MarkupResemblesLocatorWarning
|
|
import warnings
|
|
import copy
|
|
from flask import current_app
|
|
import html
|
|
import os
|
|
import urllib.parse as urlparse
|
|
from urllib.parse import parse_qs
|
|
import re
|
|
warnings.filterwarnings('ignore', category=MarkupResemblesLocatorWarning)
|
|
|
|
SKIP_ARGS = ['ref_src', 'utm']
|
|
SKIP_PREFIX = ['//www.', '//mobile.', '//m.']
|
|
GOOG_STATIC = 'www.gstatic.com'
|
|
G_M_LOGO_URL = 'https://www.gstatic.com/m/images/icons/googleg.gif'
|
|
GOOG_IMG = '/images/branding/searchlogo/1x/googlelogo'
|
|
LOGO_URL = GOOG_IMG + '_desk'
|
|
BLANK_B64 = ('data:image/png;base64,'
|
|
'iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAQAAAAnOwc2AAAAD0lEQVR42mNkw'
|
|
'AIYh7IgAAVVAAuInjI5AAAAAElFTkSuQmCC')
|
|
|
|
# Ad keywords
|
|
BLACKLIST = [
|
|
'ad', 'ads', 'anuncio', 'annuncio', 'annonce', 'Anzeige', '广告', '廣告',
|
|
'Reklama', 'Реклама', 'Anunț', '광고', 'annons', 'Annonse', 'Iklan',
|
|
'広告', 'Augl.', 'Mainos', 'Advertentie', 'إعلان', 'Գովազդ', 'विज्ञापन',
|
|
'Reklam', 'آگهی', 'Reklāma', 'Reklaam', 'Διαφήμιση', 'מודעה', 'Hirdetés',
|
|
'Anúncio', 'Quảng cáo', 'โฆษณา', 'sponsored', 'patrocinado', 'gesponsert',
|
|
'Sponzorováno', '스폰서', 'Gesponsord', 'Sponsorisé'
|
|
]
|
|
|
|
SITE_ALTS = {
|
|
'twitter.com': os.getenv('WHOOGLE_ALT_TW', 'farside.link/nitter'),
|
|
'youtube.com': os.getenv('WHOOGLE_ALT_YT', 'farside.link/invidious'),
|
|
'reddit.com': os.getenv('WHOOGLE_ALT_RD', 'farside.link/libreddit'),
|
|
**dict.fromkeys([
|
|
'medium.com',
|
|
'levelup.gitconnected.com'
|
|
], os.getenv('WHOOGLE_ALT_MD', 'farside.link/scribe')),
|
|
'imgur.com': os.getenv('WHOOGLE_ALT_IMG', 'farside.link/rimgo'),
|
|
'wikipedia.org': os.getenv('WHOOGLE_ALT_WIKI', 'farside.link/wikiless'),
|
|
'imdb.com': os.getenv('WHOOGLE_ALT_IMDB', 'farside.link/libremdb'),
|
|
'quora.com': os.getenv('WHOOGLE_ALT_QUORA', 'farside.link/quetre'),
|
|
'stackoverflow.com': os.getenv('WHOOGLE_ALT_SO', 'farside.link/anonymousoverflow')
|
|
}
|
|
|
|
# Include custom site redirects from WHOOGLE_REDIRECTS
|
|
SITE_ALTS.update(list_to_dict(re.split(',|:', os.getenv('WHOOGLE_REDIRECTS', ''))))
|
|
|
|
|
|
def contains_cjko(s: str) -> bool:
|
|
"""This function check whether or not a string contains Chinese, Japanese,
|
|
or Korean characters. It employs regex and uses the u escape sequence to
|
|
match any character in a set of Unicode ranges.
|
|
|
|
Args:
|
|
s (str): string to be checked
|
|
|
|
Returns:
|
|
bool: True if the input s contains the characters and False otherwise
|
|
"""
|
|
unicode_ranges = ('\u4e00-\u9fff' # Chinese characters
|
|
'\u3040-\u309f' # Japanese hiragana
|
|
'\u30a0-\u30ff' # Japanese katakana
|
|
'\u4e00-\u9faf' # Japanese kanji
|
|
'\uac00-\ud7af' # Korean hangul syllables
|
|
'\u1100-\u11ff' # Korean hangul jamo
|
|
)
|
|
return bool(re.search(fr'[{unicode_ranges}]', s))
|
|
|
|
|
|
def bold_search_terms(response: str, query: str) -> BeautifulSoup:
|
|
"""Wraps all search terms in bold tags (<b>). If any terms are wrapped
|
|
in quotes, only that exact phrase will be made bold.
|
|
|
|
Args:
|
|
response: The initial response body for the query
|
|
query: The original search query
|
|
|
|
Returns:
|
|
BeautifulSoup: modified soup object with bold items
|
|
"""
|
|
response = BeautifulSoup(response, 'html.parser')
|
|
|
|
def replace_any_case(element: NavigableString, target_word: str) -> None:
|
|
# Replace all instances of the word, but maintaining the same case in
|
|
# the replacement
|
|
if len(element) == len(target_word):
|
|
return
|
|
|
|
# Ensure target word is escaped for regex
|
|
target_word = re.escape(target_word)
|
|
|
|
# Check if the word contains Chinese, Japanese, or Korean characters
|
|
if contains_cjko(target_word):
|
|
reg_pattern = fr'((?![{{}}<>-]){target_word}(?![{{}}<>-]))'
|
|
else:
|
|
reg_pattern = fr'\b((?![{{}}<>-]){target_word}(?![{{}}<>-]))\b'
|
|
|
|
if re.match(r'.*[@_!#$%^&*()<>?/\|}{~:].*', target_word) or (
|
|
element.parent and element.parent.name == 'style'):
|
|
return
|
|
|
|
element.replace_with(BeautifulSoup(
|
|
re.sub(reg_pattern,
|
|
r'<b>\1</b>',
|
|
element,
|
|
flags=re.I), 'html.parser')
|
|
)
|
|
|
|
# Split all words out of query, grouping the ones wrapped in quotes
|
|
for word in re.split(r'\s+(?=[^"]*(?:"[^"]*"[^"]*)*$)', query):
|
|
word = re.sub(r'[@_!#$%^&*()<>?/\|}{~:]+', '', word)
|
|
target = response.find_all(
|
|
string=re.compile(r'' + re.escape(word), re.I))
|
|
for nav_str in target:
|
|
replace_any_case(nav_str, word)
|
|
|
|
return response
|
|
|
|
|
|
def has_ad_content(element: str) -> bool:
|
|
"""Inspects an HTML element for ad related content
|
|
|
|
Args:
|
|
element: The HTML element to inspect
|
|
|
|
Returns:
|
|
bool: True/False for the element containing an ad
|
|
|
|
"""
|
|
element_str = ''.join(filter(str.isalpha, element))
|
|
return (element_str.upper() in (value.upper() for value in BLACKLIST)
|
|
or 'ⓘ' in element)
|
|
|
|
|
|
def get_first_link(soup) -> str:
|
|
"""Retrieves the first result link from the query response
|
|
|
|
Args:
|
|
soup: The BeautifulSoup response body
|
|
|
|
Returns:
|
|
str: A str link to the first result
|
|
|
|
"""
|
|
first_link = ''
|
|
|
|
# Find the first valid search result link, excluding details elements
|
|
for a in soup.find_all('a', href=True):
|
|
# Skip links that are inside details elements (collapsible sections)
|
|
if a.find_parent('details'):
|
|
continue
|
|
|
|
# Return the first search result URL
|
|
if a['href'].startswith('http://') or a['href'].startswith('https://'):
|
|
first_link = a['href']
|
|
break
|
|
|
|
return first_link
|
|
|
|
|
|
def get_site_alt(link: str, site_alts: dict = SITE_ALTS) -> str:
|
|
"""Returns an alternative to a particular site, if one is configured
|
|
|
|
Args:
|
|
link: A string result URL to check against the site_alts map
|
|
site_alts: A map of site alternatives to replace with. defaults to SITE_ALTS
|
|
|
|
Returns:
|
|
str: An updated (or ignored) result link
|
|
|
|
"""
|
|
# Need to replace full hostname with alternative to encapsulate
|
|
# subdomains as well
|
|
parsed_link = urlparse.urlparse(link)
|
|
|
|
# Extract subdomain separately from the domain+tld. The subdomain
|
|
# is used for wikiless translations.
|
|
split_host = parsed_link.netloc.split('.')
|
|
subdomain = split_host[0] if len(split_host) > 2 else ''
|
|
hostname = '.'.join(split_host[-2:])
|
|
|
|
# The full scheme + hostname is used when comparing against the list of
|
|
# available alternative services, due to how Medium links are constructed.
|
|
# (i.e. for medium.com: "https://something.medium.com" should match,
|
|
# "https://medium.com/..." should match, but "philomedium.com" should not)
|
|
hostcomp = f'{parsed_link.scheme}://{hostname}'
|
|
|
|
for site_key in site_alts.keys():
|
|
site_alt = f'{parsed_link.scheme}://{site_key}'
|
|
if not hostname or site_alt not in hostcomp or not site_alts[site_key]:
|
|
continue
|
|
|
|
# Wikipedia -> Wikiless replacements require the subdomain (if it's
|
|
# a 2-char language code) to be passed as a URL param to Wikiless
|
|
# in order to preserve the language setting.
|
|
params = ''
|
|
if 'wikipedia' in hostname and len(subdomain) == 2:
|
|
hostname = f'{subdomain}.{hostname}'
|
|
params = f'?lang={subdomain}'
|
|
elif 'medium' in hostname and len(subdomain) > 0:
|
|
hostname = f'{subdomain}.{hostname}'
|
|
|
|
parsed_alt = urlparse.urlparse(site_alts[site_key])
|
|
link = link.replace(hostname, site_alts[site_key]) + params
|
|
# If a scheme is specified in the alternative, this results in a
|
|
# replaced link that looks like "https://http://altservice.tld".
|
|
# In this case, we can remove the original scheme from the result
|
|
# and use the one specified for the alt.
|
|
if parsed_alt.scheme:
|
|
link = '//'.join(link.split('//')[1:])
|
|
|
|
for prefix in SKIP_PREFIX:
|
|
if parsed_alt.scheme:
|
|
# If a scheme is specified, remove everything before the
|
|
# first occurence of it
|
|
link = f'{parsed_alt.scheme}{link.split(parsed_alt.scheme, 1)[-1]}'
|
|
else:
|
|
# Otherwise, replace the first occurrence of the prefix
|
|
link = link.replace(prefix, '//', 1)
|
|
break
|
|
|
|
return link
|
|
|
|
|
|
def filter_link_args(link: str) -> str:
|
|
"""Filters out unnecessary URL args from a result link
|
|
|
|
Args:
|
|
link: The string result link to check for extraneous URL params
|
|
|
|
Returns:
|
|
str: An updated (or ignored) result link
|
|
|
|
"""
|
|
parsed_link = urlparse.urlparse(link)
|
|
link_args = parse_qs(parsed_link.query)
|
|
safe_args = {}
|
|
|
|
if len(link_args) == 0 and len(parsed_link) > 0:
|
|
return link
|
|
|
|
for arg in link_args.keys():
|
|
if arg in SKIP_ARGS:
|
|
continue
|
|
|
|
safe_args[arg] = link_args[arg]
|
|
|
|
# Remove original link query and replace with filtered args
|
|
link = link.replace(parsed_link.query, '')
|
|
if len(safe_args) > 0:
|
|
link = link + urlparse.urlencode(safe_args, doseq=True)
|
|
else:
|
|
link = link.replace('?', '')
|
|
|
|
return link
|
|
|
|
|
|
def append_nojs(result: BeautifulSoup) -> None:
|
|
"""Appends a no-Javascript alternative for a search result
|
|
|
|
Args:
|
|
result: The search result to append a no-JS link to
|
|
|
|
Returns:
|
|
None
|
|
|
|
"""
|
|
nojs_link = BeautifulSoup(features='html.parser').new_tag('a')
|
|
nojs_link['href'] = f'{Endpoint.window}?nojs=1&location=' + result['href']
|
|
nojs_link.string = ' NoJS Link'
|
|
result.append(nojs_link)
|
|
|
|
|
|
def append_anon_view(result: BeautifulSoup, config: Config) -> None:
|
|
"""Appends an 'anonymous view' for a search result, where all site
|
|
contents are viewed through Whoogle as a proxy.
|
|
|
|
Args:
|
|
result: The search result to append an anon view link to
|
|
nojs: Remove Javascript from Anonymous View
|
|
|
|
Returns:
|
|
None
|
|
|
|
"""
|
|
av_link = BeautifulSoup(features='html.parser').new_tag('a')
|
|
nojs = 'nojs=1' if config.nojs else 'nojs=0'
|
|
location = f'location={result["href"]}'
|
|
av_link['href'] = f'{Endpoint.window}?{nojs}&{location}'
|
|
translation = current_app.config['TRANSLATIONS'][
|
|
config.get_localization_lang()
|
|
]
|
|
av_link.string = f'{translation["anon-view"]}'
|
|
av_link['class'] = 'anon-view'
|
|
result.append(av_link)
|
|
|
|
def check_currency(response: str) -> dict:
|
|
"""Check whether the results have currency conversion
|
|
|
|
Args:
|
|
response: Search query Result
|
|
|
|
Returns:
|
|
dict: Consists of currency names and values
|
|
|
|
"""
|
|
soup = BeautifulSoup(response, 'html.parser')
|
|
currency_link = soup.find('a', {'href': 'https://g.co/gfd'})
|
|
if currency_link:
|
|
while 'class' not in currency_link.attrs or \
|
|
'ZINbbc' not in currency_link.attrs['class']:
|
|
if currency_link.parent:
|
|
currency_link = currency_link.parent
|
|
else:
|
|
return {}
|
|
currency_link = currency_link.find_all(class_='BNeawe')
|
|
currency1 = currency_link[0].text
|
|
currency2 = currency_link[1].text
|
|
currency1 = currency1.rstrip('=').split(' ', 1)
|
|
currency2 = currency2.split(' ', 1)
|
|
|
|
# Handle differences in currency formatting
|
|
# i.e. "5.000" vs "5,000"
|
|
if currency2[0][-3] == ',':
|
|
currency1[0] = currency1[0].replace('.', '')
|
|
currency1[0] = currency1[0].replace(',', '.')
|
|
currency2[0] = currency2[0].replace('.', '')
|
|
currency2[0] = currency2[0].replace(',', '.')
|
|
else:
|
|
currency1[0] = currency1[0].replace(',', '')
|
|
currency2[0] = currency2[0].replace(',', '')
|
|
|
|
currency1_value = float(re.sub(r'[^\d\.]', '', currency1[0]))
|
|
currency1_label = currency1[1]
|
|
|
|
currency2_value = float(re.sub(r'[^\d\.]', '', currency2[0]))
|
|
currency2_label = currency2[1]
|
|
|
|
return {'currencyValue1': currency1_value,
|
|
'currencyLabel1': currency1_label,
|
|
'currencyValue2': currency2_value,
|
|
'currencyLabel2': currency2_label
|
|
}
|
|
return {}
|
|
|
|
|
|
def add_currency_card(soup: BeautifulSoup,
|
|
conversion_details: dict) -> BeautifulSoup:
|
|
"""Adds the currency conversion boxes
|
|
to response of the search query
|
|
|
|
Args:
|
|
soup: Parsed search result
|
|
conversion_details: Dictionary of currency
|
|
related information
|
|
|
|
Returns:
|
|
BeautifulSoup
|
|
"""
|
|
# Element before which the code will be changed
|
|
# (This is the 'disclaimer' link)
|
|
element1 = soup.find('a', {'href': 'https://g.co/gfd'})
|
|
|
|
while 'class' not in element1.attrs or \
|
|
'nXE3Ob' not in element1.attrs['class']:
|
|
element1 = element1.parent
|
|
|
|
# Creating the conversion factor
|
|
conversion_factor = (conversion_details['currencyValue1'] /
|
|
conversion_details['currencyValue2'])
|
|
|
|
# Creating a new div for the input boxes
|
|
conversion_box = soup.new_tag('div')
|
|
conversion_box['class'] = 'conversion_box'
|
|
|
|
# Currency to be converted from
|
|
input_box1 = soup.new_tag('input')
|
|
input_box1['id'] = 'cb1'
|
|
input_box1['type'] = 'number'
|
|
input_box1['class'] = 'cb'
|
|
input_box1['value'] = conversion_details['currencyValue1']
|
|
input_box1['oninput'] = f'convert(1, 2, {1 / conversion_factor})'
|
|
|
|
label_box1 = soup.new_tag('label')
|
|
label_box1['for'] = 'cb1'
|
|
label_box1['class'] = 'cb_label'
|
|
label_box1.append(conversion_details['currencyLabel1'])
|
|
|
|
br = soup.new_tag('br')
|
|
|
|
# Currency to be converted to
|
|
input_box2 = soup.new_tag('input')
|
|
input_box2['id'] = 'cb2'
|
|
input_box2['type'] = 'number'
|
|
input_box2['class'] = 'cb'
|
|
input_box2['value'] = conversion_details['currencyValue2']
|
|
input_box2['oninput'] = f'convert(2, 1, {conversion_factor})'
|
|
|
|
label_box2 = soup.new_tag('label')
|
|
label_box2['for'] = 'cb2'
|
|
label_box2['class'] = 'cb_label'
|
|
label_box2.append(conversion_details['currencyLabel2'])
|
|
|
|
conversion_box.append(input_box1)
|
|
conversion_box.append(label_box1)
|
|
conversion_box.append(br)
|
|
conversion_box.append(input_box2)
|
|
conversion_box.append(label_box2)
|
|
|
|
element1.insert_before(conversion_box)
|
|
return soup
|
|
|
|
|
|
def get_tabs_content(tabs: dict,
|
|
full_query: str,
|
|
search_type: str,
|
|
preferences: str,
|
|
translation: dict,
|
|
use_leta: bool = False) -> dict:
|
|
"""Takes the default tabs content and updates it according to the query.
|
|
|
|
Args:
|
|
tabs: The default content for the tabs
|
|
full_query: The original search query
|
|
search_type: The current search_type
|
|
translation: The translation to get the names of the tabs
|
|
use_leta: Whether Mullvad Leta backend is being used
|
|
|
|
Returns:
|
|
dict: contains the name, the href and if the tab is selected or not
|
|
"""
|
|
map_query = full_query
|
|
if '-site:' in full_query:
|
|
block_idx = full_query.index('-site:')
|
|
map_query = map_query[:block_idx]
|
|
tabs = copy.deepcopy(tabs)
|
|
|
|
# If using Leta, remove unsupported tabs (images, videos, news, maps)
|
|
if use_leta:
|
|
tabs = {k: v for k, v in tabs.items() if k == 'all'}
|
|
|
|
for tab_id, tab_content in tabs.items():
|
|
# update name to desired language
|
|
if tab_id in translation:
|
|
tab_content['name'] = translation[tab_id]
|
|
|
|
# update href with query
|
|
query = full_query.replace(f'&tbm={search_type}', '')
|
|
|
|
if tab_content['tbm'] is not None:
|
|
query = f"{query}&tbm={tab_content['tbm']}"
|
|
|
|
if preferences:
|
|
query = f"{query}&preferences={preferences}"
|
|
|
|
tab_content['href'] = tab_content['href'].format(
|
|
query=query,
|
|
map_query=map_query)
|
|
|
|
# update if selected tab (default all tab is selected)
|
|
if tab_content['tbm'] == search_type:
|
|
tabs['all']['selected'] = False
|
|
tab_content['selected'] = True
|
|
return tabs
|