mirror of
https://github.com/benbusby/whoogle-search.git
synced 2026-03-11 08:54:34 +00:00
- Add modern Google Images parsing (udm=2) and use view_image to render extracted image results, with Chrome UA and forced image endpoint for tbm=isch/udm=2.
- Normalize layouts (image grid width) and inject styling tweaks; remove broken image pagination/next link with TODO left for proper paging.
This commit is contained in:
parent
ff3a44b91e
commit
6c7ca7c082
4 changed files with 261 additions and 37 deletions
191
app/filter.py
191
app/filter.py
|
|
@ -5,7 +5,8 @@ from cryptography.fernet import Fernet
|
|||
from flask import render_template
|
||||
import html
|
||||
import urllib.parse as urlparse
|
||||
from urllib.parse import parse_qs
|
||||
import os
|
||||
from urllib.parse import parse_qs, urlencode, urlunparse
|
||||
import re
|
||||
|
||||
from app.models.g_classes import GClasses
|
||||
|
|
@ -208,6 +209,9 @@ class Filter:
|
|||
header = self.soup.find('header')
|
||||
if header:
|
||||
header.decompose()
|
||||
# Remove broken "Dark theme" toggle snippets that occasionally slip
|
||||
# into the footer.
|
||||
self.remove_dark_theme_toggle(self.soup)
|
||||
self.remove_site_blocks(self.soup)
|
||||
return self.soup
|
||||
|
||||
|
|
@ -292,6 +296,22 @@ class Filter:
|
|||
if GClasses.result_class_a in p_cls:
|
||||
break
|
||||
|
||||
def remove_dark_theme_toggle(self, soup: BeautifulSoup) -> None:
|
||||
"""Removes stray Dark theme toggle/link fragments that can appear
|
||||
in the footer."""
|
||||
for node in soup.find_all(string=re.compile(r'Dark theme', re.I)):
|
||||
try:
|
||||
parent = node.find_parent(
|
||||
lambda tag: tag.name in ['div', 'span', 'p', 'a', 'li',
|
||||
'section'])
|
||||
target = parent or node.parent
|
||||
if target:
|
||||
target.decompose()
|
||||
else:
|
||||
node.extract()
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
def remove_site_blocks(self, soup) -> None:
|
||||
if not self.config.block or not soup.body:
|
||||
return
|
||||
|
|
@ -531,10 +551,32 @@ class Filter:
|
|||
)
|
||||
css = f"{css_html_tag}{css}"
|
||||
css = re.sub('body{(.*?)}',
|
||||
'body{padding:0 8px;margin:0 auto;max-width:736px;}',
|
||||
'body{padding:0 12px;margin:0 auto;max-width:1200px;}',
|
||||
css)
|
||||
style.string = css
|
||||
|
||||
# Normalize the max width between result types so the page doesn't
|
||||
# jump in size when switching tabs.
|
||||
if not self.mobile:
|
||||
max_width_css = (
|
||||
'body, #cnt, #center_col, .main, .e9EfHf, #searchform, '
|
||||
'.GyAeWb, .s6JM6d {'
|
||||
'max-width:1200px;'
|
||||
'margin:0 auto;'
|
||||
'padding-left:12px;'
|
||||
'padding-right:12px;'
|
||||
'}'
|
||||
)
|
||||
# Build the style tag using a fresh soup to avoid cases where the
|
||||
# current soup lacks the helper methods (e.g., non-root elements).
|
||||
factory_soup = BeautifulSoup('', 'html.parser')
|
||||
extra_style = factory_soup.new_tag('style')
|
||||
extra_style.string = max_width_css
|
||||
if self.soup.head:
|
||||
self.soup.head.append(extra_style)
|
||||
else:
|
||||
self.soup.insert(0, extra_style)
|
||||
|
||||
def update_link(self, link: Tag) -> None:
|
||||
"""Update internal link paths with encrypted path, otherwise remove
|
||||
unnecessary redirects and/or marketing params from the url
|
||||
|
|
@ -738,16 +780,113 @@ class Filter:
|
|||
desc_node.replace_with(new_desc)
|
||||
|
||||
def view_image(self, soup) -> BeautifulSoup:
|
||||
"""Replaces the soup with a new one that handles mobile results and
|
||||
adds the link of the image full res to the results.
|
||||
"""Parses image results from Google Images and rewrites them into the
|
||||
lightweight Whoogle image results template.
|
||||
|
||||
Args:
|
||||
soup: A BeautifulSoup object containing the image mobile results.
|
||||
|
||||
Returns:
|
||||
BeautifulSoup: The new BeautifulSoup object
|
||||
Google now serves image results via the modern udm=2 endpoint, where
|
||||
the raw HTML contains only placeholder thumbnails. The actual image
|
||||
URLs live inside serialized data blobs in script tags. We extract that
|
||||
data and pair it with the visible result cards.
|
||||
"""
|
||||
|
||||
def _decode_url(url: str) -> str:
|
||||
if not url:
|
||||
return ''
|
||||
# Decode common escaped characters found in the script blobs
|
||||
return html.unescape(
|
||||
url.replace('\\u003d', '=').replace('\\u0026', '&')
|
||||
)
|
||||
|
||||
def _extract_image_data(modern_soup: BeautifulSoup) -> dict:
|
||||
"""Extracts docid -> {img_url, img_tbn} from serialized scripts."""
|
||||
scripts_text = ' '.join(
|
||||
script.string for script in modern_soup.find_all('script')
|
||||
if script.string
|
||||
)
|
||||
pattern = re.compile(
|
||||
r'\[0,"(?P<docid>[^"]+)",\["(?P<thumb>https://encrypted-tbn[^"]+)"'
|
||||
r'(?:,\d+,\d+)?\],\["(?P<full>https?://[^"]+?)"'
|
||||
r'(?:,\d+,\d+)?\]',
|
||||
re.DOTALL
|
||||
)
|
||||
results_map = {}
|
||||
for match in pattern.finditer(scripts_text):
|
||||
docid = match.group('docid')
|
||||
thumb = _decode_url(match.group('thumb'))
|
||||
full = _decode_url(match.group('full'))
|
||||
results_map[docid] = {
|
||||
'img_tbn': thumb,
|
||||
'img_url': full
|
||||
}
|
||||
return results_map
|
||||
|
||||
def _parse_modern_results(modern_soup: BeautifulSoup) -> list:
|
||||
cards = modern_soup.find_all(
|
||||
'div',
|
||||
attrs={
|
||||
'data-attrid': 'images universal',
|
||||
'data-docid': True
|
||||
}
|
||||
)
|
||||
if not cards:
|
||||
return []
|
||||
|
||||
meta_map = _extract_image_data(modern_soup)
|
||||
parsed = []
|
||||
seen = set()
|
||||
|
||||
for card in cards:
|
||||
docid = card.get('data-docid')
|
||||
meta = meta_map.get(docid, {})
|
||||
img_url = meta.get('img_url')
|
||||
img_tbn = meta.get('img_tbn')
|
||||
|
||||
# Fall back to the inline src if we failed to map the docid
|
||||
if not img_tbn:
|
||||
img_tag = card.find('img')
|
||||
if img_tag:
|
||||
candidate_src = img_tag.get('src')
|
||||
if candidate_src and candidate_src.startswith('http'):
|
||||
img_tbn = candidate_src
|
||||
|
||||
web_page = card.get('data-lpage') or ''
|
||||
if not web_page:
|
||||
link = card.find('a', href=True)
|
||||
if link:
|
||||
web_page = link['href']
|
||||
|
||||
key = (img_url, img_tbn, web_page)
|
||||
if not any(key) or key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
|
||||
parsed.append({
|
||||
'domain': urlparse.urlparse(web_page).netloc
|
||||
if web_page else '',
|
||||
'img_url': img_url or img_tbn or '',
|
||||
'web_page': web_page,
|
||||
'img_tbn': img_tbn or img_url or ''
|
||||
})
|
||||
return parsed
|
||||
|
||||
# Try parsing the modern (udm=2) layout first
|
||||
modern_results = _parse_modern_results(soup)
|
||||
if modern_results:
|
||||
# TODO: Implement proper image pagination. Google images uses
|
||||
# infinite scroll with `ijn` offsets; we need a clean,
|
||||
# de-duplicated pagination strategy before exposing a Next link.
|
||||
next_link = None
|
||||
return BeautifulSoup(
|
||||
render_template(
|
||||
'imageresults.html',
|
||||
length=len(modern_results),
|
||||
results=modern_results,
|
||||
view_label="View Image",
|
||||
next_link=next_link
|
||||
),
|
||||
features='html.parser'
|
||||
)
|
||||
|
||||
# get some tags that are unchanged between mobile and pc versions
|
||||
cor_suggested = soup.find_all('table', attrs={'class': "By0U9"})
|
||||
next_pages = soup.find('table', attrs={'class': "uZgmoc"})
|
||||
|
|
@ -761,7 +900,11 @@ class Filter:
|
|||
results_all = results_div.find_all('div', attrs={'class': "lIMUZd"})
|
||||
|
||||
for item in results_all:
|
||||
urls = item.find('a')['href'].split('&imgrefurl=')
|
||||
link = item.find('a', href=True)
|
||||
if not link:
|
||||
continue
|
||||
|
||||
urls = link['href'].split('&imgrefurl=')
|
||||
|
||||
# Skip urls that are not two-element lists
|
||||
if len(urls) != 2:
|
||||
|
|
@ -776,7 +919,16 @@ class Filter:
|
|||
except IndexError:
|
||||
web_page = urlparse.unquote(urls[1])
|
||||
|
||||
img_tbn = urlparse.unquote(item.find('a').find('img')['src'])
|
||||
img_tag = link.find('img')
|
||||
if not img_tag:
|
||||
continue
|
||||
|
||||
img_tbn = urlparse.unquote(
|
||||
img_tag.get('src') or img_tag.get('data-src', '')
|
||||
)
|
||||
|
||||
if not img_tbn:
|
||||
continue
|
||||
|
||||
results.append({
|
||||
'domain': urlparse.urlparse(web_page).netloc,
|
||||
|
|
@ -793,11 +945,18 @@ class Filter:
|
|||
|
||||
# replace correction suggested by google object if exists
|
||||
if len(cor_suggested):
|
||||
soup.find_all(
|
||||
suggested_tables = soup.find_all(
|
||||
'table',
|
||||
attrs={'class': "By0U9"}
|
||||
)[0].replaceWith(cor_suggested[0])
|
||||
# replace next page object at the bottom of the page
|
||||
soup.find_all('table',
|
||||
attrs={'class': "uZgmoc"})[0].replaceWith(next_pages)
|
||||
)
|
||||
if suggested_tables:
|
||||
suggested_tables[0].replaceWith(cor_suggested[0])
|
||||
|
||||
# replace next page object at the bottom of the page, when present
|
||||
next_page_tables = soup.find_all('table', attrs={'class': "uZgmoc"})
|
||||
if next_pages and next_page_tables:
|
||||
next_page_tables[0].replaceWith(next_pages)
|
||||
|
||||
# TODO: Reintroduce pagination for legacy image layout if needed.
|
||||
|
||||
return soup
|
||||
|
|
|
|||
|
|
@ -147,6 +147,10 @@ def gen_query(query, args, config) -> str:
|
|||
# Pass along type of results (news, images, books, etc)
|
||||
if 'tbm' in args:
|
||||
param_dict['tbm'] = '&tbm=' + args.get('tbm')
|
||||
# Google Images now expects the modern udm=2 layout; force it when
|
||||
# requesting images to avoid redirects to the new AI/text layout.
|
||||
if args.get('tbm') == 'isch' and 'udm' not in args:
|
||||
param_dict['udm'] = '&udm=2'
|
||||
|
||||
# Get results page start value (10 per page, ie page 2 start val = 20)
|
||||
if 'start' in args:
|
||||
|
|
@ -212,8 +216,18 @@ class Request:
|
|||
"""
|
||||
|
||||
def __init__(self, normal_ua, root_path, config: Config, http_client=None):
|
||||
self.search_url = 'https://www.google.com/search?gbv=1&num=' + str(
|
||||
os.getenv('WHOOGLE_RESULTS_PER_PAGE', 10)) + '&q='
|
||||
results_per_page = str(os.getenv('WHOOGLE_RESULTS_PER_PAGE', 10))
|
||||
self.search_url = (
|
||||
'https://www.google.com/search?gbv=1&num='
|
||||
f'{results_per_page}&q='
|
||||
)
|
||||
# Google Images rejects the lightweight gbv=1 interface. Use the
|
||||
# modern udm=2 entrypoint specifically for image searches to avoid the
|
||||
# "update your browser" interstitial.
|
||||
self.image_search_url = (
|
||||
'https://www.google.com/search?udm=2&num='
|
||||
f'{results_per_page}&q='
|
||||
)
|
||||
# Optionally send heartbeat to Tor to determine availability
|
||||
# Only when Tor is enabled in config to avoid unnecessary socket usage
|
||||
if config.tor:
|
||||
|
|
@ -235,6 +249,13 @@ class Request:
|
|||
if not self.mobile:
|
||||
self.modified_user_agent_mobile = gen_user_agent(config, True)
|
||||
|
||||
# Dedicated modern UA to use when Google rejects legacy ones (e.g. Images)
|
||||
self.image_user_agent = (
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
|
||||
'AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||
'Chrome/127.0.0.0 Safari/537.36'
|
||||
)
|
||||
|
||||
# Set up proxy configuration
|
||||
proxy_path = os.environ.get('WHOOGLE_PROXY_LOC', '')
|
||||
if proxy_path:
|
||||
|
|
@ -332,6 +353,13 @@ class Request:
|
|||
else:
|
||||
modified_user_agent = self.modified_user_agent
|
||||
|
||||
# Some Google endpoints (notably Images) now refuse legacy user agents.
|
||||
# If an image search is detected and the generated UA isn't Chromium-
|
||||
# like, retry with a modern Chrome string to avoid the "update your
|
||||
# browser" interstitial.
|
||||
if (('tbm=isch' in query) or ('udm=2' in query)) and 'Chrome' not in modified_user_agent:
|
||||
modified_user_agent = self.image_user_agent
|
||||
|
||||
headers = {
|
||||
'User-Agent': modified_user_agent,
|
||||
'Accept': ('text/html,application/xhtml+xml,application/xml;'
|
||||
|
|
@ -345,16 +373,23 @@ class Request:
|
|||
'Sec-Fetch-Site': 'none',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-User': '?1',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-CH-UA': (
|
||||
'"Not/A)Brand";v="8", '
|
||||
'"Chromium";v="127", '
|
||||
'"Google Chrome";v="127"'
|
||||
),
|
||||
'Sec-CH-UA-Mobile': '?0',
|
||||
'Sec-CH-UA-Platform': '"macOS"'
|
||||
'Sec-Fetch-Dest': 'document'
|
||||
}
|
||||
# Only attach client hints when using a Chromium-like user agent to
|
||||
# avoid sending conflicting information that can trigger unsupported
|
||||
# browser pages.
|
||||
if 'Chrome' in headers['User-Agent']:
|
||||
headers.update({
|
||||
'Sec-CH-UA': (
|
||||
'"Not/A)Brand";v="8", '
|
||||
'"Chromium";v="127", '
|
||||
'"Google Chrome";v="127"'
|
||||
),
|
||||
'Sec-CH-UA-Mobile': '?0',
|
||||
'Sec-CH-UA-Platform': '"Windows"'
|
||||
})
|
||||
|
||||
|
||||
# Add Accept-Language header tied to the current config if requested
|
||||
if self.lang_interface:
|
||||
headers['Accept-Language'] = (
|
||||
|
|
@ -393,9 +428,13 @@ class Request:
|
|||
"Error raised during Tor connection validation",
|
||||
disable=True)
|
||||
|
||||
search_base = base_url or self.search_url
|
||||
if not base_url and ('tbm=isch' in query or 'udm=2' in query):
|
||||
search_base = self.image_search_url
|
||||
|
||||
try:
|
||||
response = self.http_client.get(
|
||||
(base_url or self.search_url) + query,
|
||||
search_base + query,
|
||||
headers=headers,
|
||||
cookies=consent_cookies)
|
||||
except httpx.HTTPError as e:
|
||||
|
|
@ -406,6 +445,6 @@ class Request:
|
|||
attempt += 1
|
||||
if attempt > 10:
|
||||
raise TorError("Tor query failed -- max attempts exceeded 10")
|
||||
return self.send((base_url or self.search_url), query, attempt)
|
||||
return self.send(search_base, query, attempt)
|
||||
|
||||
return response
|
||||
|
|
|
|||
|
|
@ -10,9 +10,9 @@
|
|||
background-color: #fff;
|
||||
}
|
||||
body {
|
||||
padding: 0 8px;
|
||||
padding: 0 12px;
|
||||
margin: 0 auto;
|
||||
max-width: 736px;
|
||||
max-width: 1200px;
|
||||
}
|
||||
a {
|
||||
text-decoration: none;
|
||||
|
|
@ -167,6 +167,7 @@
|
|||
border-collapse: collapse;
|
||||
border-spacing: 0;
|
||||
width: 100%;
|
||||
table-layout: fixed;
|
||||
}
|
||||
.X6ZCif {
|
||||
color: #202124;
|
||||
|
|
@ -209,15 +210,20 @@
|
|||
text-align: center;
|
||||
}
|
||||
.RAyV4b {
|
||||
line-height: 140px;
|
||||
overflow: "hidden";
|
||||
height: 220px;
|
||||
line-height: 220px;
|
||||
overflow: hidden;
|
||||
text-align: center;
|
||||
}
|
||||
.t0fcAb {
|
||||
text-align: center;
|
||||
margin: auto;
|
||||
vertical-align: middle;
|
||||
object-fit: contain;
|
||||
object-fit: cover;
|
||||
max-width: 100%;
|
||||
height: auto;
|
||||
max-height: 220px;
|
||||
display: block;
|
||||
}
|
||||
.Tor4Ec {
|
||||
padding-top: 2px;
|
||||
|
|
@ -313,6 +319,24 @@
|
|||
a .CVA68e:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
.e3goi {
|
||||
width: 25%;
|
||||
padding: 10px;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
.svla5d {
|
||||
max-width: 100%;
|
||||
}
|
||||
@media (max-width: 900px) {
|
||||
.e3goi {
|
||||
width: 50%;
|
||||
}
|
||||
}
|
||||
@media (max-width: 600px) {
|
||||
.e3goi {
|
||||
width: 100%;
|
||||
}
|
||||
}
|
||||
</style>
|
||||
<div>
|
||||
<div>
|
||||
|
|
|
|||
|
|
@ -140,7 +140,8 @@ class Search:
|
|||
root_url=root_url,
|
||||
mobile=mobile,
|
||||
config=self.config,
|
||||
query=self.query)
|
||||
query=self.query,
|
||||
page_url=self.request.url)
|
||||
full_query = gen_query(self.query,
|
||||
self.request_params,
|
||||
self.config)
|
||||
|
|
@ -148,8 +149,10 @@ class Search:
|
|||
|
||||
# force mobile search when view image is true and
|
||||
# the request is not already made by a mobile
|
||||
view_image = ('tbm=isch' in full_query
|
||||
and self.config.view_image)
|
||||
is_image_query = ('tbm=isch' in full_query) or ('udm=2' in full_query)
|
||||
# Always parse image results when hitting the images endpoint (udm=2)
|
||||
# to avoid Google returning only text/AI blocks.
|
||||
view_image = is_image_query
|
||||
|
||||
client = self.user_request or g.user_request
|
||||
get_body = client.send(query=full_query,
|
||||
|
|
@ -194,4 +197,3 @@ class Search:
|
|||
link['href'] += param_str
|
||||
|
||||
return str(formatted_results)
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue