- Add modern Google Images parsing (udm=2) and use view_image to render extracted image results, with Chrome UA and forced image endpoint for tbm=isch/udm=2.

- Normalize layouts (image grid width) and inject styling tweaks; remove broken image pagination/next link with TODO left for proper paging.
This commit is contained in:
Don-Swanson 2025-11-26 22:21:23 -06:00
parent ff3a44b91e
commit 6c7ca7c082
No known key found for this signature in database
GPG key ID: C6A6ACD574A005E5
4 changed files with 261 additions and 37 deletions

View file

@ -5,7 +5,8 @@ from cryptography.fernet import Fernet
from flask import render_template
import html
import urllib.parse as urlparse
from urllib.parse import parse_qs
import os
from urllib.parse import parse_qs, urlencode, urlunparse
import re
from app.models.g_classes import GClasses
@ -208,6 +209,9 @@ class Filter:
header = self.soup.find('header')
if header:
header.decompose()
# Remove broken "Dark theme" toggle snippets that occasionally slip
# into the footer.
self.remove_dark_theme_toggle(self.soup)
self.remove_site_blocks(self.soup)
return self.soup
@ -292,6 +296,22 @@ class Filter:
if GClasses.result_class_a in p_cls:
break
def remove_dark_theme_toggle(self, soup: BeautifulSoup) -> None:
"""Removes stray Dark theme toggle/link fragments that can appear
in the footer."""
for node in soup.find_all(string=re.compile(r'Dark theme', re.I)):
try:
parent = node.find_parent(
lambda tag: tag.name in ['div', 'span', 'p', 'a', 'li',
'section'])
target = parent or node.parent
if target:
target.decompose()
else:
node.extract()
except Exception:
continue
def remove_site_blocks(self, soup) -> None:
if not self.config.block or not soup.body:
return
@ -531,10 +551,32 @@ class Filter:
)
css = f"{css_html_tag}{css}"
css = re.sub('body{(.*?)}',
'body{padding:0 8px;margin:0 auto;max-width:736px;}',
'body{padding:0 12px;margin:0 auto;max-width:1200px;}',
css)
style.string = css
# Normalize the max width between result types so the page doesn't
# jump in size when switching tabs.
if not self.mobile:
max_width_css = (
'body, #cnt, #center_col, .main, .e9EfHf, #searchform, '
'.GyAeWb, .s6JM6d {'
'max-width:1200px;'
'margin:0 auto;'
'padding-left:12px;'
'padding-right:12px;'
'}'
)
# Build the style tag using a fresh soup to avoid cases where the
# current soup lacks the helper methods (e.g., non-root elements).
factory_soup = BeautifulSoup('', 'html.parser')
extra_style = factory_soup.new_tag('style')
extra_style.string = max_width_css
if self.soup.head:
self.soup.head.append(extra_style)
else:
self.soup.insert(0, extra_style)
def update_link(self, link: Tag) -> None:
"""Update internal link paths with encrypted path, otherwise remove
unnecessary redirects and/or marketing params from the url
@ -738,16 +780,113 @@ class Filter:
desc_node.replace_with(new_desc)
def view_image(self, soup) -> BeautifulSoup:
"""Replaces the soup with a new one that handles mobile results and
adds the link of the image full res to the results.
"""Parses image results from Google Images and rewrites them into the
lightweight Whoogle image results template.
Args:
soup: A BeautifulSoup object containing the image mobile results.
Returns:
BeautifulSoup: The new BeautifulSoup object
Google now serves image results via the modern udm=2 endpoint, where
the raw HTML contains only placeholder thumbnails. The actual image
URLs live inside serialized data blobs in script tags. We extract that
data and pair it with the visible result cards.
"""
def _decode_url(url: str) -> str:
if not url:
return ''
# Decode common escaped characters found in the script blobs
return html.unescape(
url.replace('\\u003d', '=').replace('\\u0026', '&')
)
def _extract_image_data(modern_soup: BeautifulSoup) -> dict:
"""Extracts docid -> {img_url, img_tbn} from serialized scripts."""
scripts_text = ' '.join(
script.string for script in modern_soup.find_all('script')
if script.string
)
pattern = re.compile(
r'\[0,"(?P<docid>[^"]+)",\["(?P<thumb>https://encrypted-tbn[^"]+)"'
r'(?:,\d+,\d+)?\],\["(?P<full>https?://[^"]+?)"'
r'(?:,\d+,\d+)?\]',
re.DOTALL
)
results_map = {}
for match in pattern.finditer(scripts_text):
docid = match.group('docid')
thumb = _decode_url(match.group('thumb'))
full = _decode_url(match.group('full'))
results_map[docid] = {
'img_tbn': thumb,
'img_url': full
}
return results_map
def _parse_modern_results(modern_soup: BeautifulSoup) -> list:
cards = modern_soup.find_all(
'div',
attrs={
'data-attrid': 'images universal',
'data-docid': True
}
)
if not cards:
return []
meta_map = _extract_image_data(modern_soup)
parsed = []
seen = set()
for card in cards:
docid = card.get('data-docid')
meta = meta_map.get(docid, {})
img_url = meta.get('img_url')
img_tbn = meta.get('img_tbn')
# Fall back to the inline src if we failed to map the docid
if not img_tbn:
img_tag = card.find('img')
if img_tag:
candidate_src = img_tag.get('src')
if candidate_src and candidate_src.startswith('http'):
img_tbn = candidate_src
web_page = card.get('data-lpage') or ''
if not web_page:
link = card.find('a', href=True)
if link:
web_page = link['href']
key = (img_url, img_tbn, web_page)
if not any(key) or key in seen:
continue
seen.add(key)
parsed.append({
'domain': urlparse.urlparse(web_page).netloc
if web_page else '',
'img_url': img_url or img_tbn or '',
'web_page': web_page,
'img_tbn': img_tbn or img_url or ''
})
return parsed
# Try parsing the modern (udm=2) layout first
modern_results = _parse_modern_results(soup)
if modern_results:
# TODO: Implement proper image pagination. Google images uses
# infinite scroll with `ijn` offsets; we need a clean,
# de-duplicated pagination strategy before exposing a Next link.
next_link = None
return BeautifulSoup(
render_template(
'imageresults.html',
length=len(modern_results),
results=modern_results,
view_label="View Image",
next_link=next_link
),
features='html.parser'
)
# get some tags that are unchanged between mobile and pc versions
cor_suggested = soup.find_all('table', attrs={'class': "By0U9"})
next_pages = soup.find('table', attrs={'class': "uZgmoc"})
@ -761,7 +900,11 @@ class Filter:
results_all = results_div.find_all('div', attrs={'class': "lIMUZd"})
for item in results_all:
urls = item.find('a')['href'].split('&imgrefurl=')
link = item.find('a', href=True)
if not link:
continue
urls = link['href'].split('&imgrefurl=')
# Skip urls that are not two-element lists
if len(urls) != 2:
@ -776,7 +919,16 @@ class Filter:
except IndexError:
web_page = urlparse.unquote(urls[1])
img_tbn = urlparse.unquote(item.find('a').find('img')['src'])
img_tag = link.find('img')
if not img_tag:
continue
img_tbn = urlparse.unquote(
img_tag.get('src') or img_tag.get('data-src', '')
)
if not img_tbn:
continue
results.append({
'domain': urlparse.urlparse(web_page).netloc,
@ -793,11 +945,18 @@ class Filter:
# replace correction suggested by google object if exists
if len(cor_suggested):
soup.find_all(
suggested_tables = soup.find_all(
'table',
attrs={'class': "By0U9"}
)[0].replaceWith(cor_suggested[0])
# replace next page object at the bottom of the page
soup.find_all('table',
attrs={'class': "uZgmoc"})[0].replaceWith(next_pages)
)
if suggested_tables:
suggested_tables[0].replaceWith(cor_suggested[0])
# replace next page object at the bottom of the page, when present
next_page_tables = soup.find_all('table', attrs={'class': "uZgmoc"})
if next_pages and next_page_tables:
next_page_tables[0].replaceWith(next_pages)
# TODO: Reintroduce pagination for legacy image layout if needed.
return soup

View file

@ -147,6 +147,10 @@ def gen_query(query, args, config) -> str:
# Pass along type of results (news, images, books, etc)
if 'tbm' in args:
param_dict['tbm'] = '&tbm=' + args.get('tbm')
# Google Images now expects the modern udm=2 layout; force it when
# requesting images to avoid redirects to the new AI/text layout.
if args.get('tbm') == 'isch' and 'udm' not in args:
param_dict['udm'] = '&udm=2'
# Get results page start value (10 per page, ie page 2 start val = 20)
if 'start' in args:
@ -212,8 +216,18 @@ class Request:
"""
def __init__(self, normal_ua, root_path, config: Config, http_client=None):
self.search_url = 'https://www.google.com/search?gbv=1&num=' + str(
os.getenv('WHOOGLE_RESULTS_PER_PAGE', 10)) + '&q='
results_per_page = str(os.getenv('WHOOGLE_RESULTS_PER_PAGE', 10))
self.search_url = (
'https://www.google.com/search?gbv=1&num='
f'{results_per_page}&q='
)
# Google Images rejects the lightweight gbv=1 interface. Use the
# modern udm=2 entrypoint specifically for image searches to avoid the
# "update your browser" interstitial.
self.image_search_url = (
'https://www.google.com/search?udm=2&num='
f'{results_per_page}&q='
)
# Optionally send heartbeat to Tor to determine availability
# Only when Tor is enabled in config to avoid unnecessary socket usage
if config.tor:
@ -235,6 +249,13 @@ class Request:
if not self.mobile:
self.modified_user_agent_mobile = gen_user_agent(config, True)
# Dedicated modern UA to use when Google rejects legacy ones (e.g. Images)
self.image_user_agent = (
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/127.0.0.0 Safari/537.36'
)
# Set up proxy configuration
proxy_path = os.environ.get('WHOOGLE_PROXY_LOC', '')
if proxy_path:
@ -332,6 +353,13 @@ class Request:
else:
modified_user_agent = self.modified_user_agent
# Some Google endpoints (notably Images) now refuse legacy user agents.
# If an image search is detected and the generated UA isn't Chromium-
# like, retry with a modern Chrome string to avoid the "update your
# browser" interstitial.
if (('tbm=isch' in query) or ('udm=2' in query)) and 'Chrome' not in modified_user_agent:
modified_user_agent = self.image_user_agent
headers = {
'User-Agent': modified_user_agent,
'Accept': ('text/html,application/xhtml+xml,application/xml;'
@ -345,16 +373,23 @@ class Request:
'Sec-Fetch-Site': 'none',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Sec-CH-UA': (
'"Not/A)Brand";v="8", '
'"Chromium";v="127", '
'"Google Chrome";v="127"'
),
'Sec-CH-UA-Mobile': '?0',
'Sec-CH-UA-Platform': '"macOS"'
'Sec-Fetch-Dest': 'document'
}
# Only attach client hints when using a Chromium-like user agent to
# avoid sending conflicting information that can trigger unsupported
# browser pages.
if 'Chrome' in headers['User-Agent']:
headers.update({
'Sec-CH-UA': (
'"Not/A)Brand";v="8", '
'"Chromium";v="127", '
'"Google Chrome";v="127"'
),
'Sec-CH-UA-Mobile': '?0',
'Sec-CH-UA-Platform': '"Windows"'
})
# Add Accept-Language header tied to the current config if requested
if self.lang_interface:
headers['Accept-Language'] = (
@ -393,9 +428,13 @@ class Request:
"Error raised during Tor connection validation",
disable=True)
search_base = base_url or self.search_url
if not base_url and ('tbm=isch' in query or 'udm=2' in query):
search_base = self.image_search_url
try:
response = self.http_client.get(
(base_url or self.search_url) + query,
search_base + query,
headers=headers,
cookies=consent_cookies)
except httpx.HTTPError as e:
@ -406,6 +445,6 @@ class Request:
attempt += 1
if attempt > 10:
raise TorError("Tor query failed -- max attempts exceeded 10")
return self.send((base_url or self.search_url), query, attempt)
return self.send(search_base, query, attempt)
return response

View file

@ -10,9 +10,9 @@
background-color: #fff;
}
body {
padding: 0 8px;
padding: 0 12px;
margin: 0 auto;
max-width: 736px;
max-width: 1200px;
}
a {
text-decoration: none;
@ -167,6 +167,7 @@
border-collapse: collapse;
border-spacing: 0;
width: 100%;
table-layout: fixed;
}
.X6ZCif {
color: #202124;
@ -209,15 +210,20 @@
text-align: center;
}
.RAyV4b {
line-height: 140px;
overflow: "hidden";
height: 220px;
line-height: 220px;
overflow: hidden;
text-align: center;
}
.t0fcAb {
text-align: center;
margin: auto;
vertical-align: middle;
object-fit: contain;
object-fit: cover;
max-width: 100%;
height: auto;
max-height: 220px;
display: block;
}
.Tor4Ec {
padding-top: 2px;
@ -313,6 +319,24 @@
a .CVA68e:hover {
text-decoration: underline;
}
.e3goi {
width: 25%;
padding: 10px;
box-sizing: border-box;
}
.svla5d {
max-width: 100%;
}
@media (max-width: 900px) {
.e3goi {
width: 50%;
}
}
@media (max-width: 600px) {
.e3goi {
width: 100%;
}
}
</style>
<div>
<div>

View file

@ -140,7 +140,8 @@ class Search:
root_url=root_url,
mobile=mobile,
config=self.config,
query=self.query)
query=self.query,
page_url=self.request.url)
full_query = gen_query(self.query,
self.request_params,
self.config)
@ -148,8 +149,10 @@ class Search:
# force mobile search when view image is true and
# the request is not already made by a mobile
view_image = ('tbm=isch' in full_query
and self.config.view_image)
is_image_query = ('tbm=isch' in full_query) or ('udm=2' in full_query)
# Always parse image results when hitting the images endpoint (udm=2)
# to avoid Google returning only text/AI blocks.
view_image = is_image_query
client = self.user_request or g.user_request
get_body = client.send(query=full_query,
@ -194,4 +197,3 @@ class Search:
link['href'] += param_str
return str(formatted_results)