- Add modern Google Images parsing (udm=2) and use view_image to render extracted image results, with Chrome UA and forced image endpoint for tbm=isch/udm=2.

- Normalize layouts (image grid width) and inject styling tweaks; remove broken image pagination/next link with TODO left for proper paging.
2026-03-11 08:54:34 +00:00 · 2025-11-26 22:21:23 -06:00 · 2025-11-26 22:21:23 -06:00 · 6c7ca7c082
commit 6c7ca7c082
parent ff3a44b91e
4 changed files with 261 additions and 37 deletions
--- a/app/filter.py
+++ b/app/filter.py
@ -5,7 +5,8 @@ from cryptography.fernet import Fernet
 from flask import render_template
 import html
 import urllib.parse as urlparse
-from urllib.parse import parse_qs
+import os
+from urllib.parse import parse_qs, urlencode, urlunparse
 import re

 from app.models.g_classes import GClasses
@ -208,6 +209,9 @@ class Filter:
        header = self.soup.find('header')
        if header:
            header.decompose()
+        # Remove broken "Dark theme" toggle snippets that occasionally slip
+        # into the footer.
+        self.remove_dark_theme_toggle(self.soup)
        self.remove_site_blocks(self.soup)
        return self.soup

@ -292,6 +296,22 @@ class Filter:
            if GClasses.result_class_a in p_cls:
                break

+    def remove_dark_theme_toggle(self, soup: BeautifulSoup) -> None:
+        """Removes stray Dark theme toggle/link fragments that can appear
+        in the footer."""
+        for node in soup.find_all(string=re.compile(r'Dark theme', re.I)):
+            try:
+                parent = node.find_parent(
+                    lambda tag: tag.name in ['div', 'span', 'p', 'a', 'li',
+                                             'section'])
+                target = parent or node.parent
+                if target:
+                    target.decompose()
+                else:
+                    node.extract()
+            except Exception:
+                continue
+
    def remove_site_blocks(self, soup) -> None:
        if not self.config.block or not soup.body:
            return
@ -531,10 +551,32 @@ class Filter:
            )
            css = f"{css_html_tag}{css}"
            css = re.sub('body{(.*?)}',
-                         'body{padding:0 8px;margin:0 auto;max-width:736px;}',
+                         'body{padding:0 12px;margin:0 auto;max-width:1200px;}',
                         css)
            style.string = css

+        # Normalize the max width between result types so the page doesn't
+        # jump in size when switching tabs.
+        if not self.mobile:
+            max_width_css = (
+                'body, #cnt, #center_col, .main, .e9EfHf, #searchform, '
+                '.GyAeWb, .s6JM6d {'
+                'max-width:1200px;'
+                'margin:0 auto;'
+                'padding-left:12px;'
+                'padding-right:12px;'
+                '}'
+            )
+            # Build the style tag using a fresh soup to avoid cases where the
+            # current soup lacks the helper methods (e.g., non-root elements).
+            factory_soup = BeautifulSoup('', 'html.parser')
+            extra_style = factory_soup.new_tag('style')
+            extra_style.string = max_width_css
+            if self.soup.head:
+                self.soup.head.append(extra_style)
+            else:
+                self.soup.insert(0, extra_style)
+
    def update_link(self, link: Tag) -> None:
        """Update internal link paths with encrypted path, otherwise remove
        unnecessary redirects and/or marketing params from the url
@ -738,16 +780,113 @@ class Filter:
            desc_node.replace_with(new_desc)

    def view_image(self, soup) -> BeautifulSoup:
-        """Replaces the soup with a new one that handles mobile results and
-        adds the link of the image full res to the results.
+        """Parses image results from Google Images and rewrites them into the
+        lightweight Whoogle image results template.

-        Args:
-            soup: A BeautifulSoup object containing the image mobile results.
-
-        Returns:
-            BeautifulSoup: The new BeautifulSoup object
+        Google now serves image results via the modern udm=2 endpoint, where
+        the raw HTML contains only placeholder thumbnails. The actual image
+        URLs live inside serialized data blobs in script tags. We extract that
+        data and pair it with the visible result cards.
        """

+        def _decode_url(url: str) -> str:
+            if not url:
+                return ''
+            # Decode common escaped characters found in the script blobs
+            return html.unescape(
+                url.replace('\\u003d', '=').replace('\\u0026', '&')
+            )
+
+        def _extract_image_data(modern_soup: BeautifulSoup) -> dict:
+            """Extracts docid -> {img_url, img_tbn} from serialized scripts."""
+            scripts_text = ' '.join(
+                script.string for script in modern_soup.find_all('script')
+                if script.string
+            )
+            pattern = re.compile(
+                r'\[0,"(?P<docid>[^"]+)",\["(?P<thumb>https://encrypted-tbn[^"]+)"'
+                r'(?:,\d+,\d+)?\],\["(?P<full>https?://[^"]+?)"'
+                r'(?:,\d+,\d+)?\]',
+                re.DOTALL
+            )
+            results_map = {}
+            for match in pattern.finditer(scripts_text):
+                docid = match.group('docid')
+                thumb = _decode_url(match.group('thumb'))
+                full = _decode_url(match.group('full'))
+                results_map[docid] = {
+                    'img_tbn': thumb,
+                    'img_url': full
+                }
+            return results_map
+
+        def _parse_modern_results(modern_soup: BeautifulSoup) -> list:
+            cards = modern_soup.find_all(
+                'div',
+                attrs={
+                    'data-attrid': 'images universal',
+                    'data-docid': True
+                }
+            )
+            if not cards:
+                return []
+
+            meta_map = _extract_image_data(modern_soup)
+            parsed = []
+            seen = set()
+
+            for card in cards:
+                docid = card.get('data-docid')
+                meta = meta_map.get(docid, {})
+                img_url = meta.get('img_url')
+                img_tbn = meta.get('img_tbn')
+
+                # Fall back to the inline src if we failed to map the docid
+                if not img_tbn:
+                    img_tag = card.find('img')
+                    if img_tag:
+                        candidate_src = img_tag.get('src')
+                        if candidate_src and candidate_src.startswith('http'):
+                            img_tbn = candidate_src
+
+                web_page = card.get('data-lpage') or ''
+                if not web_page:
+                    link = card.find('a', href=True)
+                    if link:
+                        web_page = link['href']
+
+                key = (img_url, img_tbn, web_page)
+                if not any(key) or key in seen:
+                    continue
+                seen.add(key)
+
+                parsed.append({
+                    'domain': urlparse.urlparse(web_page).netloc
+                    if web_page else '',
+                    'img_url': img_url or img_tbn or '',
+                    'web_page': web_page,
+                    'img_tbn': img_tbn or img_url or ''
+                })
+            return parsed
+
+        # Try parsing the modern (udm=2) layout first
+        modern_results = _parse_modern_results(soup)
+        if modern_results:
+            # TODO: Implement proper image pagination. Google images uses
+            # infinite scroll with `ijn` offsets; we need a clean,
+            # de-duplicated pagination strategy before exposing a Next link.
+            next_link = None
+            return BeautifulSoup(
+                render_template(
+                    'imageresults.html',
+                    length=len(modern_results),
+                    results=modern_results,
+                    view_label="View Image",
+                    next_link=next_link
+                ),
+                features='html.parser'
+            )
+
        # get some tags that are unchanged between mobile and pc versions
        cor_suggested = soup.find_all('table', attrs={'class': "By0U9"})
        next_pages = soup.find('table', attrs={'class': "uZgmoc"})
@ -761,7 +900,11 @@ class Filter:
            results_all = results_div.find_all('div', attrs={'class': "lIMUZd"})

        for item in results_all:
-            urls = item.find('a')['href'].split('&imgrefurl=')
+            link = item.find('a', href=True)
+            if not link:
+                continue
+
+            urls = link['href'].split('&imgrefurl=')

            # Skip urls that are not two-element lists
            if len(urls) != 2:
@ -776,7 +919,16 @@ class Filter:
            except IndexError:
                web_page = urlparse.unquote(urls[1])

-            img_tbn = urlparse.unquote(item.find('a').find('img')['src'])
+            img_tag = link.find('img')
+            if not img_tag:
+                continue
+
+            img_tbn = urlparse.unquote(
+                img_tag.get('src') or img_tag.get('data-src', '')
+            )
+
+            if not img_tbn:
+                continue

            results.append({
                'domain': urlparse.urlparse(web_page).netloc,
@ -793,11 +945,18 @@ class Filter:

        # replace correction suggested by google object if exists
        if len(cor_suggested):
-            soup.find_all(
+            suggested_tables = soup.find_all(
                'table',
                attrs={'class': "By0U9"}
-            )[0].replaceWith(cor_suggested[0])
-        # replace next page object at the bottom of the page
-        soup.find_all('table',
-                      attrs={'class': "uZgmoc"})[0].replaceWith(next_pages)
+            )
+            if suggested_tables:
+                suggested_tables[0].replaceWith(cor_suggested[0])
+
+        # replace next page object at the bottom of the page, when present
+        next_page_tables = soup.find_all('table', attrs={'class': "uZgmoc"})
+        if next_pages and next_page_tables:
+            next_page_tables[0].replaceWith(next_pages)
+
+        # TODO: Reintroduce pagination for legacy image layout if needed.
+
        return soup
--- a/app/request.py
+++ b/app/request.py
@ -147,6 +147,10 @@ def gen_query(query, args, config) -> str:
    # Pass along type of results (news, images, books, etc)
    if 'tbm' in args:
        param_dict['tbm'] = '&tbm=' + args.get('tbm')
+        # Google Images now expects the modern udm=2 layout; force it when
+        # requesting images to avoid redirects to the new AI/text layout.
+        if args.get('tbm') == 'isch' and 'udm' not in args:
+            param_dict['udm'] = '&udm=2'

    # Get results page start value (10 per page, ie page 2 start val = 20)
    if 'start' in args:
@ -212,8 +216,18 @@ class Request:
    """

    def __init__(self, normal_ua, root_path, config: Config, http_client=None):
-        self.search_url = 'https://www.google.com/search?gbv=1&num=' + str(
-            os.getenv('WHOOGLE_RESULTS_PER_PAGE', 10)) + '&q='
+        results_per_page = str(os.getenv('WHOOGLE_RESULTS_PER_PAGE', 10))
+        self.search_url = (
+            'https://www.google.com/search?gbv=1&num='
+            f'{results_per_page}&q='
+        )
+        # Google Images rejects the lightweight gbv=1 interface. Use the
+        # modern udm=2 entrypoint specifically for image searches to avoid the
+        # "update your browser" interstitial.
+        self.image_search_url = (
+            'https://www.google.com/search?udm=2&num='
+            f'{results_per_page}&q='
+        )
        # Optionally send heartbeat to Tor to determine availability
        # Only when Tor is enabled in config to avoid unnecessary socket usage
        if config.tor:
@ -235,6 +249,13 @@ class Request:
        if not self.mobile:
            self.modified_user_agent_mobile = gen_user_agent(config, True)

+        # Dedicated modern UA to use when Google rejects legacy ones (e.g. Images)
+        self.image_user_agent = (
+            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
+            'AppleWebKit/537.36 (KHTML, like Gecko) '
+            'Chrome/127.0.0.0 Safari/537.36'
+        )
+
        # Set up proxy configuration
        proxy_path = os.environ.get('WHOOGLE_PROXY_LOC', '')
        if proxy_path:
@ -332,6 +353,13 @@ class Request:
            else:
                modified_user_agent = self.modified_user_agent

+        # Some Google endpoints (notably Images) now refuse legacy user agents.
+        # If an image search is detected and the generated UA isn't Chromium-
+        # like, retry with a modern Chrome string to avoid the "update your
+        # browser" interstitial.
+        if (('tbm=isch' in query) or ('udm=2' in query)) and 'Chrome' not in modified_user_agent:
+            modified_user_agent = self.image_user_agent
+
        headers = {
            'User-Agent': modified_user_agent,
            'Accept': ('text/html,application/xhtml+xml,application/xml;'
@ -345,16 +373,23 @@ class Request:
            'Sec-Fetch-Site': 'none',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-User': '?1',
-            'Sec-Fetch-Dest': 'document',
-            'Sec-CH-UA': (
-                '"Not/A)Brand";v="8", '
-                '"Chromium";v="127", '
-                '"Google Chrome";v="127"'
-            ),
-            'Sec-CH-UA-Mobile': '?0',
-            'Sec-CH-UA-Platform': '"macOS"'
+            'Sec-Fetch-Dest': 'document'
        }
+        # Only attach client hints when using a Chromium-like user agent to
+        # avoid sending conflicting information that can trigger unsupported
+        # browser pages.
+        if 'Chrome' in headers['User-Agent']:
+            headers.update({
+                'Sec-CH-UA': (
+                    '"Not/A)Brand";v="8", '
+                    '"Chromium";v="127", '
+                    '"Google Chrome";v="127"'
+                ),
+                'Sec-CH-UA-Mobile': '?0',
+                'Sec-CH-UA-Platform': '"Windows"'
+            })

+ 
        # Add Accept-Language header tied to the current config if requested
        if self.lang_interface:
            headers['Accept-Language'] = (
@ -393,9 +428,13 @@ class Request:
                    "Error raised during Tor connection validation",
                    disable=True)

+        search_base = base_url or self.search_url
+        if not base_url and ('tbm=isch' in query or 'udm=2' in query):
+            search_base = self.image_search_url
+
        try:
            response = self.http_client.get(
-                (base_url or self.search_url) + query,
+                search_base + query,
                headers=headers,
                cookies=consent_cookies)
        except httpx.HTTPError as e:
@ -406,6 +445,6 @@ class Request:
            attempt += 1
            if attempt > 10:
                raise TorError("Tor query failed -- max attempts exceeded 10")
-            return self.send((base_url or self.search_url), query, attempt)
+            return self.send(search_base, query, attempt)

        return response
--- a/app/templates/imageresults.html
+++ b/app/templates/imageresults.html
@ -10,9 +10,9 @@
      background-color: #fff;
    }
    body {
-      padding: 0 8px;
+      padding: 0 12px;
      margin: 0 auto;
-      max-width: 736px;
+      max-width: 1200px;
    }
    a {
      text-decoration: none;
@ -167,6 +167,7 @@
      border-collapse: collapse;
      border-spacing: 0;
      width: 100%;
+      table-layout: fixed;
    }
    .X6ZCif {
      color: #202124;
@ -209,15 +210,20 @@
      text-align: center;
    }
    .RAyV4b {
-      line-height: 140px;
-      overflow: "hidden";
+      height: 220px;
+      line-height: 220px;
+      overflow: hidden;
      text-align: center;
    }
    .t0fcAb {
      text-align: center;
      margin: auto;
      vertical-align: middle;
-      object-fit: contain;
+      object-fit: cover;
+      max-width: 100%;
+      height: auto;
+      max-height: 220px;
+      display: block;
    }
    .Tor4Ec {
      padding-top: 2px;
@ -313,6 +319,24 @@
    a .CVA68e:hover {
      text-decoration: underline;
    }
+    .e3goi {
+      width: 25%;
+      padding: 10px;
+      box-sizing: border-box;
+    }
+    .svla5d {
+      max-width: 100%;
+    }
+    @media (max-width: 900px) {
+      .e3goi {
+        width: 50%;
+      }
+    }
+    @media (max-width: 600px) {
+      .e3goi {
+        width: 100%;
+      }
+    }
  </style>
  <div>
    <div>
--- a/app/utils/search.py
+++ b/app/utils/search.py
@ -140,7 +140,8 @@ class Search:
                                root_url=root_url,
                                mobile=mobile,
                                config=self.config,
-                                query=self.query)
+                                query=self.query,
+                                page_url=self.request.url)
        full_query = gen_query(self.query,
                               self.request_params,
                               self.config)
@ -148,8 +149,10 @@ class Search:

        # force mobile search when view image is true and
        # the request is not already made by a mobile
-        view_image = ('tbm=isch' in full_query
-                      and self.config.view_image)
+        is_image_query = ('tbm=isch' in full_query) or ('udm=2' in full_query)
+        # Always parse image results when hitting the images endpoint (udm=2)
+        # to avoid Google returning only text/AI blocks.
+        view_image = is_image_query

        client = self.user_request or g.user_request
        get_body = client.send(query=full_query,
@ -194,4 +197,3 @@ class Search:
            link['href'] += param_str

        return str(formatted_results)
-