From 9b3a6ce550cd5f9438f4b187601a22a90e3d7194 Mon Sep 17 00:00:00 2001 From: Don-Swanson <32144818+Don-Swanson@users.noreply.github.com> Date: Sun, 23 Nov 2025 20:35:08 -0600 Subject: [PATCH] Update README and codebase to enhance User Agent handling - Revised README to reflect changes in Google search behavior and Whoogle's response strategies. - Implemented a User Agent pool for improved request handling, including fallback mechanisms. - Added configuration options for displaying the User Agent in search results. - Introduced a command-line tool for generating custom User Agent strings. - Enhanced request headers to include additional parameters for better compatibility with Google services. --- README.md | 157 ++++++++++++- app/__init__.py | 11 + app/models/config.py | 9 +- app/request.py | 117 +++++++--- app/routes.py | 8 + app/templates/footer.html | 3 + app/templates/index.html | 5 + app/utils/misc.py | 2 +- app/utils/ua_generator.py | 359 ++++++++++++++++++++++++++++++ misc/check_google_user_agents.py | 363 +++++++++++++++++++++++++++++++ misc/generate_uas.py | 210 ++++++++++++++++++ test/conftest.py | 35 +++ test/mock_google.py | 136 ++++++++++++ 13 files changed, 1379 insertions(+), 36 deletions(-) create mode 100644 app/utils/ua_generator.py create mode 100755 misc/check_google_user_agents.py create mode 100755 misc/generate_uas.py create mode 100644 test/mock_google.py diff --git a/README.md b/README.md index 7e7b864..037771f 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ >[!WARNING] > ->As of 16 January, 2025, Google seemingly no longer supports performing search queries without JavaScript enabled. This is a fundamental part of how Whoogle +>Since 16 January, 2025, Google has been attacking the ability to perform search queries without JavaScript enabled. This is a fundamental part of how Whoogle >works -- Whoogle requests the JavaScript-free search results, then filters out garbage from the results page and proxies all external content for the user. > ->This is possibly a breaking change that will mean the end for Whoogle. I'll continue monitoring the status of their JS-free results and looking into workarounds, ->and will make another post if a solution is found (or not). +>This is possibly a breaking change that may mean the end for Whoogle. We'll continue fighting back and releasing workarounds until all workarounds are +>exhausted or a better method is found. ___ @@ -68,7 +68,12 @@ Contents - POST request search and suggestion queries (when possible) - View images at full res without site redirect (currently mobile only) - Light/Dark/System theme modes (with support for [custom CSS theming](https://github.com/benbusby/whoogle-search/wiki/User-Contributed-CSS-Themes)) -- Randomly generated User Agent +- Auto-generated Opera User Agents with random rotation + - 10 unique Opera-based UAs generated on startup from 115 language variants + - Randomly rotated for each search request to avoid detection patterns + - Cached across restarts with configurable refresh options + - Fallback to safe default UA if generation fails + - Optional display of current UA in search results footer - Easy to install/deploy - DDG-style bang (i.e. `! `) searches - User-defined [custom bangs](#custom-bangs) @@ -437,9 +442,12 @@ There are a few optional environment variables available for customizing a Whoog | WHOOGLE_PROXY_PASS | The password of the proxy server. | | WHOOGLE_PROXY_TYPE | The type of the proxy server. Can be "socks5", "socks4", or "http". | | WHOOGLE_PROXY_LOC | The location of the proxy server (host or ip). | -| WHOOGLE_USER_AGENT | The desktop user agent to use. Defaults to a randomly generated one. | -| WHOOGLE_USER_AGENT_MOBILE | The mobile user agent to use. Defaults to a randomly generated one. | +| WHOOGLE_USER_AGENT | The desktop user agent to use when using 'env_conf' option. Leave empty to use auto-generated Opera UAs. | +| WHOOGLE_USER_AGENT_MOBILE | The mobile user agent to use when using 'env_conf' option. Leave empty to use auto-generated Opera UAs. | | WHOOGLE_USE_CLIENT_USER_AGENT | Enable to use your own user agent for all requests. Defaults to false. | +| WHOOGLE_UA_CACHE_PERSISTENT | Whether to persist auto-generated UAs across restarts. Set to '0' to regenerate on each startup. Default '1'. | +| WHOOGLE_UA_CACHE_REFRESH_DAYS | Auto-refresh UA cache after N days. Set to '0' to never refresh (cache persists indefinitely). Default '0'. | +| WHOOGLE_UA_LIST_FILE | Path to text file containing custom UA strings (one per line). When set, uses these instead of auto-generated UAs. | | WHOOGLE_REDIRECTS | Specify sites that should be redirected elsewhere. See [custom redirecting](#custom-redirecting). | | EXPOSE_PORT | The port where Whoogle will be exposed. | | HTTPS_ONLY | Enforce HTTPS. (See [here](https://github.com/benbusby/whoogle-search#https-enforcement)) | @@ -491,6 +499,7 @@ These environment variables allow setting default config values, but can be over | WHOOGLE_CONFIG_PREFERENCES_ENCRYPTED | Encrypt preferences token, requires preferences key | | WHOOGLE_CONFIG_PREFERENCES_KEY | Key to encrypt preferences in URL (REQUIRED to show url) | | WHOOGLE_CONFIG_ANON_VIEW | Include the "anonymous view" option for each search result | +| WHOOGLE_CONFIG_SHOW_USER_AGENT | Display the User Agent string used for search in results footer | ## Usage Same as most search engines, with the exception of filtering by time range. @@ -662,6 +671,141 @@ Whoogle can optionally serve a single bundled CSS and JS to reduce the number of - When disabled (default), templates load individual CSS/JS files for easier development. - Note: Theme CSS (`*-theme.css`) are still loaded separately to honor user theme selection. +## User Agent Generator Tool + +A standalone command-line tool is available for generating Opera User Agent strings on demand: + +```bash +# Generate 10 User Agent strings (default) +python misc/generate_uas.py + +# Generate custom number of UAs +python misc/generate_uas.py 20 +``` + +This tool is useful for: +- Testing different UA strings +- Generating UAs for other projects +- Verifying UA generation patterns +- Debugging UA-related issues + +## Using Custom User Agent Lists + +Instead of using auto-generated Opera UA strings, you can provide your own list of User Agent strings for Whoogle to use. + +### Setup + +1. Create a text file with your preferred UA strings (one per line): + +``` +Opera/9.80 (J2ME/MIDP; Opera Mini/4.2.13337/22.478; U; en) Presto/2.4.15 Version/10.00 +Opera/9.80 (Android; Linux; Opera Mobi/498; U; en) Presto/2.12.423 Version/10.1 +Opera/9.30 (Nintendo Wii; U; ; 3642; en) +``` + +2. Set the `WHOOGLE_UA_LIST_FILE` environment variable to point to your file: + +```bash +# Docker +docker run -e WHOOGLE_UA_LIST_FILE=/config/my_user_agents.txt ... + +# Docker Compose +environment: + - WHOOGLE_UA_LIST_FILE=/config/my_user_agents.txt + +# Manual/systemd +export WHOOGLE_UA_LIST_FILE=/path/to/my_user_agents.txt +``` + +### Priority Order + +Whoogle uses the following priority when loading User Agent strings: + +1. **Custom UA list file** (if `WHOOGLE_UA_LIST_FILE` is set and valid) +2. **Cached auto-generated UAs** (if cache exists and is valid) +3. **Newly generated UAs** (if no cache or cache expired) + +### Tips + +- You can use the output from `misc/check_google_user_agents.py` as your custom UA list +- Generate a list with `python misc/generate_uas.py 50 2>/dev/null > my_uas.txt` +- Mix different UA types (Opera, Firefox, Chrome) for more variety +- Keep the file readable by Whoogle (proper permissions) +- One UA string per line, blank lines are ignored + +### Example Workflow + +```bash +# Generate and test UAs, save working ones +python misc/generate_uas.py 100 2>/dev/null > candidate_uas.txt +python misc/check_google_user_agents.py candidate_uas.txt --output working_uas.txt + +# Use the working UAs with Whoogle +export WHOOGLE_UA_LIST_FILE=./working_uas.txt +./run +``` + +## User Agent Testing Tool + +Whoogle now includes a comprehensive testing tool (`misc/check_google_user_agents.py`) to verify which User Agent strings successfully return Google search results without triggering blocks, JavaScript-only pages, or browser upgrade prompts. + +### Usage + +```bash +# Test all UAs from a file +python misc/check_google_user_agents.py UAs.txt + +# Save working UAs to a file (appends incrementally) +python misc/check_google_user_agents.py UAs.txt --output working_uas.txt + +# Use a specific search query +python misc/check_google_user_agents.py UAs.txt --query "python programming" + +# Verbose mode to see detailed results +python misc/check_google_user_agents.py UAs.txt --output working.txt --verbose + +# Adjust delay between requests (default: 0.5 seconds) +python misc/check_google_user_agents.py UAs.txt --delay 1.0 + +# Set request timeout (default: 10 seconds) +python misc/check_google_user_agents.py UAs.txt --timeout 15.0 +``` + +### Features + +- **Incremental Results**: Working UAs are saved immediately to the output file (append mode), so progress is preserved even if interrupted +- **Duplicate Detection**: Automatically skips UAs already in the output file when resuming +- **Random Query Cycling**: By default, cycles through diverse search queries to simulate realistic usage patterns +- **Rate Limit Detection**: Detects and reports Google rate limiting with recovery instructions +- **Comprehensive Validation**: Checks for: + - HTTP status codes (blocks, server errors, rate limits) + - Block markers (unusual traffic, upgrade browser messages) + - Success markers (actual search result HTML elements) + - JavaScript-only pages and redirects + - Response size validation + +### Testing Methodology + +The tool evaluates UAs against multiple criteria: + +1. **HTTP Status**: Rejects 4xx/5xx errors, detects 429 rate limits +2. **Block Detection**: Searches for Google's block messages (CAPTCHA, unusual traffic, etc.) +3. **JavaScript Detection**: Identifies JS-only pages and noscript redirects +4. **Result Validation**: Confirms presence of actual search result HTML elements +5. **Content Analysis**: Validates response size and structure + +This tool was used to discover and validate the working Opera UA patterns that power Whoogle's auto-generation feature. + +## Known Issues + +### User Agent Strings and Image Search + +**Issue**: Most, if not all, of the auto-generated Opera User Agent strings may fail when performing **image searches** on Google. This appears to be a limitation with how Google's image search validates User Agent strings. + +**Impact**: +- Regular web searches work correctly with generated UAs +- Image search may return errors or no results + ## Contributing Under the hood, Whoogle is a basic Flask app with the following structure: @@ -675,6 +819,7 @@ Under the hood, Whoogle is a basic Flask app with the following structure: - `results.py`: Utility functions for interpreting/modifying individual search results - `search.py`: Creates and handles new search queries - `session.py`: Miscellaneous methods related to user sessions + - `ua_generator.py`: Auto-generates Opera User Agent strings with pattern-based randomization - `templates/` - `index.html`: The home page template - `display.html`: The search results template diff --git a/app/__init__.py b/app/__init__.py index 34dad77..0cf9b66 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -3,6 +3,7 @@ from app.request import send_tor_signal from app.utils.session import generate_key from app.utils.bangs import gen_bangs_json, load_all_bangs from app.utils.misc import gen_file_hash, read_config_bool +from app.utils.ua_generator import load_ua_pool from base64 import b64encode from bs4 import MarkupResemblesLocatorWarning from datetime import datetime, timedelta @@ -107,6 +108,16 @@ if not os.path.exists(app.config['BANG_PATH']): if not os.path.exists(app.config['BUILD_FOLDER']): os.makedirs(app.config['BUILD_FOLDER']) +# Initialize User Agent pool +app.config['UA_CACHE_PATH'] = os.path.join(app.config['CONFIG_PATH'], 'ua_cache.json') +try: + app.config['UA_POOL'] = load_ua_pool(app.config['UA_CACHE_PATH'], count=10) +except Exception as e: + # If UA pool loading fails, log warning and set empty pool + # The gen_user_agent function will handle the fallback + print(f"Warning: Could not initialize UA pool: {e}") + app.config['UA_POOL'] = [] + # Session values app_key_path = os.path.join(app.config['CONFIG_PATH'], 'whoogle.key') if os.path.exists(app_key_path): diff --git a/app/models/config.py b/app/models/config.py index 08d0e63..ed56af8 100644 --- a/app/models/config.py +++ b/app/models/config.py @@ -45,6 +45,7 @@ class Config: self.user_agent = kwargs.get('user_agent', default_ua_option) self.custom_user_agent = kwargs.get('custom_user_agent', '') self.use_custom_user_agent = kwargs.get('use_custom_user_agent', False) + self.show_user_agent = read_config_bool('WHOOGLE_CONFIG_SHOW_USER_AGENT') # Add user agent related keys to safe_keys self.safe_keys = [ @@ -63,7 +64,8 @@ class Config: 'tbs', 'user_agent', 'custom_user_agent', - 'use_custom_user_agent' + 'use_custom_user_agent', + 'show_user_agent' ] app_config = current_app.config @@ -97,7 +99,10 @@ class Config: if kwargs: mutable_attrs = self.get_mutable_attrs() for attr in mutable_attrs: - if attr in kwargs.keys(): + if attr == 'show_user_agent': + # Handle show_user_agent as boolean + self.show_user_agent = bool(kwargs.get(attr)) + elif attr in kwargs.keys(): setattr(self, attr, kwargs[attr]) elif attr not in kwargs.keys() and mutable_attrs[attr] == bool: setattr(self, attr, False) diff --git a/app/request.py b/app/request.py index 734ea95..a4dcaa9 100644 --- a/app/request.py +++ b/app/request.py @@ -1,6 +1,7 @@ from app.models.config import Config from app.utils.misc import read_config_bool from app.services.provider import get_http_client +from app.utils.ua_generator import load_ua_pool, get_random_ua, DEFAULT_FALLBACK_UA from datetime import datetime from defusedxml import ElementTree as ET import random @@ -16,8 +17,32 @@ MAPS_URL = 'https://maps.google.com/maps' AUTOCOMPLETE_URL = ('https://suggestqueries.google.com/' 'complete/search?client=toolbar&') -MOBILE_UA = '{}/5.0 (Android 0; Mobile; rv:54.0) Gecko/54.0 {}/59.0' -DESKTOP_UA = '{}/5.0 (X11; {} x86_64; rv:75.0) Gecko/20100101 {}/75.0' +DEFAULT_DESKTOP_UA = ( + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) ' + 'Gecko/20100101 Firefox/131.0' +) +DEFAULT_MOBILE_UA = ( + 'Mozilla/5.0 (Linux; Android 14; Pixel 8 Pro) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/127.0.0.0 Mobile Safari/537.36' +) + +DESKTOP_UAS = [ + DEFAULT_DESKTOP_UA, + 'Mozilla/5.0 (X11; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_5) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/127.0.0.0 Safari/537.36' +] +MOBILE_UAS = [ + DEFAULT_MOBILE_UA, + 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_5 like Mac OS X) ' + 'AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.5 ' + 'Mobile/15E148 Safari/604.1', + 'Mozilla/5.0 (Linux; Android 13; SM-S918B) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/125.0.0.0 Mobile Safari/537.36' +] # Valid query params VALID_PARAMS = ['tbs', 'tbm', 'start', 'near', 'source', 'nfpr'] @@ -73,8 +98,8 @@ def send_tor_signal(signal: Signal) -> bool: def gen_user_agent(config, is_mobile) -> str: - # Define the default PlayStation Portable user agent (replaces Lynx) - DEFAULT_UA = 'Mozilla/4.0 (PSP (PlayStation Portable); 2.00)' + # Modern defaults mimic widely-used browsers so Google returns full results. + default_ua = DEFAULT_MOBILE_UA if is_mobile else DEFAULT_DESKTOP_UA # If using custom user agent, return the custom string if config.user_agent == 'custom' and config.custom_user_agent: @@ -93,18 +118,39 @@ def gen_user_agent(config, is_mobile) -> str: # If env vars are not set, fall back to default return DEFAULT_UA - # If using default user agent + # If using default user agent - use auto-generated Opera UA pool if config.user_agent == 'default': - return DEFAULT_UA + try: + # Try to load UA pool from cache (lazy loading if not in app.config) + # First check if we have access to Flask app context + try: + from flask import current_app + if hasattr(current_app, 'config') and 'UA_POOL' in current_app.config: + ua_pool = current_app.config['UA_POOL'] + else: + # Fall back to loading from disk + config_path = os.environ.get('CONFIG_VOLUME', + os.path.join(os.path.dirname(os.path.abspath(__file__)), + 'static', 'config')) + cache_path = os.path.join(config_path, 'ua_cache.json') + ua_pool = load_ua_pool(cache_path, count=10) + except (ImportError, RuntimeError): + # No Flask context available, load from disk + config_path = os.environ.get('CONFIG_VOLUME', + os.path.join(os.path.dirname(os.path.abspath(__file__)), + 'static', 'config')) + cache_path = os.path.join(config_path, 'ua_cache.json') + ua_pool = load_ua_pool(cache_path, count=10) + + return get_random_ua(ua_pool) + except Exception as e: + # If anything goes wrong, fall back to default Opera UA + print(f"Warning: Could not load UA pool, using fallback Opera UA: {e}") + return DEFAULT_FALLBACK_UA # If no custom user agent is set, generate a random one (for backwards compatibility) - firefox = random.choice(['Choir', 'Squier', 'Higher', 'Wire']) + 'fox' - linux = random.choice(['Win', 'Sin', 'Gin', 'Fin', 'Kin']) + 'ux' - - if is_mobile: - return MOBILE_UA.format("Mozilla", firefox) - - return DESKTOP_UA.format("Mozilla", linux, firefox) + candidates = MOBILE_UAS if is_mobile else DESKTOP_UAS + return random.choice(candidates) def gen_query(query, args, config) -> str: @@ -324,23 +370,39 @@ class Request: modified_user_agent = self.modified_user_agent headers = { - 'User-Agent': modified_user_agent + 'User-Agent': modified_user_agent, + 'Accept': ('text/html,application/xhtml+xml,application/xml;' + 'q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8'), + 'Accept-Language': 'en-US,en;q=0.9', + 'Accept-Encoding': 'gzip, deflate, br', + 'Connection': 'keep-alive', + 'Cache-Control': 'max-age=0', + 'Pragma': 'no-cache', + 'Upgrade-Insecure-Requests': '1', + 'Sec-Fetch-Site': 'none', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-User': '?1', + 'Sec-Fetch-Dest': 'document', + 'Sec-CH-UA': ( + '"Not/A)Brand";v="8", ' + '"Chromium";v="127", ' + '"Google Chrome";v="127"' + ), + 'Sec-CH-UA-Mobile': '?0', + 'Sec-CH-UA-Platform': '"macOS"' } - # Adding the Accept-Language to the Header if possible + # Add Accept-Language header tied to the current config if requested if self.lang_interface: - headers.update({'Accept-Language': - self.lang_interface.replace('lang_', '') - + ';q=1.0'}) + headers['Accept-Language'] = ( + self.lang_interface.replace('lang_', '') + ';q=1.0' + ) - # view is suppressed correctly - now = datetime.now() - consent_cookie = 'CONSENT=PENDING+987; SOCS=CAESHAgBEhIaAB' - # Prefer header-based cookies to avoid httpx per-request cookies deprecation - if 'Cookie' in headers: - headers['Cookie'] += '; ' + consent_cookie - else: - headers['Cookie'] = consent_cookie + # Consent cookies keep Google from showing the interstitial consent wall + consent_cookies = { + 'CONSENT': 'PENDING+987', + 'SOCS': 'CAESHAgBEhIaAB' + } # Validate Tor conn and request new identity if the last one failed if self.tor and not send_tor_signal( @@ -371,7 +433,8 @@ class Request: try: response = self.http_client.get( (base_url or self.search_url) + query, - headers=headers) + headers=headers, + cookies=consent_cookies) except httpx.HTTPError as e: raise diff --git a/app/routes.py b/app/routes.py index d68a90c..59a14f0 100644 --- a/app/routes.py +++ b/app/routes.py @@ -544,6 +544,13 @@ def search(): 'results': results }) + # Get the user agent that was used for the search + used_user_agent = '' + if search_util.user_request: + used_user_agent = search_util.user_request.modified_user_agent + elif hasattr(g, 'user_request') and g.user_request: + used_user_agent = g.user_request.modified_user_agent + return render_template( 'display.html', has_update=app.config['HAS_UPDATE'], @@ -565,6 +572,7 @@ def search(): ) and not search_util.search_type, # Standard search queries only response=cleanresponse, version_number=app.config['VERSION_NUMBER'], + used_user_agent=used_user_agent, search_header=render_template( 'header.html', home_url=home_url, diff --git a/app/templates/footer.html b/app/templates/footer.html index f9ed4cc..4495990 100644 --- a/app/templates/footer.html +++ b/app/templates/footer.html @@ -5,5 +5,8 @@ {% if has_update %} || Update Available 🟢 {% endif %} + {% if config.show_user_agent and used_user_agent %} +
User Agent: {{ used_user_agent }} + {% endif %}

diff --git a/app/templates/index.html b/app/templates/index.html index 09dd943..d4544fa 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -258,6 +258,11 @@ +
+ + +
diff --git a/app/utils/misc.py b/app/utils/misc.py index d4e49d7..4688e53 100644 --- a/app/utils/misc.py +++ b/app/utils/misc.py @@ -36,7 +36,7 @@ def fetch_favicon(url: str) -> bytes: bytes - the favicon bytes, or a placeholder image if one was not returned """ - response = get(f'{ddg_favicon_site}/{urlparse(url).netloc}.ico') + response = httpx.get(f'{ddg_favicon_site}/{urlparse(url).netloc}.ico') if response.status_code == 200 and len(response.content) > 0: tmp_mem = io.BytesIO() diff --git a/app/utils/ua_generator.py b/app/utils/ua_generator.py new file mode 100644 index 0000000..f31851d --- /dev/null +++ b/app/utils/ua_generator.py @@ -0,0 +1,359 @@ +""" +User Agent Generator for Opera-based UA strings. + +This module generates realistic Opera User Agent strings based on patterns +found in working UA strings that successfully bypass Google's restrictions. +""" + +import json +import os +import random +from datetime import datetime, timedelta +from typing import List, Dict + + +# Default fallback UA if generation fails +DEFAULT_FALLBACK_UA = "Opera/9.30 (Nintendo Wii; U; ; 3642; en)" + +# Opera UA Pattern Templates +OPERA_PATTERNS = [ + # Opera Mini (J2ME/MIDP) + "Opera/9.80 (J2ME/MIDP; Opera Mini/{version}/{build}; U; {lang}) Presto/{presto} Version/{final}", + + # Opera Mobile (Android) + "Opera/9.80 (Android; Linux; Opera Mobi/{build}; U; {lang}) Presto/{presto} Version/{final}", + + # Opera Mobile (iPhone) + "Opera/9.80 (iPhone; Opera Mini/{version}/{build}; U; {lang}) Presto/{presto} Version/{final}", + + # Opera Mobile (iPad) + "Opera/9.80 (iPad; Opera Mini/{version}/{build}; U; {lang}) Presto/{presto} Version/{final}", + + # Opera on Nintendo Wii + "Opera/9.30 (Nintendo Wii; U; ; {code}; {lang})", + + # Opera Mobile (S60/SymbOS) + "Opera/9.80 (S60; SymbOS; Opera Mobi/{build}; U; {lang}) Presto/{presto} Version/{final}", + + # Opera Mobile (Series 60) + "Opera/9.80 (Series 60; Opera Mini/{version}/{build}; U; {lang}) Presto/{presto} Version/{final}", + + # Opera Mobile (BlackBerry) + "Opera/9.80 (BlackBerry; Opera Mini/{version}/{build}; U; {lang}) Presto/{presto} Version/{final}", + + # Opera Mobile (Windows Mobile) + "Opera/9.80 (Windows Mobile; Opera Mini/{version}/{build}; U; {lang}) Presto/{presto} Version/{final}", +] + +# Randomization pools based on working UAs +OPERA_MINI_VERSIONS = [ + "4.0", "4.1.11321", "4.1.12965", "4.1.13573", "4.1.13907", "4.1.14287", + "4.1.15082", "4.2.13057", "4.2.13221", "4.2.13265", "4.2.13337", + "4.2.13400", "4.2.13918", "4.2.13943", "4.2.14320", "4.2.14409", + "4.2.14753", "4.2.14881", "4.2.14885", "4.2.14912", "4.2.15066", + "4.2.15410", "4.2.16007", "4.2.16320", "4.2.18887", "4.2.19634", + "4.2.21465", "4.2.22228", "4.2.23453", "4.2.24721", "4.3.13337", + "4.3.24214", "4.4.26736", "4.4.29476", "4.5.33867", "4.5.40312", + "5.0.15650", "5.0.16823", "5.0.17381", "5.0.17443", "5.0.18635", + "5.0.18741", "5.0.19683", "5.0.19693", "5.0.20873", "5.0.22349", + "5.1.21051", "5.1.21126", "5.1.21214", "5.1.21415", "5.1.21594", + "5.1.21595", "5.1.22296", "5.1.22303", "5.1.22396", "5.1.22460", + "5.1.22783", "5.1.22784", "6.0.24095", "6.0.24212", "6.0.24455", + "6.1.25375", "6.1.25378", "6.1.25759", "6.24093", "6.24096", + "6.24209", "6.24288", "6.5.26955", "6.5.29702", "7.0.29952", + "7.1.32052", "7.1.32444", "7.1.32694", "7.29530", "7.5.33361", + "7.6.35766", "9.80", "36.2.2254" +] + +OPERA_MOBI_BUILDS = [ + "27", "49", "447", "498", "1181", "1209", "3730", + "ADR-1011151731", "ADR-1012211514", "ADR-1012221546", "ADR-1012272315", + "SYB-1103211396", "SYB-1104061449", "SYB-1107071606", + "ADR-1111101157" +] + +BUILD_NUMBERS = [ + "18.678", "18.684", "18.738", "18.794", "19.892", "19.916", + "20.2477", "20.2479", "20.2485", "20.2489", "21.529", "22.387", + "22.394", "22.401", "22.414", "22.453", "22.478", "23.317", + "23.333", "23.334", "23.377", "23.390", "24.741", "24.743", + "24.746", "24.783", "24.838", "24.871", "24.899", "25.657", + "25.677", "25.729", "25.872", "26.1305", "27.1366", "27.1407", + "27.1573", "28.2075", "28.2555", "28.2647", "28.2766", "29.3594", + "30.3316", "31.1350", "35.2883", "35.5706", "37.6584", "119.132", + "170.51", "170.54", "764", "870", "886", "490", "503" +] + +PRESTO_VERSIONS = [ + "2.2.0", "2.4.15", "2.4.154.15", "2.4.18", "2.5.25", "2.5.28", + "2.6.35", "2.7.60", "2.7.81", "2.8.119", "2.8.149", "2.8.191", + "2.9.201", "2.12.423" +] + +FINAL_VERSIONS = [ + "10.00", "10.1", "10.5", "10.54", "10.5454", "11.00", "11.10", + "12.02", "12.16", "13.00" +] + +LANGUAGES = [ + # English variants + "en", "en-US", "en-GB", "en-CA", "en-AU", "en-NZ", "en-ZA", "en-IN", "en-SG", + # Western European + "de", "de-DE", "de-AT", "de-CH", + "fr", "fr-FR", "fr-CA", "fr-BE", "fr-CH", "fr-LU", + "es", "es-ES", "es-MX", "es-AR", "es-CO", "es-CL", "es-PE", "es-VE", "es-LA", + "it", "it-IT", "it-CH", + "pt", "pt-PT", "pt-BR", + "nl", "nl-NL", "nl-BE", + # Nordic languages + "da", "da-DK", + "sv", "sv-SE", + "no", "no-NO", "nb", "nn", + "fi", "fi-FI", + "is", "is-IS", + # Eastern European + "pl", "pl-PL", + "cs", "cs-CZ", + "sk", "sk-SK", + "hu", "hu-HU", + "ro", "ro-RO", + "bg", "bg-BG", + "hr", "hr-HR", + "sr", "sr-RS", + "sl", "sl-SI", + "uk", "uk-UA", + "ru", "ru-RU", + # Asian languages + "zh", "zh-CN", "zh-TW", "zh-HK", + "ja", "ja-JP", + "ko", "ko-KR", + "th", "th-TH", + "vi", "vi-VN", + "id", "id-ID", + "ms", "ms-MY", + "fil", "tl", + # Middle Eastern + "tr", "tr-TR", + "ar", "ar-SA", "ar-AE", "ar-EG", + "he", "he-IL", + "fa", "fa-IR", + # Other + "hi", "hi-IN", + "bn", "bn-IN", + "ta", "ta-IN", + "te", "te-IN", + "mr", "mr-IN", + "el", "el-GR", + "ca", "ca-ES", + "eu", "eu-ES" +] + +WII_CODES = [ + "1038-58", "1309-9", "1621", "2047-7", "2071", "2077-4", "3642" +] + + +def generate_opera_ua() -> str: + """ + Generate a single random Opera User Agent string. + + Returns: + str: A randomly generated Opera UA string + """ + pattern = random.choice(OPERA_PATTERNS) + + # Determine which parameters to use based on the pattern + params = { + 'lang': random.choice(LANGUAGES) + } + + # Nintendo Wii pattern + if "Nintendo Wii" in pattern: + params['code'] = random.choice(WII_CODES) + else: + # Other patterns + if '{version}' in pattern: + params['version'] = random.choice(OPERA_MINI_VERSIONS) + + if '{build}' in pattern: + # Use MOBI build for "Opera Mobi", regular build for "Opera Mini" + if "Opera Mobi" in pattern: + params['build'] = random.choice(OPERA_MOBI_BUILDS) + else: + params['build'] = random.choice(BUILD_NUMBERS) + + if '{presto}' in pattern: + params['presto'] = random.choice(PRESTO_VERSIONS) + + if '{final}' in pattern: + params['final'] = random.choice(FINAL_VERSIONS) + + return pattern.format(**params) + + +def generate_ua_pool(count: int = 10) -> List[str]: + """ + Generate a pool of unique Opera User Agent strings. + + Args: + count: Number of UA strings to generate (default: 10) + + Returns: + List[str]: List of unique UA strings + """ + ua_pool = set() + + # Keep generating until we have enough unique UAs + # Add safety limit to prevent infinite loop + max_attempts = count * 100 + attempts = 0 + + try: + while len(ua_pool) < count and attempts < max_attempts: + ua = generate_opera_ua() + ua_pool.add(ua) + attempts += 1 + except Exception: + # If generation fails entirely, return at least the default fallback + if not ua_pool: + return [DEFAULT_FALLBACK_UA] + + # If we couldn't generate enough, fill remaining with default + result = list(ua_pool) + while len(result) < count: + result.append(DEFAULT_FALLBACK_UA) + + return result + + +def save_ua_pool(uas: List[str], cache_path: str) -> None: + """ + Save UA pool to cache file. + + Args: + uas: List of UA strings to save + cache_path: Path to cache file + """ + cache_data = { + 'generated_at': datetime.now().isoformat(), + 'user_agents': uas + } + + # Ensure directory exists + cache_dir = os.path.dirname(cache_path) + if cache_dir and not os.path.exists(cache_dir): + os.makedirs(cache_dir, exist_ok=True) + + with open(cache_path, 'w', encoding='utf-8') as f: + json.dump(cache_data, f, indent=2) + + +def load_custom_ua_list(file_path: str) -> List[str]: + """ + Load custom UA list from a text file. + + Args: + file_path: Path to text file containing UA strings (one per line) + + Returns: + List[str]: List of UA strings, or empty list if file is invalid + """ + try: + with open(file_path, 'r', encoding='utf-8') as f: + uas = [line.strip() for line in f if line.strip()] + + # Validate that we have at least one UA + if not uas: + return [] + + return uas + except (FileNotFoundError, PermissionError, UnicodeDecodeError): + return [] + + +def load_ua_pool(cache_path: str, count: int = 10) -> List[str]: + """ + Load UA pool from custom list file, cache, or generate new one. + + Priority order: + 1. Custom UA list file (if WHOOGLE_UA_LIST_FILE is set) + 2. Cached auto-generated UAs + 3. Newly generated UAs + + Args: + cache_path: Path to cache file + count: Number of UAs to generate if cache is invalid (default: 10) + + Returns: + List[str]: List of UA strings + """ + # Check for custom UA list file first (highest priority) + custom_ua_file = os.environ.get('WHOOGLE_UA_LIST_FILE', '').strip() + if custom_ua_file: + custom_uas = load_custom_ua_list(custom_ua_file) + if custom_uas: + # Custom list loaded successfully + return custom_uas + else: + # Custom file specified but invalid, log warning and fall back + print(f"Warning: Custom UA list file '{custom_ua_file}' not found or invalid, falling back to auto-generated UAs") + + # Check if we should use cache + use_cache = os.environ.get('WHOOGLE_UA_CACHE_PERSISTENT', '1') == '1' + refresh_days = int(os.environ.get('WHOOGLE_UA_CACHE_REFRESH_DAYS', '0')) + + # If cache disabled, always generate new + if not use_cache: + uas = generate_ua_pool(count) + save_ua_pool(uas, cache_path) + return uas + + # Try to load from cache + if os.path.exists(cache_path): + try: + with open(cache_path, 'r', encoding='utf-8') as f: + cache_data = json.load(f) + + # Check if cache is expired (if refresh_days > 0) + if refresh_days > 0: + generated_at = datetime.fromisoformat(cache_data['generated_at']) + age_days = (datetime.now() - generated_at).days + + if age_days >= refresh_days: + # Cache expired, generate new + uas = generate_ua_pool(count) + save_ua_pool(uas, cache_path) + return uas + + # Cache is valid, return it + return cache_data['user_agents'] + except (json.JSONDecodeError, KeyError, ValueError): + # Cache file is corrupted, generate new + pass + + # No valid cache, generate new + uas = generate_ua_pool(count) + save_ua_pool(uas, cache_path) + return uas + + +def get_random_ua(ua_pool: List[str]) -> str: + """ + Get a random UA from the pool. + + Args: + ua_pool: List of UA strings + + Returns: + str: Random UA string from the pool + """ + if not ua_pool: + # Fallback to generating one if pool is empty + try: + return generate_opera_ua() + except Exception: + # If generation fails, use default fallback + return DEFAULT_FALLBACK_UA + + return random.choice(ua_pool) + diff --git a/misc/check_google_user_agents.py b/misc/check_google_user_agents.py new file mode 100755 index 0000000..54e2603 --- /dev/null +++ b/misc/check_google_user_agents.py @@ -0,0 +1,363 @@ +#!/usr/bin/env python3 +""" +Test User Agent strings against Google to find which ones return actual search results +instead of JavaScript pages or upgrade browser messages. + +Usage: + python test_google_user_agents.py [--output ] [--query ] +""" + +import argparse +import random +import sys +import time +from typing import List, Tuple +import requests + +# Common search queries to cycle through for more realistic testing +DEFAULT_SEARCH_QUERIES = [ + "python programming", + "weather today", + "news", + "how to cook pasta", + "best movies 2025", + "restaurants near me", + "translate hello", + "calculator", + "time", + "maps", + "images", + "videos", + "shopping", + "travel", + "sports scores", + "stock market", + "recipes", + "music", + "books", + "technology", + "AI", + "AI programming", + "Why does google hate users?" +] + +# Markers that indicate blocked/JS pages +BLOCK_MARKERS = [ + "unusual traffic", + "sorry but your computer", + "solve the captcha", + "request looks automated", + "g-recaptcha", + "upgrade your browser", + "browser is not supported", + "please upgrade", + "isn't supported", + "isn\"t supported", # With escaped quote + "upgrade to a recent version", + "update your browser", + "your browser isn't supported", +] + +# Markers that indicate actual search results +SUCCESS_MARKERS = [ + '