mirror of
https://github.com/benbusby/whoogle-search.git
synced 2026-03-11 08:54:34 +00:00
Compare commits
28 commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2949510d68 | ||
|
|
255f1a2c12 | ||
|
|
4852e5b64f | ||
|
|
9c5b3150aa | ||
|
|
6c7ca7c082 | ||
|
|
ff3a44b91e | ||
|
|
b3c09ade5c | ||
|
|
a2ec4e9f22 | ||
|
|
db6d031e86 | ||
|
|
c96f5ada2e | ||
|
|
ccdeb60fc0 | ||
|
|
20ed493671 | ||
|
|
20753224f3 | ||
|
|
71a2c10e58 | ||
|
|
9ff2d2f90a | ||
|
|
0f000a676b | ||
|
|
7b56aa053b | ||
|
|
f9f54115e3 | ||
|
|
c008090d83 | ||
|
|
6bcde23501 | ||
|
|
3698d9065e | ||
|
|
cffef7aa15 | ||
|
|
178d67a73f | ||
|
|
65326e37b4 | ||
|
|
490fc6c4f9 | ||
|
|
9b3a6ce550 | ||
|
|
5f17b82735 | ||
|
|
00d8aec2fb |
33 changed files with 2542 additions and 629 deletions
53
.github/workflows/buildx.yml
vendored
53
.github/workflows/buildx.yml
vendored
|
|
@ -3,7 +3,7 @@ name: buildx
|
|||
on:
|
||||
workflow_run:
|
||||
workflows: ["docker_main"]
|
||||
branches: [main]
|
||||
branches: [main, updates]
|
||||
types:
|
||||
- completed
|
||||
push:
|
||||
|
|
@ -20,20 +20,26 @@ jobs:
|
|||
- name: Wait for tests to succeed
|
||||
if: ${{ github.event.workflow_run.conclusion != 'success' && startsWith(github.ref, 'refs/tags') != true }}
|
||||
run: exit 1
|
||||
- name: Debug workflow context
|
||||
run: |
|
||||
echo "Event name: ${{ github.event_name }}"
|
||||
echo "Ref: ${{ github.ref }}"
|
||||
echo "Actor: ${{ github.actor }}"
|
||||
echo "Branch: ${{ github.event.workflow_run.head_branch }}"
|
||||
echo "Conclusion: ${{ github.event.workflow_run.conclusion }}"
|
||||
- name: checkout code
|
||||
uses: actions/checkout@v2
|
||||
- name: install buildx
|
||||
id: buildx
|
||||
uses: crazy-max/ghaction-docker-buildx@v1
|
||||
with:
|
||||
version: latest
|
||||
uses: actions/checkout@v4
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v1
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_PASSWORD }}
|
||||
- name: Login to ghcr.io
|
||||
uses: docker/login-action@v1
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
|
|
@ -50,42 +56,37 @@ jobs:
|
|||
# docker buildx build --push \
|
||||
# --tag ghcr.io/benbusby/whoogle-search:latest \
|
||||
# --platform linux/amd64,linux/arm64 .
|
||||
- name: build and push updates branch (update-testing tag)
|
||||
if: github.event_name == 'workflow_run' && github.event.workflow_run.head_branch == 'updates' && github.event.workflow_run.conclusion == 'success' && (github.event.workflow_run.actor.login == 'benbusby' || github.event.workflow_run.actor.login == 'Don-Swanson')
|
||||
run: |
|
||||
docker buildx build --push \
|
||||
--tag benbusby/whoogle-search:update-testing \
|
||||
--tag ghcr.io/benbusby/whoogle-search:update-testing \
|
||||
--platform linux/amd64,linux/arm64 .
|
||||
- name: build and push release (version + latest)
|
||||
if: github.event_name == 'release' && github.event.release.prerelease == false && (github.actor == 'benbusby' || github.actor == 'Don-Swanson')
|
||||
run: |
|
||||
TAG="${{ github.event.release.tag_name }}"
|
||||
VERSION="${TAG#v}"
|
||||
docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
|
||||
docker buildx ls
|
||||
docker buildx build --push \
|
||||
--tag benbusby/whoogle-search:${VERSION} \
|
||||
--tag benbusby/whoogle-search:latest \
|
||||
--platform linux/amd64,linux/arm/v7,linux/arm64 .
|
||||
docker buildx build --push \
|
||||
--tag ghcr.io/benbusby/whoogle-search:${VERSION} \
|
||||
--tag ghcr.io/benbusby/whoogle-search:latest \
|
||||
--platform linux/amd64,linux/arm/v7,linux/arm64 .
|
||||
--platform linux/amd64,linux/arm64 .
|
||||
- name: build and push pre-release (version only)
|
||||
if: github.event_name == 'release' && github.event.release.prerelease == true && (github.actor == 'benbusby' || github.actor == 'Don-Swanson')
|
||||
run: |
|
||||
TAG="${{ github.event.release.tag_name }}"
|
||||
VERSION="${TAG#v}"
|
||||
docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
|
||||
docker buildx ls
|
||||
docker buildx build --push \
|
||||
--tag benbusby/whoogle-search:${VERSION} \
|
||||
--platform linux/amd64,linux/arm/v7,linux/arm64 .
|
||||
docker buildx build --push \
|
||||
--tag ghcr.io/benbusby/whoogle-search:${VERSION} \
|
||||
--platform linux/amd64,linux/arm/v7,linux/arm64 .
|
||||
--platform linux/amd64,linux/arm64 .
|
||||
- name: build and push tag
|
||||
if: startsWith(github.ref, 'refs/tags')
|
||||
run: |
|
||||
docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
|
||||
docker buildx ls
|
||||
docker buildx build --push \
|
||||
--tag benbusby/whoogle-search:${GITHUB_REF#refs/*/v}\
|
||||
--platform linux/amd64,linux/arm/v7,linux/arm64 .
|
||||
docker buildx build --push \
|
||||
--tag ghcr.io/benbusby/whoogle-search:${GITHUB_REF#refs/*/v}\
|
||||
--platform linux/amd64,linux/arm/v7,linux/arm64 .
|
||||
--tag benbusby/whoogle-search:${GITHUB_REF#refs/*/v} \
|
||||
--tag ghcr.io/benbusby/whoogle-search:${GITHUB_REF#refs/*/v} \
|
||||
--platform linux/amd64,linux/arm64 .
|
||||
|
|
|
|||
5
.github/workflows/docker_main.yml
vendored
5
.github/workflows/docker_main.yml
vendored
|
|
@ -3,7 +3,7 @@ name: docker_main
|
|||
on:
|
||||
workflow_run:
|
||||
workflows: ["tests"]
|
||||
branches: [main]
|
||||
branches: [main, updates]
|
||||
types:
|
||||
- completed
|
||||
|
||||
|
|
@ -11,9 +11,10 @@ on:
|
|||
jobs:
|
||||
on-success:
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ github.event.workflow_run.conclusion == 'success' }}
|
||||
steps:
|
||||
- name: checkout code
|
||||
uses: actions/checkout@v2
|
||||
uses: actions/checkout@v4
|
||||
- name: build and test (docker)
|
||||
run: |
|
||||
docker build --tag whoogle-search:test .
|
||||
|
|
|
|||
2
.github/workflows/pypi.yml
vendored
2
.github/workflows/pypi.yml
vendored
|
|
@ -80,4 +80,4 @@ jobs:
|
|||
if: steps.check_tag.outputs.is_stable == 'true'
|
||||
uses: pypa/gh-action-pypi-publish@master
|
||||
with:
|
||||
password: ${{ secrets.PYPI_API_TOKEN }}
|
||||
password: ${{ secrets.PYPI_API_TOKEN }}
|
||||
23
Dockerfile
23
Dockerfile
|
|
@ -1,4 +1,14 @@
|
|||
FROM python:3.12.6-alpine3.20 AS builder
|
||||
# NOTE: ARMv7 support has been dropped due to lack of pre-built cryptography wheels for Alpine/musl.
|
||||
# To restore ARMv7 support for local builds:
|
||||
# 1. Change requirements.txt:
|
||||
# cryptography==3.3.2; platform_machine == 'armv7l'
|
||||
# cryptography==46.0.1; platform_machine != 'armv7l'
|
||||
# pyOpenSSL==19.1.0; platform_machine == 'armv7l'
|
||||
# pyOpenSSL==25.3.0; platform_machine != 'armv7l'
|
||||
# 2. Add linux/arm/v7 to --platform flag when building:
|
||||
# docker buildx build --platform linux/amd64,linux/arm/v7,linux/arm64 .
|
||||
|
||||
FROM python:3.12-alpine3.22 AS builder
|
||||
|
||||
RUN apk --no-cache add \
|
||||
build-base \
|
||||
|
|
@ -12,13 +22,16 @@ COPY requirements.txt .
|
|||
RUN pip install --upgrade pip
|
||||
RUN pip install --prefix /install --no-warn-script-location --no-cache-dir -r requirements.txt
|
||||
|
||||
FROM python:3.12.6-alpine3.20
|
||||
FROM python:3.12-alpine3.22
|
||||
|
||||
RUN apk add --no-cache tor curl openrc libstdc++
|
||||
# Remove bridge package to avoid CVEs (not needed for Docker containers)
|
||||
RUN apk add --no-cache --no-scripts tor curl openrc libstdc++ && \
|
||||
apk del --no-cache bridge || true
|
||||
# git go //for obfs4proxy
|
||||
# libcurl4-openssl-dev
|
||||
|
||||
RUN apk --no-cache upgrade
|
||||
RUN pip install --upgrade pip
|
||||
RUN apk --no-cache upgrade && \
|
||||
apk del --no-cache --rdepends bridge || true
|
||||
|
||||
# uncomment to build obfs4proxy
|
||||
# RUN git clone https://gitlab.com/yawning/obfs4.git
|
||||
|
|
|
|||
|
|
@ -1,137 +0,0 @@
|
|||
# Mullvad Leta Backend Integration
|
||||
|
||||
## Overview
|
||||
|
||||
Whoogle Search now supports using Mullvad Leta (https://leta.mullvad.net) as an alternative search backend. This provides an additional privacy-focused search option that routes queries through Mullvad's infrastructure.
|
||||
|
||||
## Features
|
||||
|
||||
- **Backend Selection**: Users can choose between Google (default) and Mullvad Leta as the search backend
|
||||
- **Privacy-Focused**: Leta is designed for privacy and doesn't track searches
|
||||
- **Seamless Integration**: Results from Leta are automatically converted to Whoogle's display format
|
||||
- **Automatic Tab Filtering**: Image, video, news, and map tabs are automatically hidden when using Leta (as these are not supported)
|
||||
|
||||
## Limitations
|
||||
|
||||
When using the Mullvad Leta backend, the following search types are **NOT supported**:
|
||||
- Image search (`tbm=isch`)
|
||||
- Video search (`tbm=vid`)
|
||||
- News search (`tbm=nws`)
|
||||
- Map search
|
||||
|
||||
Attempting to use these search types with Leta enabled will show an error message and redirect to the home page.
|
||||
|
||||
## Configuration
|
||||
|
||||
### Via Web Interface
|
||||
|
||||
1. Click the "Config" button on the Whoogle home page
|
||||
2. Scroll down to find the "Use Mullvad Leta Backend" checkbox
|
||||
3. **Leta is enabled by default** - uncheck the box to use Google instead
|
||||
4. Click "Apply" to save your settings
|
||||
|
||||
### Via Environment Variable
|
||||
|
||||
Leta is **enabled by default**. To disable it and use Google instead:
|
||||
```bash
|
||||
WHOOGLE_CONFIG_USE_LETA=0
|
||||
```
|
||||
|
||||
To explicitly enable it (though it's already default):
|
||||
```bash
|
||||
WHOOGLE_CONFIG_USE_LETA=1
|
||||
```
|
||||
|
||||
## Implementation Details
|
||||
|
||||
### Files Modified
|
||||
|
||||
1. **app/models/config.py**
|
||||
- Added `use_leta` configuration option
|
||||
- Added to `safe_keys` list for URL parameter passing
|
||||
|
||||
2. **app/request.py**
|
||||
- Modified `Request.__init__()` to use Leta URL when configured
|
||||
- Added `gen_query_leta()` function to format queries for Leta's API
|
||||
- Leta uses different query parameters than Google:
|
||||
- `engine=google` (or `brave`)
|
||||
- `country=XX` (lowercase country code)
|
||||
- `language=XX` (language code without `lang_` prefix)
|
||||
- `lastUpdated=d|w|m|y` (time period filter)
|
||||
- `page=N` (pagination, 1-indexed)
|
||||
|
||||
3. **app/filter.py**
|
||||
- Added `convert_leta_to_whoogle()` method to parse Leta's HTML structure
|
||||
- Modified `clean()` method to detect and convert Leta results
|
||||
- Leta results use `<article>` tags with specific classes that are converted to Whoogle's format
|
||||
|
||||
4. **app/routes.py**
|
||||
- Added validation to prevent unsupported search types when using Leta
|
||||
- Shows user-friendly error message when attempting image/video/news/map searches with Leta
|
||||
|
||||
5. **app/utils/results.py**
|
||||
- Modified `get_tabs_content()` to accept `use_leta` parameter
|
||||
- Filters out non-web search tabs when Leta is enabled
|
||||
|
||||
6. **app/templates/index.html**
|
||||
- Added checkbox in settings panel for enabling/disabling Leta backend
|
||||
- Includes helpful tooltip explaining Leta's limitations
|
||||
|
||||
## Technical Details
|
||||
|
||||
### Query Parameter Mapping
|
||||
|
||||
| Google Parameter | Leta Parameter | Notes |
|
||||
|-----------------|----------------|-------|
|
||||
| `q=<query>` | `q=<query>` | Same format |
|
||||
| `gl=<country>` | `country=<code>` | Lowercase country code |
|
||||
| `lr=<lang>` | `language=<code>` | Without `lang_` prefix |
|
||||
| `tbs=qdr:d` | `lastUpdated=d` | Time filters mapped |
|
||||
| `start=10` | `page=2` | Converted to 1-indexed pages |
|
||||
| `tbm=isch/vid/nws` | N/A | Not supported |
|
||||
|
||||
### Leta HTML Structure
|
||||
|
||||
Leta returns results in this structure:
|
||||
```html
|
||||
<article class="svelte-fmlk7p">
|
||||
<a href="<result-url>">
|
||||
<h3>Result Title</h3>
|
||||
</a>
|
||||
<cite>display-url.com</cite>
|
||||
<p class="result__body">Result snippet/description</p>
|
||||
</article>
|
||||
```
|
||||
|
||||
This is converted to Whoogle's expected format for consistent display.
|
||||
|
||||
## Testing
|
||||
|
||||
To test the Leta integration:
|
||||
|
||||
1. Enable Leta in settings
|
||||
2. Perform a regular web search - should see results from Leta
|
||||
3. Try to access an image/video/news tab - should see error message
|
||||
4. Check pagination works correctly
|
||||
5. Verify country and language filters work
|
||||
6. Test time period filters (past day/week/month/year)
|
||||
|
||||
## Environment Variables
|
||||
|
||||
- `WHOOGLE_CONFIG_USE_LETA`: Set to `0` to disable Leta and use Google instead (default: `1` - Leta enabled)
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
Potential improvements for future versions:
|
||||
- Add Brave as an alternative engine option (Leta supports both Google and Brave)
|
||||
- Implement image search support if Leta adds this capability
|
||||
- Add per-query backend selection (bang-style syntax)
|
||||
- Cache Leta results for improved performance
|
||||
|
||||
## Notes
|
||||
|
||||
- Leta's search results are cached on their end, so you may see "cached X days ago" messages
|
||||
- Leta requires no API key or authentication
|
||||
- Leta respects Tor configuration if enabled in Whoogle
|
||||
- User agent settings apply to Leta requests as well
|
||||
|
||||
314
README.md
314
README.md
|
|
@ -1,12 +1,10 @@
|
|||
>[!WARNING]
|
||||
>
|
||||
>**Mullvad Leta Backend Now Available!**
|
||||
>Since 16 January, 2025, Google has been attacking the ability to perform search queries without JavaScript enabled. This is a fundamental part of how Whoogle
|
||||
>works -- Whoogle requests the JavaScript-free search results, then filters out garbage from the results page and proxies all external content for the user.
|
||||
>
|
||||
>As of 16 January, 2025, Google seemingly no longer supports performing search queries without JavaScript enabled. We have made multiple workarounds, but as of 2 October 2025, Google has killed off all remaining methods we had to retrieve results from them originally. While we work to rebuild and hopefully find new ways to continue on, we have released a stopgap which usus [Mullvad Leta](https://leta.mullvad.net) (an alternative privacy-focused search backend) as the default (but disable-able) backend leveraging their Google results.
|
||||
>
|
||||
>**Leta is now enabled by default**. It provides anonymous search results through Mullvad's infrastructure without requiring JavaScript. While Leta doesn't support image, video, news, or map searches, it provides privacy-focused web search results.
|
||||
>
|
||||
>To switch back to Google (if it becomes available again), you can disable Leta in the config settings or set `WHOOGLE_CONFIG_USE_LETA=0` in your environment variables. See [LETA_INTEGRATION.md](LETA_INTEGRATION.md) for more details.
|
||||
>This is possibly a breaking change that may mean the end for Whoogle. We'll continue fighting back and releasing workarounds until all workarounds are
|
||||
>exhausted or a better method is found. If you know of a better way, please review and comment in our Way Forward Discussion
|
||||
|
||||
___
|
||||
|
||||
|
|
@ -42,8 +40,9 @@ Contents
|
|||
1. [Arch/AUR](#arch-linux--arch-based-distributions)
|
||||
1. [Helm/Kubernetes](#helm-chart-for-kubernetes)
|
||||
4. [Environment Variables and Configuration](#environment-variables)
|
||||
5. [Usage](#usage)
|
||||
6. [Extra Steps](#extra-steps)
|
||||
5. [Google Custom Search (BYOK)](#google-custom-search-byok)
|
||||
6. [Usage](#usage)
|
||||
7. [Extra Steps](#extra-steps)
|
||||
1. [Set Primary Search Engine](#set-whoogle-as-your-primary-search-engine)
|
||||
2. [Custom Redirecting](#custom-redirecting)
|
||||
2. [Custom Bangs](#custom-bangs)
|
||||
|
|
@ -52,13 +51,12 @@ Contents
|
|||
5. [Using with Firefox Containers](#using-with-firefox-containers)
|
||||
6. [Reverse Proxying](#reverse-proxying)
|
||||
1. [Nginx](#nginx)
|
||||
7. [Contributing](#contributing)
|
||||
8. [FAQ](#faq)
|
||||
9. [Public Instances](#public-instances)
|
||||
10. [Screenshots](#screenshots)
|
||||
8. [Contributing](#contributing)
|
||||
9. [FAQ](#faq)
|
||||
10. [Public Instances](#public-instances)
|
||||
11. [Screenshots](#screenshots)
|
||||
|
||||
## Features
|
||||
- **Mullvad Leta backend support** - Privacy-focused alternative to Google (enabled by default)
|
||||
- No ads or sponsored content
|
||||
- No JavaScript\*
|
||||
- No cookies\*\*
|
||||
|
|
@ -71,7 +69,12 @@ Contents
|
|||
- POST request search and suggestion queries (when possible)
|
||||
- View images at full res without site redirect (currently mobile only)
|
||||
- Light/Dark/System theme modes (with support for [custom CSS theming](https://github.com/benbusby/whoogle-search/wiki/User-Contributed-CSS-Themes))
|
||||
- Randomly generated User Agent
|
||||
- Auto-generated Opera User Agents with random rotation
|
||||
- 10 unique Opera-based UAs generated on startup from 115 language variants
|
||||
- Randomly rotated for each search request to avoid detection patterns
|
||||
- Cached across restarts with configurable refresh options
|
||||
- Fallback to safe default UA if generation fails
|
||||
- Optional display of current UA in search results footer
|
||||
- Easy to install/deploy
|
||||
- DDG-style bang (i.e. `!<tag> <query>`) searches
|
||||
- User-defined [custom bangs](#custom-bangs)
|
||||
|
|
@ -86,6 +89,17 @@ Contents
|
|||
<sup>***If deployed to a remote server, or configured to send requests through a VPN, Tor, proxy, etc.</sup>
|
||||
|
||||
## Install
|
||||
|
||||
### Supported Platforms
|
||||
Official Docker images are built for:
|
||||
- **linux/amd64** (x86_64)
|
||||
- **linux/arm64** (ARM 64-bit, Raspberry Pi 3/4/5, Apple Silicon)
|
||||
|
||||
**Note**: ARMv7 support (32-bit ARM, Raspberry Pi 2) was dropped in v1.2.0 due to incompatibility with modern security libraries on Alpine Linux. Users with ARMv7 devices can either:
|
||||
- Use an older version (v1.1.x or earlier)
|
||||
- Build locally with pinned dependencies (see notes in Dockerfile)
|
||||
- Upgrade to a 64-bit OS if hardware supports it (Raspberry Pi 3+)
|
||||
|
||||
There are a few different ways to begin using the app, depending on your preferences:
|
||||
|
||||
___
|
||||
|
|
@ -440,9 +454,12 @@ There are a few optional environment variables available for customizing a Whoog
|
|||
| WHOOGLE_PROXY_PASS | The password of the proxy server. |
|
||||
| WHOOGLE_PROXY_TYPE | The type of the proxy server. Can be "socks5", "socks4", or "http". |
|
||||
| WHOOGLE_PROXY_LOC | The location of the proxy server (host or ip). |
|
||||
| WHOOGLE_USER_AGENT | The desktop user agent to use. Defaults to a randomly generated one. |
|
||||
| WHOOGLE_USER_AGENT_MOBILE | The mobile user agent to use. Defaults to a randomly generated one. |
|
||||
| WHOOGLE_USER_AGENT | The desktop user agent to use when using 'env_conf' option. Leave empty to use auto-generated Opera UAs. |
|
||||
| WHOOGLE_USER_AGENT_MOBILE | The mobile user agent to use when using 'env_conf' option. Leave empty to use auto-generated Opera UAs. |
|
||||
| WHOOGLE_USE_CLIENT_USER_AGENT | Enable to use your own user agent for all requests. Defaults to false. |
|
||||
| WHOOGLE_UA_CACHE_PERSISTENT | Whether to persist auto-generated UAs across restarts. Set to '0' to regenerate on each startup. Default '1'. |
|
||||
| WHOOGLE_UA_CACHE_REFRESH_DAYS | Auto-refresh UA cache after N days. Set to '0' to never refresh (cache persists indefinitely). Default '0'. |
|
||||
| WHOOGLE_UA_LIST_FILE | Path to text file containing custom UA strings (one per line). When set, uses these instead of auto-generated UAs. |
|
||||
| WHOOGLE_REDIRECTS | Specify sites that should be redirected elsewhere. See [custom redirecting](#custom-redirecting). |
|
||||
| EXPOSE_PORT | The port where Whoogle will be exposed. |
|
||||
| HTTPS_ONLY | Enforce HTTPS. (See [here](https://github.com/benbusby/whoogle-search#https-enforcement)) |
|
||||
|
|
@ -459,7 +476,6 @@ There are a few optional environment variables available for customizing a Whoog
|
|||
| WHOOGLE_AUTOCOMPLETE | Controls visibility of autocomplete/search suggestions. Default on -- use '0' to disable. |
|
||||
| WHOOGLE_MINIMAL | Remove everything except basic result cards from all search queries. |
|
||||
| WHOOGLE_CSP | Sets a default set of 'Content-Security-Policy' headers |
|
||||
| WHOOGLE_RESULTS_PER_PAGE | Set the number of results per page |
|
||||
| WHOOGLE_TOR_SERVICE | Enable/disable the Tor service on startup. Default on -- use '0' to disable. |
|
||||
| WHOOGLE_TOR_USE_PASS | Use password authentication for tor control port. |
|
||||
| WHOOGLE_TOR_CONF | The absolute path to the config file containing the password for the tor control port. Default: ./misc/tor/control.conf WHOOGLE_TOR_PASS must be 1 for this to work.|
|
||||
|
|
@ -494,7 +510,104 @@ These environment variables allow setting default config values, but can be over
|
|||
| WHOOGLE_CONFIG_PREFERENCES_ENCRYPTED | Encrypt preferences token, requires preferences key |
|
||||
| WHOOGLE_CONFIG_PREFERENCES_KEY | Key to encrypt preferences in URL (REQUIRED to show url) |
|
||||
| WHOOGLE_CONFIG_ANON_VIEW | Include the "anonymous view" option for each search result |
|
||||
| WHOOGLE_CONFIG_USE_LETA | Use Mullvad Leta as search backend (default: enabled). Set to 0 to use Google instead |
|
||||
| WHOOGLE_CONFIG_SHOW_USER_AGENT | Display the User Agent string used for search in results footer |
|
||||
|
||||
### Google Custom Search (BYOK) Environment Variables
|
||||
|
||||
These environment variables configure the "Bring Your Own Key" feature for Google Custom Search API:
|
||||
|
||||
| Variable | Description |
|
||||
| -------------------- | ----------------------------------------------------------------------------------------- |
|
||||
| WHOOGLE_CSE_API_KEY | Your Google API key with Custom Search API enabled |
|
||||
| WHOOGLE_CSE_ID | Your Custom Search Engine ID (cx parameter) |
|
||||
| WHOOGLE_USE_CSE | Enable Custom Search API by default (set to '1' to enable) |
|
||||
|
||||
## Google Custom Search (BYOK)
|
||||
|
||||
If Google blocks traditional search scraping (captchas, IP bans), you can use your own Google Custom Search Engine credentials as a fallback. This uses Google's official API with your own quota.
|
||||
|
||||
### Why Use This?
|
||||
|
||||
- **Reliability**: Official API never gets blocked or rate-limited (within quota)
|
||||
- **Speed**: Direct JSON responses are faster than HTML scraping
|
||||
- **Fallback**: Works when all scraping workarounds fail
|
||||
- **Privacy**: Your searches still don't go through third parties—they go directly to Google with your own API key
|
||||
|
||||
### Limitations vs Standard Whoogle
|
||||
|
||||
| Feature | Standard Scraping | CSE API |
|
||||
|------------------|--------------------------|---------------------|
|
||||
| Daily limit | None (until blocked) | 100 free, then paid |
|
||||
| Image search | ✅ Full support | ✅ Supported |
|
||||
| News/Videos tabs | ✅ | ❌ Web results only |
|
||||
| Speed | Slower (HTML parsing) | Faster (JSON) |
|
||||
| Reliability | Can be blocked | Always works |
|
||||
|
||||
### Setup Steps
|
||||
|
||||
#### 1. Create a Custom Search Engine
|
||||
1. Go to [Programmable Search Engine](https://programmablesearchengine.google.com/controlpanel/all)
|
||||
2. Click **"Add"** to create a new search engine
|
||||
3. Under "What to search?", select **"Search the entire web"**
|
||||
4. Give it a name (e.g., "My Whoogle CSE")
|
||||
5. Click **"Create"**
|
||||
6. Copy your **Search Engine ID**
|
||||
|
||||
#### 2. Get an API Key
|
||||
1. Go to [Google Cloud Console](https://console.cloud.google.com/)
|
||||
2. Create a new project or select an existing one
|
||||
3. Go to **APIs & Services** → **Library**
|
||||
4. Search for **"Custom Search API"** and click **Enable**
|
||||
5. Go to **APIs & Services** → **Credentials**
|
||||
6. Click **"Create Credentials"** → **"API Key"**
|
||||
7. Copy your API key (looks like `AIza...`)
|
||||
|
||||
#### 3. (Recommended) Restrict Your API Key
|
||||
To prevent misuse if your key is exposed:
|
||||
1. Click on your API key in Credentials
|
||||
2. Under **"API restrictions"**, select **"Restrict key"**
|
||||
3. Choose only **"Custom Search API"**
|
||||
4. Under **"Application restrictions"**, consider adding IP restrictions if using on a server
|
||||
5. Click **Save**
|
||||
|
||||
#### 4. Configure Whoogle
|
||||
|
||||
**Option A: Via Settings UI**
|
||||
1. Open your Whoogle instance
|
||||
2. Click the **Config** button
|
||||
3. Scroll to "Google Custom Search (BYOK)" section
|
||||
4. Enter your API Key and CSE ID
|
||||
5. Check "Use Custom Search API"
|
||||
6. Click **Apply**
|
||||
|
||||
**Option B: Via Environment Variables**
|
||||
```bash
|
||||
WHOOGLE_CSE_API_KEY=AIza...
|
||||
WHOOGLE_CSE_ID=23f...
|
||||
WHOOGLE_USE_CSE=1
|
||||
```
|
||||
|
||||
### Pricing & Avoiding Charges
|
||||
|
||||
| Tier | Queries | Cost |
|
||||
|------|------------------|-----------------------|
|
||||
| Free | 100/day | $0 |
|
||||
| Paid | Up to 10,000/day | $5 per 1,000 queries |
|
||||
|
||||
**⚠️ To avoid unexpected charges:**
|
||||
|
||||
1. **Don't add a payment method** to Google Cloud (safest option—API stops at 100/day)
|
||||
2. **Set a billing budget alert**: [Billing → Budgets & Alerts](https://console.cloud.google.com/billing/budgets)
|
||||
3. **Cap API usage**: APIs & Services → Custom Search API → Quotas → Set "Queries per day" to 100
|
||||
4. **Monitor usage**: APIs & Services → Custom Search API → Metrics
|
||||
|
||||
### Troubleshooting
|
||||
|
||||
| Error | Cause | Solution |
|
||||
|---------------------|---------------------------|-----------------------------------------------------------------|
|
||||
| "API key not valid" | Invalid or restricted key | Check key in Cloud Console, ensure Custom Search API is enabled |
|
||||
| "Quota exceeded" | Hit 100/day limit | Wait until midnight PT, or enable billing |
|
||||
| "Invalid CSE ID" | Wrong cx parameter | Copy ID from Programmable Search Engine control panel |
|
||||
|
||||
## Usage
|
||||
Same as most search engines, with the exception of filtering by time range.
|
||||
|
|
@ -666,6 +779,140 @@ Whoogle can optionally serve a single bundled CSS and JS to reduce the number of
|
|||
- When disabled (default), templates load individual CSS/JS files for easier development.
|
||||
- Note: Theme CSS (`*-theme.css`) are still loaded separately to honor user theme selection.
|
||||
|
||||
## User Agent Generator Tool
|
||||
|
||||
A standalone command-line tool is available for generating Opera User Agent strings on demand:
|
||||
|
||||
```bash
|
||||
# Generate 10 User Agent strings (default)
|
||||
python misc/generate_uas.py
|
||||
|
||||
# Generate custom number of UAs
|
||||
python misc/generate_uas.py 20
|
||||
```
|
||||
|
||||
This tool is useful for:
|
||||
- Testing different UA strings
|
||||
- Generating UAs for other projects
|
||||
- Verifying UA generation patterns
|
||||
- Debugging UA-related issues
|
||||
|
||||
## Using Custom User Agent Lists
|
||||
|
||||
Instead of using auto-generated Opera UA strings, you can provide your own list of User Agent strings for Whoogle to use.
|
||||
|
||||
### Setup
|
||||
|
||||
1. Create a text file with your preferred UA strings (one per line):
|
||||
|
||||
```
|
||||
Opera/9.80 (J2ME/MIDP; Opera Mini/4.2.13337/22.478; U; en) Presto/2.4.15 Version/10.00
|
||||
Opera/9.80 (Android; Linux; Opera Mobi/498; U; en) Presto/2.12.423 Version/10.1
|
||||
```
|
||||
|
||||
2. Set the `WHOOGLE_UA_LIST_FILE` environment variable to point to your file:
|
||||
|
||||
```bash
|
||||
# Docker
|
||||
docker run -e WHOOGLE_UA_LIST_FILE=/config/my_user_agents.txt ...
|
||||
|
||||
# Docker Compose
|
||||
environment:
|
||||
- WHOOGLE_UA_LIST_FILE=/config/my_user_agents.txt
|
||||
|
||||
# Manual/systemd
|
||||
export WHOOGLE_UA_LIST_FILE=/path/to/my_user_agents.txt
|
||||
```
|
||||
|
||||
### Priority Order
|
||||
|
||||
Whoogle uses the following priority when loading User Agent strings:
|
||||
|
||||
1. **Custom UA list file** (if `WHOOGLE_UA_LIST_FILE` is set and valid)
|
||||
2. **Cached auto-generated UAs** (if cache exists and is valid)
|
||||
3. **Newly generated UAs** (if no cache or cache expired)
|
||||
|
||||
### Tips
|
||||
|
||||
- You can use the output from `misc/check_google_user_agents.py` as your custom UA list
|
||||
- Generate a list with `python misc/generate_uas.py 50 2>/dev/null > my_uas.txt`
|
||||
- Mix different UA types (Opera, Firefox, Chrome) for more variety
|
||||
- Keep the file readable by Whoogle (proper permissions)
|
||||
- One UA string per line, blank lines are ignored
|
||||
|
||||
### Example Workflow
|
||||
|
||||
```bash
|
||||
# Generate and test UAs, save working ones
|
||||
python misc/generate_uas.py 100 2>/dev/null > candidate_uas.txt
|
||||
python misc/check_google_user_agents.py candidate_uas.txt --output working_uas.txt
|
||||
|
||||
# Use the working UAs with Whoogle
|
||||
export WHOOGLE_UA_LIST_FILE=./working_uas.txt
|
||||
./run
|
||||
```
|
||||
|
||||
## User Agent Testing Tool
|
||||
|
||||
Whoogle now includes a comprehensive testing tool (`misc/check_google_user_agents.py`) to verify which User Agent strings successfully return Google search results without triggering blocks, JavaScript-only pages, or browser upgrade prompts.
|
||||
|
||||
### Usage
|
||||
|
||||
```bash
|
||||
# Test all UAs from a file
|
||||
python misc/check_google_user_agents.py UAs.txt
|
||||
|
||||
# Save working UAs to a file (appends incrementally)
|
||||
python misc/check_google_user_agents.py UAs.txt --output working_uas.txt
|
||||
|
||||
# Use a specific search query
|
||||
python misc/check_google_user_agents.py UAs.txt --query "python programming"
|
||||
|
||||
# Verbose mode to see detailed results
|
||||
python misc/check_google_user_agents.py UAs.txt --output working.txt --verbose
|
||||
|
||||
# Adjust delay between requests (default: 0.5 seconds)
|
||||
python misc/check_google_user_agents.py UAs.txt --delay 1.0
|
||||
|
||||
# Set request timeout (default: 10 seconds)
|
||||
python misc/check_google_user_agents.py UAs.txt --timeout 15.0
|
||||
```
|
||||
|
||||
### Features
|
||||
|
||||
- **Incremental Results**: Working UAs are saved immediately to the output file (append mode), so progress is preserved even if interrupted
|
||||
- **Duplicate Detection**: Automatically skips UAs already in the output file when resuming
|
||||
- **Random Query Cycling**: By default, cycles through diverse search queries to simulate realistic usage patterns
|
||||
- **Rate Limit Detection**: Detects and reports Google rate limiting with recovery instructions
|
||||
- **Comprehensive Validation**: Checks for:
|
||||
- HTTP status codes (blocks, server errors, rate limits)
|
||||
- Block markers (unusual traffic, upgrade browser messages)
|
||||
- Success markers (actual search result HTML elements)
|
||||
- JavaScript-only pages and redirects
|
||||
- Response size validation
|
||||
|
||||
### Testing Methodology
|
||||
|
||||
The tool evaluates UAs against multiple criteria:
|
||||
|
||||
1. **HTTP Status**: Rejects 4xx/5xx errors, detects 429 rate limits
|
||||
2. **Block Detection**: Searches for Google's block messages (CAPTCHA, unusual traffic, etc.)
|
||||
3. **JavaScript Detection**: Identifies JS-only pages and noscript redirects
|
||||
4. **Result Validation**: Confirms presence of actual search result HTML elements
|
||||
5. **Content Analysis**: Validates response size and structure
|
||||
|
||||
This tool was used to discover and validate the working Opera UA patterns that power Whoogle's auto-generation feature.
|
||||
|
||||
## Known Issues
|
||||
|
||||
### User Agent Strings and Image Search
|
||||
|
||||
**Issue**: Most, if not all, of the auto-generated Opera User Agent strings may fail when performing **image searches** on Google. This appears to be a limitation with how Google's image search validates User Agent strings.
|
||||
|
||||
**Impact**:
|
||||
- Regular web searches work correctly with generated UAs
|
||||
- Image search may return errors or no results
|
||||
|
||||
## Contributing
|
||||
|
||||
Under the hood, Whoogle is a basic Flask app with the following structure:
|
||||
|
|
@ -679,6 +926,7 @@ Under the hood, Whoogle is a basic Flask app with the following structure:
|
|||
- `results.py`: Utility functions for interpreting/modifying individual search results
|
||||
- `search.py`: Creates and handles new search queries
|
||||
- `session.py`: Miscellaneous methods related to user sessions
|
||||
- `ua_generator.py`: Auto-generates Opera User Agent strings with pattern-based randomization
|
||||
- `templates/`
|
||||
- `index.html`: The home page template
|
||||
- `display.html`: The search results template
|
||||
|
|
@ -717,20 +965,6 @@ def contains(x: list, y: int) -> bool:
|
|||
Whoogle currently supports translations using [`translations.json`](https://github.com/benbusby/whoogle-search/blob/main/app/static/settings/translations.json). Language values in this file need to match the "value" of the according language in [`languages.json`](https://github.com/benbusby/whoogle-search/blob/main/app/static/settings/languages.json) (i.e. "lang_en" for English, "lang_es" for Spanish, etc). After you add a new set of translations to `translations.json`, open a PR with your changes and they will be merged in as soon as possible.
|
||||
|
||||
## FAQ
|
||||
|
||||
**What is Mullvad Leta and why is it the default?**
|
||||
|
||||
Mullvad Leta is a privacy-focused search service provided by [Mullvad VPN](https://mullvad.net/en/leta). As of January 2025, Google disabled JavaScript-free search results, which breaks Whoogle's core functionality. Leta provides an excellent alternative that:
|
||||
|
||||
- Doesn't require JavaScript
|
||||
- Provides privacy-focused search results through Mullvad's infrastructure
|
||||
- Uses Google's search index (so results are similar to what you'd expect)
|
||||
- Doesn't track or log your searches
|
||||
|
||||
**Limitations:** Leta only supports regular web search - no images, videos, news, or maps. If you need these features and Google's JavaScript-free search becomes available again, you can disable Leta in settings or set `WHOOGLE_CONFIG_USE_LETA=0`.
|
||||
|
||||
For more details, see [LETA_INTEGRATION.md](LETA_INTEGRATION.md).
|
||||
|
||||
**What's the difference between this and [Searx](https://github.com/asciimoo/searx)?**
|
||||
|
||||
Whoogle is intended to only ever be deployed to private instances by individuals of any background, with as little effort as possible. Prior knowledge of/experience with the command line or deploying applications is not necessary to deploy Whoogle, which isn't the case with Searx. As a result, Whoogle is missing some features of Searx in order to be as easy to deploy as possible.
|
||||
|
|
@ -750,12 +984,8 @@ A lot of the app currently piggybacks on Google's existing support for fetching
|
|||
| Website | Country | Language | Cloudflare |
|
||||
|-|-|-|-|
|
||||
| [https://search.garudalinux.org](https://search.garudalinux.org) | 🇫🇮 FI | Multi-choice | ✅ |
|
||||
| [https://search.sethforprivacy.com](https://search.sethforprivacy.com) | 🇩🇪 DE | English | |
|
||||
| [https://whoogle.privacydev.net](https://whoogle.privacydev.net) | 🇫🇷 FR | English | |
|
||||
| [https://wg.vern.cc](https://wg.vern.cc) | 🇺🇸 US | English | |
|
||||
| [https://whoogle.lunar.icu](https://whoogle.lunar.icu) | 🇩🇪 DE | Multi-choice | ✅ |
|
||||
| [https://whoogle.4040940.xyz/](https://whoogle.4040940.xyz/) | 🇺🇸 US | English | ✅ |
|
||||
|
||||
|
||||
|
||||
* A checkmark in the "Cloudflare" category here refers to the use of the reverse proxy, [Cloudflare](https://cloudflare.com). The checkmark will not be listed for a site which uses Cloudflare DNS but rather the proxying service which grants Cloudflare the ability to monitor traffic to the website.
|
||||
|
|
@ -764,17 +994,7 @@ A lot of the app currently piggybacks on Google's existing support for fetching
|
|||
|
||||
| Website | Country | Language |
|
||||
|-|-|-|
|
||||
| [http://whoglqjdkgt2an4tdepberwqz3hk7tjo4kqgdnuj77rt7nshw2xqhqad.onion](http://whoglqjdkgt2an4tdepberwqz3hk7tjo4kqgdnuj77rt7nshw2xqhqad.onion) | 🇺🇸 US | Multi-choice
|
||||
| [http://nuifgsnbb2mcyza74o7illtqmuaqbwu4flam3cdmsrnudwcmkqur37qd.onion](http://nuifgsnbb2mcyza74o7illtqmuaqbwu4flam3cdmsrnudwcmkqur37qd.onion) | 🇩🇪 DE | English
|
||||
| [http://whoogle.vernccvbvyi5qhfzyqengccj7lkove6bjot2xhh5kajhwvidqafczrad.onion](http://whoogle.vernccvbvyi5qhfzyqengccj7lkove6bjot2xhh5kajhwvidqafczrad.onion/) | 🇺🇸 US | English |
|
||||
| [http://whoogle.g4c3eya4clenolymqbpgwz3q3tawoxw56yhzk4vugqrl6dtu3ejvhjid.onion](http://whoogle.g4c3eya4clenolymqbpgwz3q3tawoxw56yhzk4vugqrl6dtu3ejvhjid.onion/) | 🇫🇷 FR | English |
|
||||
| [http://whoogle.daturab6drmkhyeia4ch5gvfc2f3wgo6bhjrv3pz6n7kxmvoznlkq4yd.onion](http://whoogle.daturab6drmkhyeia4ch5gvfc2f3wgo6bhjrv3pz6n7kxmvoznlkq4yd.onion/) | 🇩🇪 DE | Multi-choice | |
|
||||
|
||||
#### I2P Instances
|
||||
|
||||
| Website | Country | Language |
|
||||
|-|-|-|
|
||||
| [http://verneks7rfjptpz5fpii7n7nrxilsidi2qxepeuuf66c3tsf4nhq.b32.i2p](http://verneks7rfjptpz5fpii7n7nrxilsidi2qxepeuuf66c3tsf4nhq.b32.i2p) | 🇺🇸 US | English |
|
||||
NONE of the existing Onion accessible sites appear to be live anymore
|
||||
|
||||
## Screenshots
|
||||
#### Desktop
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ from app.request import send_tor_signal
|
|||
from app.utils.session import generate_key
|
||||
from app.utils.bangs import gen_bangs_json, load_all_bangs
|
||||
from app.utils.misc import gen_file_hash, read_config_bool
|
||||
from app.utils.ua_generator import load_ua_pool
|
||||
from base64 import b64encode
|
||||
from bs4 import MarkupResemblesLocatorWarning
|
||||
from datetime import datetime, timedelta
|
||||
|
|
@ -11,19 +12,19 @@ from flask import Flask
|
|||
import json
|
||||
import logging.config
|
||||
import os
|
||||
import sys
|
||||
from stem import Signal
|
||||
import threading
|
||||
import warnings
|
||||
|
||||
from werkzeug.middleware.proxy_fix import ProxyFix
|
||||
|
||||
from app.utils.misc import read_config_bool
|
||||
from app.services.http_client import HttpxClient
|
||||
from app.services.provider import close_all_clients
|
||||
from app.version import __version__
|
||||
|
||||
app = Flask(__name__, static_folder=os.path.dirname(
|
||||
os.path.abspath(__file__)) + '/static')
|
||||
app = Flask(__name__, static_folder=os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)), 'static'))
|
||||
|
||||
app.wsgi_app = ProxyFix(app.wsgi_app)
|
||||
|
||||
|
|
@ -75,7 +76,10 @@ app.config['CONFIG_DISABLE'] = read_config_bool('WHOOGLE_CONFIG_DISABLE')
|
|||
app.config['SESSION_FILE_DIR'] = os.path.join(
|
||||
app.config['CONFIG_PATH'],
|
||||
'session')
|
||||
app.config['MAX_SESSION_SIZE'] = 4000 # Sessions won't exceed 4KB
|
||||
# Maximum session file size in bytes (4KB limit to prevent abuse and disk exhaustion)
|
||||
# Session files larger than this are ignored during cleanup to avoid processing
|
||||
# potentially malicious or corrupted files
|
||||
app.config['MAX_SESSION_SIZE'] = 4000
|
||||
app.config['BANG_PATH'] = os.getenv(
|
||||
'CONFIG_VOLUME',
|
||||
os.path.join(app.config['STATIC_FOLDER'], 'bangs'))
|
||||
|
|
@ -107,18 +111,63 @@ if not os.path.exists(app.config['BANG_PATH']):
|
|||
if not os.path.exists(app.config['BUILD_FOLDER']):
|
||||
os.makedirs(app.config['BUILD_FOLDER'])
|
||||
|
||||
# Session values
|
||||
app_key_path = os.path.join(app.config['CONFIG_PATH'], 'whoogle.key')
|
||||
if os.path.exists(app_key_path):
|
||||
# Initialize User Agent pool
|
||||
app.config['UA_CACHE_PATH'] = os.path.join(app.config['CONFIG_PATH'], 'ua_cache.json')
|
||||
try:
|
||||
app.config['UA_POOL'] = load_ua_pool(app.config['UA_CACHE_PATH'], count=10)
|
||||
except Exception as e:
|
||||
# If UA pool loading fails, log warning and set empty pool
|
||||
# The gen_user_agent function will handle the fallback
|
||||
print(f"Warning: Could not initialize UA pool: {e}")
|
||||
app.config['UA_POOL'] = []
|
||||
|
||||
# Session values - Secret key management
|
||||
# Priority: environment variable → file → generate new
|
||||
def get_secret_key():
|
||||
"""Load or generate secret key with validation.
|
||||
|
||||
Priority order:
|
||||
1. WHOOGLE_SECRET_KEY environment variable
|
||||
2. Existing key file
|
||||
3. Generate new key and save to file
|
||||
|
||||
Returns:
|
||||
str: Valid secret key for Flask sessions
|
||||
"""
|
||||
# Check environment variable first
|
||||
env_key = os.getenv('WHOOGLE_SECRET_KEY', '').strip()
|
||||
if env_key:
|
||||
# Validate env key has minimum length
|
||||
if len(env_key) >= 32:
|
||||
return env_key
|
||||
else:
|
||||
print(f"Warning: WHOOGLE_SECRET_KEY too short ({len(env_key)} chars, need 32+). Using file/generated key instead.", file=sys.stderr)
|
||||
|
||||
# Check file-based key
|
||||
app_key_path = os.path.join(app.config['CONFIG_PATH'], 'whoogle.key')
|
||||
if os.path.exists(app_key_path):
|
||||
try:
|
||||
with open(app_key_path, 'r', encoding='utf-8') as f:
|
||||
key = f.read().strip()
|
||||
# Validate file key
|
||||
if len(key) >= 32:
|
||||
return key
|
||||
else:
|
||||
print(f"Warning: Key file too short, regenerating", file=sys.stderr)
|
||||
except (PermissionError, IOError) as e:
|
||||
print(f"Warning: Could not read key file: {e}", file=sys.stderr)
|
||||
|
||||
# Generate new key
|
||||
new_key = str(b64encode(os.urandom(32)))
|
||||
try:
|
||||
with open(app_key_path, 'r', encoding='utf-8') as f:
|
||||
app.config['SECRET_KEY'] = f.read()
|
||||
except PermissionError:
|
||||
app.config['SECRET_KEY'] = str(b64encode(os.urandom(32)))
|
||||
else:
|
||||
app.config['SECRET_KEY'] = str(b64encode(os.urandom(32)))
|
||||
with open(app_key_path, 'w', encoding='utf-8') as key_file:
|
||||
key_file.write(app.config['SECRET_KEY'])
|
||||
with open(app_key_path, 'w', encoding='utf-8') as key_file:
|
||||
key_file.write(new_key)
|
||||
except (PermissionError, IOError) as e:
|
||||
print(f"Warning: Could not save key file: {e}. Key will not persist across restarts.", file=sys.stderr)
|
||||
|
||||
return new_key
|
||||
|
||||
app.config['SECRET_KEY'] = get_secret_key()
|
||||
app.config['PERMANENT_SESSION_LIFETIME'] = timedelta(days=365)
|
||||
|
||||
# NOTE: SESSION_COOKIE_SAMESITE must be set to 'lax' to allow the user's
|
||||
|
|
|
|||
376
app/filter.py
376
app/filter.py
|
|
@ -5,7 +5,8 @@ from cryptography.fernet import Fernet
|
|||
from flask import render_template
|
||||
import html
|
||||
import urllib.parse as urlparse
|
||||
from urllib.parse import parse_qs
|
||||
import os
|
||||
from urllib.parse import parse_qs, urlencode, urlunparse
|
||||
import re
|
||||
|
||||
from app.models.g_classes import GClasses
|
||||
|
|
@ -111,8 +112,10 @@ def clean_css(css: str, page_url: str) -> str:
|
|||
|
||||
|
||||
class Filter:
|
||||
# Limit used for determining if a result is a "regular" result or a list
|
||||
# type result (such as "people also asked", "related searches", etc)
|
||||
# Minimum number of child div elements that indicates a collapsible section
|
||||
# Regular search results typically have fewer child divs (< 7)
|
||||
# Special sections like "People also ask", "Related searches" have more (>= 7)
|
||||
# This threshold helps identify and collapse these extended result sections
|
||||
RESULT_CHILD_LIMIT = 7
|
||||
|
||||
def __init__(
|
||||
|
|
@ -142,127 +145,6 @@ class Filter:
|
|||
def elements(self):
|
||||
return self._elements
|
||||
|
||||
def convert_leta_to_whoogle(self, soup) -> BeautifulSoup:
|
||||
"""Converts Leta search results HTML to Whoogle-compatible format
|
||||
|
||||
Args:
|
||||
soup: BeautifulSoup object containing Leta results
|
||||
|
||||
Returns:
|
||||
BeautifulSoup: Converted HTML in Whoogle format
|
||||
"""
|
||||
# Find all Leta result articles
|
||||
articles = soup.find_all('article', class_='svelte-fmlk7p')
|
||||
|
||||
if not articles:
|
||||
# No results found, return empty results page
|
||||
return soup
|
||||
|
||||
# Create a new container for results with proper Whoogle CSS class
|
||||
main_div = BeautifulSoup(features='html.parser').new_tag('div', attrs={'id': 'main'})
|
||||
|
||||
for article in articles:
|
||||
# Extract data from Leta article
|
||||
link_tag = article.find('a', href=True)
|
||||
if not link_tag:
|
||||
continue
|
||||
|
||||
url = link_tag.get('href', '')
|
||||
title_tag = article.find('h3')
|
||||
title = title_tag.get_text(strip=True) if title_tag else ''
|
||||
|
||||
snippet_tag = article.find('p', class_='result__body')
|
||||
snippet = snippet_tag.get_text(strip=True) if snippet_tag else ''
|
||||
|
||||
cite_tag = article.find('cite')
|
||||
display_url = cite_tag.get_text(strip=True) if cite_tag else url
|
||||
|
||||
# Create Whoogle-style result div with proper CSS class
|
||||
result_div = BeautifulSoup(features='html.parser').new_tag(
|
||||
'div', attrs={'class': [GClasses.result_class_a]}
|
||||
)
|
||||
result_outer = BeautifulSoup(features='html.parser').new_tag('div')
|
||||
|
||||
# Create a div for the title link
|
||||
title_div = BeautifulSoup(features='html.parser').new_tag('div')
|
||||
result_link = BeautifulSoup(features='html.parser').new_tag('a', href=url)
|
||||
result_title = BeautifulSoup(features='html.parser').new_tag('h3')
|
||||
result_title.string = title
|
||||
result_link.append(result_title)
|
||||
title_div.append(result_link)
|
||||
|
||||
# Create a div for the URL display with cite
|
||||
url_div = BeautifulSoup(features='html.parser').new_tag('div')
|
||||
result_cite = BeautifulSoup(features='html.parser').new_tag('cite')
|
||||
result_cite.string = display_url
|
||||
url_div.append(result_cite)
|
||||
|
||||
# Create a div for snippet
|
||||
result_snippet = BeautifulSoup(features='html.parser').new_tag('div')
|
||||
snippet_span = BeautifulSoup(features='html.parser').new_tag('span')
|
||||
snippet_span.string = snippet
|
||||
result_snippet.append(snippet_span)
|
||||
|
||||
# Assemble the result with proper structure
|
||||
result_outer.append(title_div)
|
||||
result_outer.append(url_div)
|
||||
result_outer.append(result_snippet)
|
||||
result_div.append(result_outer)
|
||||
main_div.append(result_div)
|
||||
|
||||
# Find and preserve pagination elements from Leta
|
||||
navigation = soup.find('div', class_='navigation')
|
||||
if navigation:
|
||||
# Convert Leta's "Next" button to Whoogle-style pagination
|
||||
next_button = navigation.find('button', attrs={'data-cy': 'next-button'})
|
||||
if next_button:
|
||||
next_form = next_button.find_parent('form')
|
||||
if next_form:
|
||||
# Extract the page number from hidden input
|
||||
page_input = next_form.find('input', attrs={'name': 'page'})
|
||||
if page_input:
|
||||
next_page = page_input.get('value', '2')
|
||||
# Create footer for pagination
|
||||
footer = BeautifulSoup(features='html.parser').new_tag('footer')
|
||||
nav_table = BeautifulSoup(features='html.parser').new_tag('table')
|
||||
nav_tr = BeautifulSoup(features='html.parser').new_tag('tr')
|
||||
nav_td = BeautifulSoup(features='html.parser').new_tag('td')
|
||||
|
||||
# Calculate start value for Whoogle pagination
|
||||
start_val = (int(next_page) - 1) * 10
|
||||
next_link = BeautifulSoup(features='html.parser').new_tag('a', href=f'search?q={self.query}&start={start_val}')
|
||||
next_link.string = 'Next »'
|
||||
|
||||
nav_td.append(next_link)
|
||||
nav_tr.append(nav_td)
|
||||
nav_table.append(nav_tr)
|
||||
footer.append(nav_table)
|
||||
main_div.append(footer)
|
||||
|
||||
# Clear the original soup body and add our converted results
|
||||
if soup.body:
|
||||
soup.body.clear()
|
||||
# Add inline style to body for proper width constraints
|
||||
if not soup.body.get('style'):
|
||||
soup.body['style'] = 'padding: 0 20px; margin: 0 auto; max-width: 1000px;'
|
||||
soup.body.append(main_div)
|
||||
else:
|
||||
# If no body, create one with proper styling
|
||||
new_body = BeautifulSoup(features='html.parser').new_tag(
|
||||
'body',
|
||||
attrs={'style': 'padding: 0 20px; margin: 0 auto; max-width: 1000px;'}
|
||||
)
|
||||
new_body.append(main_div)
|
||||
if soup.html:
|
||||
soup.html.append(new_body)
|
||||
else:
|
||||
# Create minimal HTML structure
|
||||
html_tag = BeautifulSoup(features='html.parser').new_tag('html')
|
||||
html_tag.append(new_body)
|
||||
soup.append(html_tag)
|
||||
|
||||
return soup
|
||||
|
||||
def encrypt_path(self, path, is_element=False) -> str:
|
||||
# Encrypts path to avoid plaintext results in logs
|
||||
if is_element:
|
||||
|
|
@ -276,13 +158,9 @@ class Filter:
|
|||
|
||||
def clean(self, soup) -> BeautifulSoup:
|
||||
self.soup = soup
|
||||
|
||||
# Check if this is a Leta result page and convert it
|
||||
if self.config.use_leta and self.soup.find('article', class_='svelte-fmlk7p'):
|
||||
self.soup = self.convert_leta_to_whoogle(self.soup)
|
||||
|
||||
self.main_divs = self.soup.find('div', {'id': 'main'})
|
||||
self.remove_ads()
|
||||
self.remove_ai_overview()
|
||||
self.remove_block_titles()
|
||||
self.remove_block_url()
|
||||
self.collapse_sections()
|
||||
|
|
@ -332,6 +210,9 @@ class Filter:
|
|||
header = self.soup.find('header')
|
||||
if header:
|
||||
header.decompose()
|
||||
# Remove broken "Dark theme" toggle snippets that occasionally slip
|
||||
# into the footer.
|
||||
self.remove_dark_theme_toggle(self.soup)
|
||||
self.remove_site_blocks(self.soup)
|
||||
return self.soup
|
||||
|
||||
|
|
@ -341,7 +222,7 @@ class Filter:
|
|||
Returns:
|
||||
None (The soup object is modified directly)
|
||||
"""
|
||||
if not div:
|
||||
if not div or not isinstance(div, Tag):
|
||||
return
|
||||
|
||||
for d in div.find_all('div', recursive=True):
|
||||
|
|
@ -416,6 +297,22 @@ class Filter:
|
|||
if GClasses.result_class_a in p_cls:
|
||||
break
|
||||
|
||||
def remove_dark_theme_toggle(self, soup: BeautifulSoup) -> None:
|
||||
"""Removes stray Dark theme toggle/link fragments that can appear
|
||||
in the footer."""
|
||||
for node in soup.find_all(string=re.compile(r'Dark theme', re.I)):
|
||||
try:
|
||||
parent = node.find_parent(
|
||||
lambda tag: tag.name in ['div', 'span', 'p', 'a', 'li',
|
||||
'section'])
|
||||
target = parent or node.parent
|
||||
if target:
|
||||
target.decompose()
|
||||
else:
|
||||
node.extract()
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
def remove_site_blocks(self, soup) -> None:
|
||||
if not self.config.block or not soup.body:
|
||||
return
|
||||
|
|
@ -427,6 +324,48 @@ class Filter:
|
|||
result.string.replace_with(result.string.replace(
|
||||
search_string, ''))
|
||||
|
||||
def remove_ai_overview(self) -> None:
|
||||
"""Removes Google's AI Overview/SGE results from search results
|
||||
|
||||
Returns:
|
||||
None (The soup object is modified directly)
|
||||
"""
|
||||
if not self.main_divs:
|
||||
return
|
||||
|
||||
# Patterns that identify AI Overview sections
|
||||
ai_patterns = [
|
||||
'AI Overview',
|
||||
'AI responses may include mistakes',
|
||||
]
|
||||
|
||||
# Result div classes - check both original Google classes and mapped ones
|
||||
# since this runs before CSS class replacement
|
||||
result_classes = [GClasses.result_class_a] # 'ZINbbc'
|
||||
result_classes.extend(GClasses.result_classes.get(
|
||||
GClasses.result_class_a, [])) # ['Gx5Zad']
|
||||
|
||||
# Collect divs to remove first to avoid modifying while iterating
|
||||
divs_to_remove = []
|
||||
|
||||
for div in self.main_divs.find_all('div', recursive=True):
|
||||
# Check if this div or its children contain AI Overview markers
|
||||
div_text = div.get_text()
|
||||
if any(pattern in div_text for pattern in ai_patterns):
|
||||
# Walk up to find the top-level result div
|
||||
parent = div
|
||||
while parent:
|
||||
p_cls = parent.attrs.get('class') or []
|
||||
if any(rc in p_cls for rc in result_classes):
|
||||
if parent not in divs_to_remove:
|
||||
divs_to_remove.append(parent)
|
||||
break
|
||||
parent = parent.parent
|
||||
|
||||
# Remove collected divs
|
||||
for div in divs_to_remove:
|
||||
div.decompose()
|
||||
|
||||
def remove_ads(self) -> None:
|
||||
"""Removes ads found in the list of search result divs
|
||||
|
||||
|
|
@ -498,6 +437,11 @@ class Filter:
|
|||
if not self.main_divs:
|
||||
return
|
||||
|
||||
# Skip collapsing for CSE (Custom Search Engine) results
|
||||
# CSE results have a data-cse attribute on the main container
|
||||
if self.soup.find(attrs={'data-cse': 'true'}):
|
||||
return
|
||||
|
||||
# Loop through results and check for the number of child divs in each
|
||||
for result in self.main_divs.find_all():
|
||||
result_children = pull_child_divs(result)
|
||||
|
|
@ -655,10 +599,32 @@ class Filter:
|
|||
)
|
||||
css = f"{css_html_tag}{css}"
|
||||
css = re.sub('body{(.*?)}',
|
||||
'body{padding:0 8px;margin:0 auto;max-width:736px;}',
|
||||
'body{padding:0 12px;margin:0 auto;max-width:1200px;}',
|
||||
css)
|
||||
style.string = css
|
||||
|
||||
# Normalize the max width between result types so the page doesn't
|
||||
# jump in size when switching tabs.
|
||||
if not self.mobile:
|
||||
max_width_css = (
|
||||
'body, #cnt, #center_col, .main, .e9EfHf, #searchform, '
|
||||
'.GyAeWb, .s6JM6d {'
|
||||
'max-width:1200px;'
|
||||
'margin:0 auto;'
|
||||
'padding-left:12px;'
|
||||
'padding-right:12px;'
|
||||
'}'
|
||||
)
|
||||
# Build the style tag using a fresh soup to avoid cases where the
|
||||
# current soup lacks the helper methods (e.g., non-root elements).
|
||||
factory_soup = BeautifulSoup('', 'html.parser')
|
||||
extra_style = factory_soup.new_tag('style')
|
||||
extra_style.string = max_width_css
|
||||
if self.soup.head:
|
||||
self.soup.head.append(extra_style)
|
||||
else:
|
||||
self.soup.insert(0, extra_style)
|
||||
|
||||
def update_link(self, link: Tag) -> None:
|
||||
"""Update internal link paths with encrypted path, otherwise remove
|
||||
unnecessary redirects and/or marketing params from the url
|
||||
|
|
@ -678,9 +644,6 @@ class Filter:
|
|||
|
||||
# Remove any elements that direct to unsupported Google pages
|
||||
if any(url in link_netloc for url in unsupported_g_pages):
|
||||
# FIXME: The "Shopping" tab requires further filtering (see #136)
|
||||
# Temporarily removing all links to that tab for now.
|
||||
|
||||
# Replaces the /url google unsupported link to the direct url
|
||||
link['href'] = link_netloc
|
||||
parent = link.parent
|
||||
|
|
@ -865,16 +828,113 @@ class Filter:
|
|||
desc_node.replace_with(new_desc)
|
||||
|
||||
def view_image(self, soup) -> BeautifulSoup:
|
||||
"""Replaces the soup with a new one that handles mobile results and
|
||||
adds the link of the image full res to the results.
|
||||
"""Parses image results from Google Images and rewrites them into the
|
||||
lightweight Whoogle image results template.
|
||||
|
||||
Args:
|
||||
soup: A BeautifulSoup object containing the image mobile results.
|
||||
|
||||
Returns:
|
||||
BeautifulSoup: The new BeautifulSoup object
|
||||
Google now serves image results via the modern udm=2 endpoint, where
|
||||
the raw HTML contains only placeholder thumbnails. The actual image
|
||||
URLs live inside serialized data blobs in script tags. We extract that
|
||||
data and pair it with the visible result cards.
|
||||
"""
|
||||
|
||||
def _decode_url(url: str) -> str:
|
||||
if not url:
|
||||
return ''
|
||||
# Decode common escaped characters found in the script blobs
|
||||
return html.unescape(
|
||||
url.replace('\\u003d', '=').replace('\\u0026', '&')
|
||||
)
|
||||
|
||||
def _extract_image_data(modern_soup: BeautifulSoup) -> dict:
|
||||
"""Extracts docid -> {img_url, img_tbn} from serialized scripts."""
|
||||
scripts_text = ' '.join(
|
||||
script.string for script in modern_soup.find_all('script')
|
||||
if script.string
|
||||
)
|
||||
pattern = re.compile(
|
||||
r'\[0,"(?P<docid>[^"]+)",\["(?P<thumb>https://encrypted-tbn[^"]+)"'
|
||||
r'(?:,\d+,\d+)?\],\["(?P<full>https?://[^"]+?)"'
|
||||
r'(?:,\d+,\d+)?\]',
|
||||
re.DOTALL
|
||||
)
|
||||
results_map = {}
|
||||
for match in pattern.finditer(scripts_text):
|
||||
docid = match.group('docid')
|
||||
thumb = _decode_url(match.group('thumb'))
|
||||
full = _decode_url(match.group('full'))
|
||||
results_map[docid] = {
|
||||
'img_tbn': thumb,
|
||||
'img_url': full
|
||||
}
|
||||
return results_map
|
||||
|
||||
def _parse_modern_results(modern_soup: BeautifulSoup) -> list:
|
||||
cards = modern_soup.find_all(
|
||||
'div',
|
||||
attrs={
|
||||
'data-attrid': 'images universal',
|
||||
'data-docid': True
|
||||
}
|
||||
)
|
||||
if not cards:
|
||||
return []
|
||||
|
||||
meta_map = _extract_image_data(modern_soup)
|
||||
parsed = []
|
||||
seen = set()
|
||||
|
||||
for card in cards:
|
||||
docid = card.get('data-docid')
|
||||
meta = meta_map.get(docid, {})
|
||||
img_url = meta.get('img_url')
|
||||
img_tbn = meta.get('img_tbn')
|
||||
|
||||
# Fall back to the inline src if we failed to map the docid
|
||||
if not img_tbn:
|
||||
img_tag = card.find('img')
|
||||
if img_tag:
|
||||
candidate_src = img_tag.get('src')
|
||||
if candidate_src and candidate_src.startswith('http'):
|
||||
img_tbn = candidate_src
|
||||
|
||||
web_page = card.get('data-lpage') or ''
|
||||
if not web_page:
|
||||
link = card.find('a', href=True)
|
||||
if link:
|
||||
web_page = link['href']
|
||||
|
||||
key = (img_url, img_tbn, web_page)
|
||||
if not any(key) or key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
|
||||
parsed.append({
|
||||
'domain': urlparse.urlparse(web_page).netloc
|
||||
if web_page else '',
|
||||
'img_url': img_url or img_tbn or '',
|
||||
'web_page': web_page,
|
||||
'img_tbn': img_tbn or img_url or ''
|
||||
})
|
||||
return parsed
|
||||
|
||||
# Try parsing the modern (udm=2) layout first
|
||||
modern_results = _parse_modern_results(soup)
|
||||
if modern_results:
|
||||
# TODO: Implement proper image pagination. Google images uses
|
||||
# infinite scroll with `ijn` offsets; we need a clean,
|
||||
# de-duplicated pagination strategy before exposing a Next link.
|
||||
next_link = None
|
||||
return BeautifulSoup(
|
||||
render_template(
|
||||
'imageresults.html',
|
||||
length=len(modern_results),
|
||||
results=modern_results,
|
||||
view_label="View Image",
|
||||
next_link=next_link
|
||||
),
|
||||
features='html.parser'
|
||||
)
|
||||
|
||||
# get some tags that are unchanged between mobile and pc versions
|
||||
cor_suggested = soup.find_all('table', attrs={'class': "By0U9"})
|
||||
next_pages = soup.find('table', attrs={'class': "uZgmoc"})
|
||||
|
|
@ -888,7 +948,11 @@ class Filter:
|
|||
results_all = results_div.find_all('div', attrs={'class': "lIMUZd"})
|
||||
|
||||
for item in results_all:
|
||||
urls = item.find('a')['href'].split('&imgrefurl=')
|
||||
link = item.find('a', href=True)
|
||||
if not link:
|
||||
continue
|
||||
|
||||
urls = link['href'].split('&imgrefurl=')
|
||||
|
||||
# Skip urls that are not two-element lists
|
||||
if len(urls) != 2:
|
||||
|
|
@ -903,7 +967,16 @@ class Filter:
|
|||
except IndexError:
|
||||
web_page = urlparse.unquote(urls[1])
|
||||
|
||||
img_tbn = urlparse.unquote(item.find('a').find('img')['src'])
|
||||
img_tag = link.find('img')
|
||||
if not img_tag:
|
||||
continue
|
||||
|
||||
img_tbn = urlparse.unquote(
|
||||
img_tag.get('src') or img_tag.get('data-src', '')
|
||||
)
|
||||
|
||||
if not img_tbn:
|
||||
continue
|
||||
|
||||
results.append({
|
||||
'domain': urlparse.urlparse(web_page).netloc,
|
||||
|
|
@ -920,11 +993,18 @@ class Filter:
|
|||
|
||||
# replace correction suggested by google object if exists
|
||||
if len(cor_suggested):
|
||||
soup.find_all(
|
||||
suggested_tables = soup.find_all(
|
||||
'table',
|
||||
attrs={'class': "By0U9"}
|
||||
)[0].replaceWith(cor_suggested[0])
|
||||
# replace next page object at the bottom of the page
|
||||
soup.find_all('table',
|
||||
attrs={'class': "uZgmoc"})[0].replaceWith(next_pages)
|
||||
)
|
||||
if suggested_tables:
|
||||
suggested_tables[0].replaceWith(cor_suggested[0])
|
||||
|
||||
# replace next page object at the bottom of the page, when present
|
||||
next_page_tables = soup.find_all('table', attrs={'class': "uZgmoc"})
|
||||
if next_pages and next_page_tables:
|
||||
next_page_tables[0].replaceWith(next_pages)
|
||||
|
||||
# TODO: Reintroduce pagination for legacy image layout if needed.
|
||||
|
||||
return soup
|
||||
|
|
|
|||
|
|
@ -45,8 +45,11 @@ class Config:
|
|||
self.user_agent = kwargs.get('user_agent', default_ua_option)
|
||||
self.custom_user_agent = kwargs.get('custom_user_agent', '')
|
||||
self.use_custom_user_agent = kwargs.get('use_custom_user_agent', False)
|
||||
self.show_user_agent = read_config_bool('WHOOGLE_CONFIG_SHOW_USER_AGENT')
|
||||
|
||||
# Add user agent related keys to safe_keys
|
||||
# Note: CSE credentials (cse_api_key, cse_id) are intentionally NOT included
|
||||
# in safe_keys for security - they should not be shareable via URL
|
||||
self.safe_keys = [
|
||||
'lang_search',
|
||||
'lang_interface',
|
||||
|
|
@ -64,7 +67,7 @@ class Config:
|
|||
'user_agent',
|
||||
'custom_user_agent',
|
||||
'use_custom_user_agent',
|
||||
'use_leta'
|
||||
'show_user_agent'
|
||||
]
|
||||
|
||||
app_config = current_app.config
|
||||
|
|
@ -80,7 +83,6 @@ class Config:
|
|||
self.tbs = os.getenv('WHOOGLE_CONFIG_TIME_PERIOD', '')
|
||||
self.theme = os.getenv('WHOOGLE_CONFIG_THEME', 'system')
|
||||
self.safe = read_config_bool('WHOOGLE_CONFIG_SAFE')
|
||||
self.dark = read_config_bool('WHOOGLE_CONFIG_DARK') # deprecated
|
||||
self.alts = read_config_bool('WHOOGLE_CONFIG_ALTS')
|
||||
self.nojs = read_config_bool('WHOOGLE_CONFIG_NOJS')
|
||||
self.tor = read_config_bool('WHOOGLE_CONFIG_TOR')
|
||||
|
|
@ -91,7 +93,11 @@ class Config:
|
|||
self.anon_view = read_config_bool('WHOOGLE_CONFIG_ANON_VIEW')
|
||||
self.preferences_encrypted = read_config_bool('WHOOGLE_CONFIG_PREFERENCES_ENCRYPTED')
|
||||
self.preferences_key = os.getenv('WHOOGLE_CONFIG_PREFERENCES_KEY', '')
|
||||
self.use_leta = read_config_bool('WHOOGLE_CONFIG_USE_LETA', default=True)
|
||||
|
||||
# Google Custom Search Engine (CSE) BYOK settings
|
||||
self.cse_api_key = os.getenv('WHOOGLE_CSE_API_KEY', '')
|
||||
self.cse_id = os.getenv('WHOOGLE_CSE_ID', '')
|
||||
self.use_cse = read_config_bool('WHOOGLE_USE_CSE')
|
||||
|
||||
self.accept_language = False
|
||||
|
||||
|
|
@ -99,13 +105,13 @@ class Config:
|
|||
if kwargs:
|
||||
mutable_attrs = self.get_mutable_attrs()
|
||||
for attr in mutable_attrs:
|
||||
if attr in kwargs.keys():
|
||||
if attr == 'show_user_agent':
|
||||
# Handle show_user_agent as boolean
|
||||
self.show_user_agent = bool(kwargs.get(attr))
|
||||
elif attr in kwargs.keys():
|
||||
setattr(self, attr, kwargs[attr])
|
||||
elif attr not in kwargs.keys() and mutable_attrs[attr] == bool:
|
||||
# Only set to False if the attribute wasn't already set to True
|
||||
# by environment defaults (e.g., use_leta defaults to True)
|
||||
if not getattr(self, attr, False):
|
||||
setattr(self, attr, False)
|
||||
setattr(self, attr, False)
|
||||
|
||||
def __getitem__(self, name):
|
||||
return getattr(self, name)
|
||||
|
|
@ -248,9 +254,34 @@ class Config:
|
|||
return param_str
|
||||
|
||||
def _get_fernet_key(self, password: str) -> bytes:
|
||||
hash_object = hashlib.md5(password.encode())
|
||||
key = urlsafe_b64encode(hash_object.hexdigest().encode())
|
||||
return key
|
||||
"""Derive a Fernet-compatible key from a password using PBKDF2.
|
||||
|
||||
Note: This uses a static salt for simplicity. This is a breaking change
|
||||
from the previous MD5-based implementation. Existing encrypted preferences
|
||||
will need to be re-encrypted.
|
||||
|
||||
Args:
|
||||
password: The password to derive the key from
|
||||
|
||||
Returns:
|
||||
bytes: A URL-safe base64 encoded 32-byte key suitable for Fernet
|
||||
"""
|
||||
# Use a static salt derived from app context
|
||||
# In a production system, you'd want to store per-user salts
|
||||
salt = b'whoogle-preferences-salt-v2'
|
||||
|
||||
# Derive a 32-byte key using PBKDF2 with SHA256
|
||||
# 100,000 iterations is a reasonable balance of security and performance
|
||||
kdf_key = hashlib.pbkdf2_hmac(
|
||||
'sha256',
|
||||
password.encode('utf-8'),
|
||||
salt,
|
||||
100000,
|
||||
dklen=32
|
||||
)
|
||||
|
||||
# Fernet requires a URL-safe base64 encoded key
|
||||
return urlsafe_b64encode(kdf_key)
|
||||
|
||||
def _encode_preferences(self) -> str:
|
||||
preferences_json = json.dumps(self.get_attrs()).encode()
|
||||
|
|
|
|||
209
app/request.py
209
app/request.py
|
|
@ -1,9 +1,8 @@
|
|||
from app.models.config import Config
|
||||
from app.utils.misc import read_config_bool
|
||||
from app.services.provider import get_http_client
|
||||
from datetime import datetime
|
||||
from app.utils.ua_generator import load_ua_pool, get_random_ua, DEFAULT_FALLBACK_UA
|
||||
from defusedxml import ElementTree as ET
|
||||
import random
|
||||
import httpx
|
||||
import urllib.parse as urlparse
|
||||
import os
|
||||
|
|
@ -16,9 +15,6 @@ MAPS_URL = 'https://maps.google.com/maps'
|
|||
AUTOCOMPLETE_URL = ('https://suggestqueries.google.com/'
|
||||
'complete/search?client=toolbar&')
|
||||
|
||||
MOBILE_UA = '{}/5.0 (Android 0; Mobile; rv:54.0) Gecko/54.0 {}/59.0'
|
||||
DESKTOP_UA = '{}/5.0 (X11; {} x86_64; rv:75.0) Gecko/20100101 {}/75.0'
|
||||
|
||||
# Valid query params
|
||||
VALID_PARAMS = ['tbs', 'tbm', 'start', 'near', 'source', 'nfpr']
|
||||
|
||||
|
|
@ -73,9 +69,6 @@ def send_tor_signal(signal: Signal) -> bool:
|
|||
|
||||
|
||||
def gen_user_agent(config, is_mobile) -> str:
|
||||
# Define the default PlayStation Portable user agent (replaces Lynx)
|
||||
DEFAULT_UA = 'Mozilla/4.0 (PSP (PlayStation Portable); 2.00)'
|
||||
|
||||
# If using custom user agent, return the custom string
|
||||
if config.user_agent == 'custom' and config.custom_user_agent:
|
||||
return config.custom_user_agent
|
||||
|
|
@ -90,92 +83,40 @@ def gen_user_agent(config, is_mobile) -> str:
|
|||
env_ua = os.getenv('WHOOGLE_USER_AGENT', '')
|
||||
if env_ua:
|
||||
return env_ua
|
||||
# If env vars are not set, fall back to default
|
||||
return DEFAULT_UA
|
||||
# If env vars are not set, fall back to Opera UA
|
||||
return DEFAULT_FALLBACK_UA
|
||||
|
||||
# If using default user agent
|
||||
# If using default user agent - use auto-generated Opera UA pool
|
||||
if config.user_agent == 'default':
|
||||
return DEFAULT_UA
|
||||
try:
|
||||
# Try to load UA pool from cache (lazy loading if not in app.config)
|
||||
# First check if we have access to Flask app context
|
||||
try:
|
||||
from flask import current_app
|
||||
if hasattr(current_app, 'config') and 'UA_POOL' in current_app.config:
|
||||
ua_pool = current_app.config['UA_POOL']
|
||||
else:
|
||||
# Fall back to loading from disk
|
||||
raise ImportError("UA_POOL not in app config")
|
||||
except (ImportError, RuntimeError):
|
||||
# No Flask context available or UA_POOL not in config, load from disk
|
||||
config_path = os.environ.get('CONFIG_VOLUME',
|
||||
os.path.join(os.path.dirname(os.path.abspath(__file__)),
|
||||
'static', 'config'))
|
||||
cache_path = os.path.join(config_path, 'ua_cache.json')
|
||||
ua_pool = load_ua_pool(cache_path, count=10)
|
||||
|
||||
return get_random_ua(ua_pool)
|
||||
except Exception as e:
|
||||
# If anything goes wrong, fall back to default Opera UA
|
||||
print(f"Warning: Could not load UA pool, using fallback Opera UA: {e}")
|
||||
return DEFAULT_FALLBACK_UA
|
||||
|
||||
# If no custom user agent is set, generate a random one (for backwards compatibility)
|
||||
firefox = random.choice(['Choir', 'Squier', 'Higher', 'Wire']) + 'fox'
|
||||
linux = random.choice(['Win', 'Sin', 'Gin', 'Fin', 'Kin']) + 'ux'
|
||||
|
||||
if is_mobile:
|
||||
return MOBILE_UA.format("Mozilla", firefox)
|
||||
|
||||
return DESKTOP_UA.format("Mozilla", linux, firefox)
|
||||
|
||||
|
||||
def gen_query_leta(query, args, config) -> str:
|
||||
"""Builds a query string for Mullvad Leta backend
|
||||
|
||||
Args:
|
||||
query: The search query string
|
||||
args: Request arguments
|
||||
config: User configuration
|
||||
|
||||
Returns:
|
||||
str: A formatted query string for Leta
|
||||
"""
|
||||
# Ensure search query is parsable
|
||||
query = urlparse.quote(query)
|
||||
|
||||
# Build query starting with 'q='
|
||||
query_str = 'q=' + query
|
||||
|
||||
# Always use Google as the engine (Leta supports 'google' or 'brave')
|
||||
query_str += '&engine=google'
|
||||
|
||||
# Add country if configured
|
||||
if config.country:
|
||||
query_str += '&country=' + config.country.lower()
|
||||
|
||||
# Add language if configured
|
||||
# Convert from Google's lang format (lang_en) to Leta's format (en)
|
||||
if config.lang_search:
|
||||
lang_code = config.lang_search.replace('lang_', '')
|
||||
query_str += '&language=' + lang_code
|
||||
|
||||
# Handle time period filtering with :past syntax or tbs parameter
|
||||
if ':past' in query:
|
||||
time_range = str.strip(query.split(':past', 1)[-1]).lower()
|
||||
if time_range.startswith('day'):
|
||||
query_str += '&lastUpdated=d'
|
||||
elif time_range.startswith('week'):
|
||||
query_str += '&lastUpdated=w'
|
||||
elif time_range.startswith('month'):
|
||||
query_str += '&lastUpdated=m'
|
||||
elif time_range.startswith('year'):
|
||||
query_str += '&lastUpdated=y'
|
||||
elif 'tbs' in args or 'tbs' in config:
|
||||
result_tbs = args.get('tbs') if 'tbs' in args else config.tbs
|
||||
# Convert Google's tbs format to Leta's lastUpdated format
|
||||
if result_tbs and 'qdr:d' in result_tbs:
|
||||
query_str += '&lastUpdated=d'
|
||||
elif result_tbs and 'qdr:w' in result_tbs:
|
||||
query_str += '&lastUpdated=w'
|
||||
elif result_tbs and 'qdr:m' in result_tbs:
|
||||
query_str += '&lastUpdated=m'
|
||||
elif result_tbs and 'qdr:y' in result_tbs:
|
||||
query_str += '&lastUpdated=y'
|
||||
|
||||
# Add pagination if present
|
||||
if 'start' in args:
|
||||
start = int(args.get('start', '0'))
|
||||
# Leta uses 1-indexed pages, Google uses result offset
|
||||
page = (start // 10) + 1
|
||||
if page > 1:
|
||||
query_str += '&page=' + str(page)
|
||||
|
||||
return query_str
|
||||
# Fallback for backwards compatibility (old configs or invalid user_agent values)
|
||||
return DEFAULT_FALLBACK_UA
|
||||
|
||||
|
||||
def gen_query(query, args, config) -> str:
|
||||
# If using Leta backend, build query differently
|
||||
if config.use_leta:
|
||||
return gen_query_leta(query, args, config)
|
||||
|
||||
param_dict = {key: '' for key in VALID_PARAMS}
|
||||
|
||||
# Use :past(hour/day/week/month/year) if available
|
||||
|
|
@ -206,6 +147,10 @@ def gen_query(query, args, config) -> str:
|
|||
# Pass along type of results (news, images, books, etc)
|
||||
if 'tbm' in args:
|
||||
param_dict['tbm'] = '&tbm=' + args.get('tbm')
|
||||
# Google Images now expects the modern udm=2 layout; force it when
|
||||
# requesting images to avoid redirects to the new AI/text layout.
|
||||
if args.get('tbm') == 'isch' and 'udm' not in args:
|
||||
param_dict['udm'] = '&udm=2'
|
||||
|
||||
# Get results page start value (10 per page, ie page 2 start val = 20)
|
||||
if 'start' in args:
|
||||
|
|
@ -271,15 +216,11 @@ class Request:
|
|||
"""
|
||||
|
||||
def __init__(self, normal_ua, root_path, config: Config, http_client=None):
|
||||
# Use Leta backend if configured, otherwise use Google
|
||||
if config.use_leta:
|
||||
self.search_url = 'https://leta.mullvad.net/search?'
|
||||
self.use_leta = True
|
||||
else:
|
||||
self.search_url = 'https://www.google.com/search?gbv=1&num=' + str(
|
||||
os.getenv('WHOOGLE_RESULTS_PER_PAGE', 10)) + '&'
|
||||
self.use_leta = False
|
||||
|
||||
self.search_url = 'https://www.google.com/search?gbv=1&q='
|
||||
# Google Images rejects the lightweight gbv=1 interface. Use the
|
||||
# modern udm=2 entrypoint specifically for image searches to avoid the
|
||||
# "update your browser" interstitial.
|
||||
self.image_search_url = 'https://www.google.com/search?udm=2&q='
|
||||
# Optionally send heartbeat to Tor to determine availability
|
||||
# Only when Tor is enabled in config to avoid unnecessary socket usage
|
||||
if config.tor:
|
||||
|
|
@ -301,6 +242,13 @@ class Request:
|
|||
if not self.mobile:
|
||||
self.modified_user_agent_mobile = gen_user_agent(config, True)
|
||||
|
||||
# Dedicated modern UA to use when Google rejects legacy ones (e.g. Images)
|
||||
self.image_user_agent = (
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
|
||||
'AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||
'Chrome/127.0.0.0 Safari/537.36'
|
||||
)
|
||||
|
||||
# Set up proxy configuration
|
||||
proxy_path = os.environ.get('WHOOGLE_PROXY_LOC', '')
|
||||
if proxy_path:
|
||||
|
|
@ -398,24 +346,54 @@ class Request:
|
|||
else:
|
||||
modified_user_agent = self.modified_user_agent
|
||||
|
||||
# Some Google endpoints (notably Images) now refuse legacy user agents.
|
||||
# If an image search is detected and the generated UA isn't Chromium-
|
||||
# like, retry with a modern Chrome string to avoid the "update your
|
||||
# browser" interstitial.
|
||||
if (('tbm=isch' in query) or ('udm=2' in query)) and 'Chrome' not in modified_user_agent:
|
||||
modified_user_agent = self.image_user_agent
|
||||
|
||||
headers = {
|
||||
'User-Agent': modified_user_agent
|
||||
'User-Agent': modified_user_agent,
|
||||
'Accept': ('text/html,application/xhtml+xml,application/xml;'
|
||||
'q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8'),
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Connection': 'keep-alive',
|
||||
'Cache-Control': 'max-age=0',
|
||||
'Pragma': 'no-cache',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'Sec-Fetch-Site': 'none',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-User': '?1',
|
||||
'Sec-Fetch-Dest': 'document'
|
||||
}
|
||||
# Only attach client hints when using a Chromium-like user agent to
|
||||
# avoid sending conflicting information that can trigger unsupported
|
||||
# browser pages.
|
||||
if 'Chrome' in headers['User-Agent']:
|
||||
headers.update({
|
||||
'Sec-CH-UA': (
|
||||
'"Not/A)Brand";v="8", '
|
||||
'"Chromium";v="127", '
|
||||
'"Google Chrome";v="127"'
|
||||
),
|
||||
'Sec-CH-UA-Mobile': '?0',
|
||||
'Sec-CH-UA-Platform': '"Windows"'
|
||||
})
|
||||
|
||||
# Adding the Accept-Language to the Header if possible
|
||||
|
||||
# Add Accept-Language header tied to the current config if requested
|
||||
if self.lang_interface:
|
||||
headers.update({'Accept-Language':
|
||||
self.lang_interface.replace('lang_', '')
|
||||
+ ';q=1.0'})
|
||||
headers['Accept-Language'] = (
|
||||
self.lang_interface.replace('lang_', '') + ';q=1.0'
|
||||
)
|
||||
|
||||
# view is suppressed correctly
|
||||
now = datetime.now()
|
||||
consent_cookie = 'CONSENT=PENDING+987; SOCS=CAESHAgBEhIaAB'
|
||||
# Prefer header-based cookies to avoid httpx per-request cookies deprecation
|
||||
if 'Cookie' in headers:
|
||||
headers['Cookie'] += '; ' + consent_cookie
|
||||
else:
|
||||
headers['Cookie'] = consent_cookie
|
||||
# Consent cookies keep Google from showing the interstitial consent wall
|
||||
consent_cookies = {
|
||||
'CONSENT': 'PENDING+987',
|
||||
'SOCS': 'CAESHAgBEhIaAB'
|
||||
}
|
||||
|
||||
# Validate Tor conn and request new identity if the last one failed
|
||||
if self.tor and not send_tor_signal(
|
||||
|
|
@ -443,10 +421,15 @@ class Request:
|
|||
"Error raised during Tor connection validation",
|
||||
disable=True)
|
||||
|
||||
search_base = base_url or self.search_url
|
||||
if not base_url and ('tbm=isch' in query or 'udm=2' in query):
|
||||
search_base = self.image_search_url
|
||||
|
||||
try:
|
||||
response = self.http_client.get(
|
||||
(base_url or self.search_url) + query,
|
||||
headers=headers)
|
||||
search_base + query,
|
||||
headers=headers,
|
||||
cookies=consent_cookies)
|
||||
except httpx.HTTPError as e:
|
||||
raise
|
||||
|
||||
|
|
@ -455,6 +438,6 @@ class Request:
|
|||
attempt += 1
|
||||
if attempt > 10:
|
||||
raise TorError("Tor query failed -- max attempts exceeded 10")
|
||||
return self.send((base_url or self.search_url), query, attempt)
|
||||
return self.send(search_base, query, attempt)
|
||||
|
||||
return response
|
||||
|
|
|
|||
135
app/routes.py
135
app/routes.py
|
|
@ -3,7 +3,6 @@ import base64
|
|||
import io
|
||||
import json
|
||||
import os
|
||||
import pickle
|
||||
import re
|
||||
import urllib.parse as urlparse
|
||||
import uuid
|
||||
|
|
@ -18,6 +17,7 @@ from app import app
|
|||
from app.models.config import Config
|
||||
from app.models.endpoint import Endpoint
|
||||
from app.request import Request, TorError
|
||||
from app.services.cse_client import CSEException
|
||||
from app.utils.bangs import suggest_bang, resolve_bang
|
||||
from app.utils.misc import empty_gif, placeholder_img, get_proxy_host_url, \
|
||||
fetch_favicon
|
||||
|
|
@ -102,9 +102,8 @@ def session_required(f):
|
|||
if os.path.getsize(file_path) > app.config['MAX_SESSION_SIZE']:
|
||||
continue
|
||||
|
||||
with open(file_path, 'rb') as session_file:
|
||||
_ = pickle.load(session_file)
|
||||
data = pickle.load(session_file)
|
||||
with open(file_path, 'r', encoding='utf-8') as session_file:
|
||||
data = json.load(session_file)
|
||||
if isinstance(data, dict) and 'valid' in data:
|
||||
continue
|
||||
invalid_sessions.append(file_path)
|
||||
|
|
@ -176,19 +175,28 @@ def after_request_func(resp):
|
|||
resp.headers['X-Content-Type-Options'] = 'nosniff'
|
||||
resp.headers['X-Frame-Options'] = 'DENY'
|
||||
resp.headers['Cache-Control'] = 'max-age=86400'
|
||||
|
||||
# Security headers
|
||||
resp.headers['Referrer-Policy'] = 'no-referrer'
|
||||
resp.headers['Permissions-Policy'] = 'geolocation=(), microphone=(), camera=()'
|
||||
|
||||
# Add HSTS header if HTTPS is enabled
|
||||
if os.environ.get('HTTPS_ONLY', False):
|
||||
resp.headers['Strict-Transport-Security'] = 'max-age=31536000; includeSubDomains'
|
||||
|
||||
if os.getenv('WHOOGLE_CSP', False):
|
||||
# Enable CSP by default (can be disabled via env var)
|
||||
if os.getenv('WHOOGLE_CSP', '1') != '0':
|
||||
resp.headers['Content-Security-Policy'] = app.config['CSP']
|
||||
if os.environ.get('HTTPS_ONLY', False):
|
||||
resp.headers['Content-Security-Policy'] += \
|
||||
'upgrade-insecure-requests'
|
||||
' upgrade-insecure-requests'
|
||||
|
||||
return resp
|
||||
|
||||
|
||||
@app.errorhandler(404)
|
||||
def unknown_page(e):
|
||||
app.logger.warn(e)
|
||||
app.logger.warning(e)
|
||||
return redirect(g.app_location)
|
||||
|
||||
|
||||
|
|
@ -217,9 +225,7 @@ def index():
|
|||
translation=app.config['TRANSLATIONS'][
|
||||
g.user_config.get_localization_lang()
|
||||
],
|
||||
logo=render_template(
|
||||
'logo.html',
|
||||
dark=g.user_config.dark),
|
||||
logo=render_template('logo.html'),
|
||||
config_disabled=(
|
||||
app.config['CONFIG_DISABLE'] or
|
||||
not valid_user_session(session)),
|
||||
|
|
@ -342,16 +348,6 @@ def search():
|
|||
if not query:
|
||||
return redirect(url_for('.index'))
|
||||
|
||||
# Check if using Leta with unsupported search type
|
||||
tbm_value = request.args.get('tbm', '').strip()
|
||||
if g.user_config.use_leta and tbm_value:
|
||||
session['error_message'] = (
|
||||
"Image, video, news, and map searches are not supported when using "
|
||||
"Mullvad Leta as the search backend. Please disable Leta in settings "
|
||||
"or perform a regular web search."
|
||||
)
|
||||
return redirect(url_for('.index'))
|
||||
|
||||
# Generate response and number of external elements from the page
|
||||
try:
|
||||
response = search_util.generate_response()
|
||||
|
|
@ -361,6 +357,30 @@ def search():
|
|||
session['config']['tor'] = False if e.disable else session['config'][
|
||||
'tor']
|
||||
return redirect(url_for('.index'))
|
||||
except CSEException as e:
|
||||
localization_lang = g.user_config.get_localization_lang()
|
||||
translation = app.config['TRANSLATIONS'][localization_lang]
|
||||
wants_json = (
|
||||
request.args.get('format') == 'json' or
|
||||
'application/json' in request.headers.get('Accept', '') or
|
||||
'application/*+json' in request.headers.get('Accept', '')
|
||||
)
|
||||
error_msg = f"Custom Search API Error: {e.message}"
|
||||
if e.is_quota_error:
|
||||
error_msg = ("Google Custom Search API quota exceeded. "
|
||||
"Free tier allows 100 queries/day. "
|
||||
"Wait until midnight PT or disable CSE in settings.")
|
||||
if wants_json:
|
||||
return jsonify({
|
||||
'error': True,
|
||||
'error_message': error_msg,
|
||||
'query': urlparse.unquote(query)
|
||||
}), e.code
|
||||
return render_template(
|
||||
'error.html',
|
||||
error_message=error_msg,
|
||||
translation=translation,
|
||||
config=g.user_config), e.code
|
||||
|
||||
wants_json = (
|
||||
request.args.get('format') == 'json' or
|
||||
|
|
@ -428,8 +448,17 @@ def search():
|
|||
full_query_val,
|
||||
search_util.search_type,
|
||||
g.user_config.preferences,
|
||||
translation,
|
||||
g.user_config.use_leta)
|
||||
translation)
|
||||
|
||||
# Filter out unsupported tabs when CSE is enabled
|
||||
# CSE only supports web (all) and image search, not videos/news
|
||||
use_cse = (
|
||||
g.user_config.use_cse and
|
||||
g.user_config.cse_api_key and
|
||||
g.user_config.cse_id
|
||||
)
|
||||
if use_cse:
|
||||
tabs = {k: v for k, v in tabs.items() if k in ['all', 'images', 'maps']}
|
||||
|
||||
# Feature to display currency_card
|
||||
# Since this is determined by more than just the
|
||||
|
|
@ -555,6 +584,13 @@ def search():
|
|||
'results': results
|
||||
})
|
||||
|
||||
# Get the user agent that was used for the search
|
||||
used_user_agent = ''
|
||||
if search_util.user_request:
|
||||
used_user_agent = search_util.user_request.modified_user_agent
|
||||
elif hasattr(g, 'user_request') and g.user_request:
|
||||
used_user_agent = g.user_request.modified_user_agent
|
||||
|
||||
return render_template(
|
||||
'display.html',
|
||||
has_update=app.config['HAS_UPDATE'],
|
||||
|
|
@ -576,6 +612,7 @@ def search():
|
|||
) and not search_util.search_type, # Standard search queries only
|
||||
response=cleanresponse,
|
||||
version_number=app.config['VERSION_NUMBER'],
|
||||
used_user_agent=used_user_agent,
|
||||
search_header=render_template(
|
||||
'header.html',
|
||||
home_url=home_url,
|
||||
|
|
@ -584,7 +621,7 @@ def search():
|
|||
languages=app.config['LANGUAGES'],
|
||||
countries=app.config['COUNTRIES'],
|
||||
time_periods=app.config['TIME_PERIODS'],
|
||||
logo=render_template('logo.html', dark=g.user_config.dark),
|
||||
logo=render_template('logo.html'),
|
||||
query=urlparse.unquote(query),
|
||||
search_type=search_util.search_type,
|
||||
mobile=g.user_request.mobile,
|
||||
|
|
@ -609,10 +646,11 @@ def config():
|
|||
return json.dumps(g.user_config.__dict__)
|
||||
elif request.method == 'PUT' and not config_disabled:
|
||||
if name:
|
||||
config_pkl = os.path.join(app.config['CONFIG_PATH'], name)
|
||||
session['config'] = (pickle.load(open(config_pkl, 'rb'))
|
||||
if os.path.exists(config_pkl)
|
||||
else session['config'])
|
||||
config_file = os.path.join(app.config['CONFIG_PATH'], name)
|
||||
if os.path.exists(config_file):
|
||||
with open(config_file, 'r', encoding='utf-8') as f:
|
||||
session['config'] = json.load(f)
|
||||
# else keep existing session['config']
|
||||
return json.dumps(session['config'])
|
||||
else:
|
||||
return json.dumps({})
|
||||
|
|
@ -628,7 +666,7 @@ def config():
|
|||
# Keep both the selection and the custom string
|
||||
if 'custom_user_agent' in config_data:
|
||||
config_data['custom_user_agent'] = config_data['custom_user_agent']
|
||||
print(f"Setting custom user agent to: {config_data['custom_user_agent']}") # Debug log
|
||||
app.logger.debug(f"Setting custom user agent to: {config_data['custom_user_agent']}")
|
||||
else:
|
||||
config_data['use_custom_user_agent'] = False
|
||||
# Only clear custom_user_agent if not using custom option
|
||||
|
|
@ -637,11 +675,9 @@ def config():
|
|||
|
||||
# Save config by name to allow a user to easily load later
|
||||
if name:
|
||||
pickle.dump(
|
||||
config_data,
|
||||
open(os.path.join(
|
||||
app.config['CONFIG_PATH'],
|
||||
name), 'wb'))
|
||||
config_file = os.path.join(app.config['CONFIG_PATH'], name)
|
||||
with open(config_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(config_data, f, indent=2)
|
||||
|
||||
session['config'] = config_data
|
||||
return redirect(config_data['url'])
|
||||
|
|
@ -803,8 +839,9 @@ def internal_error(e):
|
|||
|
||||
# Attempt to parse the query
|
||||
try:
|
||||
search_util = Search(request, g.user_config, g.session_key)
|
||||
query = search_util.new_search_query()
|
||||
if hasattr(g, 'user_config') and hasattr(g, 'session_key'):
|
||||
search_util = Search(request, g.user_config, g.session_key)
|
||||
query = search_util.new_search_query()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
|
@ -814,16 +851,26 @@ def internal_error(e):
|
|||
if (fallback_engine):
|
||||
return redirect(fallback_engine + (query or ''))
|
||||
|
||||
localization_lang = g.user_config.get_localization_lang()
|
||||
# Safely get localization language with fallback
|
||||
if hasattr(g, 'user_config'):
|
||||
localization_lang = g.user_config.get_localization_lang()
|
||||
else:
|
||||
localization_lang = 'lang_en'
|
||||
translation = app.config['TRANSLATIONS'][localization_lang]
|
||||
return render_template(
|
||||
'error.html',
|
||||
error_message='Internal server error (500)',
|
||||
translation=translation,
|
||||
farside='https://farside.link',
|
||||
config=g.user_config,
|
||||
query=urlparse.unquote(query or ''),
|
||||
params=g.user_config.to_params(keys=['preferences'])), 500
|
||||
# Build template context with safe defaults
|
||||
template_context = {
|
||||
'error_message': 'Internal server error (500)',
|
||||
'translation': translation,
|
||||
'farside': 'https://farside.link',
|
||||
'query': urlparse.unquote(query or '')
|
||||
}
|
||||
|
||||
# Add user config if available
|
||||
if hasattr(g, 'user_config'):
|
||||
template_context['config'] = g.user_config
|
||||
template_context['params'] = g.user_config.to_params(keys=['preferences'])
|
||||
|
||||
return render_template('error.html', **template_context), 500
|
||||
|
||||
|
||||
def run_app() -> None:
|
||||
|
|
|
|||
452
app/services/cse_client.py
Normal file
452
app/services/cse_client.py
Normal file
|
|
@ -0,0 +1,452 @@
|
|||
"""Google Custom Search Engine (CSE) API Client
|
||||
|
||||
This module provides a client for Google's Custom Search JSON API,
|
||||
allowing users to bring their own API key (BYOK) for search functionality.
|
||||
"""
|
||||
|
||||
import httpx
|
||||
from typing import Optional
|
||||
from dataclasses import dataclass
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from flask import render_template
|
||||
|
||||
|
||||
# Google Custom Search API endpoint
|
||||
CSE_API_URL = 'https://www.googleapis.com/customsearch/v1'
|
||||
|
||||
|
||||
class CSEException(Exception):
|
||||
"""Exception raised for CSE API errors"""
|
||||
def __init__(self, message: str, code: int = 500, is_quota_error: bool = False):
|
||||
self.message = message
|
||||
self.code = code
|
||||
self.is_quota_error = is_quota_error
|
||||
super().__init__(self.message)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CSEError:
|
||||
"""Represents an error from the CSE API"""
|
||||
code: int
|
||||
message: str
|
||||
|
||||
@property
|
||||
def is_quota_exceeded(self) -> bool:
|
||||
return self.code == 429 or 'quota' in self.message.lower()
|
||||
|
||||
@property
|
||||
def is_invalid_key(self) -> bool:
|
||||
return self.code == 400 or 'invalid' in self.message.lower()
|
||||
|
||||
|
||||
@dataclass
|
||||
class CSEResult:
|
||||
"""Represents a single search result from CSE API"""
|
||||
title: str
|
||||
link: str
|
||||
snippet: str
|
||||
display_link: str
|
||||
html_title: Optional[str] = None
|
||||
html_snippet: Optional[str] = None
|
||||
# Image-specific fields (populated for image search)
|
||||
image_url: Optional[str] = None
|
||||
thumbnail_url: Optional[str] = None
|
||||
image_width: Optional[int] = None
|
||||
image_height: Optional[int] = None
|
||||
context_link: Optional[str] = None # Page where image was found
|
||||
|
||||
|
||||
@dataclass
|
||||
class CSEResponse:
|
||||
"""Represents a complete CSE API response"""
|
||||
results: list[CSEResult]
|
||||
total_results: str
|
||||
search_time: float
|
||||
query: str
|
||||
start_index: int
|
||||
is_image_search: bool = False
|
||||
error: Optional[CSEError] = None
|
||||
|
||||
@property
|
||||
def has_error(self) -> bool:
|
||||
return self.error is not None
|
||||
|
||||
@property
|
||||
def has_results(self) -> bool:
|
||||
return len(self.results) > 0
|
||||
|
||||
|
||||
class CSEClient:
|
||||
"""Client for Google Custom Search Engine API
|
||||
|
||||
Usage:
|
||||
client = CSEClient(api_key='your-key', cse_id='your-cse-id')
|
||||
response = client.search('python programming')
|
||||
|
||||
if response.has_error:
|
||||
print(f"Error: {response.error.message}")
|
||||
else:
|
||||
for result in response.results:
|
||||
print(f"{result.title}: {result.link}")
|
||||
"""
|
||||
|
||||
def __init__(self, api_key: str, cse_id: str, timeout: float = 10.0):
|
||||
"""Initialize CSE client
|
||||
|
||||
Args:
|
||||
api_key: Google API key with Custom Search API enabled
|
||||
cse_id: Custom Search Engine ID (cx parameter)
|
||||
timeout: Request timeout in seconds
|
||||
"""
|
||||
self.api_key = api_key
|
||||
self.cse_id = cse_id
|
||||
self.timeout = timeout
|
||||
self._client = httpx.Client(timeout=timeout)
|
||||
|
||||
def search(
|
||||
self,
|
||||
query: str,
|
||||
start: int = 1,
|
||||
num: int = 10,
|
||||
safe: str = 'off',
|
||||
language: str = '',
|
||||
country: str = '',
|
||||
search_type: str = ''
|
||||
) -> CSEResponse:
|
||||
"""Execute a search query against the CSE API
|
||||
|
||||
Args:
|
||||
query: Search query string
|
||||
start: Starting result index (1-based, for pagination)
|
||||
num: Number of results to return (max 10)
|
||||
safe: Safe search setting ('off', 'medium', 'high')
|
||||
language: Language restriction (e.g., 'lang_en')
|
||||
country: Country restriction (e.g., 'countryUS')
|
||||
search_type: Type of search ('image' for image search, '' for web)
|
||||
|
||||
Returns:
|
||||
CSEResponse with results or error information
|
||||
"""
|
||||
params = {
|
||||
'key': self.api_key,
|
||||
'cx': self.cse_id,
|
||||
'q': query,
|
||||
'start': start,
|
||||
'num': min(num, 10), # API max is 10
|
||||
'safe': safe,
|
||||
}
|
||||
|
||||
# Add search type for image search
|
||||
if search_type == 'image':
|
||||
params['searchType'] = 'image'
|
||||
|
||||
# Add optional parameters
|
||||
if language:
|
||||
# CSE uses 'lr' for language restrict
|
||||
params['lr'] = language
|
||||
if country:
|
||||
# CSE uses 'cr' for country restrict
|
||||
params['cr'] = country
|
||||
|
||||
try:
|
||||
response = self._client.get(CSE_API_URL, params=params)
|
||||
data = response.json()
|
||||
|
||||
# Check for API errors
|
||||
if 'error' in data:
|
||||
error_info = data['error']
|
||||
return CSEResponse(
|
||||
results=[],
|
||||
total_results='0',
|
||||
search_time=0.0,
|
||||
query=query,
|
||||
start_index=start,
|
||||
error=CSEError(
|
||||
code=error_info.get('code', 500),
|
||||
message=error_info.get('message', 'Unknown error')
|
||||
)
|
||||
)
|
||||
|
||||
# Parse successful response
|
||||
search_info = data.get('searchInformation', {})
|
||||
items = data.get('items', [])
|
||||
is_image = search_type == 'image'
|
||||
|
||||
results = []
|
||||
for item in items:
|
||||
# Extract image-specific data if present
|
||||
image_data = item.get('image', {})
|
||||
|
||||
results.append(CSEResult(
|
||||
title=item.get('title', ''),
|
||||
link=item.get('link', ''),
|
||||
snippet=item.get('snippet', ''),
|
||||
display_link=item.get('displayLink', ''),
|
||||
html_title=item.get('htmlTitle'),
|
||||
html_snippet=item.get('htmlSnippet'),
|
||||
# Image fields
|
||||
image_url=item.get('link') if is_image else None,
|
||||
thumbnail_url=image_data.get('thumbnailLink'),
|
||||
image_width=image_data.get('width'),
|
||||
image_height=image_data.get('height'),
|
||||
context_link=image_data.get('contextLink')
|
||||
))
|
||||
|
||||
return CSEResponse(
|
||||
results=results,
|
||||
total_results=search_info.get('totalResults', '0'),
|
||||
search_time=float(search_info.get('searchTime', 0)),
|
||||
query=query,
|
||||
start_index=start,
|
||||
is_image_search=is_image
|
||||
)
|
||||
|
||||
except httpx.TimeoutException:
|
||||
return CSEResponse(
|
||||
results=[],
|
||||
total_results='0',
|
||||
search_time=0.0,
|
||||
query=query,
|
||||
start_index=start,
|
||||
error=CSEError(code=408, message='Request timed out')
|
||||
)
|
||||
except httpx.RequestError as e:
|
||||
return CSEResponse(
|
||||
results=[],
|
||||
total_results='0',
|
||||
search_time=0.0,
|
||||
query=query,
|
||||
start_index=start,
|
||||
error=CSEError(code=500, message=f'Request failed: {str(e)}')
|
||||
)
|
||||
except Exception as e:
|
||||
return CSEResponse(
|
||||
results=[],
|
||||
total_results='0',
|
||||
search_time=0.0,
|
||||
query=query,
|
||||
start_index=start,
|
||||
error=CSEError(code=500, message=f'Unexpected error: {str(e)}')
|
||||
)
|
||||
|
||||
def close(self):
|
||||
"""Close the HTTP client"""
|
||||
self._client.close()
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, *args):
|
||||
self.close()
|
||||
|
||||
|
||||
def cse_results_to_html(response: CSEResponse, query: str) -> str:
|
||||
"""Convert CSE API response to HTML matching Whoogle's result format
|
||||
|
||||
This generates HTML that mimics the structure expected by Whoogle's
|
||||
existing filter and result processing pipeline.
|
||||
|
||||
Args:
|
||||
response: CSEResponse from the API
|
||||
query: Original search query
|
||||
|
||||
Returns:
|
||||
HTML string formatted like Google search results
|
||||
"""
|
||||
if response.has_error:
|
||||
error = response.error
|
||||
if error.is_quota_exceeded:
|
||||
return _error_html(
|
||||
'API Quota Exceeded',
|
||||
'Your Google Custom Search API quota has been exceeded. '
|
||||
'Free tier allows 100 queries/day. Wait until midnight PT '
|
||||
'or enable billing in Google Cloud Console.'
|
||||
)
|
||||
elif error.is_invalid_key:
|
||||
return _error_html(
|
||||
'Invalid API Key',
|
||||
'Your Google Custom Search API key is invalid. '
|
||||
'Please check your API key and CSE ID in settings.'
|
||||
)
|
||||
else:
|
||||
return _error_html('Search Error', error.message)
|
||||
|
||||
if not response.has_results:
|
||||
return _no_results_html(query)
|
||||
|
||||
# Use different HTML structure for image vs web results
|
||||
if response.is_image_search:
|
||||
return _image_results_html(response, query)
|
||||
|
||||
# Build HTML results matching Whoogle's expected structure
|
||||
results_html = []
|
||||
|
||||
for result in response.results:
|
||||
# Escape HTML in content
|
||||
title = _escape_html(result.title)
|
||||
snippet = _escape_html(result.snippet)
|
||||
link = result.link
|
||||
display_link = _escape_html(result.display_link)
|
||||
|
||||
# Use HTML versions if available (they have bold tags for query terms)
|
||||
if result.html_title:
|
||||
title = result.html_title
|
||||
if result.html_snippet:
|
||||
snippet = result.html_snippet
|
||||
|
||||
# Match the structure used by Google/mock results
|
||||
result_html = f'''
|
||||
<div class="ZINbbc xpd O9g5cc uUPGi">
|
||||
<div class="kCrYT">
|
||||
<a href="{link}">
|
||||
<h3 class="BNeawe vvjwJb AP7Wnd">{title}</h3>
|
||||
<div class="BNeawe UPmit AP7Wnd luh4tb" style="color: var(--whoogle-result-url);">{display_link}</div>
|
||||
</a>
|
||||
</div>
|
||||
<div class="kCrYT">
|
||||
<div class="BNeawe s3v9rd AP7Wnd">
|
||||
<span class="VwiC3b">{snippet}</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
'''
|
||||
results_html.append(result_html)
|
||||
|
||||
# Build pagination if needed
|
||||
pagination_html = ''
|
||||
if int(response.total_results) > 10:
|
||||
pagination_html = _pagination_html(response.start_index, response.query)
|
||||
|
||||
# Wrap in expected structure
|
||||
# Add data-cse attribute to prevent collapse_sections from collapsing these results
|
||||
return f'''
|
||||
<html>
|
||||
<body>
|
||||
<div id="main" data-cse="true">
|
||||
<div id="cnt">
|
||||
<div id="rcnt">
|
||||
<div id="center_col">
|
||||
<div id="res">
|
||||
<div id="search">
|
||||
<div id="rso">
|
||||
{''.join(results_html)}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{pagination_html}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
|
||||
def _escape_html(text: str) -> str:
|
||||
"""Escape HTML special characters"""
|
||||
if not text:
|
||||
return ''
|
||||
return (text
|
||||
.replace('&', '&')
|
||||
.replace('<', '<')
|
||||
.replace('>', '>')
|
||||
.replace('"', '"')
|
||||
.replace("'", '''))
|
||||
|
||||
|
||||
def _error_html(title: str, message: str) -> str:
|
||||
"""Generate error HTML"""
|
||||
return f'''
|
||||
<html>
|
||||
<body>
|
||||
<div id="main">
|
||||
<div style="padding: 20px; text-align: center;">
|
||||
<h2 style="color: #d93025;">{_escape_html(title)}</h2>
|
||||
<p>{_escape_html(message)}</p>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
|
||||
def _no_results_html(query: str) -> str:
|
||||
"""Generate no results HTML"""
|
||||
return f'''
|
||||
<html>
|
||||
<body>
|
||||
<div id="main">
|
||||
<div style="padding: 20px;">
|
||||
<p>No results found for <b>{_escape_html(query)}</b></p>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
|
||||
def _image_results_html(response: CSEResponse, query: str) -> str:
|
||||
"""Generate HTML for image search results using the imageresults template
|
||||
|
||||
Args:
|
||||
response: CSEResponse with image results
|
||||
query: Original search query
|
||||
|
||||
Returns:
|
||||
HTML string formatted for image results display
|
||||
"""
|
||||
# Convert CSE results to the format expected by imageresults.html template
|
||||
results = []
|
||||
for result in response.results:
|
||||
image_url = result.image_url or result.link
|
||||
thumbnail_url = result.thumbnail_url or image_url
|
||||
web_page = result.context_link or result.link
|
||||
domain = urlparse(web_page).netloc if web_page else result.display_link
|
||||
|
||||
results.append({
|
||||
'domain': domain,
|
||||
'img_url': image_url,
|
||||
'web_page': web_page,
|
||||
'img_tbn': thumbnail_url
|
||||
})
|
||||
|
||||
# Build pagination link if needed
|
||||
next_link = None
|
||||
if int(response.total_results) > response.start_index + len(response.results) - 1:
|
||||
next_start = response.start_index + 10
|
||||
next_link = f'search?q={query}&tbm=isch&start={next_start}'
|
||||
|
||||
# Use the same template as regular image results
|
||||
return render_template(
|
||||
'imageresults.html',
|
||||
length=len(results),
|
||||
results=results,
|
||||
view_label="View Image",
|
||||
next_link=next_link
|
||||
)
|
||||
|
||||
|
||||
def _pagination_html(current_start: int, query: str) -> str:
|
||||
"""Generate pagination links"""
|
||||
# CSE API uses 1-based indexing, 10 results per page
|
||||
current_page = (current_start - 1) // 10 + 1
|
||||
|
||||
prev_link = ''
|
||||
next_link = ''
|
||||
|
||||
if current_page > 1:
|
||||
prev_start = (current_page - 2) * 10 + 1
|
||||
prev_link = f'<a href="search?q={query}&start={prev_start}">Previous</a>'
|
||||
|
||||
next_start = current_page * 10 + 1
|
||||
next_link = f'<a href="search?q={query}&start={next_start}">Next</a>'
|
||||
|
||||
return f'''
|
||||
<div id="foot" style="text-align: center; padding: 20px;">
|
||||
{prev_link}
|
||||
<span style="margin: 0 20px;">Page {current_page}</span>
|
||||
{next_link}
|
||||
</div>
|
||||
'''
|
||||
|
|
@ -193,10 +193,13 @@ const calc = () => {
|
|||
(statement.match(/\(/g) || []).length >
|
||||
(statement.match(/\)/g) || []).length
|
||||
) statement += ")"; else break;
|
||||
// evaluate the expression.
|
||||
// evaluate the expression using a safe evaluator (no eval())
|
||||
console.log("calculating [" + statement + "]");
|
||||
try {
|
||||
var result = eval(statement);
|
||||
// Safe evaluation: create a sandboxed function with only Math object available
|
||||
// This prevents arbitrary code execution while allowing mathematical operations
|
||||
const safeEval = new Function('Math', `'use strict'; return (${statement})`);
|
||||
var result = safeEval(Math);
|
||||
document.getElementById("prev-equation").innerHTML = mathtext.innerHTML + " = ";
|
||||
mathtext.innerHTML = result;
|
||||
mathtext.classList.remove("error-border");
|
||||
|
|
|
|||
|
|
@ -26,10 +26,12 @@
|
|||
{% else %}
|
||||
<link rel="stylesheet" href="{{ cb_url(config.theme + '-theme.css') }}"/>
|
||||
{% endif %}
|
||||
{% else %}
|
||||
<link rel="stylesheet" href="{{ cb_url(('dark' if config.dark else 'light') + '-theme.css') }}"/>
|
||||
{% endif %}
|
||||
<style>{{ config.style }}</style>
|
||||
{% if config.style %}
|
||||
<style>
|
||||
{{ config.style }}
|
||||
</style>
|
||||
{% endif %}
|
||||
<title>{{ clean_query(query) }} - Whoogle Search</title>
|
||||
</head>
|
||||
<body>
|
||||
|
|
|
|||
|
|
@ -7,8 +7,6 @@
|
|||
{% else %}
|
||||
<link rel="stylesheet" href="{{ cb_url(config.theme + '-theme.css') }}"/>
|
||||
{% endif %}
|
||||
{% else %}
|
||||
<link rel="stylesheet" href="{{ cb_url(('dark' if config.dark else 'light') + '-theme.css') }}"/>
|
||||
{% endif %}
|
||||
{% if bundle_static() %}
|
||||
<link rel="stylesheet" href="/{{ cb_url('bundle.css') }}">
|
||||
|
|
|
|||
|
|
@ -5,5 +5,8 @@
|
|||
{% if has_update %}
|
||||
|| <span class="update_available">Update Available 🟢</span>
|
||||
{% endif %}
|
||||
{% if config.show_user_agent and used_user_agent %}
|
||||
<br><span class="user-agent-display" style="font-size: 0.85em; color: #666;">User Agent: {{ used_user_agent }}</span>
|
||||
{% endif %}
|
||||
</p>
|
||||
</footer>
|
||||
|
|
|
|||
|
|
@ -10,9 +10,9 @@
|
|||
background-color: #fff;
|
||||
}
|
||||
body {
|
||||
padding: 0 8px;
|
||||
padding: 0 12px;
|
||||
margin: 0 auto;
|
||||
max-width: 736px;
|
||||
max-width: 1200px;
|
||||
}
|
||||
a {
|
||||
text-decoration: none;
|
||||
|
|
@ -167,6 +167,7 @@
|
|||
border-collapse: collapse;
|
||||
border-spacing: 0;
|
||||
width: 100%;
|
||||
table-layout: fixed;
|
||||
}
|
||||
.X6ZCif {
|
||||
color: #202124;
|
||||
|
|
@ -209,15 +210,20 @@
|
|||
text-align: center;
|
||||
}
|
||||
.RAyV4b {
|
||||
line-height: 140px;
|
||||
overflow: "hidden";
|
||||
height: 220px;
|
||||
line-height: 220px;
|
||||
overflow: hidden;
|
||||
text-align: center;
|
||||
}
|
||||
.t0fcAb {
|
||||
text-align: center;
|
||||
margin: auto;
|
||||
vertical-align: middle;
|
||||
object-fit: contain;
|
||||
object-fit: cover;
|
||||
max-width: 100%;
|
||||
height: auto;
|
||||
max-height: 220px;
|
||||
display: block;
|
||||
}
|
||||
.Tor4Ec {
|
||||
padding-top: 2px;
|
||||
|
|
@ -313,6 +319,24 @@
|
|||
a .CVA68e:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
.e3goi {
|
||||
width: 25%;
|
||||
padding: 10px;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
.svla5d {
|
||||
max-width: 100%;
|
||||
}
|
||||
@media (max-width: 900px) {
|
||||
.e3goi {
|
||||
width: 50%;
|
||||
}
|
||||
}
|
||||
@media (max-width: 600px) {
|
||||
.e3goi {
|
||||
width: 100%;
|
||||
}
|
||||
}
|
||||
</style>
|
||||
<div>
|
||||
<div>
|
||||
|
|
|
|||
|
|
@ -41,8 +41,6 @@
|
|||
{% else %}
|
||||
<link rel="stylesheet" href="{{ cb_url(config.theme + '-theme.css') }}"/>
|
||||
{% endif %}
|
||||
{% else %}
|
||||
<link rel="stylesheet" href="{{ cb_url(('dark' if config.dark else 'light') + '-theme.css') }}"/>
|
||||
{% endif %}
|
||||
{% if not bundle_static() %}
|
||||
<link rel="stylesheet" href="{{ cb_url('main.css') }}">
|
||||
|
|
@ -204,10 +202,6 @@
|
|||
</select>
|
||||
</div>
|
||||
<!-- DEPRECATED -->
|
||||
<!--<div class="config-div config-div-dark">-->
|
||||
<!--<label for="config-dark">{{ translation['config-dark'] }}: </label>-->
|
||||
<!--<input type="checkbox" name="dark" id="config-dark" {{ 'checked' if config.dark else '' }}>-->
|
||||
<!--</div>-->
|
||||
<div class="config-div config-div-safe">
|
||||
<label for="config-safe">{{ translation['config-safe'] }}: </label>
|
||||
<input type="checkbox" name="safe" id="config-safe" {{ 'checked' if config.safe else '' }}>
|
||||
|
|
@ -233,12 +227,6 @@
|
|||
<input type="checkbox" name="tor"
|
||||
id="config-tor" {{ '' if tor_available else 'hidden' }} {{ 'checked' if config.tor else '' }}>
|
||||
</div>
|
||||
<div class="config-div config-div-leta">
|
||||
<label class="tooltip" for="config-leta">Use Mullvad Leta Backend: </label>
|
||||
<input type="checkbox" name="use_leta"
|
||||
id="config-leta" {{ 'checked' if config.use_leta else '' }}>
|
||||
<div><span class="info-text"> — Uses Mullvad's privacy-focused search. Only supports regular web search (no images/videos/news/maps).</span></div>
|
||||
</div>
|
||||
<div class="config-div config-div-get-only">
|
||||
<label for="config-get-only">{{ translation['config-get-only'] }}: </label>
|
||||
<input type="checkbox" name="get_only"
|
||||
|
|
@ -264,6 +252,35 @@
|
|||
<input type="checkbox" name="accept_language"
|
||||
id="config-accept-language" {{ 'checked' if config.accept_language else '' }}>
|
||||
</div>
|
||||
<div class="config-div config-div-show-user-agent">
|
||||
<label for="config-show-user-agent">Show User Agent in Footer: </label>
|
||||
<input type="checkbox" name="show_user_agent"
|
||||
id="config-show-user-agent" {{ 'checked' if config.show_user_agent else '' }}>
|
||||
</div>
|
||||
<!-- Google Custom Search Engine (BYOK) Settings -->
|
||||
<div class="config-div config-div-cse-header" style="margin-top: 20px; border-top: 1px solid var(--result-bg); padding-top: 15px;">
|
||||
<strong>Google Custom Search (BYOK)</strong>
|
||||
<div><span class="info-text"> — <a href="https://github.com/benbusby/whoogle-search#google-custom-search-byok">Setup Guide</a></span></div>
|
||||
</div>
|
||||
<div class="config-div config-div-use-cse">
|
||||
<label for="config-use-cse">Use Custom Search API: </label>
|
||||
<input type="checkbox" name="use_cse" id="config-use-cse" {{ 'checked' if config.use_cse else '' }}>
|
||||
<div><span class="info-text"> — Enable to use your own Google API key (100 free queries/day)</span></div>
|
||||
</div>
|
||||
<div class="config-div config-div-cse-api-key">
|
||||
<label for="config-cse-api-key">CSE API Key: </label>
|
||||
<input type="password" name="cse_api_key" id="config-cse-api-key"
|
||||
value="{{ config.cse_api_key }}"
|
||||
placeholder="AIza..."
|
||||
autocomplete="off">
|
||||
</div>
|
||||
<div class="config-div config-div-cse-id">
|
||||
<label for="config-cse-id">CSE ID: </label>
|
||||
<input type="text" name="cse_id" id="config-cse-id"
|
||||
value="{{ config.cse_id }}"
|
||||
placeholder="abc123..."
|
||||
autocomplete="off">
|
||||
</div>
|
||||
<div class="config-div config-div-root-url">
|
||||
<label for="config-url">{{ translation['config-url'] }}: </label>
|
||||
<input type="text" name="url" id="config-url" value="{{ config.url }}">
|
||||
|
|
|
|||
|
|
@ -36,18 +36,14 @@ def fetch_favicon(url: str) -> bytes:
|
|||
bytes - the favicon bytes, or a placeholder image if one
|
||||
was not returned
|
||||
"""
|
||||
try:
|
||||
response = httpx.get(f'{ddg_favicon_site}/{urlparse(url).netloc}.ico', timeout=2.0)
|
||||
response = httpx.get(f'{ddg_favicon_site}/{urlparse(url).netloc}.ico')
|
||||
|
||||
if response.status_code == 200 and len(response.content) > 0:
|
||||
tmp_mem = io.BytesIO()
|
||||
tmp_mem.write(response.content)
|
||||
tmp_mem.seek(0)
|
||||
if response.status_code == 200 and len(response.content) > 0:
|
||||
tmp_mem = io.BytesIO()
|
||||
tmp_mem.write(response.content)
|
||||
tmp_mem.seek(0)
|
||||
|
||||
return tmp_mem.read()
|
||||
except Exception:
|
||||
# If favicon fetch fails, return placeholder
|
||||
pass
|
||||
return tmp_mem.read()
|
||||
return placeholder_img
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -420,8 +420,7 @@ def get_tabs_content(tabs: dict,
|
|||
full_query: str,
|
||||
search_type: str,
|
||||
preferences: str,
|
||||
translation: dict,
|
||||
use_leta: bool = False) -> dict:
|
||||
translation: dict) -> dict:
|
||||
"""Takes the default tabs content and updates it according to the query.
|
||||
|
||||
Args:
|
||||
|
|
@ -429,7 +428,6 @@ def get_tabs_content(tabs: dict,
|
|||
full_query: The original search query
|
||||
search_type: The current search_type
|
||||
translation: The translation to get the names of the tabs
|
||||
use_leta: Whether Mullvad Leta backend is being used
|
||||
|
||||
Returns:
|
||||
dict: contains the name, the href and if the tab is selected or not
|
||||
|
|
@ -439,11 +437,6 @@ def get_tabs_content(tabs: dict,
|
|||
block_idx = full_query.index('-site:')
|
||||
map_query = map_query[:block_idx]
|
||||
tabs = copy.deepcopy(tabs)
|
||||
|
||||
# If using Leta, remove unsupported tabs (images, videos, news, maps)
|
||||
if use_leta:
|
||||
tabs = {k: v for k, v in tabs.items() if k == 'all'}
|
||||
|
||||
for tab_id, tab_content in tabs.items():
|
||||
# update name to desired language
|
||||
if tab_id in translation:
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ from app.filter import Filter
|
|||
from app.request import gen_query
|
||||
from app.utils.misc import get_proxy_host_url
|
||||
from app.utils.results import get_first_link
|
||||
from app.services.cse_client import CSEClient, cse_results_to_html
|
||||
from bs4 import BeautifulSoup as bsoup
|
||||
from cryptography.fernet import Fernet, InvalidToken
|
||||
from flask import g
|
||||
|
|
@ -140,7 +141,91 @@ class Search:
|
|||
root_url=root_url,
|
||||
mobile=mobile,
|
||||
config=self.config,
|
||||
query=self.query)
|
||||
query=self.query,
|
||||
page_url=self.request.url)
|
||||
|
||||
# Check if CSE (Custom Search Engine) should be used
|
||||
use_cse = (
|
||||
self.config.use_cse and
|
||||
self.config.cse_api_key and
|
||||
self.config.cse_id
|
||||
)
|
||||
|
||||
if use_cse:
|
||||
# Use Google Custom Search API
|
||||
return self._generate_cse_response(content_filter, root_url, mobile)
|
||||
|
||||
# Default: Use traditional scraping method
|
||||
return self._generate_scrape_response(content_filter, root_url, mobile)
|
||||
|
||||
def _generate_cse_response(self, content_filter: Filter, root_url: str, mobile: bool) -> str:
|
||||
"""Generate response using Google Custom Search API
|
||||
|
||||
Args:
|
||||
content_filter: Filter instance for processing results
|
||||
root_url: Root URL of the instance
|
||||
mobile: Whether this is a mobile request
|
||||
|
||||
Returns:
|
||||
str: HTML response string
|
||||
"""
|
||||
# Get pagination start index from request params
|
||||
start = int(self.request_params.get('start', 1))
|
||||
|
||||
# Determine safe search setting
|
||||
safe = 'high' if self.config.safe else 'off'
|
||||
|
||||
# Determine search type (web or image)
|
||||
# tbm=isch or udm=2 indicates image search
|
||||
search_type = ''
|
||||
if self.search_type == 'isch' or self.request_params.get('udm') == '2':
|
||||
search_type = 'image'
|
||||
|
||||
# Create CSE client and perform search
|
||||
with CSEClient(
|
||||
api_key=self.config.cse_api_key,
|
||||
cse_id=self.config.cse_id
|
||||
) as client:
|
||||
response = client.search(
|
||||
query=self.query,
|
||||
start=start,
|
||||
safe=safe,
|
||||
language=self.config.lang_search,
|
||||
country=self.config.country,
|
||||
search_type=search_type
|
||||
)
|
||||
|
||||
# Convert CSE response to HTML
|
||||
html_content = cse_results_to_html(response, self.query)
|
||||
|
||||
# Store full query for tabs
|
||||
self.full_query = self.query
|
||||
|
||||
# Parse and filter the HTML
|
||||
html_soup = bsoup(html_content, 'html.parser')
|
||||
|
||||
# Handle feeling lucky
|
||||
if self.feeling_lucky:
|
||||
if response.has_results and response.results:
|
||||
return response.results[0].link
|
||||
self.feeling_lucky = False
|
||||
|
||||
# Apply content filter (encrypts links, applies CSS, etc.)
|
||||
formatted_results = content_filter.clean(html_soup)
|
||||
|
||||
return str(formatted_results)
|
||||
|
||||
def _generate_scrape_response(self, content_filter: Filter, root_url: str, mobile: bool) -> str:
|
||||
"""Generate response using traditional HTML scraping
|
||||
|
||||
Args:
|
||||
content_filter: Filter instance for processing results
|
||||
root_url: Root URL of the instance
|
||||
mobile: Whether this is a mobile request
|
||||
|
||||
Returns:
|
||||
str: HTML response string
|
||||
"""
|
||||
full_query = gen_query(self.query,
|
||||
self.request_params,
|
||||
self.config)
|
||||
|
|
@ -148,8 +233,10 @@ class Search:
|
|||
|
||||
# force mobile search when view image is true and
|
||||
# the request is not already made by a mobile
|
||||
view_image = ('tbm=isch' in full_query
|
||||
and self.config.view_image)
|
||||
is_image_query = ('tbm=isch' in full_query) or ('udm=2' in full_query)
|
||||
# Always parse image results when hitting the images endpoint (udm=2)
|
||||
# to avoid Google returning only text/AI blocks.
|
||||
view_image = is_image_query
|
||||
|
||||
client = self.user_request or g.user_request
|
||||
get_body = client.send(query=full_query,
|
||||
|
|
@ -194,4 +281,3 @@ class Search:
|
|||
link['href'] += param_str
|
||||
|
||||
return str(formatted_results)
|
||||
|
||||
|
|
|
|||
336
app/utils/ua_generator.py
Normal file
336
app/utils/ua_generator.py
Normal file
|
|
@ -0,0 +1,336 @@
|
|||
"""
|
||||
User Agent Generator for Opera-based UA strings.
|
||||
|
||||
This module generates realistic Opera User Agent strings based on patterns
|
||||
found in working UA strings that successfully bypass Google's restrictions.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
from datetime import datetime, timedelta
|
||||
from typing import List, Dict
|
||||
|
||||
|
||||
# Default fallback UA if generation fails
|
||||
DEFAULT_FALLBACK_UA = "Opera/9.80 (iPad; Opera Mini/5.0.17381/503; U; eu) Presto/2.6.35 Version/11.10)"
|
||||
|
||||
# Opera UA Pattern Templates
|
||||
OPERA_PATTERNS = [
|
||||
# Opera Mini (J2ME/MIDP)
|
||||
"Opera/9.80 (J2ME/MIDP; Opera Mini/{version}/{build}; U; {lang}) Presto/{presto} Version/{final}",
|
||||
|
||||
# Opera Mobile (Android)
|
||||
"Opera/9.80 (Android; Linux; Opera Mobi/{build}; U; {lang}) Presto/{presto} Version/{final}",
|
||||
|
||||
# Opera Mobile (iPhone)
|
||||
"Opera/9.80 (iPhone; Opera Mini/{version}/{build}; U; {lang}) Presto/{presto} Version/{final}",
|
||||
|
||||
# Opera Mobile (iPad)
|
||||
"Opera/9.80 (iPad; Opera Mini/{version}/{build}; U; {lang}) Presto/{presto} Version/{final}",
|
||||
]
|
||||
|
||||
# Randomization pools based on working UAs
|
||||
OPERA_MINI_VERSIONS = [
|
||||
"4.0", "4.1.11321", "4.1.12965", "4.1.13573", "4.1.13907", "4.1.14287",
|
||||
"4.1.15082", "4.2.13057", "4.2.13221", "4.2.13265", "4.2.13337",
|
||||
"4.2.13400", "4.2.13918", "4.2.13943", "4.2.14320", "4.2.14409",
|
||||
"4.2.14753", "4.2.14881", "4.2.14885", "4.2.14912", "4.2.15066",
|
||||
"4.2.15410", "4.2.16007", "4.2.16320", "4.2.18887", "4.2.19634",
|
||||
"4.2.21465", "4.2.22228", "4.2.23453", "4.2.24721", "4.3.13337",
|
||||
"4.3.24214", "4.4.26736", "4.4.29476", "4.5.33867", "4.5.40312",
|
||||
"5.0.15650", "5.0.16823", "5.0.17381", "5.0.17443", "5.0.18635",
|
||||
"5.0.18741", "5.0.19683", "5.0.19693", "5.0.20873", "5.0.22349",
|
||||
"5.1.21051", "5.1.21126", "5.1.21214", "5.1.21415", "5.1.21594",
|
||||
"5.1.21595", "5.1.22296", "5.1.22303", "5.1.22396", "5.1.22460",
|
||||
"5.1.22783", "5.1.22784", "6.0.24095", "6.0.24212", "6.0.24455",
|
||||
"6.1.25375", "6.1.25378", "6.1.25759", "6.24093", "6.24096",
|
||||
"6.24209", "6.24288", "6.5.26955", "6.5.29702", "7.0.29952",
|
||||
"7.1.32052", "7.1.32444", "7.1.32694", "7.29530", "7.5.33361",
|
||||
"7.6.35766", "9.80", "36.2.2254"
|
||||
]
|
||||
|
||||
OPERA_MOBI_BUILDS = [
|
||||
"27", "49", "447", "498", "1181", "1209", "3730",
|
||||
"ADR-1011151731", "ADR-1012211514", "ADR-1012221546", "ADR-1012272315",
|
||||
"SYB-1103211396", "SYB-1104061449", "SYB-1107071606",
|
||||
"ADR-1111101157"
|
||||
]
|
||||
|
||||
BUILD_NUMBERS = [
|
||||
"18.678", "18.684", "18.738", "18.794", "19.892", "19.916",
|
||||
"20.2477", "20.2479", "20.2485", "20.2489", "21.529", "22.387",
|
||||
"22.394", "22.401", "22.414", "22.453", "22.478", "23.317",
|
||||
"23.333", "23.334", "23.377", "23.390", "24.741", "24.743",
|
||||
"24.746", "24.783", "24.838", "24.871", "24.899", "25.657",
|
||||
"25.677", "25.729", "25.872", "26.1305", "27.1366", "27.1407",
|
||||
"27.1573", "28.2075", "28.2555", "28.2647", "28.2766", "29.3594",
|
||||
"30.3316", "31.1350", "35.2883", "35.5706", "37.6584", "119.132",
|
||||
"170.51", "170.54", "764", "870", "886", "490", "503"
|
||||
]
|
||||
|
||||
PRESTO_VERSIONS = [
|
||||
"2.2.0", "2.4.15", "2.4.154.15", "2.4.18", "2.5.25", "2.5.28",
|
||||
"2.6.35", "2.7.60", "2.7.81", "2.8.119", "2.8.149", "2.8.191",
|
||||
"2.9.201", "2.12.423"
|
||||
]
|
||||
|
||||
FINAL_VERSIONS = [
|
||||
"10.00", "10.1", "10.5", "10.54", "10.5454", "11.00", "11.10",
|
||||
"12.02", "12.16", "13.00"
|
||||
]
|
||||
|
||||
LANGUAGES = [
|
||||
# English variants
|
||||
"en", "en-US", "en-GB", "en-CA", "en-AU", "en-NZ", "en-ZA", "en-IN", "en-SG",
|
||||
# Western European
|
||||
"de", "de-DE", "de-AT", "de-CH",
|
||||
"fr", "fr-FR", "fr-CA", "fr-BE", "fr-CH", "fr-LU",
|
||||
"es", "es-ES", "es-MX", "es-AR", "es-CO", "es-CL", "es-PE", "es-VE", "es-LA",
|
||||
"it", "it-IT", "it-CH",
|
||||
"pt", "pt-PT", "pt-BR",
|
||||
"nl", "nl-NL", "nl-BE",
|
||||
# Nordic languages
|
||||
"da", "da-DK",
|
||||
"sv", "sv-SE",
|
||||
"no", "no-NO", "nb", "nn",
|
||||
"fi", "fi-FI",
|
||||
"is", "is-IS",
|
||||
# Eastern European
|
||||
"pl", "pl-PL",
|
||||
"cs", "cs-CZ",
|
||||
"sk", "sk-SK",
|
||||
"hu", "hu-HU",
|
||||
"ro", "ro-RO",
|
||||
"bg", "bg-BG",
|
||||
"hr", "hr-HR",
|
||||
"sr", "sr-RS",
|
||||
"sl", "sl-SI",
|
||||
"uk", "uk-UA",
|
||||
"ru", "ru-RU",
|
||||
# Asian languages
|
||||
"zh", "zh-CN", "zh-TW", "zh-HK",
|
||||
"ja", "ja-JP",
|
||||
"ko", "ko-KR",
|
||||
"th", "th-TH",
|
||||
"vi", "vi-VN",
|
||||
"id", "id-ID",
|
||||
"ms", "ms-MY",
|
||||
"fil", "tl",
|
||||
# Middle Eastern
|
||||
"tr", "tr-TR",
|
||||
"ar", "ar-SA", "ar-AE", "ar-EG",
|
||||
"he", "he-IL",
|
||||
"fa", "fa-IR",
|
||||
# Other
|
||||
"hi", "hi-IN",
|
||||
"bn", "bn-IN",
|
||||
"ta", "ta-IN",
|
||||
"te", "te-IN",
|
||||
"mr", "mr-IN",
|
||||
"el", "el-GR",
|
||||
"ca", "ca-ES",
|
||||
"eu", "eu-ES"
|
||||
]
|
||||
|
||||
|
||||
|
||||
def generate_opera_ua() -> str:
|
||||
"""
|
||||
Generate a single random Opera User Agent string.
|
||||
|
||||
Returns:
|
||||
str: A randomly generated Opera UA string
|
||||
"""
|
||||
pattern = random.choice(OPERA_PATTERNS)
|
||||
|
||||
# Determine which parameters to use based on the pattern
|
||||
params = {
|
||||
'lang': random.choice(LANGUAGES)
|
||||
}
|
||||
|
||||
if '{version}' in pattern:
|
||||
params['version'] = random.choice(OPERA_MINI_VERSIONS)
|
||||
|
||||
if '{build}' in pattern:
|
||||
# Use MOBI build for "Opera Mobi", regular build for "Opera Mini"
|
||||
if "Opera Mobi" in pattern:
|
||||
params['build'] = random.choice(OPERA_MOBI_BUILDS)
|
||||
else:
|
||||
params['build'] = random.choice(BUILD_NUMBERS)
|
||||
|
||||
if '{presto}' in pattern:
|
||||
params['presto'] = random.choice(PRESTO_VERSIONS)
|
||||
|
||||
if '{final}' in pattern:
|
||||
params['final'] = random.choice(FINAL_VERSIONS)
|
||||
|
||||
return pattern.format(**params)
|
||||
|
||||
|
||||
def generate_ua_pool(count: int = 10) -> List[str]:
|
||||
"""
|
||||
Generate a pool of unique Opera User Agent strings.
|
||||
|
||||
Args:
|
||||
count: Number of UA strings to generate (default: 10)
|
||||
|
||||
Returns:
|
||||
List[str]: List of unique UA strings
|
||||
"""
|
||||
ua_pool = set()
|
||||
|
||||
# Keep generating until we have enough unique UAs
|
||||
# Add safety limit to prevent infinite loop
|
||||
max_attempts = count * 100
|
||||
attempts = 0
|
||||
|
||||
try:
|
||||
while len(ua_pool) < count and attempts < max_attempts:
|
||||
ua = generate_opera_ua()
|
||||
ua_pool.add(ua)
|
||||
attempts += 1
|
||||
except Exception:
|
||||
# If generation fails entirely, return at least the default fallback
|
||||
if not ua_pool:
|
||||
return [DEFAULT_FALLBACK_UA]
|
||||
|
||||
# If we couldn't generate enough, fill remaining with default
|
||||
result = list(ua_pool)
|
||||
while len(result) < count:
|
||||
result.append(DEFAULT_FALLBACK_UA)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def save_ua_pool(uas: List[str], cache_path: str) -> None:
|
||||
"""
|
||||
Save UA pool to cache file.
|
||||
|
||||
Args:
|
||||
uas: List of UA strings to save
|
||||
cache_path: Path to cache file
|
||||
"""
|
||||
cache_data = {
|
||||
'generated_at': datetime.now().isoformat(),
|
||||
'user_agents': uas
|
||||
}
|
||||
|
||||
# Ensure directory exists
|
||||
cache_dir = os.path.dirname(cache_path)
|
||||
if cache_dir and not os.path.exists(cache_dir):
|
||||
os.makedirs(cache_dir, exist_ok=True)
|
||||
|
||||
with open(cache_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(cache_data, f, indent=2)
|
||||
|
||||
|
||||
def load_custom_ua_list(file_path: str) -> List[str]:
|
||||
"""
|
||||
Load custom UA list from a text file.
|
||||
|
||||
Args:
|
||||
file_path: Path to text file containing UA strings (one per line)
|
||||
|
||||
Returns:
|
||||
List[str]: List of UA strings, or empty list if file is invalid
|
||||
"""
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
uas = [line.strip() for line in f if line.strip()]
|
||||
|
||||
# Validate that we have at least one UA
|
||||
if not uas:
|
||||
return []
|
||||
|
||||
return uas
|
||||
except (FileNotFoundError, PermissionError, UnicodeDecodeError):
|
||||
return []
|
||||
|
||||
|
||||
def load_ua_pool(cache_path: str, count: int = 10) -> List[str]:
|
||||
"""
|
||||
Load UA pool from custom list file, cache, or generate new one.
|
||||
|
||||
Priority order:
|
||||
1. Custom UA list file (if WHOOGLE_UA_LIST_FILE is set)
|
||||
2. Cached auto-generated UAs
|
||||
3. Newly generated UAs
|
||||
|
||||
Args:
|
||||
cache_path: Path to cache file
|
||||
count: Number of UAs to generate if cache is invalid (default: 10)
|
||||
|
||||
Returns:
|
||||
List[str]: List of UA strings
|
||||
"""
|
||||
# Check for custom UA list file first (highest priority)
|
||||
custom_ua_file = os.environ.get('WHOOGLE_UA_LIST_FILE', '').strip()
|
||||
if custom_ua_file:
|
||||
custom_uas = load_custom_ua_list(custom_ua_file)
|
||||
if custom_uas:
|
||||
# Custom list loaded successfully
|
||||
return custom_uas
|
||||
else:
|
||||
# Custom file specified but invalid, log warning and fall back
|
||||
print(f"Warning: Custom UA list file '{custom_ua_file}' not found or invalid, falling back to auto-generated UAs")
|
||||
|
||||
# Check if we should use cache
|
||||
use_cache = os.environ.get('WHOOGLE_UA_CACHE_PERSISTENT', '1') == '1'
|
||||
refresh_days = int(os.environ.get('WHOOGLE_UA_CACHE_REFRESH_DAYS', '0'))
|
||||
|
||||
# If cache disabled, always generate new
|
||||
if not use_cache:
|
||||
uas = generate_ua_pool(count)
|
||||
save_ua_pool(uas, cache_path)
|
||||
return uas
|
||||
|
||||
# Try to load from cache
|
||||
if os.path.exists(cache_path):
|
||||
try:
|
||||
with open(cache_path, 'r', encoding='utf-8') as f:
|
||||
cache_data = json.load(f)
|
||||
|
||||
# Check if cache is expired (if refresh_days > 0)
|
||||
if refresh_days > 0:
|
||||
generated_at = datetime.fromisoformat(cache_data['generated_at'])
|
||||
age_days = (datetime.now() - generated_at).days
|
||||
|
||||
if age_days >= refresh_days:
|
||||
# Cache expired, generate new
|
||||
uas = generate_ua_pool(count)
|
||||
save_ua_pool(uas, cache_path)
|
||||
return uas
|
||||
|
||||
# Cache is valid, return it
|
||||
return cache_data['user_agents']
|
||||
except (json.JSONDecodeError, KeyError, ValueError):
|
||||
# Cache file is corrupted, generate new
|
||||
pass
|
||||
|
||||
# No valid cache, generate new
|
||||
uas = generate_ua_pool(count)
|
||||
save_ua_pool(uas, cache_path)
|
||||
return uas
|
||||
|
||||
|
||||
def get_random_ua(ua_pool: List[str]) -> str:
|
||||
"""
|
||||
Get a random UA from the pool.
|
||||
|
||||
Args:
|
||||
ua_pool: List of UA strings
|
||||
|
||||
Returns:
|
||||
str: Random UA string from the pool
|
||||
"""
|
||||
if not ua_pool:
|
||||
# Fallback to generating one if pool is empty
|
||||
try:
|
||||
return generate_opera_ua()
|
||||
except Exception:
|
||||
# If generation fails, use default fallback
|
||||
return DEFAULT_FALLBACK_UA
|
||||
|
||||
return random.choice(ua_pool)
|
||||
|
||||
|
|
@ -4,4 +4,5 @@ optional_dev_tag = ''
|
|||
if os.getenv('DEV_BUILD'):
|
||||
optional_dev_tag = '.dev' + os.getenv('DEV_BUILD')
|
||||
|
||||
__version__ = '1.1.0' + optional_dev_tag
|
||||
__version__ = '1.2.2' + optional_dev_tag
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
# can't use mem_limit in a 3.x docker-compose file in non swarm mode
|
||||
# see https://github.com/docker/compose/issues/4513
|
||||
version: "2.4"
|
||||
# Modern docker-compose format (v2+) does not require version specification
|
||||
# Memory limits are supported in Compose v2+ without version field
|
||||
|
||||
services:
|
||||
whoogle-search:
|
||||
|
|
|
|||
363
misc/check_google_user_agents.py
Executable file
363
misc/check_google_user_agents.py
Executable file
|
|
@ -0,0 +1,363 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test User Agent strings against Google to find which ones return actual search results
|
||||
instead of JavaScript pages or upgrade browser messages.
|
||||
|
||||
Usage:
|
||||
python test_google_user_agents.py <user_agent_file> [--output <output_file>] [--query <search_query>]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import random
|
||||
import sys
|
||||
import time
|
||||
from typing import List, Tuple
|
||||
import requests
|
||||
|
||||
# Common search queries to cycle through for more realistic testing
|
||||
DEFAULT_SEARCH_QUERIES = [
|
||||
"python programming",
|
||||
"weather today",
|
||||
"news",
|
||||
"how to cook pasta",
|
||||
"best movies 2025",
|
||||
"restaurants near me",
|
||||
"translate hello",
|
||||
"calculator",
|
||||
"time",
|
||||
"maps",
|
||||
"images",
|
||||
"videos",
|
||||
"shopping",
|
||||
"travel",
|
||||
"sports scores",
|
||||
"stock market",
|
||||
"recipes",
|
||||
"music",
|
||||
"books",
|
||||
"technology",
|
||||
"AI",
|
||||
"AI programming",
|
||||
"Why does google hate users?"
|
||||
]
|
||||
|
||||
# Markers that indicate blocked/JS pages
|
||||
BLOCK_MARKERS = [
|
||||
"unusual traffic",
|
||||
"sorry but your computer",
|
||||
"solve the captcha",
|
||||
"request looks automated",
|
||||
"g-recaptcha",
|
||||
"upgrade your browser",
|
||||
"browser is not supported",
|
||||
"please upgrade",
|
||||
"isn't supported",
|
||||
"isn\"t supported", # With escaped quote
|
||||
"upgrade to a recent version",
|
||||
"update your browser",
|
||||
"your browser isn't supported",
|
||||
]
|
||||
|
||||
# Markers that indicate actual search results
|
||||
SUCCESS_MARKERS = [
|
||||
'<div class="g"', # Google search result container
|
||||
'<div id="search"', # Search results container
|
||||
'<div class="rc"', # Result container
|
||||
'class="yuRUbf"', # Result link container
|
||||
'class="LC20lb"', # Result title
|
||||
'- Google Search</title>', # Page title indicator
|
||||
'id="rso"', # Results container
|
||||
'class="g"', # Result class (without div tag)
|
||||
]
|
||||
|
||||
|
||||
def read_user_agents(file_path: str) -> List[str]:
|
||||
"""Read user agent strings from a file, one per line."""
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
user_agents = [line.strip() for line in f if line.strip()]
|
||||
return user_agents
|
||||
except FileNotFoundError:
|
||||
print(f"Error: File '{file_path}' not found.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"Error reading file: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def test_user_agent(user_agent: str, query: str = "test", timeout: float = 10.0) -> Tuple[bool, str]:
|
||||
"""
|
||||
Test a user agent against Google search.
|
||||
|
||||
Returns:
|
||||
Tuple of (is_working: bool, reason: str)
|
||||
"""
|
||||
url = "https://www.google.com/search"
|
||||
params = {"q": query, "gbv": "1", "num": "10"}
|
||||
|
||||
headers = {
|
||||
"User-Agent": user_agent,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
"Connection": "keep-alive",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.get(url, params=params, headers=headers, timeout=timeout)
|
||||
|
||||
# Check HTTP status
|
||||
if response.status_code == 429:
|
||||
# Rate limited - raise this so we can handle it specially
|
||||
raise Exception(f"Rate limited (429)")
|
||||
if response.status_code >= 500:
|
||||
return False, f"Server error ({response.status_code})"
|
||||
if response.status_code == 403:
|
||||
return False, f"Blocked ({response.status_code})"
|
||||
if response.status_code >= 400:
|
||||
return False, f"HTTP {response.status_code}"
|
||||
|
||||
body_lower = response.text.lower()
|
||||
|
||||
# Check for block markers
|
||||
for marker in BLOCK_MARKERS:
|
||||
if marker.lower() in body_lower:
|
||||
return False, f"Blocked: {marker}"
|
||||
|
||||
# Check for redirect indicators first - these indicate non-working responses
|
||||
has_redirect = ("window.location" in body_lower or "location.href" in body_lower) and "google.com" not in body_lower
|
||||
if has_redirect:
|
||||
return False, "JavaScript redirect detected"
|
||||
|
||||
# Check for noscript redirect (another indicator of JS-only page)
|
||||
if 'noscript' in body_lower and 'http-equiv="refresh"' in body_lower:
|
||||
return False, "NoScript redirect page"
|
||||
|
||||
# Check for success markers (actual search results)
|
||||
# We need at least one strong indicator of search results
|
||||
has_results = any(marker in response.text for marker in SUCCESS_MARKERS)
|
||||
|
||||
if has_results:
|
||||
return True, "OK - Has search results"
|
||||
else:
|
||||
# Check for very short responses (likely error pages)
|
||||
if len(response.text) < 1000:
|
||||
return False, "Response too short (likely error page)"
|
||||
# If we don't have success markers, it's not a working response
|
||||
# Even if it's substantial and doesn't have block markers, it might be a JS-only page
|
||||
return False, "No search results found"
|
||||
|
||||
except requests.Timeout:
|
||||
return False, "Request timeout"
|
||||
except requests.HTTPError as e:
|
||||
if e.response and e.response.status_code == 429:
|
||||
# Rate limited - raise this so we can handle it specially
|
||||
raise Exception(f"Rate limited (429) - {str(e)}")
|
||||
return False, f"HTTP error: {str(e)}"
|
||||
except requests.RequestException as e:
|
||||
# Check if it's a 429 in the response
|
||||
if hasattr(e, 'response') and e.response and e.response.status_code == 429:
|
||||
raise Exception(f"Rate limited (429) - {str(e)}")
|
||||
return False, f"Request error: {str(e)}"
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Test User Agent strings against Google to find working ones.",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
python test_google_user_agents.py UAs.txt
|
||||
python test_google_user_agents.py UAs.txt --output working_uas.txt
|
||||
python test_google_user_agents.py UAs.txt --query "python programming"
|
||||
"""
|
||||
)
|
||||
parser.add_argument(
|
||||
"user_agent_file",
|
||||
help="Path to file containing user agent strings (one per line)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", "-o",
|
||||
help="Output file to write working user agents (default: stdout)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--query", "-q",
|
||||
default=None,
|
||||
help="Search query to use for testing (default: cycles through random queries)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--random-queries", "-r",
|
||||
action="store_true",
|
||||
help="Use random queries from a predefined list (default: True if --query not specified)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeout", "-t",
|
||||
type=float,
|
||||
default=10.0,
|
||||
help="Request timeout in seconds (default: 10.0)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--delay", "-d",
|
||||
type=float,
|
||||
default=0.5,
|
||||
help="Delay between requests in seconds (default: 0.5)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose", "-v",
|
||||
action="store_true",
|
||||
help="Show detailed results for each user agent"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Determine query strategy
|
||||
use_random_queries = args.random_queries or (args.query is None)
|
||||
if use_random_queries:
|
||||
search_queries = DEFAULT_SEARCH_QUERIES.copy()
|
||||
random.shuffle(search_queries) # Shuffle for variety
|
||||
current_query_idx = 0
|
||||
query_display = f"cycling through {len(search_queries)} random queries"
|
||||
else:
|
||||
search_queries = [args.query]
|
||||
query_display = f"'{args.query}'"
|
||||
|
||||
# Read user agents
|
||||
user_agents = read_user_agents(args.user_agent_file)
|
||||
if not user_agents:
|
||||
print("No user agents found in file.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Testing {len(user_agents)} user agents against Google...", file=sys.stderr)
|
||||
print(f"Query: {query_display}", file=sys.stderr)
|
||||
if args.output:
|
||||
print(f"Output file: {args.output} (appending results incrementally)", file=sys.stderr)
|
||||
print(file=sys.stderr)
|
||||
|
||||
# Load existing working user agents from output file to avoid duplicates
|
||||
existing_working = set()
|
||||
if args.output:
|
||||
try:
|
||||
with open(args.output, 'r', encoding='utf-8') as f:
|
||||
existing_working = {line.strip() for line in f if line.strip()}
|
||||
if existing_working:
|
||||
print(f"Found {len(existing_working)} existing user agents in output file", file=sys.stderr)
|
||||
except FileNotFoundError:
|
||||
# File doesn't exist yet, that's fine
|
||||
pass
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not read existing output file: {e}", file=sys.stderr)
|
||||
|
||||
# Open output file for incremental writing if specified (append mode)
|
||||
output_file = None
|
||||
if args.output:
|
||||
try:
|
||||
output_file = open(args.output, 'a', encoding='utf-8')
|
||||
except Exception as e:
|
||||
print(f"Error opening output file: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
working_agents = []
|
||||
failed_count = 0
|
||||
skipped_count = 0
|
||||
last_successful_idx = 0
|
||||
|
||||
try:
|
||||
for idx, ua in enumerate(user_agents, 1):
|
||||
# Skip testing if this UA is already in the working file
|
||||
if args.output and ua in existing_working:
|
||||
skipped_count += 1
|
||||
if args.verbose:
|
||||
print(f"[{idx}/{len(user_agents)}] ⊘ SKIPPED - Already in working file", file=sys.stderr)
|
||||
last_successful_idx = idx
|
||||
continue
|
||||
|
||||
try:
|
||||
# Get the next query (cycle through if using random queries)
|
||||
if use_random_queries:
|
||||
query = search_queries[current_query_idx % len(search_queries)]
|
||||
current_query_idx += 1
|
||||
else:
|
||||
query = args.query
|
||||
|
||||
is_working, reason = test_user_agent(ua, query, args.timeout)
|
||||
|
||||
if is_working:
|
||||
working_agents.append(ua)
|
||||
status = "✓"
|
||||
# Write immediately to output file if specified (skip if duplicate)
|
||||
if output_file:
|
||||
if ua not in existing_working:
|
||||
output_file.write(ua + '\n')
|
||||
output_file.flush() # Ensure it's written to disk
|
||||
existing_working.add(ua) # Track it to avoid duplicates
|
||||
else:
|
||||
if args.verbose:
|
||||
print(f"[{idx}/{len(user_agents)}] {status} WORKING (duplicate, skipped) - {reason}", file=sys.stderr)
|
||||
# Also print to stdout if no output file
|
||||
if not args.output:
|
||||
print(ua)
|
||||
|
||||
if args.verbose:
|
||||
print(f"[{idx}/{len(user_agents)}] {status} WORKING - {reason}", file=sys.stderr)
|
||||
else:
|
||||
failed_count += 1
|
||||
status = "✗"
|
||||
if args.verbose:
|
||||
print(f"[{idx}/{len(user_agents)}] {status} FAILED - {reason}", file=sys.stderr)
|
||||
|
||||
last_successful_idx = idx
|
||||
|
||||
# Progress indicator for non-verbose mode
|
||||
if not args.verbose and idx % 10 == 0:
|
||||
print(f"Progress: {idx}/{len(user_agents)} tested ({len(working_agents)} working, {failed_count} failed)", file=sys.stderr)
|
||||
|
||||
# Delay between requests to avoid rate limiting
|
||||
if idx < len(user_agents):
|
||||
time.sleep(args.delay)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print(file=sys.stderr)
|
||||
print(f"\nInterrupted by user at index {idx}/{len(user_agents)}", file=sys.stderr)
|
||||
print(f"Last successful test: {last_successful_idx}/{len(user_agents)}", file=sys.stderr)
|
||||
break
|
||||
except Exception as e:
|
||||
# Handle unexpected errors (like network issues or rate limits)
|
||||
error_msg = str(e)
|
||||
if "429" in error_msg or "Rate limited" in error_msg:
|
||||
print(file=sys.stderr)
|
||||
print(f"\n⚠️ RATE LIMIT DETECTED at index {idx}/{len(user_agents)}", file=sys.stderr)
|
||||
print(f"Last successful test: {last_successful_idx}/{len(user_agents)}", file=sys.stderr)
|
||||
print(f"Working user agents found so far: {len(working_agents)}", file=sys.stderr)
|
||||
if args.output:
|
||||
print(f"Results saved to: {args.output}", file=sys.stderr)
|
||||
print(f"\nTo resume later, you can skip the first {last_successful_idx} user agents.", file=sys.stderr)
|
||||
raise # Re-raise to exit the loop
|
||||
else:
|
||||
print(f"[{idx}/{len(user_agents)}] ERROR - {error_msg}", file=sys.stderr)
|
||||
failed_count += 1
|
||||
last_successful_idx = idx
|
||||
if idx < len(user_agents):
|
||||
time.sleep(args.delay)
|
||||
continue
|
||||
|
||||
finally:
|
||||
# Close output file if opened
|
||||
if output_file:
|
||||
output_file.close()
|
||||
|
||||
# Summary
|
||||
print(file=sys.stderr)
|
||||
tested_count = last_successful_idx - skipped_count
|
||||
print(f"Summary: {len(working_agents)} working, {failed_count} failed, {skipped_count} skipped out of {last_successful_idx} processed (of {len(user_agents)} total)", file=sys.stderr)
|
||||
if last_successful_idx < len(user_agents):
|
||||
print(f"Note: Processing stopped at index {last_successful_idx}. {len(user_agents) - last_successful_idx} user agents not processed.", file=sys.stderr)
|
||||
if args.output:
|
||||
print(f"Results saved to: {args.output}", file=sys.stderr)
|
||||
|
||||
return 0 if working_agents else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
|
||||
198
misc/generate_uas.py
Executable file
198
misc/generate_uas.py
Executable file
|
|
@ -0,0 +1,198 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Standalone Opera User Agent String Generator
|
||||
|
||||
This tool generates Opera-based User Agent strings that can be used with Whoogle.
|
||||
It can be run independently to generate and display UA strings on demand.
|
||||
|
||||
Usage:
|
||||
python misc/generate_uas.py [count]
|
||||
|
||||
Arguments:
|
||||
count: Number of UA strings to generate (default: 10)
|
||||
|
||||
Examples:
|
||||
python misc/generate_uas.py # Generate 10 UAs
|
||||
python misc/generate_uas.py 20 # Generate 20 UAs
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Default fallback UA if generation fails
|
||||
DEFAULT_FALLBACK_UA = "Opera/9.30 (Nintendo Wii; U; ; 3642; en)"
|
||||
|
||||
# Try to import from the app module if available
|
||||
try:
|
||||
# Add parent directory to path to allow imports
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
||||
from app.utils.ua_generator import generate_ua_pool
|
||||
USE_APP_MODULE = True
|
||||
except ImportError:
|
||||
USE_APP_MODULE = False
|
||||
# Self-contained version if app module is not available
|
||||
import random
|
||||
|
||||
# Opera UA Pattern Templates
|
||||
OPERA_PATTERNS = [
|
||||
"Opera/9.80 (J2ME/MIDP; Opera Mini/{version}/{build}; U; {lang}) Presto/{presto} Version/{final}",
|
||||
"Opera/9.80 (Android; Linux; Opera Mobi/{build}; U; {lang}) Presto/{presto} Version/{final}",
|
||||
"Opera/9.80 (iPhone; Opera Mini/{version}/{build}; U; {lang}) Presto/{presto} Version/{final}",
|
||||
"Opera/9.80 (iPad; Opera Mini/{version}/{build}; U; {lang}) Presto/{presto} Version/{final}",
|
||||
]
|
||||
|
||||
OPERA_MINI_VERSIONS = [
|
||||
"4.0", "4.1.11321", "4.2.13337", "4.2.14912", "4.2.15410", "4.3.24214",
|
||||
"5.0.18741", "5.1.22296", "5.1.22783", "6.0.24095", "6.24093", "7.1.32444",
|
||||
"7.6.35766", "36.2.2254"
|
||||
]
|
||||
|
||||
OPERA_MOBI_BUILDS = [
|
||||
"27", "49", "447", "1209", "3730", "ADR-1012221546", "SYB-1107071606"
|
||||
]
|
||||
|
||||
BUILD_NUMBERS = [
|
||||
"22.387", "22.478", "23.334", "23.377", "24.746", "24.783", "25.657",
|
||||
"27.1407", "28.2647", "35.5706", "119.132", "870", "886"
|
||||
]
|
||||
|
||||
PRESTO_VERSIONS = [
|
||||
"2.4.15", "2.4.18", "2.5.25", "2.8.119", "2.12.423"
|
||||
]
|
||||
|
||||
FINAL_VERSIONS = [
|
||||
"10.00", "10.1", "10.54", "11.10", "12.16", "13.00"
|
||||
]
|
||||
|
||||
LANGUAGES = [
|
||||
# English variants
|
||||
"en", "en-US", "en-GB", "en-CA", "en-AU", "en-NZ", "en-ZA", "en-IN", "en-SG",
|
||||
# Western European
|
||||
"de", "de-DE", "de-AT", "de-CH",
|
||||
"fr", "fr-FR", "fr-CA", "fr-BE", "fr-CH", "fr-LU",
|
||||
"es", "es-ES", "es-MX", "es-AR", "es-CO", "es-CL", "es-PE", "es-VE", "es-LA",
|
||||
"it", "it-IT", "it-CH",
|
||||
"pt", "pt-PT", "pt-BR",
|
||||
"nl", "nl-NL", "nl-BE",
|
||||
# Nordic languages
|
||||
"da", "da-DK",
|
||||
"sv", "sv-SE",
|
||||
"no", "no-NO", "nb", "nn",
|
||||
"fi", "fi-FI",
|
||||
"is", "is-IS",
|
||||
# Eastern European
|
||||
"pl", "pl-PL",
|
||||
"cs", "cs-CZ",
|
||||
"sk", "sk-SK",
|
||||
"hu", "hu-HU",
|
||||
"ro", "ro-RO",
|
||||
"bg", "bg-BG",
|
||||
"hr", "hr-HR",
|
||||
"sr", "sr-RS",
|
||||
"sl", "sl-SI",
|
||||
"uk", "uk-UA",
|
||||
"ru", "ru-RU",
|
||||
# Asian languages
|
||||
"zh", "zh-CN", "zh-TW", "zh-HK",
|
||||
"ja", "ja-JP",
|
||||
"ko", "ko-KR",
|
||||
"th", "th-TH",
|
||||
"vi", "vi-VN",
|
||||
"id", "id-ID",
|
||||
"ms", "ms-MY",
|
||||
"fil", "tl",
|
||||
# Middle Eastern
|
||||
"tr", "tr-TR",
|
||||
"ar", "ar-SA", "ar-AE", "ar-EG",
|
||||
"he", "he-IL",
|
||||
"fa", "fa-IR",
|
||||
# Other
|
||||
"hi", "hi-IN",
|
||||
"bn", "bn-IN",
|
||||
"ta", "ta-IN",
|
||||
"te", "te-IN",
|
||||
"mr", "mr-IN",
|
||||
"el", "el-GR",
|
||||
"ca", "ca-ES",
|
||||
"eu", "eu-ES"
|
||||
]
|
||||
|
||||
def generate_opera_ua():
|
||||
"""Generate a single random Opera User Agent string."""
|
||||
pattern = random.choice(OPERA_PATTERNS)
|
||||
params = {'lang': random.choice(LANGUAGES)}
|
||||
|
||||
if '{version}' in pattern:
|
||||
params['version'] = random.choice(OPERA_MINI_VERSIONS)
|
||||
if '{build}' in pattern:
|
||||
if "Opera Mobi" in pattern:
|
||||
params['build'] = random.choice(OPERA_MOBI_BUILDS)
|
||||
else:
|
||||
params['build'] = random.choice(BUILD_NUMBERS)
|
||||
if '{presto}' in pattern:
|
||||
params['presto'] = random.choice(PRESTO_VERSIONS)
|
||||
if '{final}' in pattern:
|
||||
params['final'] = random.choice(FINAL_VERSIONS)
|
||||
|
||||
return pattern.format(**params)
|
||||
|
||||
def generate_ua_pool(count=10):
|
||||
"""Generate a pool of unique Opera User Agent strings."""
|
||||
ua_pool = set()
|
||||
max_attempts = count * 100
|
||||
attempts = 0
|
||||
|
||||
try:
|
||||
while len(ua_pool) < count and attempts < max_attempts:
|
||||
ua = generate_opera_ua()
|
||||
ua_pool.add(ua)
|
||||
attempts += 1
|
||||
except Exception:
|
||||
# If generation fails entirely, return at least the default fallback
|
||||
if not ua_pool:
|
||||
return [DEFAULT_FALLBACK_UA]
|
||||
|
||||
# If we couldn't generate enough, fill remaining with default
|
||||
result = list(ua_pool)
|
||||
while len(result) < count:
|
||||
result.append(DEFAULT_FALLBACK_UA)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function to generate and display UA strings."""
|
||||
# Parse command line argument
|
||||
count = 10 # Default
|
||||
if len(sys.argv) > 1:
|
||||
try:
|
||||
count = int(sys.argv[1])
|
||||
if count < 1:
|
||||
print("Error: Count must be a positive integer", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except ValueError:
|
||||
print(f"Error: Invalid count '{sys.argv[1]}'. Must be an integer.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Show which mode we're using (to stderr so it doesn't interfere with output)
|
||||
if USE_APP_MODULE:
|
||||
print(f"# Using app.utils.ua_generator module", file=sys.stderr)
|
||||
else:
|
||||
print(f"# Using standalone generator (app module not available)", file=sys.stderr)
|
||||
|
||||
print(f"# Generating {count} Opera User Agent strings...\n", file=sys.stderr)
|
||||
|
||||
# Generate UAs
|
||||
uas = generate_ua_pool(count)
|
||||
|
||||
# Display them (one per line, no numbering)
|
||||
for ua in uas:
|
||||
print(ua)
|
||||
|
||||
# Summary to stderr so it doesn't interfere with piping
|
||||
print(f"\n# Generated {len(uas)} unique User Agent strings", file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
|
|
@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|||
|
||||
[tool.ruff]
|
||||
line-length = 100
|
||||
target-version = "py311"
|
||||
target-version = "py312"
|
||||
lint.select = [
|
||||
"E", "F", "W", # pycodestyle/pyflakes
|
||||
"I", # isort
|
||||
|
|
@ -13,4 +13,4 @@ lint.ignore = []
|
|||
|
||||
[tool.black]
|
||||
line-length = 100
|
||||
target-version = ['py311']
|
||||
target-version = ['py312']
|
||||
|
|
|
|||
|
|
@ -1,16 +1,15 @@
|
|||
attrs==25.3.0
|
||||
beautifulsoup4==4.13.5
|
||||
brotli==1.1.0
|
||||
brotli==1.2.0
|
||||
certifi==2025.8.3
|
||||
cffi==2.0.0
|
||||
click==8.3.0
|
||||
cryptography==3.3.2; platform_machine == 'armv7l'
|
||||
cryptography==46.0.1; platform_machine != 'armv7l'
|
||||
cryptography==46.0.1
|
||||
cssutils==2.11.1
|
||||
defusedxml==0.7.1
|
||||
Flask==2.3.2
|
||||
Flask==3.1.2
|
||||
idna==3.10
|
||||
itsdangerous==2.1.2
|
||||
itsdangerous==2.2.0
|
||||
Jinja2==3.1.6
|
||||
MarkupSafe==3.0.2
|
||||
more-itertools==10.8.0
|
||||
|
|
@ -18,10 +17,9 @@ packaging==25.0
|
|||
pluggy==1.6.0
|
||||
pycodestyle==2.14.0
|
||||
pycparser==2.22
|
||||
pyOpenSSL==19.1.0; platform_machine == 'armv7l'
|
||||
pyOpenSSL==25.3.0; platform_machine != 'armv7l'
|
||||
pyOpenSSL==25.3.0
|
||||
pyparsing==3.2.5
|
||||
pytest==7.2.1
|
||||
pytest==8.3.3
|
||||
python-dateutil==2.9.0.post0
|
||||
httpx[http2,socks]==0.28.1
|
||||
cachetools==6.2.0
|
||||
|
|
@ -32,5 +30,5 @@ h11>=0.16.0
|
|||
validators==0.35.0
|
||||
waitress==3.0.2
|
||||
wcwidth==0.2.14
|
||||
Werkzeug==3.0.6
|
||||
Werkzeug==3.1.4
|
||||
python-dotenv==1.1.1
|
||||
|
|
|
|||
|
|
@ -1,11 +1,13 @@
|
|||
from app import app
|
||||
from app.request import Request
|
||||
from app.utils.session import generate_key
|
||||
from test.mock_google import build_mock_response
|
||||
import httpx
|
||||
import pytest
|
||||
import random
|
||||
|
||||
demo_config = {
|
||||
'near': random.choice(['Seattle', 'New York', 'San Francisco']),
|
||||
'dark': str(random.getrandbits(1)),
|
||||
'nojs': str(random.getrandbits(1)),
|
||||
'lang_interface': random.choice(app.config['LANGUAGES'])['value'],
|
||||
'lang_search': random.choice(app.config['LANGUAGES'])['value'],
|
||||
|
|
@ -13,6 +15,38 @@ demo_config = {
|
|||
}
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def mock_google(monkeypatch):
|
||||
original_send = Request.send
|
||||
|
||||
def fake_send(self, base_url='', query='', attempt=0,
|
||||
force_mobile=False, user_agent=''):
|
||||
use_mock = not base_url or 'google.com/search' in base_url
|
||||
if not use_mock:
|
||||
return original_send(self, base_url, query, attempt,
|
||||
force_mobile, user_agent)
|
||||
|
||||
html = build_mock_response(query, getattr(self, 'language', ''), getattr(self, 'country', ''))
|
||||
request_url = (base_url or self.search_url) + query
|
||||
request = httpx.Request('GET', request_url)
|
||||
return httpx.Response(200, request=request, text=html)
|
||||
|
||||
def fake_autocomplete(self, q):
|
||||
normalized = q.replace('+', ' ').lower()
|
||||
suggestions = []
|
||||
if 'green eggs and' in normalized:
|
||||
suggestions.append('green eggs and ham')
|
||||
if 'the cat in the' in normalized:
|
||||
suggestions.append('the cat in the hat')
|
||||
if normalized.startswith('who'):
|
||||
suggestions.extend(['whoogle', 'whoogle search'])
|
||||
return suggestions
|
||||
|
||||
monkeypatch.setattr(Request, 'send', fake_send)
|
||||
monkeypatch.setattr(Request, 'autocomplete', fake_autocomplete)
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def client():
|
||||
with app.test_client() as client:
|
||||
|
|
|
|||
136
test/mock_google.py
Normal file
136
test/mock_google.py
Normal file
|
|
@ -0,0 +1,136 @@
|
|||
from urllib.parse import parse_qs, unquote, quote
|
||||
|
||||
from app.models.config import Config
|
||||
|
||||
DEFAULT_RESULTS = [
|
||||
('Example Domain', 'https://example.com/{slug}', 'Example information about {term}.'),
|
||||
('Whoogle Search', 'https://github.com/benbusby/whoogle-search', 'Private self-hosted Google proxy'),
|
||||
('Wikipedia', 'https://en.wikipedia.org/wiki/{title}', '{title} – encyclopedia entry.'),
|
||||
]
|
||||
|
||||
|
||||
def _result_block(title, href, snippet):
|
||||
encoded_href = quote(href, safe=':/')
|
||||
return (
|
||||
f'<div class="ZINbbc xpd O9g5cc uUPGi">'
|
||||
f'<div class="kCrYT">'
|
||||
f'<a href="/url?q={encoded_href}&sa=U&ved=2ahUKE">'
|
||||
f'<h3 class="BNeawe vvjwJb AP7Wnd">{title}</h3>'
|
||||
f'<span class="CVA68e">{title}</span>'
|
||||
f'</a>'
|
||||
f'<div class="VwiC3b">{snippet}</div>'
|
||||
f'</div>'
|
||||
f'</div>'
|
||||
)
|
||||
|
||||
|
||||
def _main_results(query, params, language='', country=''):
|
||||
term = query.lower()
|
||||
slug = query.replace(' ', '-')
|
||||
results = []
|
||||
|
||||
pref_lang = ''
|
||||
pref_country = ''
|
||||
if 'preferences' in params:
|
||||
try:
|
||||
pref_data = Config(**{})._decode_preferences(params['preferences'][0])
|
||||
pref_lang = str(pref_data.get('lang_interface', '') or '').lower()
|
||||
pref_country = str(pref_data.get('country', '') or '').lower()
|
||||
except Exception:
|
||||
pref_lang = pref_country = ''
|
||||
else:
|
||||
pref_lang = pref_country = ''
|
||||
|
||||
if 'wikipedia' in term:
|
||||
hl = str(params.get('hl', [''])[0] or '').lower()
|
||||
gl = str(params.get('gl', [''])[0] or '').lower()
|
||||
lr = str(params.get('lr', [''])[0] or '').lower()
|
||||
language_code = str(language or '').lower()
|
||||
country_code = str(country or '').lower()
|
||||
is_japanese = (
|
||||
hl.startswith('ja') or
|
||||
gl.startswith('jp') or
|
||||
lr.endswith('lang_ja') or
|
||||
language_code.endswith('lang_ja') or
|
||||
country_code.startswith('jp') or
|
||||
pref_lang.endswith('lang_ja') or
|
||||
pref_country.startswith('jp')
|
||||
)
|
||||
if is_japanese:
|
||||
results.append((
|
||||
'ウィキペディア',
|
||||
'https://ja.wikipedia.org/wiki/ウィキペディア',
|
||||
'日本語版ウィキペディアの記事です。'
|
||||
))
|
||||
else:
|
||||
results.append((
|
||||
'Wikipedia',
|
||||
'https://www.wikipedia.org/wiki/Wikipedia',
|
||||
'Wikipedia is a free online encyclopedia.'
|
||||
))
|
||||
|
||||
if 'pinterest' in term:
|
||||
results.append((
|
||||
'Pinterest',
|
||||
'https://www.pinterest.com/ideas/',
|
||||
'Discover recipes, home ideas, style inspiration and other ideas.'
|
||||
))
|
||||
|
||||
if 'whoogle' in term:
|
||||
results.append((
|
||||
'Whoogle Search GitHub',
|
||||
'https://github.com/benbusby/whoogle-search',
|
||||
'Source code for Whoogle Search.'
|
||||
))
|
||||
|
||||
if 'github' in term:
|
||||
results.append((
|
||||
'GitHub',
|
||||
f'https://github.com/search?q={slug}',
|
||||
'GitHub is a development platform to host and review code.'
|
||||
))
|
||||
|
||||
for title, url, snippet in DEFAULT_RESULTS:
|
||||
formatted_url = url.format(slug=slug, term=term, title=title.replace(' ', '_'))
|
||||
formatted_snippet = snippet.format(term=query, title=title)
|
||||
results.append((title, formatted_url, formatted_snippet))
|
||||
|
||||
unique = []
|
||||
seen = set()
|
||||
for entry in results:
|
||||
if entry[1] in seen:
|
||||
continue
|
||||
seen.add(entry[1])
|
||||
unique.append(entry)
|
||||
|
||||
return ''.join(_result_block(*entry) for entry in unique)
|
||||
|
||||
|
||||
def build_mock_response(raw_query, language='', country=''):
|
||||
if '&' in raw_query:
|
||||
q_part, extra = raw_query.split('&', 1)
|
||||
else:
|
||||
q_part, extra = raw_query, ''
|
||||
|
||||
query = unquote(q_part)
|
||||
params = parse_qs(extra)
|
||||
|
||||
results_html = _main_results(query, params, language, country)
|
||||
safe_query = query.replace('"', '')
|
||||
pagination = (
|
||||
f'<a href="/search?q={q_part}&start=10">Next</a>'
|
||||
f'<a href="/search?q={q_part}&start=20">More</a>'
|
||||
)
|
||||
|
||||
return (
|
||||
'<html>'
|
||||
'<head><title>Mock Google Results</title></head>'
|
||||
'<body>'
|
||||
f'<div id="main">{results_html}</div>'
|
||||
f'<form action="/search" method="GET">'
|
||||
f'<input name="q" value="{safe_query}">'
|
||||
'</form>'
|
||||
f'<footer class="TuS8Ad">{pagination}</footer>'
|
||||
'</body>'
|
||||
'</html>'
|
||||
)
|
||||
|
|
@ -66,16 +66,5 @@ def test_prefs_url(client):
|
|||
|
||||
rv = client.get(f'{base_url}&preferences={JAPAN_PREFS}')
|
||||
assert rv._status_code == 200
|
||||
# Leta may format results differently than Google, so check for either:
|
||||
# 1. Japanese Wikipedia URL (Google's format)
|
||||
# 2. Japanese language results (indicated by Japanese characters or lang param)
|
||||
# 3. Any Wikipedia result (Leta may not localize URLs the same way)
|
||||
has_ja_wiki = b'ja.wikipedia.org' in rv.data
|
||||
has_japanese_content = b'\xe3\x82' in rv.data or b'\xe3\x83' in rv.data # Japanese characters
|
||||
has_wiki_result = b'wikipedia.org' in rv.data
|
||||
|
||||
# Test passes if we get Japanese Wikipedia, Japanese content, or any Wikipedia result
|
||||
# (Leta backend may handle language preferences differently)
|
||||
assert has_ja_wiki or has_japanese_content or has_wiki_result, \
|
||||
"Expected Japanese Wikipedia results or Japanese content in response"
|
||||
assert b'ja.wikipedia.org' in rv.data
|
||||
|
||||
|
|
|
|||
|
|
@ -75,14 +75,14 @@ def test_config(client):
|
|||
|
||||
# Test disabling changing config from client
|
||||
app.config['CONFIG_DISABLE'] = 1
|
||||
dark_mod = not demo_config['dark']
|
||||
demo_config['dark'] = dark_mod
|
||||
nojs_mod = not bool(int(demo_config['nojs']))
|
||||
demo_config['nojs'] = str(int(nojs_mod))
|
||||
rv = client.post(f'/{Endpoint.config}', data=demo_config)
|
||||
assert rv._status_code == 403
|
||||
|
||||
rv = client.get(f'/{Endpoint.config}')
|
||||
config = json.loads(rv.data)
|
||||
assert config['dark'] != dark_mod
|
||||
assert config['nojs'] != nojs_mod
|
||||
|
||||
|
||||
def test_opensearch(client):
|
||||
|
|
|
|||
|
|
@ -72,9 +72,6 @@
|
|||
# Remove everything except basic result cards from all search queries
|
||||
#WHOOGLE_MINIMAL=0
|
||||
|
||||
# Set the number of results per page
|
||||
#WHOOGLE_RESULTS_PER_PAGE=10
|
||||
|
||||
# Controls visibility of autocomplete/search suggestions
|
||||
#WHOOGLE_AUTOCOMPLETE=1
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue