mirror of
https://github.com/benbusby/whoogle-search.git
synced 2026-03-11 08:54:34 +00:00
Compare commits
65 commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2949510d68 | ||
|
|
255f1a2c12 | ||
|
|
4852e5b64f | ||
|
|
9c5b3150aa | ||
|
|
6c7ca7c082 | ||
|
|
ff3a44b91e | ||
|
|
b3c09ade5c | ||
|
|
a2ec4e9f22 | ||
|
|
db6d031e86 | ||
|
|
c96f5ada2e | ||
|
|
ccdeb60fc0 | ||
|
|
20ed493671 | ||
|
|
20753224f3 | ||
|
|
71a2c10e58 | ||
|
|
9ff2d2f90a | ||
|
|
0f000a676b | ||
|
|
7b56aa053b | ||
|
|
f9f54115e3 | ||
|
|
c008090d83 | ||
|
|
6bcde23501 | ||
|
|
3698d9065e | ||
|
|
cffef7aa15 | ||
|
|
178d67a73f | ||
|
|
65326e37b4 | ||
|
|
490fc6c4f9 | ||
|
|
9b3a6ce550 | ||
|
|
5f17b82735 | ||
|
|
00d8aec2fb | ||
|
|
c46ec6f937 | ||
|
|
65c0c99dad | ||
|
|
20111a8f88 | ||
|
|
bb3347f7ff | ||
|
|
e0a4a5f2cb | ||
|
|
457725ee5a | ||
|
|
442060b2ef | ||
|
|
ca214cb563 | ||
|
|
33cdaf390d | ||
|
|
9dd33de91a | ||
|
|
0fe29daaf1 | ||
|
|
579d983db8 | ||
|
|
be83605c77 | ||
|
|
ffdeeb5f44 | ||
|
|
99c7c7b00d | ||
|
|
7f80eb1e51 | ||
|
|
418d9df89c | ||
|
|
3733d87546 | ||
|
|
6782413560 | ||
|
|
1c1dcfc270 | ||
|
|
ba757b64e8 | ||
|
|
3476367ee1 | ||
|
|
7ce8c0b216 | ||
|
|
e24f2d751c | ||
|
|
ec9e7877b6 | ||
|
|
c70497d532 | ||
|
|
69d1ddae0c | ||
|
|
339eb61cea | ||
|
|
e4cabe3e5b | ||
|
|
f25611cbcb | ||
|
|
97502de606 | ||
|
|
1339c49dc5 | ||
|
|
dda91ad155 | ||
|
|
d6c8d73147 | ||
|
|
123a00669c | ||
|
|
5218d97f02 | ||
|
|
fb19bded0d |
57 changed files with 3945 additions and 477 deletions
75
.github/workflows/buildx.yml
vendored
75
.github/workflows/buildx.yml
vendored
|
|
@ -3,12 +3,15 @@ name: buildx
|
|||
on:
|
||||
workflow_run:
|
||||
workflows: ["docker_main"]
|
||||
branches: [main]
|
||||
branches: [main, updates]
|
||||
types:
|
||||
- completed
|
||||
push:
|
||||
tags:
|
||||
- '*'
|
||||
release:
|
||||
types:
|
||||
- published
|
||||
|
||||
jobs:
|
||||
on-success:
|
||||
|
|
@ -17,43 +20,73 @@ jobs:
|
|||
- name: Wait for tests to succeed
|
||||
if: ${{ github.event.workflow_run.conclusion != 'success' && startsWith(github.ref, 'refs/tags') != true }}
|
||||
run: exit 1
|
||||
- name: Debug workflow context
|
||||
run: |
|
||||
echo "Event name: ${{ github.event_name }}"
|
||||
echo "Ref: ${{ github.ref }}"
|
||||
echo "Actor: ${{ github.actor }}"
|
||||
echo "Branch: ${{ github.event.workflow_run.head_branch }}"
|
||||
echo "Conclusion: ${{ github.event.workflow_run.conclusion }}"
|
||||
- name: checkout code
|
||||
uses: actions/checkout@v2
|
||||
- name: install buildx
|
||||
id: buildx
|
||||
uses: crazy-max/ghaction-docker-buildx@v1
|
||||
with:
|
||||
version: latest
|
||||
uses: actions/checkout@v4
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v1
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_PASSWORD }}
|
||||
- name: Login to ghcr.io
|
||||
uses: docker/login-action@v1
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
- name: build and push the image
|
||||
if: startsWith(github.ref, 'refs/heads/main') && github.actor == 'benbusby'
|
||||
# Disabled: only build on release events now
|
||||
# - name: build and push the image
|
||||
# if: startsWith(github.ref, 'refs/heads/main') && (github.actor == 'benbusby' || github.actor == 'Don-Swanson')
|
||||
# run: |
|
||||
# docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
|
||||
# docker buildx ls
|
||||
# docker buildx build --push \
|
||||
# --tag benbusby/whoogle-search:latest \
|
||||
# --platform linux/amd64,linux/arm64 .
|
||||
# docker buildx build --push \
|
||||
# --tag ghcr.io/benbusby/whoogle-search:latest \
|
||||
# --platform linux/amd64,linux/arm64 .
|
||||
- name: build and push updates branch (update-testing tag)
|
||||
if: github.event_name == 'workflow_run' && github.event.workflow_run.head_branch == 'updates' && github.event.workflow_run.conclusion == 'success' && (github.event.workflow_run.actor.login == 'benbusby' || github.event.workflow_run.actor.login == 'Don-Swanson')
|
||||
run: |
|
||||
docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
|
||||
docker buildx ls
|
||||
docker buildx build --push \
|
||||
--tag benbusby/whoogle-search:latest \
|
||||
--tag benbusby/whoogle-search:update-testing \
|
||||
--tag ghcr.io/benbusby/whoogle-search:update-testing \
|
||||
--platform linux/amd64,linux/arm64 .
|
||||
- name: build and push release (version + latest)
|
||||
if: github.event_name == 'release' && github.event.release.prerelease == false && (github.actor == 'benbusby' || github.actor == 'Don-Swanson')
|
||||
run: |
|
||||
TAG="${{ github.event.release.tag_name }}"
|
||||
VERSION="${TAG#v}"
|
||||
docker buildx build --push \
|
||||
--tag benbusby/whoogle-search:${VERSION} \
|
||||
--tag benbusby/whoogle-search:latest \
|
||||
--tag ghcr.io/benbusby/whoogle-search:${VERSION} \
|
||||
--tag ghcr.io/benbusby/whoogle-search:latest \
|
||||
--platform linux/amd64,linux/arm64 .
|
||||
- name: build and push pre-release (version only)
|
||||
if: github.event_name == 'release' && github.event.release.prerelease == true && (github.actor == 'benbusby' || github.actor == 'Don-Swanson')
|
||||
run: |
|
||||
TAG="${{ github.event.release.tag_name }}"
|
||||
VERSION="${TAG#v}"
|
||||
docker buildx build --push \
|
||||
--tag benbusby/whoogle-search:${VERSION} \
|
||||
--tag ghcr.io/benbusby/whoogle-search:${VERSION} \
|
||||
--platform linux/amd64,linux/arm64 .
|
||||
- name: build and push tag
|
||||
if: startsWith(github.ref, 'refs/tags')
|
||||
run: |
|
||||
docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
|
||||
docker buildx ls
|
||||
docker buildx build --push \
|
||||
--tag benbusby/whoogle-search:${GITHUB_REF#refs/*/v}\
|
||||
--platform linux/amd64,linux/arm/v7,linux/arm64 .
|
||||
docker buildx build --push \
|
||||
--tag ghcr.io/benbusby/whoogle-search:${GITHUB_REF#refs/*/v}\
|
||||
--platform linux/amd64,linux/arm/v7,linux/arm64 .
|
||||
--tag benbusby/whoogle-search:${GITHUB_REF#refs/*/v} \
|
||||
--tag ghcr.io/benbusby/whoogle-search:${GITHUB_REF#refs/*/v} \
|
||||
--platform linux/amd64,linux/arm64 .
|
||||
|
|
|
|||
5
.github/workflows/docker_main.yml
vendored
5
.github/workflows/docker_main.yml
vendored
|
|
@ -3,7 +3,7 @@ name: docker_main
|
|||
on:
|
||||
workflow_run:
|
||||
workflows: ["tests"]
|
||||
branches: [main]
|
||||
branches: [main, updates]
|
||||
types:
|
||||
- completed
|
||||
|
||||
|
|
@ -11,9 +11,10 @@ on:
|
|||
jobs:
|
||||
on-success:
|
||||
runs-on: ubuntu-latest
|
||||
if: ${{ github.event.workflow_run.conclusion == 'success' }}
|
||||
steps:
|
||||
- name: checkout code
|
||||
uses: actions/checkout@v2
|
||||
uses: actions/checkout@v4
|
||||
- name: build and test (docker)
|
||||
run: |
|
||||
docker build --tag whoogle-search:test .
|
||||
|
|
|
|||
20
.github/workflows/pypi.yml
vendored
20
.github/workflows/pypi.yml
vendored
|
|
@ -38,21 +38,37 @@ jobs:
|
|||
password: ${{ secrets.TEST_PYPI_API_TOKEN }}
|
||||
repository_url: https://test.pypi.org/legacy/
|
||||
publish:
|
||||
# Gate real PyPI publishing to stable SemVer tags only
|
||||
if: startsWith(github.ref, 'refs/tags/')
|
||||
name: Build and publish to PyPI
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Check if stable release
|
||||
id: check_tag
|
||||
run: |
|
||||
TAG="${{ github.ref_name }}"
|
||||
if echo "$TAG" | grep -qE '^v?[0-9]+\.[0-9]+\.[0-9]+$'; then
|
||||
echo "is_stable=true" >> $GITHUB_OUTPUT
|
||||
echo "Tag '$TAG' is a stable release. Will publish to PyPI."
|
||||
else
|
||||
echo "is_stable=false" >> $GITHUB_OUTPUT
|
||||
echo "Tag '$TAG' is not a stable release (contains pre-release suffix). Skipping PyPI publish."
|
||||
fi
|
||||
- name: Set up Python 3.9
|
||||
if: steps.check_tag.outputs.is_stable == 'true'
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: 3.9
|
||||
- name: Install pypa/build
|
||||
if: steps.check_tag.outputs.is_stable == 'true'
|
||||
run: >-
|
||||
python -m
|
||||
pip install
|
||||
build
|
||||
--user
|
||||
- name: Build binary wheel and source tarball
|
||||
if: steps.check_tag.outputs.is_stable == 'true'
|
||||
run: >-
|
||||
python -m
|
||||
build
|
||||
|
|
@ -61,7 +77,7 @@ jobs:
|
|||
--outdir dist/
|
||||
.
|
||||
- name: Publish distribution to PyPI
|
||||
if: startsWith(github.ref, 'refs/tags')
|
||||
if: steps.check_tag.outputs.is_stable == 'true'
|
||||
uses: pypa/gh-action-pypi-publish@master
|
||||
with:
|
||||
password: ${{ secrets.PYPI_API_TOKEN }}
|
||||
password: ${{ secrets.PYPI_API_TOKEN }}
|
||||
33
.github/workflows/stale.yml
vendored
Normal file
33
.github/workflows/stale.yml
vendored
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
# This workflow warns and then closes issues and PRs that have had no activity for a specified amount of time.
|
||||
#
|
||||
# You can adjust the behavior by modifying this file.
|
||||
# For more information, see:
|
||||
# https://github.com/actions/stale
|
||||
name: Mark stale issues and pull requests
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: '35 10 * * *'
|
||||
|
||||
jobs:
|
||||
stale:
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
issues: write
|
||||
pull-requests: write
|
||||
|
||||
steps:
|
||||
- uses: actions/stale@v10
|
||||
with:
|
||||
days-before-stale: 90
|
||||
days-before-close: 7
|
||||
stale-issue-message: 'This issue has been automatically marked as stale due to inactivity. If it is still valid please comment within 7 days or it will be auto-closed.'
|
||||
close-issue-message: 'Closing this issue due to prolonged inactivity.'
|
||||
# Disabled PR Closing for now, but pre-staged the settings
|
||||
days-before-pr-stale: -1
|
||||
days-before-pr-close: -1
|
||||
operations-per-run: 100
|
||||
stale-pr-message: "This PR appears to be stale. If it is still valid please comment within 14 days or it will be auto-closed."
|
||||
close-pr-message: "This PR was closed as stale."
|
||||
exempt-issue-labels: 'keep-open,enhancement,critical,dependencies,documentation'
|
||||
13
.pre-commit-config.yaml
Normal file
13
.pre-commit-config.yaml
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
repos:
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.6.9
|
||||
hooks:
|
||||
- id: ruff
|
||||
args: [--fix]
|
||||
- id: ruff-format
|
||||
- repo: https://github.com/psf/black
|
||||
rev: 24.8.0
|
||||
hooks:
|
||||
- id: black
|
||||
args: [--quiet]
|
||||
|
||||
25
Dockerfile
25
Dockerfile
|
|
@ -1,4 +1,14 @@
|
|||
FROM python:3.12.6-alpine3.20 AS builder
|
||||
# NOTE: ARMv7 support has been dropped due to lack of pre-built cryptography wheels for Alpine/musl.
|
||||
# To restore ARMv7 support for local builds:
|
||||
# 1. Change requirements.txt:
|
||||
# cryptography==3.3.2; platform_machine == 'armv7l'
|
||||
# cryptography==46.0.1; platform_machine != 'armv7l'
|
||||
# pyOpenSSL==19.1.0; platform_machine == 'armv7l'
|
||||
# pyOpenSSL==25.3.0; platform_machine != 'armv7l'
|
||||
# 2. Add linux/arm/v7 to --platform flag when building:
|
||||
# docker buildx build --platform linux/amd64,linux/arm/v7,linux/arm64 .
|
||||
|
||||
FROM python:3.12-alpine3.22 AS builder
|
||||
|
||||
RUN apk --no-cache add \
|
||||
build-base \
|
||||
|
|
@ -12,13 +22,16 @@ COPY requirements.txt .
|
|||
RUN pip install --upgrade pip
|
||||
RUN pip install --prefix /install --no-warn-script-location --no-cache-dir -r requirements.txt
|
||||
|
||||
FROM python:3.12.6-alpine3.20
|
||||
FROM python:3.12-alpine3.22
|
||||
|
||||
RUN apk add --no-cache tor curl openrc libstdc++
|
||||
# Remove bridge package to avoid CVEs (not needed for Docker containers)
|
||||
RUN apk add --no-cache --no-scripts tor curl openrc libstdc++ && \
|
||||
apk del --no-cache bridge || true
|
||||
# git go //for obfs4proxy
|
||||
# libcurl4-openssl-dev
|
||||
|
||||
RUN apk --no-cache upgrade
|
||||
RUN pip install --upgrade pip
|
||||
RUN apk --no-cache upgrade && \
|
||||
apk del --no-cache --rdepends bridge || true
|
||||
|
||||
# uncomment to build obfs4proxy
|
||||
# RUN git clone https://gitlab.com/yawning/obfs4.git
|
||||
|
|
@ -100,4 +113,4 @@ EXPOSE $EXPOSE_PORT
|
|||
HEALTHCHECK --interval=30s --timeout=5s \
|
||||
CMD curl -f http://localhost:${EXPOSE_PORT}/healthz || exit 1
|
||||
|
||||
CMD misc/tor/start-tor.sh & ./run
|
||||
CMD ["/bin/sh", "-c", "misc/tor/start-tor.sh & ./run"]
|
||||
|
|
|
|||
351
README.md
351
README.md
|
|
@ -1,10 +1,10 @@
|
|||
>[!WARNING]
|
||||
>
|
||||
>As of 16 January, 2025, Google seemingly no longer supports performing search queries without JavaScript enabled. This is a fundamental part of how Whoogle
|
||||
>Since 16 January, 2025, Google has been attacking the ability to perform search queries without JavaScript enabled. This is a fundamental part of how Whoogle
|
||||
>works -- Whoogle requests the JavaScript-free search results, then filters out garbage from the results page and proxies all external content for the user.
|
||||
>
|
||||
>This is possibly a breaking change that will mean the end for Whoogle. I'll continue monitoring the status of their JS-free results and looking into workarounds,
|
||||
>and will make another post if a solution is found (or not).
|
||||
>This is possibly a breaking change that may mean the end for Whoogle. We'll continue fighting back and releasing workarounds until all workarounds are
|
||||
>exhausted or a better method is found. If you know of a better way, please review and comment in our Way Forward Discussion
|
||||
|
||||
___
|
||||
|
||||
|
|
@ -14,7 +14,6 @@ ___
|
|||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://github.com/benbusby/whoogle-search/actions/workflows/tests.yml)
|
||||
[](https://github.com/benbusby/whoogle-search/actions/workflows/buildx.yml)
|
||||
[](https://codebeat.co/projects/github-com-benbusby-shoogle-master)
|
||||
[](https://hub.docker.com/r/benbusby/whoogle-search)
|
||||
|
||||
<table>
|
||||
|
|
@ -41,8 +40,9 @@ Contents
|
|||
1. [Arch/AUR](#arch-linux--arch-based-distributions)
|
||||
1. [Helm/Kubernetes](#helm-chart-for-kubernetes)
|
||||
4. [Environment Variables and Configuration](#environment-variables)
|
||||
5. [Usage](#usage)
|
||||
6. [Extra Steps](#extra-steps)
|
||||
5. [Google Custom Search (BYOK)](#google-custom-search-byok)
|
||||
6. [Usage](#usage)
|
||||
7. [Extra Steps](#extra-steps)
|
||||
1. [Set Primary Search Engine](#set-whoogle-as-your-primary-search-engine)
|
||||
2. [Custom Redirecting](#custom-redirecting)
|
||||
2. [Custom Bangs](#custom-bangs)
|
||||
|
|
@ -51,10 +51,10 @@ Contents
|
|||
5. [Using with Firefox Containers](#using-with-firefox-containers)
|
||||
6. [Reverse Proxying](#reverse-proxying)
|
||||
1. [Nginx](#nginx)
|
||||
7. [Contributing](#contributing)
|
||||
8. [FAQ](#faq)
|
||||
9. [Public Instances](#public-instances)
|
||||
10. [Screenshots](#screenshots)
|
||||
8. [Contributing](#contributing)
|
||||
9. [FAQ](#faq)
|
||||
10. [Public Instances](#public-instances)
|
||||
11. [Screenshots](#screenshots)
|
||||
|
||||
## Features
|
||||
- No ads or sponsored content
|
||||
|
|
@ -69,12 +69,18 @@ Contents
|
|||
- POST request search and suggestion queries (when possible)
|
||||
- View images at full res without site redirect (currently mobile only)
|
||||
- Light/Dark/System theme modes (with support for [custom CSS theming](https://github.com/benbusby/whoogle-search/wiki/User-Contributed-CSS-Themes))
|
||||
- Randomly generated User Agent
|
||||
- Auto-generated Opera User Agents with random rotation
|
||||
- 10 unique Opera-based UAs generated on startup from 115 language variants
|
||||
- Randomly rotated for each search request to avoid detection patterns
|
||||
- Cached across restarts with configurable refresh options
|
||||
- Fallback to safe default UA if generation fails
|
||||
- Optional display of current UA in search results footer
|
||||
- Easy to install/deploy
|
||||
- DDG-style bang (i.e. `!<tag> <query>`) searches
|
||||
- User-defined [custom bangs](#custom-bangs)
|
||||
- Optional location-based searching (i.e. results near \<city\>)
|
||||
- Optional NoJS mode to view search results in a separate window with JavaScript blocked
|
||||
- JSON output for results via content negotiation (see "JSON results (API)")
|
||||
|
||||
<sup>*No third party JavaScript. Whoogle can be used with JavaScript disabled, but if enabled, uses JavaScript for things like presenting search suggestions.</sup>
|
||||
|
||||
|
|
@ -83,6 +89,17 @@ Contents
|
|||
<sup>***If deployed to a remote server, or configured to send requests through a VPN, Tor, proxy, etc.</sup>
|
||||
|
||||
## Install
|
||||
|
||||
### Supported Platforms
|
||||
Official Docker images are built for:
|
||||
- **linux/amd64** (x86_64)
|
||||
- **linux/arm64** (ARM 64-bit, Raspberry Pi 3/4/5, Apple Silicon)
|
||||
|
||||
**Note**: ARMv7 support (32-bit ARM, Raspberry Pi 2) was dropped in v1.2.0 due to incompatibility with modern security libraries on Alpine Linux. Users with ARMv7 devices can either:
|
||||
- Use an older version (v1.1.x or earlier)
|
||||
- Build locally with pinned dependencies (see notes in Dockerfile)
|
||||
- Upgrade to a 64-bit OS if hardware supports it (Raspberry Pi 3+)
|
||||
|
||||
There are a few different ways to begin using the app, depending on your preferences:
|
||||
|
||||
___
|
||||
|
|
@ -155,6 +172,13 @@ Use one of the following guides to install Whoogle on Koyeb:
|
|||
|
||||
___
|
||||
|
||||
### [RepoCloud](https://repocloud.io)
|
||||
[](https://repocloud.io/details/?app_id=309)
|
||||
|
||||
1. Sign up for a free [RepoCloud account](https://repocloud.io) and receive free credits to get started.
|
||||
2. Click "Deploy" to launch the app and access it instantly via your RepoCloud URL.
|
||||
___
|
||||
|
||||
### [pipx](https://github.com/pipxproject/pipx#install-pipx)
|
||||
Persistent install:
|
||||
|
||||
|
|
@ -399,7 +423,7 @@ To use the Kubernetes Helm Chart:
|
|||
1. Ensure you have [Helm](https://helm.sh/docs/intro/install/) `>=3.0.0` installed
|
||||
2. Clone this repository
|
||||
3. Update [charts/whoogle/values.yaml](./charts/whoogle/values.yaml) as desired
|
||||
4. Run `helm install whoogle ./charts/whoogle`
|
||||
4. Run `helm upgrade --install whoogle ./charts/whoogle`
|
||||
|
||||
___
|
||||
|
||||
|
|
@ -430,9 +454,12 @@ There are a few optional environment variables available for customizing a Whoog
|
|||
| WHOOGLE_PROXY_PASS | The password of the proxy server. |
|
||||
| WHOOGLE_PROXY_TYPE | The type of the proxy server. Can be "socks5", "socks4", or "http". |
|
||||
| WHOOGLE_PROXY_LOC | The location of the proxy server (host or ip). |
|
||||
| WHOOGLE_USER_AGENT | The desktop user agent to use. Defaults to a randomly generated one. |
|
||||
| WHOOGLE_USER_AGENT_MOBILE | The mobile user agent to use. Defaults to a randomly generated one. |
|
||||
| WHOOGLE_USER_AGENT | The desktop user agent to use when using 'env_conf' option. Leave empty to use auto-generated Opera UAs. |
|
||||
| WHOOGLE_USER_AGENT_MOBILE | The mobile user agent to use when using 'env_conf' option. Leave empty to use auto-generated Opera UAs. |
|
||||
| WHOOGLE_USE_CLIENT_USER_AGENT | Enable to use your own user agent for all requests. Defaults to false. |
|
||||
| WHOOGLE_UA_CACHE_PERSISTENT | Whether to persist auto-generated UAs across restarts. Set to '0' to regenerate on each startup. Default '1'. |
|
||||
| WHOOGLE_UA_CACHE_REFRESH_DAYS | Auto-refresh UA cache after N days. Set to '0' to never refresh (cache persists indefinitely). Default '0'. |
|
||||
| WHOOGLE_UA_LIST_FILE | Path to text file containing custom UA strings (one per line). When set, uses these instead of auto-generated UAs. |
|
||||
| WHOOGLE_REDIRECTS | Specify sites that should be redirected elsewhere. See [custom redirecting](#custom-redirecting). |
|
||||
| EXPOSE_PORT | The port where Whoogle will be exposed. |
|
||||
| HTTPS_ONLY | Enforce HTTPS. (See [here](https://github.com/benbusby/whoogle-search#https-enforcement)) |
|
||||
|
|
@ -449,13 +476,14 @@ There are a few optional environment variables available for customizing a Whoog
|
|||
| WHOOGLE_AUTOCOMPLETE | Controls visibility of autocomplete/search suggestions. Default on -- use '0' to disable. |
|
||||
| WHOOGLE_MINIMAL | Remove everything except basic result cards from all search queries. |
|
||||
| WHOOGLE_CSP | Sets a default set of 'Content-Security-Policy' headers |
|
||||
| WHOOGLE_RESULTS_PER_PAGE | Set the number of results per page |
|
||||
| WHOOGLE_TOR_SERVICE | Enable/disable the Tor service on startup. Default on -- use '0' to disable. |
|
||||
| WHOOGLE_TOR_USE_PASS | Use password authentication for tor control port. |
|
||||
| WHOOGLE_TOR_CONF | The absolute path to the config file containing the password for the tor control port. Default: ./misc/tor/control.conf WHOOGLE_TOR_PASS must be 1 for this to work.|
|
||||
| WHOOGLE_SHOW_FAVICONS | Show/hide favicons next to search result URLs. Default on. |
|
||||
| WHOOGLE_UPDATE_CHECK | Enable/disable the automatic daily check for new versions of Whoogle. Default on. |
|
||||
| WHOOGLE_FALLBACK_ENGINE_URL | Set a fallback Search Engine URL when there is internal server error or instance is rate-limited. Search query is appended to the end of the URL (eg. https://duckduckgo.com/?k1=-1&q=). |
|
||||
| WHOOGLE_BUNDLE_STATIC | When set to 1, serve a single bundled CSS and JS file generated at startup to reduce requests. Default off. |
|
||||
| WHOOGLE_HTTP2 | Enable HTTP/2 for upstream requests (via httpx). Default on — set to 0 to force HTTP/1.1. |
|
||||
|
||||
### Config Environment Variables
|
||||
These environment variables allow setting default config values, but can be overwritten manually by using the home page config menu. These allow a shortcut for destroying/rebuilding an instance to the same config state every time.
|
||||
|
|
@ -482,12 +510,132 @@ These environment variables allow setting default config values, but can be over
|
|||
| WHOOGLE_CONFIG_PREFERENCES_ENCRYPTED | Encrypt preferences token, requires preferences key |
|
||||
| WHOOGLE_CONFIG_PREFERENCES_KEY | Key to encrypt preferences in URL (REQUIRED to show url) |
|
||||
| WHOOGLE_CONFIG_ANON_VIEW | Include the "anonymous view" option for each search result |
|
||||
| WHOOGLE_CONFIG_SHOW_USER_AGENT | Display the User Agent string used for search in results footer |
|
||||
|
||||
### Google Custom Search (BYOK) Environment Variables
|
||||
|
||||
These environment variables configure the "Bring Your Own Key" feature for Google Custom Search API:
|
||||
|
||||
| Variable | Description |
|
||||
| -------------------- | ----------------------------------------------------------------------------------------- |
|
||||
| WHOOGLE_CSE_API_KEY | Your Google API key with Custom Search API enabled |
|
||||
| WHOOGLE_CSE_ID | Your Custom Search Engine ID (cx parameter) |
|
||||
| WHOOGLE_USE_CSE | Enable Custom Search API by default (set to '1' to enable) |
|
||||
|
||||
## Google Custom Search (BYOK)
|
||||
|
||||
If Google blocks traditional search scraping (captchas, IP bans), you can use your own Google Custom Search Engine credentials as a fallback. This uses Google's official API with your own quota.
|
||||
|
||||
### Why Use This?
|
||||
|
||||
- **Reliability**: Official API never gets blocked or rate-limited (within quota)
|
||||
- **Speed**: Direct JSON responses are faster than HTML scraping
|
||||
- **Fallback**: Works when all scraping workarounds fail
|
||||
- **Privacy**: Your searches still don't go through third parties—they go directly to Google with your own API key
|
||||
|
||||
### Limitations vs Standard Whoogle
|
||||
|
||||
| Feature | Standard Scraping | CSE API |
|
||||
|------------------|--------------------------|---------------------|
|
||||
| Daily limit | None (until blocked) | 100 free, then paid |
|
||||
| Image search | ✅ Full support | ✅ Supported |
|
||||
| News/Videos tabs | ✅ | ❌ Web results only |
|
||||
| Speed | Slower (HTML parsing) | Faster (JSON) |
|
||||
| Reliability | Can be blocked | Always works |
|
||||
|
||||
### Setup Steps
|
||||
|
||||
#### 1. Create a Custom Search Engine
|
||||
1. Go to [Programmable Search Engine](https://programmablesearchengine.google.com/controlpanel/all)
|
||||
2. Click **"Add"** to create a new search engine
|
||||
3. Under "What to search?", select **"Search the entire web"**
|
||||
4. Give it a name (e.g., "My Whoogle CSE")
|
||||
5. Click **"Create"**
|
||||
6. Copy your **Search Engine ID**
|
||||
|
||||
#### 2. Get an API Key
|
||||
1. Go to [Google Cloud Console](https://console.cloud.google.com/)
|
||||
2. Create a new project or select an existing one
|
||||
3. Go to **APIs & Services** → **Library**
|
||||
4. Search for **"Custom Search API"** and click **Enable**
|
||||
5. Go to **APIs & Services** → **Credentials**
|
||||
6. Click **"Create Credentials"** → **"API Key"**
|
||||
7. Copy your API key (looks like `AIza...`)
|
||||
|
||||
#### 3. (Recommended) Restrict Your API Key
|
||||
To prevent misuse if your key is exposed:
|
||||
1. Click on your API key in Credentials
|
||||
2. Under **"API restrictions"**, select **"Restrict key"**
|
||||
3. Choose only **"Custom Search API"**
|
||||
4. Under **"Application restrictions"**, consider adding IP restrictions if using on a server
|
||||
5. Click **Save**
|
||||
|
||||
#### 4. Configure Whoogle
|
||||
|
||||
**Option A: Via Settings UI**
|
||||
1. Open your Whoogle instance
|
||||
2. Click the **Config** button
|
||||
3. Scroll to "Google Custom Search (BYOK)" section
|
||||
4. Enter your API Key and CSE ID
|
||||
5. Check "Use Custom Search API"
|
||||
6. Click **Apply**
|
||||
|
||||
**Option B: Via Environment Variables**
|
||||
```bash
|
||||
WHOOGLE_CSE_API_KEY=AIza...
|
||||
WHOOGLE_CSE_ID=23f...
|
||||
WHOOGLE_USE_CSE=1
|
||||
```
|
||||
|
||||
### Pricing & Avoiding Charges
|
||||
|
||||
| Tier | Queries | Cost |
|
||||
|------|------------------|-----------------------|
|
||||
| Free | 100/day | $0 |
|
||||
| Paid | Up to 10,000/day | $5 per 1,000 queries |
|
||||
|
||||
**⚠️ To avoid unexpected charges:**
|
||||
|
||||
1. **Don't add a payment method** to Google Cloud (safest option—API stops at 100/day)
|
||||
2. **Set a billing budget alert**: [Billing → Budgets & Alerts](https://console.cloud.google.com/billing/budgets)
|
||||
3. **Cap API usage**: APIs & Services → Custom Search API → Quotas → Set "Queries per day" to 100
|
||||
4. **Monitor usage**: APIs & Services → Custom Search API → Metrics
|
||||
|
||||
### Troubleshooting
|
||||
|
||||
| Error | Cause | Solution |
|
||||
|---------------------|---------------------------|-----------------------------------------------------------------|
|
||||
| "API key not valid" | Invalid or restricted key | Check key in Cloud Console, ensure Custom Search API is enabled |
|
||||
| "Quota exceeded" | Hit 100/day limit | Wait until midnight PT, or enable billing |
|
||||
| "Invalid CSE ID" | Wrong cx parameter | Copy ID from Programmable Search Engine control panel |
|
||||
|
||||
## Usage
|
||||
Same as most search engines, with the exception of filtering by time range.
|
||||
|
||||
To filter by a range of time, append ":past <time>" to the end of your search, where <time> can be `hour`, `day`, `month`, or `year`. Example: `coronavirus updates :past hour`
|
||||
|
||||
### JSON results (API)
|
||||
Whoogle can return filtered results as JSON using the same sanitization rules as the HTML view.
|
||||
|
||||
- Send `Accept: application/json` or append `format=json` to the search URL.
|
||||
- Example: `/search?q=whoogle` with `Accept: application/json`, or `/search?q=whoogle&format=json`.
|
||||
- Response shape:
|
||||
|
||||
```
|
||||
{
|
||||
"query": "whoogle",
|
||||
"search_type": "",
|
||||
"results": [
|
||||
{"href": "https://example.com/page", "text": "Example Page"},
|
||||
...
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
Special cases:
|
||||
- Feeling Lucky returns HTTP 303 with body `{ "redirect": "<url>" }`.
|
||||
- Temporary blocks (captcha) return HTTP 503 with `{ "blocked": true, "error_message": "...", "query": "..." }`.
|
||||
|
||||
## Extra Steps
|
||||
|
||||
### Set Whoogle as your primary search engine
|
||||
|
|
@ -623,6 +771,148 @@ server {
|
|||
|
||||
You can then add SSL support using LetsEncrypt by following a guide such as [this one](https://www.nginx.com/blog/using-free-ssltls-certificates-from-lets-encrypt-with-nginx/).
|
||||
|
||||
### Static asset bundling (optional)
|
||||
Whoogle can optionally serve a single bundled CSS and JS to reduce the number of HTTP requests.
|
||||
|
||||
- Enable by setting `WHOOGLE_BUNDLE_STATIC=1` and restarting the app.
|
||||
- On startup, Whoogle concatenates local CSS/JS into hashed files under `app/static/build/` and templates will prefer those bundles.
|
||||
- When disabled (default), templates load individual CSS/JS files for easier development.
|
||||
- Note: Theme CSS (`*-theme.css`) are still loaded separately to honor user theme selection.
|
||||
|
||||
## User Agent Generator Tool
|
||||
|
||||
A standalone command-line tool is available for generating Opera User Agent strings on demand:
|
||||
|
||||
```bash
|
||||
# Generate 10 User Agent strings (default)
|
||||
python misc/generate_uas.py
|
||||
|
||||
# Generate custom number of UAs
|
||||
python misc/generate_uas.py 20
|
||||
```
|
||||
|
||||
This tool is useful for:
|
||||
- Testing different UA strings
|
||||
- Generating UAs for other projects
|
||||
- Verifying UA generation patterns
|
||||
- Debugging UA-related issues
|
||||
|
||||
## Using Custom User Agent Lists
|
||||
|
||||
Instead of using auto-generated Opera UA strings, you can provide your own list of User Agent strings for Whoogle to use.
|
||||
|
||||
### Setup
|
||||
|
||||
1. Create a text file with your preferred UA strings (one per line):
|
||||
|
||||
```
|
||||
Opera/9.80 (J2ME/MIDP; Opera Mini/4.2.13337/22.478; U; en) Presto/2.4.15 Version/10.00
|
||||
Opera/9.80 (Android; Linux; Opera Mobi/498; U; en) Presto/2.12.423 Version/10.1
|
||||
```
|
||||
|
||||
2. Set the `WHOOGLE_UA_LIST_FILE` environment variable to point to your file:
|
||||
|
||||
```bash
|
||||
# Docker
|
||||
docker run -e WHOOGLE_UA_LIST_FILE=/config/my_user_agents.txt ...
|
||||
|
||||
# Docker Compose
|
||||
environment:
|
||||
- WHOOGLE_UA_LIST_FILE=/config/my_user_agents.txt
|
||||
|
||||
# Manual/systemd
|
||||
export WHOOGLE_UA_LIST_FILE=/path/to/my_user_agents.txt
|
||||
```
|
||||
|
||||
### Priority Order
|
||||
|
||||
Whoogle uses the following priority when loading User Agent strings:
|
||||
|
||||
1. **Custom UA list file** (if `WHOOGLE_UA_LIST_FILE` is set and valid)
|
||||
2. **Cached auto-generated UAs** (if cache exists and is valid)
|
||||
3. **Newly generated UAs** (if no cache or cache expired)
|
||||
|
||||
### Tips
|
||||
|
||||
- You can use the output from `misc/check_google_user_agents.py` as your custom UA list
|
||||
- Generate a list with `python misc/generate_uas.py 50 2>/dev/null > my_uas.txt`
|
||||
- Mix different UA types (Opera, Firefox, Chrome) for more variety
|
||||
- Keep the file readable by Whoogle (proper permissions)
|
||||
- One UA string per line, blank lines are ignored
|
||||
|
||||
### Example Workflow
|
||||
|
||||
```bash
|
||||
# Generate and test UAs, save working ones
|
||||
python misc/generate_uas.py 100 2>/dev/null > candidate_uas.txt
|
||||
python misc/check_google_user_agents.py candidate_uas.txt --output working_uas.txt
|
||||
|
||||
# Use the working UAs with Whoogle
|
||||
export WHOOGLE_UA_LIST_FILE=./working_uas.txt
|
||||
./run
|
||||
```
|
||||
|
||||
## User Agent Testing Tool
|
||||
|
||||
Whoogle now includes a comprehensive testing tool (`misc/check_google_user_agents.py`) to verify which User Agent strings successfully return Google search results without triggering blocks, JavaScript-only pages, or browser upgrade prompts.
|
||||
|
||||
### Usage
|
||||
|
||||
```bash
|
||||
# Test all UAs from a file
|
||||
python misc/check_google_user_agents.py UAs.txt
|
||||
|
||||
# Save working UAs to a file (appends incrementally)
|
||||
python misc/check_google_user_agents.py UAs.txt --output working_uas.txt
|
||||
|
||||
# Use a specific search query
|
||||
python misc/check_google_user_agents.py UAs.txt --query "python programming"
|
||||
|
||||
# Verbose mode to see detailed results
|
||||
python misc/check_google_user_agents.py UAs.txt --output working.txt --verbose
|
||||
|
||||
# Adjust delay between requests (default: 0.5 seconds)
|
||||
python misc/check_google_user_agents.py UAs.txt --delay 1.0
|
||||
|
||||
# Set request timeout (default: 10 seconds)
|
||||
python misc/check_google_user_agents.py UAs.txt --timeout 15.0
|
||||
```
|
||||
|
||||
### Features
|
||||
|
||||
- **Incremental Results**: Working UAs are saved immediately to the output file (append mode), so progress is preserved even if interrupted
|
||||
- **Duplicate Detection**: Automatically skips UAs already in the output file when resuming
|
||||
- **Random Query Cycling**: By default, cycles through diverse search queries to simulate realistic usage patterns
|
||||
- **Rate Limit Detection**: Detects and reports Google rate limiting with recovery instructions
|
||||
- **Comprehensive Validation**: Checks for:
|
||||
- HTTP status codes (blocks, server errors, rate limits)
|
||||
- Block markers (unusual traffic, upgrade browser messages)
|
||||
- Success markers (actual search result HTML elements)
|
||||
- JavaScript-only pages and redirects
|
||||
- Response size validation
|
||||
|
||||
### Testing Methodology
|
||||
|
||||
The tool evaluates UAs against multiple criteria:
|
||||
|
||||
1. **HTTP Status**: Rejects 4xx/5xx errors, detects 429 rate limits
|
||||
2. **Block Detection**: Searches for Google's block messages (CAPTCHA, unusual traffic, etc.)
|
||||
3. **JavaScript Detection**: Identifies JS-only pages and noscript redirects
|
||||
4. **Result Validation**: Confirms presence of actual search result HTML elements
|
||||
5. **Content Analysis**: Validates response size and structure
|
||||
|
||||
This tool was used to discover and validate the working Opera UA patterns that power Whoogle's auto-generation feature.
|
||||
|
||||
## Known Issues
|
||||
|
||||
### User Agent Strings and Image Search
|
||||
|
||||
**Issue**: Most, if not all, of the auto-generated Opera User Agent strings may fail when performing **image searches** on Google. This appears to be a limitation with how Google's image search validates User Agent strings.
|
||||
|
||||
**Impact**:
|
||||
- Regular web searches work correctly with generated UAs
|
||||
- Image search may return errors or no results
|
||||
|
||||
## Contributing
|
||||
|
||||
Under the hood, Whoogle is a basic Flask app with the following structure:
|
||||
|
|
@ -636,6 +926,7 @@ Under the hood, Whoogle is a basic Flask app with the following structure:
|
|||
- `results.py`: Utility functions for interpreting/modifying individual search results
|
||||
- `search.py`: Creates and handles new search queries
|
||||
- `session.py`: Miscellaneous methods related to user sessions
|
||||
- `ua_generator.py`: Auto-generates Opera User Agent strings with pattern-based randomization
|
||||
- `templates/`
|
||||
- `index.html`: The home page template
|
||||
- `display.html`: The search results template
|
||||
|
|
@ -692,27 +983,9 @@ A lot of the app currently piggybacks on Google's existing support for fetching
|
|||
|
||||
| Website | Country | Language | Cloudflare |
|
||||
|-|-|-|-|
|
||||
| [https://search.albony.xyz](https://search.albony.xyz/) | 🇮🇳 IN | Multi-choice | |
|
||||
| [https://search.garudalinux.org](https://search.garudalinux.org) | 🇫🇮 FI | Multi-choice | ✅ |
|
||||
| [https://search.dr460nf1r3.org](https://search.dr460nf1r3.org) | 🇩🇪 DE | Multi-choice | ✅ |
|
||||
| [https://s.tokhmi.xyz](https://s.tokhmi.xyz) | 🇺🇸 US | Multi-choice | ✅ |
|
||||
| [https://search.sethforprivacy.com](https://search.sethforprivacy.com) | 🇩🇪 DE | English | |
|
||||
| [https://whoogle.dcs0.hu](https://whoogle.dcs0.hu) | 🇭🇺 HU | Multi-choice | |
|
||||
| [https://gowogle.voring.me](https://gowogle.voring.me) | 🇺🇸 US | Multi-choice | |
|
||||
| [https://whoogle.privacydev.net](https://whoogle.privacydev.net) | 🇫🇷 FR | English | |
|
||||
| [https://wg.vern.cc](https://wg.vern.cc) | 🇺🇸 US | English | |
|
||||
| [https://whoogle.hxvy0.gq](https://whoogle.hxvy0.gq) | 🇨🇦 CA | Turkish Only | ✅ |
|
||||
| [https://whoogle.hostux.net](https://whoogle.hostux.net) | 🇫🇷 FR | Multi-choice | |
|
||||
| [https://whoogle.lunar.icu](https://whoogle.lunar.icu) | 🇩🇪 DE | Multi-choice | ✅ |
|
||||
| [https://wgl.frail.duckdns.org](https://wgl.frail.duckdns.org) | 🇧🇷 BR | Multi-choice | |
|
||||
| [https://whoogle.no-logs.com](https://whoogle.no-logs.com/) | 🇸🇪 SE | Multi-choice | |
|
||||
| [https://whoogle.ftw.lol](https://whoogle.ftw.lol) | 🇩🇪 DE | Multi-choice | |
|
||||
| [https://whoogle-search--replitcomreside.repl.co](https://whoogle-search--replitcomreside.repl.co) | 🇺🇸 US | English | |
|
||||
| [https://search.notrustverify.ch](https://search.notrustverify.ch) | 🇨🇭 CH | Multi-choice | |
|
||||
| [https://whoogle.datura.network](https://whoogle.datura.network) | 🇩🇪 DE | Multi-choice | |
|
||||
| [https://whoogle.yepserver.xyz](https://whoogle.yepserver.xyz) | 🇺🇦 UA | Multi-choice | |
|
||||
| [https://search.nezumi.party](https://search.nezumi.party) | 🇮🇹 IT | Multi-choice | |
|
||||
| [https://search.snine.nl](https://search.snine.nl) | 🇳🇱 NL | Mult-choice | ✅ |
|
||||
|
||||
|
||||
* A checkmark in the "Cloudflare" category here refers to the use of the reverse proxy, [Cloudflare](https://cloudflare.com). The checkmark will not be listed for a site which uses Cloudflare DNS but rather the proxying service which grants Cloudflare the ability to monitor traffic to the website.
|
||||
|
|
@ -721,17 +994,7 @@ A lot of the app currently piggybacks on Google's existing support for fetching
|
|||
|
||||
| Website | Country | Language |
|
||||
|-|-|-|
|
||||
| [http://whoglqjdkgt2an4tdepberwqz3hk7tjo4kqgdnuj77rt7nshw2xqhqad.onion](http://whoglqjdkgt2an4tdepberwqz3hk7tjo4kqgdnuj77rt7nshw2xqhqad.onion) | 🇺🇸 US | Multi-choice
|
||||
| [http://nuifgsnbb2mcyza74o7illtqmuaqbwu4flam3cdmsrnudwcmkqur37qd.onion](http://nuifgsnbb2mcyza74o7illtqmuaqbwu4flam3cdmsrnudwcmkqur37qd.onion) | 🇩🇪 DE | English
|
||||
| [http://whoogle.vernccvbvyi5qhfzyqengccj7lkove6bjot2xhh5kajhwvidqafczrad.onion](http://whoogle.vernccvbvyi5qhfzyqengccj7lkove6bjot2xhh5kajhwvidqafczrad.onion/) | 🇺🇸 US | English |
|
||||
| [http://whoogle.g4c3eya4clenolymqbpgwz3q3tawoxw56yhzk4vugqrl6dtu3ejvhjid.onion](http://whoogle.g4c3eya4clenolymqbpgwz3q3tawoxw56yhzk4vugqrl6dtu3ejvhjid.onion/) | 🇫🇷 FR | English |
|
||||
| [http://whoogle.daturab6drmkhyeia4ch5gvfc2f3wgo6bhjrv3pz6n7kxmvoznlkq4yd.onion](http://whoogle.daturab6drmkhyeia4ch5gvfc2f3wgo6bhjrv3pz6n7kxmvoznlkq4yd.onion/) | 🇩🇪 DE | Multi-choice | |
|
||||
|
||||
#### I2P Instances
|
||||
|
||||
| Website | Country | Language |
|
||||
|-|-|-|
|
||||
| [http://verneks7rfjptpz5fpii7n7nrxilsidi2qxepeuuf66c3tsf4nhq.b32.i2p](http://verneks7rfjptpz5fpii7n7nrxilsidi2qxepeuuf66c3tsf4nhq.b32.i2p) | 🇺🇸 US | English |
|
||||
NONE of the existing Onion accessible sites appear to be live anymore
|
||||
|
||||
## Screenshots
|
||||
#### Desktop
|
||||
|
|
|
|||
174
app/__init__.py
174
app/__init__.py
|
|
@ -3,6 +3,7 @@ from app.request import send_tor_signal
|
|||
from app.utils.session import generate_key
|
||||
from app.utils.bangs import gen_bangs_json, load_all_bangs
|
||||
from app.utils.misc import gen_file_hash, read_config_bool
|
||||
from app.utils.ua_generator import load_ua_pool
|
||||
from base64 import b64encode
|
||||
from bs4 import MarkupResemblesLocatorWarning
|
||||
from datetime import datetime, timedelta
|
||||
|
|
@ -11,17 +12,19 @@ from flask import Flask
|
|||
import json
|
||||
import logging.config
|
||||
import os
|
||||
import sys
|
||||
from stem import Signal
|
||||
import threading
|
||||
import warnings
|
||||
|
||||
from werkzeug.middleware.proxy_fix import ProxyFix
|
||||
|
||||
from app.utils.misc import read_config_bool
|
||||
from app.services.http_client import HttpxClient
|
||||
from app.services.provider import close_all_clients
|
||||
from app.version import __version__
|
||||
|
||||
app = Flask(__name__, static_folder=os.path.dirname(
|
||||
os.path.abspath(__file__)) + '/static')
|
||||
app = Flask(__name__, static_folder=os.path.join(
|
||||
os.path.dirname(os.path.abspath(__file__)), 'static'))
|
||||
|
||||
app.wsgi_app = ProxyFix(app.wsgi_app)
|
||||
|
||||
|
|
@ -50,24 +53,19 @@ app.config['STATIC_FOLDER'] = os.getenv(
|
|||
app.config['BUILD_FOLDER'] = os.path.join(
|
||||
app.config['STATIC_FOLDER'], 'build')
|
||||
app.config['CACHE_BUSTING_MAP'] = {}
|
||||
app.config['LANGUAGES'] = json.load(open(
|
||||
os.path.join(app.config['STATIC_FOLDER'], 'settings/languages.json'),
|
||||
encoding='utf-8'))
|
||||
app.config['COUNTRIES'] = json.load(open(
|
||||
os.path.join(app.config['STATIC_FOLDER'], 'settings/countries.json'),
|
||||
encoding='utf-8'))
|
||||
app.config['TIME_PERIODS'] = json.load(open(
|
||||
os.path.join(app.config['STATIC_FOLDER'], 'settings/time_periods.json'),
|
||||
encoding='utf-8'))
|
||||
app.config['TRANSLATIONS'] = json.load(open(
|
||||
os.path.join(app.config['STATIC_FOLDER'], 'settings/translations.json'),
|
||||
encoding='utf-8'))
|
||||
app.config['THEMES'] = json.load(open(
|
||||
os.path.join(app.config['STATIC_FOLDER'], 'settings/themes.json'),
|
||||
encoding='utf-8'))
|
||||
app.config['HEADER_TABS'] = json.load(open(
|
||||
os.path.join(app.config['STATIC_FOLDER'], 'settings/header_tabs.json'),
|
||||
encoding='utf-8'))
|
||||
app.config['BUNDLE_STATIC'] = read_config_bool('WHOOGLE_BUNDLE_STATIC')
|
||||
with open(os.path.join(app.config['STATIC_FOLDER'], 'settings/languages.json'), 'r', encoding='utf-8') as f:
|
||||
app.config['LANGUAGES'] = json.load(f)
|
||||
with open(os.path.join(app.config['STATIC_FOLDER'], 'settings/countries.json'), 'r', encoding='utf-8') as f:
|
||||
app.config['COUNTRIES'] = json.load(f)
|
||||
with open(os.path.join(app.config['STATIC_FOLDER'], 'settings/time_periods.json'), 'r', encoding='utf-8') as f:
|
||||
app.config['TIME_PERIODS'] = json.load(f)
|
||||
with open(os.path.join(app.config['STATIC_FOLDER'], 'settings/translations.json'), 'r', encoding='utf-8') as f:
|
||||
app.config['TRANSLATIONS'] = json.load(f)
|
||||
with open(os.path.join(app.config['STATIC_FOLDER'], 'settings/themes.json'), 'r', encoding='utf-8') as f:
|
||||
app.config['THEMES'] = json.load(f)
|
||||
with open(os.path.join(app.config['STATIC_FOLDER'], 'settings/header_tabs.json'), 'r', encoding='utf-8') as f:
|
||||
app.config['HEADER_TABS'] = json.load(f)
|
||||
app.config['CONFIG_PATH'] = os.getenv(
|
||||
'CONFIG_VOLUME',
|
||||
os.path.join(app.config['STATIC_FOLDER'], 'config'))
|
||||
|
|
@ -78,7 +76,10 @@ app.config['CONFIG_DISABLE'] = read_config_bool('WHOOGLE_CONFIG_DISABLE')
|
|||
app.config['SESSION_FILE_DIR'] = os.path.join(
|
||||
app.config['CONFIG_PATH'],
|
||||
'session')
|
||||
app.config['MAX_SESSION_SIZE'] = 4000 # Sessions won't exceed 4KB
|
||||
# Maximum session file size in bytes (4KB limit to prevent abuse and disk exhaustion)
|
||||
# Session files larger than this are ignored during cleanup to avoid processing
|
||||
# potentially malicious or corrupted files
|
||||
app.config['MAX_SESSION_SIZE'] = 4000
|
||||
app.config['BANG_PATH'] = os.getenv(
|
||||
'CONFIG_VOLUME',
|
||||
os.path.join(app.config['STATIC_FOLDER'], 'bangs'))
|
||||
|
|
@ -86,6 +87,17 @@ app.config['BANG_FILE'] = os.path.join(
|
|||
app.config['BANG_PATH'],
|
||||
'bangs.json')
|
||||
|
||||
# Global services registry (simple DI)
|
||||
app.services = {}
|
||||
|
||||
|
||||
@app.teardown_appcontext
|
||||
def _teardown_clients(exception):
|
||||
try:
|
||||
close_all_clients()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Ensure all necessary directories exist
|
||||
if not os.path.exists(app.config['CONFIG_PATH']):
|
||||
os.makedirs(app.config['CONFIG_PATH'])
|
||||
|
|
@ -99,18 +111,63 @@ if not os.path.exists(app.config['BANG_PATH']):
|
|||
if not os.path.exists(app.config['BUILD_FOLDER']):
|
||||
os.makedirs(app.config['BUILD_FOLDER'])
|
||||
|
||||
# Session values
|
||||
app_key_path = os.path.join(app.config['CONFIG_PATH'], 'whoogle.key')
|
||||
if os.path.exists(app_key_path):
|
||||
# Initialize User Agent pool
|
||||
app.config['UA_CACHE_PATH'] = os.path.join(app.config['CONFIG_PATH'], 'ua_cache.json')
|
||||
try:
|
||||
app.config['UA_POOL'] = load_ua_pool(app.config['UA_CACHE_PATH'], count=10)
|
||||
except Exception as e:
|
||||
# If UA pool loading fails, log warning and set empty pool
|
||||
# The gen_user_agent function will handle the fallback
|
||||
print(f"Warning: Could not initialize UA pool: {e}")
|
||||
app.config['UA_POOL'] = []
|
||||
|
||||
# Session values - Secret key management
|
||||
# Priority: environment variable → file → generate new
|
||||
def get_secret_key():
|
||||
"""Load or generate secret key with validation.
|
||||
|
||||
Priority order:
|
||||
1. WHOOGLE_SECRET_KEY environment variable
|
||||
2. Existing key file
|
||||
3. Generate new key and save to file
|
||||
|
||||
Returns:
|
||||
str: Valid secret key for Flask sessions
|
||||
"""
|
||||
# Check environment variable first
|
||||
env_key = os.getenv('WHOOGLE_SECRET_KEY', '').strip()
|
||||
if env_key:
|
||||
# Validate env key has minimum length
|
||||
if len(env_key) >= 32:
|
||||
return env_key
|
||||
else:
|
||||
print(f"Warning: WHOOGLE_SECRET_KEY too short ({len(env_key)} chars, need 32+). Using file/generated key instead.", file=sys.stderr)
|
||||
|
||||
# Check file-based key
|
||||
app_key_path = os.path.join(app.config['CONFIG_PATH'], 'whoogle.key')
|
||||
if os.path.exists(app_key_path):
|
||||
try:
|
||||
with open(app_key_path, 'r', encoding='utf-8') as f:
|
||||
key = f.read().strip()
|
||||
# Validate file key
|
||||
if len(key) >= 32:
|
||||
return key
|
||||
else:
|
||||
print(f"Warning: Key file too short, regenerating", file=sys.stderr)
|
||||
except (PermissionError, IOError) as e:
|
||||
print(f"Warning: Could not read key file: {e}", file=sys.stderr)
|
||||
|
||||
# Generate new key
|
||||
new_key = str(b64encode(os.urandom(32)))
|
||||
try:
|
||||
app.config['SECRET_KEY'] = open(app_key_path, 'r').read()
|
||||
except PermissionError:
|
||||
app.config['SECRET_KEY'] = str(b64encode(os.urandom(32)))
|
||||
else:
|
||||
app.config['SECRET_KEY'] = str(b64encode(os.urandom(32)))
|
||||
with open(app_key_path, 'w') as key_file:
|
||||
key_file.write(app.config['SECRET_KEY'])
|
||||
key_file.close()
|
||||
with open(app_key_path, 'w', encoding='utf-8') as key_file:
|
||||
key_file.write(new_key)
|
||||
except (PermissionError, IOError) as e:
|
||||
print(f"Warning: Could not save key file: {e}. Key will not persist across restarts.", file=sys.stderr)
|
||||
|
||||
return new_key
|
||||
|
||||
app.config['SECRET_KEY'] = get_secret_key()
|
||||
app.config['PERMANENT_SESSION_LIFETIME'] = timedelta(days=365)
|
||||
|
||||
# NOTE: SESSION_COOKIE_SAMESITE must be set to 'lax' to allow the user's
|
||||
|
|
@ -146,7 +203,8 @@ app.config['CSP'] = 'default-src \'none\';' \
|
|||
generating_bangs = False
|
||||
if not os.path.exists(app.config['BANG_FILE']):
|
||||
generating_bangs = True
|
||||
json.dump({}, open(app.config['BANG_FILE'], 'w'))
|
||||
with open(app.config['BANG_FILE'], 'w', encoding='utf-8') as f:
|
||||
json.dump({}, f)
|
||||
bangs_thread = threading.Thread(
|
||||
target=gen_bangs_json,
|
||||
args=(app.config['BANG_FILE'],))
|
||||
|
|
@ -174,10 +232,58 @@ for cb_dir in cache_busting_dirs:
|
|||
map_path = map_path[1:]
|
||||
app.config['CACHE_BUSTING_MAP'][cb_file] = map_path
|
||||
|
||||
# Optionally create simple bundled assets (opt-in via WHOOGLE_BUNDLE_STATIC=1)
|
||||
if app.config['BUNDLE_STATIC']:
|
||||
# CSS bundle: include all css except theme files (end with -theme.css)
|
||||
css_dir = os.path.join(app.config['STATIC_FOLDER'], 'css')
|
||||
css_parts = []
|
||||
for name in sorted(os.listdir(css_dir)):
|
||||
if not name.endswith('.css'):
|
||||
continue
|
||||
if name.endswith('-theme.css'):
|
||||
continue
|
||||
try:
|
||||
with open(os.path.join(css_dir, name), 'r', encoding='utf-8') as f:
|
||||
css_parts.append(f.read())
|
||||
except Exception:
|
||||
pass
|
||||
css_bundle = '\n'.join(css_parts)
|
||||
if css_bundle:
|
||||
css_tmp = os.path.join(app.config['BUILD_FOLDER'], 'app.css')
|
||||
with open(css_tmp, 'w', encoding='utf-8') as f:
|
||||
f.write(css_bundle)
|
||||
css_hashed = gen_file_hash(app.config['BUILD_FOLDER'], 'app.css')
|
||||
os.replace(css_tmp, os.path.join(app.config['BUILD_FOLDER'], css_hashed))
|
||||
map_path = os.path.join('app/static/build', css_hashed)
|
||||
app.config['CACHE_BUSTING_MAP']['bundle.css'] = map_path
|
||||
|
||||
# JS bundle: include all js files
|
||||
js_dir = os.path.join(app.config['STATIC_FOLDER'], 'js')
|
||||
js_parts = []
|
||||
for name in sorted(os.listdir(js_dir)):
|
||||
if not name.endswith('.js'):
|
||||
continue
|
||||
try:
|
||||
with open(os.path.join(js_dir, name), 'r', encoding='utf-8') as f:
|
||||
js_parts.append(f.read())
|
||||
except Exception:
|
||||
pass
|
||||
js_bundle = '\n;'.join(js_parts)
|
||||
if js_bundle:
|
||||
js_tmp = os.path.join(app.config['BUILD_FOLDER'], 'app.js')
|
||||
with open(js_tmp, 'w', encoding='utf-8') as f:
|
||||
f.write(js_bundle)
|
||||
js_hashed = gen_file_hash(app.config['BUILD_FOLDER'], 'app.js')
|
||||
os.replace(js_tmp, os.path.join(app.config['BUILD_FOLDER'], js_hashed))
|
||||
map_path = os.path.join('app/static/build', js_hashed)
|
||||
app.config['CACHE_BUSTING_MAP']['bundle.js'] = map_path
|
||||
|
||||
# Templating functions
|
||||
app.jinja_env.globals.update(clean_query=clean_query)
|
||||
app.jinja_env.globals.update(
|
||||
cb_url=lambda f: app.config['CACHE_BUSTING_MAP'][f.lower()])
|
||||
app.jinja_env.globals.update(
|
||||
bundle_static=lambda: app.config.get('BUNDLE_STATIC', False))
|
||||
|
||||
# Attempt to acquire tor identity, to determine if Tor config is available
|
||||
send_tor_signal(Signal.HEARTBEAT)
|
||||
|
|
|
|||
384
app/filter.py
384
app/filter.py
|
|
@ -5,7 +5,8 @@ from cryptography.fernet import Fernet
|
|||
from flask import render_template
|
||||
import html
|
||||
import urllib.parse as urlparse
|
||||
from urllib.parse import parse_qs
|
||||
import os
|
||||
from urllib.parse import parse_qs, urlencode, urlunparse
|
||||
import re
|
||||
|
||||
from app.models.g_classes import GClasses
|
||||
|
|
@ -111,8 +112,10 @@ def clean_css(css: str, page_url: str) -> str:
|
|||
|
||||
|
||||
class Filter:
|
||||
# Limit used for determining if a result is a "regular" result or a list
|
||||
# type result (such as "people also asked", "related searches", etc)
|
||||
# Minimum number of child div elements that indicates a collapsible section
|
||||
# Regular search results typically have fewer child divs (< 7)
|
||||
# Special sections like "People also ask", "Related searches" have more (>= 7)
|
||||
# This threshold helps identify and collapse these extended result sections
|
||||
RESULT_CHILD_LIMIT = 7
|
||||
|
||||
def __init__(
|
||||
|
|
@ -157,6 +160,7 @@ class Filter:
|
|||
self.soup = soup
|
||||
self.main_divs = self.soup.find('div', {'id': 'main'})
|
||||
self.remove_ads()
|
||||
self.remove_ai_overview()
|
||||
self.remove_block_titles()
|
||||
self.remove_block_url()
|
||||
self.collapse_sections()
|
||||
|
|
@ -206,6 +210,9 @@ class Filter:
|
|||
header = self.soup.find('header')
|
||||
if header:
|
||||
header.decompose()
|
||||
# Remove broken "Dark theme" toggle snippets that occasionally slip
|
||||
# into the footer.
|
||||
self.remove_dark_theme_toggle(self.soup)
|
||||
self.remove_site_blocks(self.soup)
|
||||
return self.soup
|
||||
|
||||
|
|
@ -215,11 +222,11 @@ class Filter:
|
|||
Returns:
|
||||
None (The soup object is modified directly)
|
||||
"""
|
||||
if not div:
|
||||
if not div or not isinstance(div, Tag):
|
||||
return
|
||||
|
||||
for d in div.find_all('div', recursive=True):
|
||||
d_text = d.find(text=True, recursive=False)
|
||||
d_text = d.find(string=True, recursive=False)
|
||||
|
||||
# Ensure we're working with tags that contain text content
|
||||
if not d_text or not d.string:
|
||||
|
|
@ -290,17 +297,75 @@ class Filter:
|
|||
if GClasses.result_class_a in p_cls:
|
||||
break
|
||||
|
||||
def remove_dark_theme_toggle(self, soup: BeautifulSoup) -> None:
|
||||
"""Removes stray Dark theme toggle/link fragments that can appear
|
||||
in the footer."""
|
||||
for node in soup.find_all(string=re.compile(r'Dark theme', re.I)):
|
||||
try:
|
||||
parent = node.find_parent(
|
||||
lambda tag: tag.name in ['div', 'span', 'p', 'a', 'li',
|
||||
'section'])
|
||||
target = parent or node.parent
|
||||
if target:
|
||||
target.decompose()
|
||||
else:
|
||||
node.extract()
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
def remove_site_blocks(self, soup) -> None:
|
||||
if not self.config.block or not soup.body:
|
||||
return
|
||||
search_string = ' '.join(['-site:' +
|
||||
_ for _ in self.config.block.split(',')])
|
||||
selected = soup.body.findAll(text=re.compile(search_string))
|
||||
selected = soup.body.find_all(string=re.compile(search_string))
|
||||
|
||||
for result in selected:
|
||||
result.string.replace_with(result.string.replace(
|
||||
search_string, ''))
|
||||
|
||||
def remove_ai_overview(self) -> None:
|
||||
"""Removes Google's AI Overview/SGE results from search results
|
||||
|
||||
Returns:
|
||||
None (The soup object is modified directly)
|
||||
"""
|
||||
if not self.main_divs:
|
||||
return
|
||||
|
||||
# Patterns that identify AI Overview sections
|
||||
ai_patterns = [
|
||||
'AI Overview',
|
||||
'AI responses may include mistakes',
|
||||
]
|
||||
|
||||
# Result div classes - check both original Google classes and mapped ones
|
||||
# since this runs before CSS class replacement
|
||||
result_classes = [GClasses.result_class_a] # 'ZINbbc'
|
||||
result_classes.extend(GClasses.result_classes.get(
|
||||
GClasses.result_class_a, [])) # ['Gx5Zad']
|
||||
|
||||
# Collect divs to remove first to avoid modifying while iterating
|
||||
divs_to_remove = []
|
||||
|
||||
for div in self.main_divs.find_all('div', recursive=True):
|
||||
# Check if this div or its children contain AI Overview markers
|
||||
div_text = div.get_text()
|
||||
if any(pattern in div_text for pattern in ai_patterns):
|
||||
# Walk up to find the top-level result div
|
||||
parent = div
|
||||
while parent:
|
||||
p_cls = parent.attrs.get('class') or []
|
||||
if any(rc in p_cls for rc in result_classes):
|
||||
if parent not in divs_to_remove:
|
||||
divs_to_remove.append(parent)
|
||||
break
|
||||
parent = parent.parent
|
||||
|
||||
# Remove collected divs
|
||||
for div in divs_to_remove:
|
||||
div.decompose()
|
||||
|
||||
def remove_ads(self) -> None:
|
||||
"""Removes ads found in the list of search result divs
|
||||
|
||||
|
|
@ -362,16 +427,21 @@ class Filter:
|
|||
|
||||
def pull_child_divs(result_div: BeautifulSoup):
|
||||
try:
|
||||
return result_div.findChildren(
|
||||
'div', recursive=False
|
||||
)[0].findChildren(
|
||||
'div', recursive=False)
|
||||
except IndexError:
|
||||
top_level_divs = result_div.find_all('div', recursive=False)
|
||||
if not top_level_divs:
|
||||
return []
|
||||
return top_level_divs[0].find_all('div', recursive=False)
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
if not self.main_divs:
|
||||
return
|
||||
|
||||
# Skip collapsing for CSE (Custom Search Engine) results
|
||||
# CSE results have a data-cse attribute on the main container
|
||||
if self.soup.find(attrs={'data-cse': 'true'}):
|
||||
return
|
||||
|
||||
# Loop through results and check for the number of child divs in each
|
||||
for result in self.main_divs.find_all():
|
||||
result_children = pull_child_divs(result)
|
||||
|
|
@ -529,10 +599,32 @@ class Filter:
|
|||
)
|
||||
css = f"{css_html_tag}{css}"
|
||||
css = re.sub('body{(.*?)}',
|
||||
'body{padding:0 8px;margin:0 auto;max-width:736px;}',
|
||||
'body{padding:0 12px;margin:0 auto;max-width:1200px;}',
|
||||
css)
|
||||
style.string = css
|
||||
|
||||
# Normalize the max width between result types so the page doesn't
|
||||
# jump in size when switching tabs.
|
||||
if not self.mobile:
|
||||
max_width_css = (
|
||||
'body, #cnt, #center_col, .main, .e9EfHf, #searchform, '
|
||||
'.GyAeWb, .s6JM6d {'
|
||||
'max-width:1200px;'
|
||||
'margin:0 auto;'
|
||||
'padding-left:12px;'
|
||||
'padding-right:12px;'
|
||||
'}'
|
||||
)
|
||||
# Build the style tag using a fresh soup to avoid cases where the
|
||||
# current soup lacks the helper methods (e.g., non-root elements).
|
||||
factory_soup = BeautifulSoup('', 'html.parser')
|
||||
extra_style = factory_soup.new_tag('style')
|
||||
extra_style.string = max_width_css
|
||||
if self.soup.head:
|
||||
self.soup.head.append(extra_style)
|
||||
else:
|
||||
self.soup.insert(0, extra_style)
|
||||
|
||||
def update_link(self, link: Tag) -> None:
|
||||
"""Update internal link paths with encrypted path, otherwise remove
|
||||
unnecessary redirects and/or marketing params from the url
|
||||
|
|
@ -552,9 +644,6 @@ class Filter:
|
|||
|
||||
# Remove any elements that direct to unsupported Google pages
|
||||
if any(url in link_netloc for url in unsupported_g_pages):
|
||||
# FIXME: The "Shopping" tab requires further filtering (see #136)
|
||||
# Temporarily removing all links to that tab for now.
|
||||
|
||||
# Replaces the /url google unsupported link to the direct url
|
||||
link['href'] = link_netloc
|
||||
parent = link.parent
|
||||
|
|
@ -649,62 +738,203 @@ class Filter:
|
|||
"""Replaces link locations and page elements if "alts" config
|
||||
is enabled
|
||||
"""
|
||||
for site, alt in SITE_ALTS.items():
|
||||
if site != "medium.com" and alt != "":
|
||||
# Ignore medium.com replacements since these are handled
|
||||
# specifically in the link description replacement, and medium
|
||||
# results are never given their own "card" result where this
|
||||
# replacement would make sense.
|
||||
# Also ignore if the alt is empty, since this is used to indicate
|
||||
# that the alt is not enabled.
|
||||
for div in self.soup.find_all('div', text=re.compile(site)):
|
||||
# Use the number of words in the div string to determine if the
|
||||
# string is a result description (shouldn't replace domains used
|
||||
# in desc text).
|
||||
if len(div.string.split(' ')) == 1:
|
||||
div.string = div.string.replace(site, alt)
|
||||
# Precompute regex for sites (escape dots) and common prefixes
|
||||
site_keys = list(SITE_ALTS.keys())
|
||||
if not site_keys:
|
||||
return
|
||||
sites_pattern = re.compile('|'.join([re.escape(k) for k in site_keys]))
|
||||
prefix_pattern = re.compile(r'^(?:https?:\/\/)?(?:(?:www|mobile|m)\.)?')
|
||||
|
||||
for link in self.soup.find_all('a', href=True):
|
||||
# Search and replace all link descriptions
|
||||
# with alternative location
|
||||
link['href'] = get_site_alt(link['href'])
|
||||
link_desc = link.find_all(
|
||||
text=re.compile('|'.join(SITE_ALTS.keys())))
|
||||
if len(link_desc) == 0:
|
||||
continue
|
||||
# 1) Replace bare domain divs (single token) once, avoiding duplicates
|
||||
for div in self.soup.find_all('div', string=sites_pattern):
|
||||
if not div or not div.string:
|
||||
continue
|
||||
if len(div.string.split(' ')) != 1:
|
||||
continue
|
||||
match = sites_pattern.search(div.string)
|
||||
if not match:
|
||||
continue
|
||||
site = match.group(0)
|
||||
alt = SITE_ALTS.get(site, '')
|
||||
if not alt:
|
||||
continue
|
||||
# Skip if already contains the alt to avoid old.old.* repetition
|
||||
if alt in div.string:
|
||||
continue
|
||||
div.string = div.string.replace(site, alt)
|
||||
|
||||
# Replace link description
|
||||
link_desc = link_desc[0]
|
||||
if site not in link_desc or not alt:
|
||||
continue
|
||||
# 2) Update link hrefs and descriptions in a single pass
|
||||
for link in self.soup.find_all('a', href=True):
|
||||
link['href'] = get_site_alt(link['href'])
|
||||
|
||||
new_desc = BeautifulSoup(features='html.parser').new_tag('div')
|
||||
link_str = str(link_desc)
|
||||
# Find a description text node matching a known site
|
||||
desc_nodes = link.find_all(string=sites_pattern)
|
||||
if not desc_nodes:
|
||||
continue
|
||||
desc_node = desc_nodes[0]
|
||||
link_str = str(desc_node)
|
||||
|
||||
# Medium links should be handled differently, since 'medium.com'
|
||||
# is a common substring of domain names, but shouldn't be
|
||||
# replaced (i.e. 'philomedium.com' should stay as it is).
|
||||
if 'medium.com' in link_str:
|
||||
if link_str.startswith('medium.com') or '.medium.com' in link_str:
|
||||
link_str = SITE_ALTS['medium.com'] + link_str[
|
||||
link_str.find('medium.com') + len('medium.com'):]
|
||||
new_desc.string = link_str
|
||||
# Determine which site key is present in the description
|
||||
site_match = sites_pattern.search(link_str)
|
||||
if not site_match:
|
||||
continue
|
||||
site = site_match.group(0)
|
||||
alt = SITE_ALTS.get(site, '')
|
||||
if not alt:
|
||||
continue
|
||||
|
||||
# Avoid duplication if alt already present
|
||||
if alt in link_str:
|
||||
continue
|
||||
|
||||
# Medium-specific handling remains to avoid matching substrings
|
||||
if 'medium.com' in link_str:
|
||||
if link_str.startswith('medium.com') or '.medium.com' in link_str:
|
||||
replaced = SITE_ALTS['medium.com'] + link_str[
|
||||
link_str.find('medium.com') + len('medium.com'):
|
||||
]
|
||||
else:
|
||||
new_desc.string = link_str.replace(site, alt)
|
||||
replaced = link_str
|
||||
else:
|
||||
# If the description looks like a URL with scheme, replace only the host
|
||||
if '://' in link_str:
|
||||
scheme, rest = link_str.split('://', 1)
|
||||
host, sep, path = rest.partition('/')
|
||||
# Drop common prefixes from host when swapping to a fully-qualified alt
|
||||
alt_parsed = urlparse.urlparse(alt)
|
||||
alt_host = alt_parsed.netloc if alt_parsed.netloc else alt.replace('https://', '').replace('http://', '')
|
||||
# If alt includes a scheme, prefer its host; otherwise use alt as host
|
||||
if alt_parsed.scheme:
|
||||
new_host = alt_host
|
||||
else:
|
||||
# When alt has no scheme, still replace entire host
|
||||
new_host = alt
|
||||
# Prevent replacing if host already equals target
|
||||
if host == new_host:
|
||||
replaced = link_str
|
||||
else:
|
||||
replaced = f"{scheme}://{new_host}{sep}{path}"
|
||||
else:
|
||||
# No scheme in the text; include optional prefixes in replacement
|
||||
# Replace any leading www./m./mobile. + site with alt host (no scheme)
|
||||
alt_parsed = urlparse.urlparse(alt)
|
||||
alt_host = alt_parsed.netloc if alt_parsed.netloc else alt.replace('https://', '').replace('http://', '')
|
||||
# Build a pattern that includes optional prefixes for the specific site
|
||||
site_with_prefix = re.compile(rf'(?:(?:www|mobile|m)\.)?{re.escape(site)}')
|
||||
replaced = site_with_prefix.sub(alt_host, link_str, count=1)
|
||||
|
||||
link_desc.replace_with(new_desc)
|
||||
new_desc = BeautifulSoup(features='html.parser').new_tag('div')
|
||||
new_desc.string = replaced
|
||||
desc_node.replace_with(new_desc)
|
||||
|
||||
def view_image(self, soup) -> BeautifulSoup:
|
||||
"""Replaces the soup with a new one that handles mobile results and
|
||||
adds the link of the image full res to the results.
|
||||
"""Parses image results from Google Images and rewrites them into the
|
||||
lightweight Whoogle image results template.
|
||||
|
||||
Args:
|
||||
soup: A BeautifulSoup object containing the image mobile results.
|
||||
|
||||
Returns:
|
||||
BeautifulSoup: The new BeautifulSoup object
|
||||
Google now serves image results via the modern udm=2 endpoint, where
|
||||
the raw HTML contains only placeholder thumbnails. The actual image
|
||||
URLs live inside serialized data blobs in script tags. We extract that
|
||||
data and pair it with the visible result cards.
|
||||
"""
|
||||
|
||||
def _decode_url(url: str) -> str:
|
||||
if not url:
|
||||
return ''
|
||||
# Decode common escaped characters found in the script blobs
|
||||
return html.unescape(
|
||||
url.replace('\\u003d', '=').replace('\\u0026', '&')
|
||||
)
|
||||
|
||||
def _extract_image_data(modern_soup: BeautifulSoup) -> dict:
|
||||
"""Extracts docid -> {img_url, img_tbn} from serialized scripts."""
|
||||
scripts_text = ' '.join(
|
||||
script.string for script in modern_soup.find_all('script')
|
||||
if script.string
|
||||
)
|
||||
pattern = re.compile(
|
||||
r'\[0,"(?P<docid>[^"]+)",\["(?P<thumb>https://encrypted-tbn[^"]+)"'
|
||||
r'(?:,\d+,\d+)?\],\["(?P<full>https?://[^"]+?)"'
|
||||
r'(?:,\d+,\d+)?\]',
|
||||
re.DOTALL
|
||||
)
|
||||
results_map = {}
|
||||
for match in pattern.finditer(scripts_text):
|
||||
docid = match.group('docid')
|
||||
thumb = _decode_url(match.group('thumb'))
|
||||
full = _decode_url(match.group('full'))
|
||||
results_map[docid] = {
|
||||
'img_tbn': thumb,
|
||||
'img_url': full
|
||||
}
|
||||
return results_map
|
||||
|
||||
def _parse_modern_results(modern_soup: BeautifulSoup) -> list:
|
||||
cards = modern_soup.find_all(
|
||||
'div',
|
||||
attrs={
|
||||
'data-attrid': 'images universal',
|
||||
'data-docid': True
|
||||
}
|
||||
)
|
||||
if not cards:
|
||||
return []
|
||||
|
||||
meta_map = _extract_image_data(modern_soup)
|
||||
parsed = []
|
||||
seen = set()
|
||||
|
||||
for card in cards:
|
||||
docid = card.get('data-docid')
|
||||
meta = meta_map.get(docid, {})
|
||||
img_url = meta.get('img_url')
|
||||
img_tbn = meta.get('img_tbn')
|
||||
|
||||
# Fall back to the inline src if we failed to map the docid
|
||||
if not img_tbn:
|
||||
img_tag = card.find('img')
|
||||
if img_tag:
|
||||
candidate_src = img_tag.get('src')
|
||||
if candidate_src and candidate_src.startswith('http'):
|
||||
img_tbn = candidate_src
|
||||
|
||||
web_page = card.get('data-lpage') or ''
|
||||
if not web_page:
|
||||
link = card.find('a', href=True)
|
||||
if link:
|
||||
web_page = link['href']
|
||||
|
||||
key = (img_url, img_tbn, web_page)
|
||||
if not any(key) or key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
|
||||
parsed.append({
|
||||
'domain': urlparse.urlparse(web_page).netloc
|
||||
if web_page else '',
|
||||
'img_url': img_url or img_tbn or '',
|
||||
'web_page': web_page,
|
||||
'img_tbn': img_tbn or img_url or ''
|
||||
})
|
||||
return parsed
|
||||
|
||||
# Try parsing the modern (udm=2) layout first
|
||||
modern_results = _parse_modern_results(soup)
|
||||
if modern_results:
|
||||
# TODO: Implement proper image pagination. Google images uses
|
||||
# infinite scroll with `ijn` offsets; we need a clean,
|
||||
# de-duplicated pagination strategy before exposing a Next link.
|
||||
next_link = None
|
||||
return BeautifulSoup(
|
||||
render_template(
|
||||
'imageresults.html',
|
||||
length=len(modern_results),
|
||||
results=modern_results,
|
||||
view_label="View Image",
|
||||
next_link=next_link
|
||||
),
|
||||
features='html.parser'
|
||||
)
|
||||
|
||||
# get some tags that are unchanged between mobile and pc versions
|
||||
cor_suggested = soup.find_all('table', attrs={'class': "By0U9"})
|
||||
next_pages = soup.find('table', attrs={'class': "uZgmoc"})
|
||||
|
|
@ -718,7 +948,11 @@ class Filter:
|
|||
results_all = results_div.find_all('div', attrs={'class': "lIMUZd"})
|
||||
|
||||
for item in results_all:
|
||||
urls = item.find('a')['href'].split('&imgrefurl=')
|
||||
link = item.find('a', href=True)
|
||||
if not link:
|
||||
continue
|
||||
|
||||
urls = link['href'].split('&imgrefurl=')
|
||||
|
||||
# Skip urls that are not two-element lists
|
||||
if len(urls) != 2:
|
||||
|
|
@ -733,7 +967,16 @@ class Filter:
|
|||
except IndexError:
|
||||
web_page = urlparse.unquote(urls[1])
|
||||
|
||||
img_tbn = urlparse.unquote(item.find('a').find('img')['src'])
|
||||
img_tag = link.find('img')
|
||||
if not img_tag:
|
||||
continue
|
||||
|
||||
img_tbn = urlparse.unquote(
|
||||
img_tag.get('src') or img_tag.get('data-src', '')
|
||||
)
|
||||
|
||||
if not img_tbn:
|
||||
continue
|
||||
|
||||
results.append({
|
||||
'domain': urlparse.urlparse(web_page).netloc,
|
||||
|
|
@ -750,11 +993,18 @@ class Filter:
|
|||
|
||||
# replace correction suggested by google object if exists
|
||||
if len(cor_suggested):
|
||||
soup.find_all(
|
||||
suggested_tables = soup.find_all(
|
||||
'table',
|
||||
attrs={'class': "By0U9"}
|
||||
)[0].replaceWith(cor_suggested[0])
|
||||
# replace next page object at the bottom of the page
|
||||
soup.find_all('table',
|
||||
attrs={'class': "uZgmoc"})[0].replaceWith(next_pages)
|
||||
)
|
||||
if suggested_tables:
|
||||
suggested_tables[0].replaceWith(cor_suggested[0])
|
||||
|
||||
# replace next page object at the bottom of the page, when present
|
||||
next_page_tables = soup.find_all('table', attrs={'class': "uZgmoc"})
|
||||
if next_pages and next_page_tables:
|
||||
next_page_tables[0].replaceWith(next_pages)
|
||||
|
||||
# TODO: Reintroduce pagination for legacy image layout if needed.
|
||||
|
||||
return soup
|
||||
|
|
|
|||
|
|
@ -37,12 +37,19 @@ def get_rule_for_selector(stylesheet: CSSStyleSheet,
|
|||
|
||||
class Config:
|
||||
def __init__(self, **kwargs):
|
||||
# User agent configuration
|
||||
self.user_agent = kwargs.get('user_agent', 'LYNX_UA')
|
||||
# User agent configuration - default to env_conf if environment variables exist, otherwise default
|
||||
env_user_agent = os.getenv('WHOOGLE_USER_AGENT', '')
|
||||
env_mobile_agent = os.getenv('WHOOGLE_USER_AGENT_MOBILE', '')
|
||||
default_ua_option = 'env_conf' if (env_user_agent or env_mobile_agent) else 'default'
|
||||
|
||||
self.user_agent = kwargs.get('user_agent', default_ua_option)
|
||||
self.custom_user_agent = kwargs.get('custom_user_agent', '')
|
||||
self.use_custom_user_agent = kwargs.get('use_custom_user_agent', False)
|
||||
self.show_user_agent = read_config_bool('WHOOGLE_CONFIG_SHOW_USER_AGENT')
|
||||
|
||||
# Add user agent related keys to safe_keys
|
||||
# Note: CSE credentials (cse_api_key, cse_id) are intentionally NOT included
|
||||
# in safe_keys for security - they should not be shareable via URL
|
||||
self.safe_keys = [
|
||||
'lang_search',
|
||||
'lang_interface',
|
||||
|
|
@ -59,7 +66,8 @@ class Config:
|
|||
'tbs',
|
||||
'user_agent',
|
||||
'custom_user_agent',
|
||||
'use_custom_user_agent'
|
||||
'use_custom_user_agent',
|
||||
'show_user_agent'
|
||||
]
|
||||
|
||||
app_config = current_app.config
|
||||
|
|
@ -75,7 +83,6 @@ class Config:
|
|||
self.tbs = os.getenv('WHOOGLE_CONFIG_TIME_PERIOD', '')
|
||||
self.theme = os.getenv('WHOOGLE_CONFIG_THEME', 'system')
|
||||
self.safe = read_config_bool('WHOOGLE_CONFIG_SAFE')
|
||||
self.dark = read_config_bool('WHOOGLE_CONFIG_DARK') # deprecated
|
||||
self.alts = read_config_bool('WHOOGLE_CONFIG_ALTS')
|
||||
self.nojs = read_config_bool('WHOOGLE_CONFIG_NOJS')
|
||||
self.tor = read_config_bool('WHOOGLE_CONFIG_TOR')
|
||||
|
|
@ -87,13 +94,21 @@ class Config:
|
|||
self.preferences_encrypted = read_config_bool('WHOOGLE_CONFIG_PREFERENCES_ENCRYPTED')
|
||||
self.preferences_key = os.getenv('WHOOGLE_CONFIG_PREFERENCES_KEY', '')
|
||||
|
||||
# Google Custom Search Engine (CSE) BYOK settings
|
||||
self.cse_api_key = os.getenv('WHOOGLE_CSE_API_KEY', '')
|
||||
self.cse_id = os.getenv('WHOOGLE_CSE_ID', '')
|
||||
self.use_cse = read_config_bool('WHOOGLE_USE_CSE')
|
||||
|
||||
self.accept_language = False
|
||||
|
||||
# Skip setting custom config if there isn't one
|
||||
if kwargs:
|
||||
mutable_attrs = self.get_mutable_attrs()
|
||||
for attr in mutable_attrs:
|
||||
if attr in kwargs.keys():
|
||||
if attr == 'show_user_agent':
|
||||
# Handle show_user_agent as boolean
|
||||
self.show_user_agent = bool(kwargs.get(attr))
|
||||
elif attr in kwargs.keys():
|
||||
setattr(self, attr, kwargs[attr])
|
||||
elif attr not in kwargs.keys() and mutable_attrs[attr] == bool:
|
||||
setattr(self, attr, False)
|
||||
|
|
@ -127,10 +142,9 @@ class Config:
|
|||
Returns:
|
||||
str -- the new style
|
||||
"""
|
||||
style_sheet = cssutils.parseString(
|
||||
open(os.path.join(current_app.config['STATIC_FOLDER'],
|
||||
'css/variables.css')).read()
|
||||
)
|
||||
vars_path = os.path.join(current_app.config['STATIC_FOLDER'], 'css/variables.css')
|
||||
with open(vars_path, 'r', encoding='utf-8') as f:
|
||||
style_sheet = cssutils.parseString(f.read())
|
||||
|
||||
modified_sheet = cssutils.parseString(self.style_modified)
|
||||
for rule in modified_sheet:
|
||||
|
|
@ -240,9 +254,34 @@ class Config:
|
|||
return param_str
|
||||
|
||||
def _get_fernet_key(self, password: str) -> bytes:
|
||||
hash_object = hashlib.md5(password.encode())
|
||||
key = urlsafe_b64encode(hash_object.hexdigest().encode())
|
||||
return key
|
||||
"""Derive a Fernet-compatible key from a password using PBKDF2.
|
||||
|
||||
Note: This uses a static salt for simplicity. This is a breaking change
|
||||
from the previous MD5-based implementation. Existing encrypted preferences
|
||||
will need to be re-encrypted.
|
||||
|
||||
Args:
|
||||
password: The password to derive the key from
|
||||
|
||||
Returns:
|
||||
bytes: A URL-safe base64 encoded 32-byte key suitable for Fernet
|
||||
"""
|
||||
# Use a static salt derived from app context
|
||||
# In a production system, you'd want to store per-user salts
|
||||
salt = b'whoogle-preferences-salt-v2'
|
||||
|
||||
# Derive a 32-byte key using PBKDF2 with SHA256
|
||||
# 100,000 iterations is a reasonable balance of security and performance
|
||||
kdf_key = hashlib.pbkdf2_hmac(
|
||||
'sha256',
|
||||
password.encode('utf-8'),
|
||||
salt,
|
||||
100000,
|
||||
dklen=32
|
||||
)
|
||||
|
||||
# Fernet requires a URL-safe base64 encoded key
|
||||
return urlsafe_b64encode(kdf_key)
|
||||
|
||||
def _encode_preferences(self) -> str:
|
||||
preferences_json = json.dumps(self.get_attrs()).encode()
|
||||
|
|
|
|||
213
app/request.py
213
app/request.py
|
|
@ -1,10 +1,9 @@
|
|||
from app.models.config import Config
|
||||
from app.utils.misc import read_config_bool
|
||||
from datetime import datetime
|
||||
from app.services.provider import get_http_client
|
||||
from app.utils.ua_generator import load_ua_pool, get_random_ua, DEFAULT_FALLBACK_UA
|
||||
from defusedxml import ElementTree as ET
|
||||
import random
|
||||
import requests
|
||||
from requests import Response, ConnectionError
|
||||
import httpx
|
||||
import urllib.parse as urlparse
|
||||
import os
|
||||
from stem import Signal, SocketError
|
||||
|
|
@ -16,9 +15,6 @@ MAPS_URL = 'https://maps.google.com/maps'
|
|||
AUTOCOMPLETE_URL = ('https://suggestqueries.google.com/'
|
||||
'complete/search?client=toolbar&')
|
||||
|
||||
MOBILE_UA = '{}/5.0 (Android 0; Mobile; rv:54.0) Gecko/54.0 {}/59.0'
|
||||
DESKTOP_UA = '{}/5.0 (X11; {} x86_64; rv:75.0) Gecko/20100101 {}/75.0'
|
||||
|
||||
# Valid query params
|
||||
VALID_PARAMS = ['tbs', 'tbm', 'start', 'near', 'source', 'nfpr']
|
||||
|
||||
|
|
@ -73,25 +69,51 @@ def send_tor_signal(signal: Signal) -> bool:
|
|||
|
||||
|
||||
def gen_user_agent(config, is_mobile) -> str:
|
||||
# Define the Lynx user agent
|
||||
LYNX_UA = 'Lynx/2.9.2 libwww-FM/2.14 SSL-MM/1.4.1 OpenSSL/3.4.0'
|
||||
|
||||
# If using custom user agent, return the custom string
|
||||
if config.user_agent == 'custom' and config.custom_user_agent:
|
||||
return config.custom_user_agent
|
||||
|
||||
# If using Lynx user agent
|
||||
if config.user_agent == 'LYNX_UA':
|
||||
return LYNX_UA
|
||||
# If using environment configuration
|
||||
if config.user_agent == 'env_conf':
|
||||
if is_mobile:
|
||||
env_ua = os.getenv('WHOOGLE_USER_AGENT_MOBILE', '')
|
||||
if env_ua:
|
||||
return env_ua
|
||||
else:
|
||||
env_ua = os.getenv('WHOOGLE_USER_AGENT', '')
|
||||
if env_ua:
|
||||
return env_ua
|
||||
# If env vars are not set, fall back to Opera UA
|
||||
return DEFAULT_FALLBACK_UA
|
||||
|
||||
# If no custom user agent is set, generate a random one
|
||||
firefox = random.choice(['Choir', 'Squier', 'Higher', 'Wire']) + 'fox'
|
||||
linux = random.choice(['Win', 'Sin', 'Gin', 'Fin', 'Kin']) + 'ux'
|
||||
# If using default user agent - use auto-generated Opera UA pool
|
||||
if config.user_agent == 'default':
|
||||
try:
|
||||
# Try to load UA pool from cache (lazy loading if not in app.config)
|
||||
# First check if we have access to Flask app context
|
||||
try:
|
||||
from flask import current_app
|
||||
if hasattr(current_app, 'config') and 'UA_POOL' in current_app.config:
|
||||
ua_pool = current_app.config['UA_POOL']
|
||||
else:
|
||||
# Fall back to loading from disk
|
||||
raise ImportError("UA_POOL not in app config")
|
||||
except (ImportError, RuntimeError):
|
||||
# No Flask context available or UA_POOL not in config, load from disk
|
||||
config_path = os.environ.get('CONFIG_VOLUME',
|
||||
os.path.join(os.path.dirname(os.path.abspath(__file__)),
|
||||
'static', 'config'))
|
||||
cache_path = os.path.join(config_path, 'ua_cache.json')
|
||||
ua_pool = load_ua_pool(cache_path, count=10)
|
||||
|
||||
return get_random_ua(ua_pool)
|
||||
except Exception as e:
|
||||
# If anything goes wrong, fall back to default Opera UA
|
||||
print(f"Warning: Could not load UA pool, using fallback Opera UA: {e}")
|
||||
return DEFAULT_FALLBACK_UA
|
||||
|
||||
if is_mobile:
|
||||
return MOBILE_UA.format("Mozilla", firefox)
|
||||
|
||||
return DESKTOP_UA.format("Mozilla", linux, firefox)
|
||||
# Fallback for backwards compatibility (old configs or invalid user_agent values)
|
||||
return DEFAULT_FALLBACK_UA
|
||||
|
||||
|
||||
def gen_query(query, args, config) -> str:
|
||||
|
|
@ -125,6 +147,10 @@ def gen_query(query, args, config) -> str:
|
|||
# Pass along type of results (news, images, books, etc)
|
||||
if 'tbm' in args:
|
||||
param_dict['tbm'] = '&tbm=' + args.get('tbm')
|
||||
# Google Images now expects the modern udm=2 layout; force it when
|
||||
# requesting images to avoid redirects to the new AI/text layout.
|
||||
if args.get('tbm') == 'isch' and 'udm' not in args:
|
||||
param_dict['udm'] = '&udm=2'
|
||||
|
||||
# Get results page start value (10 per page, ie page 2 start val = 20)
|
||||
if 'start' in args:
|
||||
|
|
@ -189,12 +215,16 @@ class Request:
|
|||
config: the user's current whoogle configuration
|
||||
"""
|
||||
|
||||
def __init__(self, normal_ua, root_path, config: Config):
|
||||
self.search_url = 'https://www.google.com/search?gbv=1&num=' + str(
|
||||
os.getenv('WHOOGLE_RESULTS_PER_PAGE', 10)) + '&q='
|
||||
# Send heartbeat to Tor, used in determining if the user can or cannot
|
||||
# enable Tor for future requests
|
||||
send_tor_signal(Signal.HEARTBEAT)
|
||||
def __init__(self, normal_ua, root_path, config: Config, http_client=None):
|
||||
self.search_url = 'https://www.google.com/search?gbv=1&q='
|
||||
# Google Images rejects the lightweight gbv=1 interface. Use the
|
||||
# modern udm=2 entrypoint specifically for image searches to avoid the
|
||||
# "update your browser" interstitial.
|
||||
self.image_search_url = 'https://www.google.com/search?udm=2&q='
|
||||
# Optionally send heartbeat to Tor to determine availability
|
||||
# Only when Tor is enabled in config to avoid unnecessary socket usage
|
||||
if config.tor:
|
||||
send_tor_signal(Signal.HEARTBEAT)
|
||||
|
||||
self.language = config.lang_search if config.lang_search else ''
|
||||
self.country = config.country if config.country else ''
|
||||
|
|
@ -212,6 +242,13 @@ class Request:
|
|||
if not self.mobile:
|
||||
self.modified_user_agent_mobile = gen_user_agent(config, True)
|
||||
|
||||
# Dedicated modern UA to use when Google rejects legacy ones (e.g. Images)
|
||||
self.image_user_agent = (
|
||||
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
|
||||
'AppleWebKit/537.36 (KHTML, like Gecko) '
|
||||
'Chrome/127.0.0.0 Safari/537.36'
|
||||
)
|
||||
|
||||
# Set up proxy configuration
|
||||
proxy_path = os.environ.get('WHOOGLE_PROXY_LOC', '')
|
||||
if proxy_path:
|
||||
|
|
@ -236,6 +273,8 @@ class Request:
|
|||
self.tor = config.tor
|
||||
self.tor_valid = False
|
||||
self.root_path = root_path
|
||||
# Initialize HTTP client (shared per proxies)
|
||||
self.http_client = http_client or get_http_client(self.proxies)
|
||||
|
||||
def __getitem__(self, name):
|
||||
return getattr(self, name)
|
||||
|
|
@ -250,30 +289,39 @@ class Request:
|
|||
list: The list of matches for possible search suggestions
|
||||
|
||||
"""
|
||||
ac_query = dict(q=query)
|
||||
if self.language:
|
||||
ac_query['lr'] = self.language
|
||||
if self.country:
|
||||
ac_query['gl'] = self.country
|
||||
if self.lang_interface:
|
||||
ac_query['hl'] = self.lang_interface
|
||||
|
||||
response = self.send(base_url=AUTOCOMPLETE_URL,
|
||||
query=urlparse.urlencode(ac_query)).text
|
||||
|
||||
if not response:
|
||||
# Check if autocomplete is disabled via environment variable
|
||||
if os.environ.get('WHOOGLE_AUTOCOMPLETE', '1') == '0':
|
||||
return []
|
||||
|
||||
|
||||
try:
|
||||
root = ET.fromstring(response)
|
||||
return [_.attrib['data'] for _ in
|
||||
root.findall('.//suggestion/[@data]')]
|
||||
except ET.ParseError:
|
||||
# Malformed XML response
|
||||
ac_query = dict(q=query)
|
||||
if self.language:
|
||||
ac_query['lr'] = self.language
|
||||
if self.country:
|
||||
ac_query['gl'] = self.country
|
||||
if self.lang_interface:
|
||||
ac_query['hl'] = self.lang_interface
|
||||
|
||||
response = self.send(base_url=AUTOCOMPLETE_URL,
|
||||
query=urlparse.urlencode(ac_query)).text
|
||||
|
||||
if not response:
|
||||
return []
|
||||
|
||||
try:
|
||||
root = ET.fromstring(response)
|
||||
return [_.attrib['data'] for _ in
|
||||
root.findall('.//suggestion/[@data]')]
|
||||
except ET.ParseError:
|
||||
# Malformed XML response
|
||||
return []
|
||||
except Exception as e:
|
||||
# Log the error but don't crash - autocomplete is non-essential
|
||||
print(f"Autocomplete error: {str(e)}")
|
||||
return []
|
||||
|
||||
def send(self, base_url='', query='', attempt=0,
|
||||
force_mobile=False, user_agent='') -> Response:
|
||||
force_mobile=False, user_agent=''):
|
||||
"""Sends an outbound request to a URL. Optionally sends the request
|
||||
using Tor, if enabled by the user.
|
||||
|
||||
|
|
@ -298,21 +346,53 @@ class Request:
|
|||
else:
|
||||
modified_user_agent = self.modified_user_agent
|
||||
|
||||
# Some Google endpoints (notably Images) now refuse legacy user agents.
|
||||
# If an image search is detected and the generated UA isn't Chromium-
|
||||
# like, retry with a modern Chrome string to avoid the "update your
|
||||
# browser" interstitial.
|
||||
if (('tbm=isch' in query) or ('udm=2' in query)) and 'Chrome' not in modified_user_agent:
|
||||
modified_user_agent = self.image_user_agent
|
||||
|
||||
headers = {
|
||||
'User-Agent': modified_user_agent
|
||||
'User-Agent': modified_user_agent,
|
||||
'Accept': ('text/html,application/xhtml+xml,application/xml;'
|
||||
'q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8'),
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Accept-Encoding': 'gzip, deflate, br',
|
||||
'Connection': 'keep-alive',
|
||||
'Cache-Control': 'max-age=0',
|
||||
'Pragma': 'no-cache',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'Sec-Fetch-Site': 'none',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-User': '?1',
|
||||
'Sec-Fetch-Dest': 'document'
|
||||
}
|
||||
# Only attach client hints when using a Chromium-like user agent to
|
||||
# avoid sending conflicting information that can trigger unsupported
|
||||
# browser pages.
|
||||
if 'Chrome' in headers['User-Agent']:
|
||||
headers.update({
|
||||
'Sec-CH-UA': (
|
||||
'"Not/A)Brand";v="8", '
|
||||
'"Chromium";v="127", '
|
||||
'"Google Chrome";v="127"'
|
||||
),
|
||||
'Sec-CH-UA-Mobile': '?0',
|
||||
'Sec-CH-UA-Platform': '"Windows"'
|
||||
})
|
||||
|
||||
# Adding the Accept-Language to the Header if possible
|
||||
|
||||
# Add Accept-Language header tied to the current config if requested
|
||||
if self.lang_interface:
|
||||
headers.update({'Accept-Language':
|
||||
self.lang_interface.replace('lang_', '')
|
||||
+ ';q=1.0'})
|
||||
headers['Accept-Language'] = (
|
||||
self.lang_interface.replace('lang_', '') + ';q=1.0'
|
||||
)
|
||||
|
||||
# view is suppressed correctly
|
||||
now = datetime.now()
|
||||
cookies = {
|
||||
# Consent cookies keep Google from showing the interstitial consent wall
|
||||
consent_cookies = {
|
||||
'CONSENT': 'PENDING+987',
|
||||
'SOCS': 'CAESHAgBEhIaAB',
|
||||
'SOCS': 'CAESHAgBEhIaAB'
|
||||
}
|
||||
|
||||
# Validate Tor conn and request new identity if the last one failed
|
||||
|
|
@ -326,8 +406,9 @@ class Request:
|
|||
# Make sure that the tor connection is valid, if enabled
|
||||
if self.tor:
|
||||
try:
|
||||
tor_check = requests.get('https://check.torproject.org/',
|
||||
proxies=self.proxies, headers=headers)
|
||||
tor_check = self.http_client.get('https://check.torproject.org/',
|
||||
headers=headers,
|
||||
retries=1)
|
||||
self.tor_valid = 'Congratulations' in tor_check.text
|
||||
|
||||
if not self.tor_valid:
|
||||
|
|
@ -335,22 +416,28 @@ class Request:
|
|||
"Tor connection succeeded, but the connection could "
|
||||
"not be validated by torproject.org",
|
||||
disable=True)
|
||||
except ConnectionError:
|
||||
except httpx.RequestError:
|
||||
raise TorError(
|
||||
"Error raised during Tor connection validation",
|
||||
disable=True)
|
||||
|
||||
response = requests.get(
|
||||
(base_url or self.search_url) + query,
|
||||
proxies=self.proxies,
|
||||
headers=headers,
|
||||
cookies=cookies)
|
||||
search_base = base_url or self.search_url
|
||||
if not base_url and ('tbm=isch' in query or 'udm=2' in query):
|
||||
search_base = self.image_search_url
|
||||
|
||||
try:
|
||||
response = self.http_client.get(
|
||||
search_base + query,
|
||||
headers=headers,
|
||||
cookies=consent_cookies)
|
||||
except httpx.HTTPError as e:
|
||||
raise
|
||||
|
||||
# Retry query with new identity if using Tor (max 10 attempts)
|
||||
if 'form id="captcha-form"' in response.text and self.tor:
|
||||
attempt += 1
|
||||
if attempt > 10:
|
||||
raise TorError("Tor query failed -- max attempts exceeded 10")
|
||||
return self.send((base_url or self.search_url), query, attempt)
|
||||
return self.send(search_base, query, attempt)
|
||||
|
||||
return response
|
||||
|
|
|
|||
324
app/routes.py
324
app/routes.py
|
|
@ -3,7 +3,6 @@ import base64
|
|||
import io
|
||||
import json
|
||||
import os
|
||||
import pickle
|
||||
import re
|
||||
import urllib.parse as urlparse
|
||||
import uuid
|
||||
|
|
@ -18,6 +17,7 @@ from app import app
|
|||
from app.models.config import Config
|
||||
from app.models.endpoint import Endpoint
|
||||
from app.request import Request, TorError
|
||||
from app.services.cse_client import CSEException
|
||||
from app.utils.bangs import suggest_bang, resolve_bang
|
||||
from app.utils.misc import empty_gif, placeholder_img, get_proxy_host_url, \
|
||||
fetch_favicon
|
||||
|
|
@ -32,8 +32,7 @@ from app.utils.session import valid_user_session
|
|||
from bs4 import BeautifulSoup as bsoup
|
||||
from flask import jsonify, make_response, request, redirect, render_template, \
|
||||
send_file, session, url_for, g
|
||||
from requests import exceptions
|
||||
from requests.models import PreparedRequest
|
||||
import httpx
|
||||
from cryptography.fernet import Fernet, InvalidToken
|
||||
from cryptography.exceptions import InvalidSignature
|
||||
from werkzeug.datastructures import MultiDict
|
||||
|
|
@ -103,9 +102,8 @@ def session_required(f):
|
|||
if os.path.getsize(file_path) > app.config['MAX_SESSION_SIZE']:
|
||||
continue
|
||||
|
||||
with open(file_path, 'rb') as session_file:
|
||||
_ = pickle.load(session_file)
|
||||
data = pickle.load(session_file)
|
||||
with open(file_path, 'r', encoding='utf-8') as session_file:
|
||||
data = json.load(session_file)
|
||||
if isinstance(data, dict) and 'valid' in data:
|
||||
continue
|
||||
invalid_sessions.append(file_path)
|
||||
|
|
@ -166,7 +164,8 @@ def before_request_func():
|
|||
g.user_request = Request(
|
||||
request.headers.get('User-Agent'),
|
||||
get_request_url(request.url_root),
|
||||
config=g.user_config)
|
||||
config=g.user_config
|
||||
)
|
||||
|
||||
g.app_location = g.user_config.url
|
||||
|
||||
|
|
@ -176,19 +175,28 @@ def after_request_func(resp):
|
|||
resp.headers['X-Content-Type-Options'] = 'nosniff'
|
||||
resp.headers['X-Frame-Options'] = 'DENY'
|
||||
resp.headers['Cache-Control'] = 'max-age=86400'
|
||||
|
||||
# Security headers
|
||||
resp.headers['Referrer-Policy'] = 'no-referrer'
|
||||
resp.headers['Permissions-Policy'] = 'geolocation=(), microphone=(), camera=()'
|
||||
|
||||
# Add HSTS header if HTTPS is enabled
|
||||
if os.environ.get('HTTPS_ONLY', False):
|
||||
resp.headers['Strict-Transport-Security'] = 'max-age=31536000; includeSubDomains'
|
||||
|
||||
if os.getenv('WHOOGLE_CSP', False):
|
||||
# Enable CSP by default (can be disabled via env var)
|
||||
if os.getenv('WHOOGLE_CSP', '1') != '0':
|
||||
resp.headers['Content-Security-Policy'] = app.config['CSP']
|
||||
if os.environ.get('HTTPS_ONLY', False):
|
||||
resp.headers['Content-Security-Policy'] += \
|
||||
'upgrade-insecure-requests'
|
||||
' upgrade-insecure-requests'
|
||||
|
||||
return resp
|
||||
|
||||
|
||||
@app.errorhandler(404)
|
||||
def unknown_page(e):
|
||||
app.logger.warn(e)
|
||||
app.logger.warning(e)
|
||||
return redirect(g.app_location)
|
||||
|
||||
|
||||
|
|
@ -217,9 +225,7 @@ def index():
|
|||
translation=app.config['TRANSLATIONS'][
|
||||
g.user_config.get_localization_lang()
|
||||
],
|
||||
logo=render_template(
|
||||
'logo.html',
|
||||
dark=g.user_config.dark),
|
||||
logo=render_template('logo.html'),
|
||||
config_disabled=(
|
||||
app.config['CONFIG_DISABLE'] or
|
||||
not valid_user_session(session)),
|
||||
|
|
@ -283,11 +289,43 @@ def autocomplete():
|
|||
#
|
||||
# Note: If Tor is enabled, this returns nothing, as the request is
|
||||
# almost always rejected
|
||||
# Also check if autocomplete is disabled globally
|
||||
autocomplete_enabled = os.environ.get('WHOOGLE_AUTOCOMPLETE', '1') != '0'
|
||||
return jsonify([
|
||||
q,
|
||||
g.user_request.autocomplete(q) if not g.user_config.tor else []
|
||||
g.user_request.autocomplete(q) if (not g.user_config.tor and autocomplete_enabled) else []
|
||||
])
|
||||
|
||||
def clean_text_spacing(text: str) -> str:
|
||||
"""Clean up text spacing issues from HTML extraction.
|
||||
|
||||
Args:
|
||||
text: Text extracted from HTML that may have spacing issues
|
||||
|
||||
Returns:
|
||||
Cleaned text with proper spacing
|
||||
"""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
# Normalize multiple spaces to single space
|
||||
text = re.sub(r'\s+', ' ', text)
|
||||
|
||||
# Fix domain names: remove space before period followed by domain extension
|
||||
# Examples: "weather .com" -> "weather.com", "example .org" -> "example.org"
|
||||
text = re.sub(r'\s+\.([a-zA-Z]{2,})\b', r'.\1', text)
|
||||
|
||||
# Fix www/http/https patterns
|
||||
# Examples: "www .example" -> "www.example"
|
||||
text = re.sub(r'\b(www|http|https)\s+\.', r'\1.', text)
|
||||
|
||||
# Fix spaces before common punctuation
|
||||
text = re.sub(r'\s+([,;:])', r'\1', text)
|
||||
|
||||
# Strip leading/trailing whitespace
|
||||
return text.strip()
|
||||
|
||||
|
||||
@app.route(f'/{Endpoint.search}', methods=['GET', 'POST'])
|
||||
@session_required
|
||||
@auth_required
|
||||
|
|
@ -299,7 +337,7 @@ def search():
|
|||
get_req_str = urlparse.urlencode(post_data)
|
||||
return redirect(url_for('.search') + '?' + get_req_str)
|
||||
|
||||
search_util = Search(request, g.user_config, g.session_key)
|
||||
search_util = Search(request, g.user_config, g.session_key, user_request=g.user_request)
|
||||
query = search_util.new_search_query()
|
||||
|
||||
bang = resolve_bang(query)
|
||||
|
|
@ -319,8 +357,40 @@ def search():
|
|||
session['config']['tor'] = False if e.disable else session['config'][
|
||||
'tor']
|
||||
return redirect(url_for('.index'))
|
||||
except CSEException as e:
|
||||
localization_lang = g.user_config.get_localization_lang()
|
||||
translation = app.config['TRANSLATIONS'][localization_lang]
|
||||
wants_json = (
|
||||
request.args.get('format') == 'json' or
|
||||
'application/json' in request.headers.get('Accept', '') or
|
||||
'application/*+json' in request.headers.get('Accept', '')
|
||||
)
|
||||
error_msg = f"Custom Search API Error: {e.message}"
|
||||
if e.is_quota_error:
|
||||
error_msg = ("Google Custom Search API quota exceeded. "
|
||||
"Free tier allows 100 queries/day. "
|
||||
"Wait until midnight PT or disable CSE in settings.")
|
||||
if wants_json:
|
||||
return jsonify({
|
||||
'error': True,
|
||||
'error_message': error_msg,
|
||||
'query': urlparse.unquote(query)
|
||||
}), e.code
|
||||
return render_template(
|
||||
'error.html',
|
||||
error_message=error_msg,
|
||||
translation=translation,
|
||||
config=g.user_config), e.code
|
||||
|
||||
wants_json = (
|
||||
request.args.get('format') == 'json' or
|
||||
'application/json' in request.headers.get('Accept', '') or
|
||||
'application/*+json' in request.headers.get('Accept', '')
|
||||
)
|
||||
|
||||
if search_util.feeling_lucky:
|
||||
if wants_json:
|
||||
return jsonify({'redirect': response}), 303
|
||||
return redirect(response, code=303)
|
||||
|
||||
# If the user is attempting to translate a string, determine the correct
|
||||
|
|
@ -341,17 +411,26 @@ def search():
|
|||
app.logger.error('503 (CAPTCHA)')
|
||||
fallback_engine = os.environ.get('WHOOGLE_FALLBACK_ENGINE_URL', '')
|
||||
if (fallback_engine):
|
||||
if wants_json:
|
||||
return jsonify({'redirect': fallback_engine + query}), 302
|
||||
return redirect(fallback_engine + query)
|
||||
|
||||
return render_template(
|
||||
'error.html',
|
||||
blocked=True,
|
||||
error_message=translation['ratelimit'],
|
||||
translation=translation,
|
||||
farside='https://farside.link',
|
||||
config=g.user_config,
|
||||
query=urlparse.unquote(query),
|
||||
params=g.user_config.to_params(keys=['preferences'])), 503
|
||||
if wants_json:
|
||||
return jsonify({
|
||||
'blocked': True,
|
||||
'error_message': translation['ratelimit'],
|
||||
'query': urlparse.unquote(query)
|
||||
}), 503
|
||||
else:
|
||||
return render_template(
|
||||
'error.html',
|
||||
blocked=True,
|
||||
error_message=translation['ratelimit'],
|
||||
translation=translation,
|
||||
farside='https://farside.link',
|
||||
config=g.user_config,
|
||||
query=urlparse.unquote(query),
|
||||
params=g.user_config.to_params(keys=['preferences'])), 503
|
||||
|
||||
response = bold_search_terms(response, query)
|
||||
|
||||
|
|
@ -363,12 +442,23 @@ def search():
|
|||
elif search_util.widget == 'calculator' and not 'nojs' in request.args:
|
||||
response = add_calculator_card(html_soup)
|
||||
|
||||
# Update tabs content
|
||||
# Update tabs content (fallback to the raw query if full_query isn't set)
|
||||
full_query_val = getattr(search_util, 'full_query', query)
|
||||
tabs = get_tabs_content(app.config['HEADER_TABS'],
|
||||
search_util.full_query,
|
||||
full_query_val,
|
||||
search_util.search_type,
|
||||
g.user_config.preferences,
|
||||
translation)
|
||||
|
||||
# Filter out unsupported tabs when CSE is enabled
|
||||
# CSE only supports web (all) and image search, not videos/news
|
||||
use_cse = (
|
||||
g.user_config.use_cse and
|
||||
g.user_config.cse_api_key and
|
||||
g.user_config.cse_id
|
||||
)
|
||||
if use_cse:
|
||||
tabs = {k: v for k, v in tabs.items() if k in ['all', 'images', 'maps']}
|
||||
|
||||
# Feature to display currency_card
|
||||
# Since this is determined by more than just the
|
||||
|
|
@ -382,6 +472,125 @@ def search():
|
|||
home_url = f"home?preferences={preferences}" if preferences else "home"
|
||||
cleanresponse = str(response).replace("andlt;","<").replace("andgt;",">")
|
||||
|
||||
if wants_json:
|
||||
# Build a parsable JSON from the filtered soup
|
||||
json_soup = bsoup(str(response), 'html.parser')
|
||||
results = []
|
||||
seen = set()
|
||||
|
||||
# Find all result containers (using known result classes)
|
||||
result_divs = json_soup.find_all('div', class_=['ZINbbc', 'ezO2md'])
|
||||
|
||||
if result_divs:
|
||||
# Process structured Google results with container divs
|
||||
for div in result_divs:
|
||||
# Find the first valid link in this result container
|
||||
link = None
|
||||
for a in div.find_all('a', href=True):
|
||||
if a['href'].startswith('http'):
|
||||
link = a
|
||||
break
|
||||
|
||||
if not link:
|
||||
continue
|
||||
|
||||
href = link['href']
|
||||
if href in seen:
|
||||
continue
|
||||
|
||||
# Get all text from the result container, not just the link
|
||||
text = clean_text_spacing(div.get_text(separator=' ', strip=True))
|
||||
if not text:
|
||||
continue
|
||||
|
||||
# Extract title and content separately
|
||||
# Title is typically in an h3 tag, CVA68e span, or the main link text
|
||||
title = ''
|
||||
# First try h3 tag
|
||||
h3_tag = div.find('h3')
|
||||
if h3_tag:
|
||||
title = clean_text_spacing(h3_tag.get_text(separator=' ', strip=True))
|
||||
else:
|
||||
# Try CVA68e class (common title class in Google results)
|
||||
title_span = div.find('span', class_='CVA68e')
|
||||
if title_span:
|
||||
title = clean_text_spacing(title_span.get_text(separator=' ', strip=True))
|
||||
elif link:
|
||||
# Fallback to link text, but exclude URL breadcrumb
|
||||
title = clean_text_spacing(link.get_text(separator=' ', strip=True))
|
||||
|
||||
# Content is the description/snippet text
|
||||
# Look for description/snippet elements
|
||||
content = ''
|
||||
# Common classes for snippets/descriptions in Google results
|
||||
snippet_selectors = [
|
||||
{'class_': 'VwiC3b'}, # Standard snippet
|
||||
{'class_': 'FrIlee'}, # Alternative snippet class (common in current Google)
|
||||
{'class_': 's'}, # Another snippet class
|
||||
{'class_': 'st'}, # Legacy snippet class
|
||||
]
|
||||
|
||||
for selector in snippet_selectors:
|
||||
snippet_elem = div.find('span', selector) or div.find('div', selector)
|
||||
if snippet_elem:
|
||||
# Get text but exclude any nested links (like "Related searches")
|
||||
content = clean_text_spacing(snippet_elem.get_text(separator=' ', strip=True))
|
||||
# Only use if it's substantial content (not just the URL breadcrumb)
|
||||
if content and not content.startswith('www.') and '›' not in content:
|
||||
break
|
||||
else:
|
||||
content = ''
|
||||
|
||||
# If no specific content found, use text minus title as fallback
|
||||
if not content and title:
|
||||
# Try to extract content by removing title from full text
|
||||
if text.startswith(title):
|
||||
content = text[len(title):].strip()
|
||||
else:
|
||||
content = text
|
||||
elif not content:
|
||||
content = text
|
||||
|
||||
seen.add(href)
|
||||
results.append({
|
||||
'href': href,
|
||||
'text': text,
|
||||
'title': title,
|
||||
'content': content
|
||||
})
|
||||
else:
|
||||
# Fallback: extract links directly if no result containers found
|
||||
for a in json_soup.find_all('a', href=True):
|
||||
href = a['href']
|
||||
if not href.startswith('http'):
|
||||
continue
|
||||
if href in seen:
|
||||
continue
|
||||
text = clean_text_spacing(a.get_text(separator=' ', strip=True))
|
||||
if not text:
|
||||
continue
|
||||
seen.add(href)
|
||||
# In fallback mode, the link text serves as both title and text
|
||||
results.append({
|
||||
'href': href,
|
||||
'text': text,
|
||||
'title': text,
|
||||
'content': ''
|
||||
})
|
||||
|
||||
return jsonify({
|
||||
'query': urlparse.unquote(query),
|
||||
'search_type': search_util.search_type,
|
||||
'results': results
|
||||
})
|
||||
|
||||
# Get the user agent that was used for the search
|
||||
used_user_agent = ''
|
||||
if search_util.user_request:
|
||||
used_user_agent = search_util.user_request.modified_user_agent
|
||||
elif hasattr(g, 'user_request') and g.user_request:
|
||||
used_user_agent = g.user_request.modified_user_agent
|
||||
|
||||
return render_template(
|
||||
'display.html',
|
||||
has_update=app.config['HAS_UPDATE'],
|
||||
|
|
@ -403,6 +612,7 @@ def search():
|
|||
) and not search_util.search_type, # Standard search queries only
|
||||
response=cleanresponse,
|
||||
version_number=app.config['VERSION_NUMBER'],
|
||||
used_user_agent=used_user_agent,
|
||||
search_header=render_template(
|
||||
'header.html',
|
||||
home_url=home_url,
|
||||
|
|
@ -411,7 +621,7 @@ def search():
|
|||
languages=app.config['LANGUAGES'],
|
||||
countries=app.config['COUNTRIES'],
|
||||
time_periods=app.config['TIME_PERIODS'],
|
||||
logo=render_template('logo.html', dark=g.user_config.dark),
|
||||
logo=render_template('logo.html'),
|
||||
query=urlparse.unquote(query),
|
||||
search_type=search_util.search_type,
|
||||
mobile=g.user_request.mobile,
|
||||
|
|
@ -436,10 +646,11 @@ def config():
|
|||
return json.dumps(g.user_config.__dict__)
|
||||
elif request.method == 'PUT' and not config_disabled:
|
||||
if name:
|
||||
config_pkl = os.path.join(app.config['CONFIG_PATH'], name)
|
||||
session['config'] = (pickle.load(open(config_pkl, 'rb'))
|
||||
if os.path.exists(config_pkl)
|
||||
else session['config'])
|
||||
config_file = os.path.join(app.config['CONFIG_PATH'], name)
|
||||
if os.path.exists(config_file):
|
||||
with open(config_file, 'r', encoding='utf-8') as f:
|
||||
session['config'] = json.load(f)
|
||||
# else keep existing session['config']
|
||||
return json.dumps(session['config'])
|
||||
else:
|
||||
return json.dumps({})
|
||||
|
|
@ -455,18 +666,18 @@ def config():
|
|||
# Keep both the selection and the custom string
|
||||
if 'custom_user_agent' in config_data:
|
||||
config_data['custom_user_agent'] = config_data['custom_user_agent']
|
||||
print(f"Setting custom user agent to: {config_data['custom_user_agent']}") # Debug log
|
||||
app.logger.debug(f"Setting custom user agent to: {config_data['custom_user_agent']}")
|
||||
else:
|
||||
config_data['use_custom_user_agent'] = False
|
||||
config_data['custom_user_agent'] = ''
|
||||
# Only clear custom_user_agent if not using custom option
|
||||
if config_data['user_agent'] != 'custom':
|
||||
config_data['custom_user_agent'] = ''
|
||||
|
||||
# Save config by name to allow a user to easily load later
|
||||
if name:
|
||||
pickle.dump(
|
||||
config_data,
|
||||
open(os.path.join(
|
||||
app.config['CONFIG_PATH'],
|
||||
name), 'wb'))
|
||||
config_file = os.path.join(app.config['CONFIG_PATH'], name)
|
||||
with open(config_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(config_data, f, indent=2)
|
||||
|
||||
session['config'] = config_data
|
||||
return redirect(config_data['url'])
|
||||
|
|
@ -519,7 +730,7 @@ def element():
|
|||
tmp_mem.seek(0)
|
||||
|
||||
return send_file(tmp_mem, mimetype=src_type)
|
||||
except exceptions.RequestException:
|
||||
except httpx.HTTPError:
|
||||
pass
|
||||
|
||||
return send_file(io.BytesIO(empty_gif), mimetype='image/gif')
|
||||
|
|
@ -628,8 +839,9 @@ def internal_error(e):
|
|||
|
||||
# Attempt to parse the query
|
||||
try:
|
||||
search_util = Search(request, g.user_config, g.session_key)
|
||||
query = search_util.new_search_query()
|
||||
if hasattr(g, 'user_config') and hasattr(g, 'session_key'):
|
||||
search_util = Search(request, g.user_config, g.session_key)
|
||||
query = search_util.new_search_query()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
|
@ -637,18 +849,28 @@ def internal_error(e):
|
|||
|
||||
fallback_engine = os.environ.get('WHOOGLE_FALLBACK_ENGINE_URL', '')
|
||||
if (fallback_engine):
|
||||
return redirect(fallback_engine + query)
|
||||
return redirect(fallback_engine + (query or ''))
|
||||
|
||||
localization_lang = g.user_config.get_localization_lang()
|
||||
# Safely get localization language with fallback
|
||||
if hasattr(g, 'user_config'):
|
||||
localization_lang = g.user_config.get_localization_lang()
|
||||
else:
|
||||
localization_lang = 'lang_en'
|
||||
translation = app.config['TRANSLATIONS'][localization_lang]
|
||||
return render_template(
|
||||
'error.html',
|
||||
error_message='Internal server error (500)',
|
||||
translation=translation,
|
||||
farside='https://farside.link',
|
||||
config=g.user_config,
|
||||
query=urlparse.unquote(query),
|
||||
params=g.user_config.to_params(keys=['preferences'])), 500
|
||||
# Build template context with safe defaults
|
||||
template_context = {
|
||||
'error_message': 'Internal server error (500)',
|
||||
'translation': translation,
|
||||
'farside': 'https://farside.link',
|
||||
'query': urlparse.unquote(query or '')
|
||||
}
|
||||
|
||||
# Add user config if available
|
||||
if hasattr(g, 'user_config'):
|
||||
template_context['config'] = g.user_config
|
||||
template_context['params'] = g.user_config.to_params(keys=['preferences'])
|
||||
|
||||
return render_template('error.html', **template_context), 500
|
||||
|
||||
|
||||
def run_app() -> None:
|
||||
|
|
|
|||
2
app/services/__init__.py
Normal file
2
app/services/__init__.py
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
|
||||
|
||||
452
app/services/cse_client.py
Normal file
452
app/services/cse_client.py
Normal file
|
|
@ -0,0 +1,452 @@
|
|||
"""Google Custom Search Engine (CSE) API Client
|
||||
|
||||
This module provides a client for Google's Custom Search JSON API,
|
||||
allowing users to bring their own API key (BYOK) for search functionality.
|
||||
"""
|
||||
|
||||
import httpx
|
||||
from typing import Optional
|
||||
from dataclasses import dataclass
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from flask import render_template
|
||||
|
||||
|
||||
# Google Custom Search API endpoint
|
||||
CSE_API_URL = 'https://www.googleapis.com/customsearch/v1'
|
||||
|
||||
|
||||
class CSEException(Exception):
|
||||
"""Exception raised for CSE API errors"""
|
||||
def __init__(self, message: str, code: int = 500, is_quota_error: bool = False):
|
||||
self.message = message
|
||||
self.code = code
|
||||
self.is_quota_error = is_quota_error
|
||||
super().__init__(self.message)
|
||||
|
||||
|
||||
@dataclass
|
||||
class CSEError:
|
||||
"""Represents an error from the CSE API"""
|
||||
code: int
|
||||
message: str
|
||||
|
||||
@property
|
||||
def is_quota_exceeded(self) -> bool:
|
||||
return self.code == 429 or 'quota' in self.message.lower()
|
||||
|
||||
@property
|
||||
def is_invalid_key(self) -> bool:
|
||||
return self.code == 400 or 'invalid' in self.message.lower()
|
||||
|
||||
|
||||
@dataclass
|
||||
class CSEResult:
|
||||
"""Represents a single search result from CSE API"""
|
||||
title: str
|
||||
link: str
|
||||
snippet: str
|
||||
display_link: str
|
||||
html_title: Optional[str] = None
|
||||
html_snippet: Optional[str] = None
|
||||
# Image-specific fields (populated for image search)
|
||||
image_url: Optional[str] = None
|
||||
thumbnail_url: Optional[str] = None
|
||||
image_width: Optional[int] = None
|
||||
image_height: Optional[int] = None
|
||||
context_link: Optional[str] = None # Page where image was found
|
||||
|
||||
|
||||
@dataclass
|
||||
class CSEResponse:
|
||||
"""Represents a complete CSE API response"""
|
||||
results: list[CSEResult]
|
||||
total_results: str
|
||||
search_time: float
|
||||
query: str
|
||||
start_index: int
|
||||
is_image_search: bool = False
|
||||
error: Optional[CSEError] = None
|
||||
|
||||
@property
|
||||
def has_error(self) -> bool:
|
||||
return self.error is not None
|
||||
|
||||
@property
|
||||
def has_results(self) -> bool:
|
||||
return len(self.results) > 0
|
||||
|
||||
|
||||
class CSEClient:
|
||||
"""Client for Google Custom Search Engine API
|
||||
|
||||
Usage:
|
||||
client = CSEClient(api_key='your-key', cse_id='your-cse-id')
|
||||
response = client.search('python programming')
|
||||
|
||||
if response.has_error:
|
||||
print(f"Error: {response.error.message}")
|
||||
else:
|
||||
for result in response.results:
|
||||
print(f"{result.title}: {result.link}")
|
||||
"""
|
||||
|
||||
def __init__(self, api_key: str, cse_id: str, timeout: float = 10.0):
|
||||
"""Initialize CSE client
|
||||
|
||||
Args:
|
||||
api_key: Google API key with Custom Search API enabled
|
||||
cse_id: Custom Search Engine ID (cx parameter)
|
||||
timeout: Request timeout in seconds
|
||||
"""
|
||||
self.api_key = api_key
|
||||
self.cse_id = cse_id
|
||||
self.timeout = timeout
|
||||
self._client = httpx.Client(timeout=timeout)
|
||||
|
||||
def search(
|
||||
self,
|
||||
query: str,
|
||||
start: int = 1,
|
||||
num: int = 10,
|
||||
safe: str = 'off',
|
||||
language: str = '',
|
||||
country: str = '',
|
||||
search_type: str = ''
|
||||
) -> CSEResponse:
|
||||
"""Execute a search query against the CSE API
|
||||
|
||||
Args:
|
||||
query: Search query string
|
||||
start: Starting result index (1-based, for pagination)
|
||||
num: Number of results to return (max 10)
|
||||
safe: Safe search setting ('off', 'medium', 'high')
|
||||
language: Language restriction (e.g., 'lang_en')
|
||||
country: Country restriction (e.g., 'countryUS')
|
||||
search_type: Type of search ('image' for image search, '' for web)
|
||||
|
||||
Returns:
|
||||
CSEResponse with results or error information
|
||||
"""
|
||||
params = {
|
||||
'key': self.api_key,
|
||||
'cx': self.cse_id,
|
||||
'q': query,
|
||||
'start': start,
|
||||
'num': min(num, 10), # API max is 10
|
||||
'safe': safe,
|
||||
}
|
||||
|
||||
# Add search type for image search
|
||||
if search_type == 'image':
|
||||
params['searchType'] = 'image'
|
||||
|
||||
# Add optional parameters
|
||||
if language:
|
||||
# CSE uses 'lr' for language restrict
|
||||
params['lr'] = language
|
||||
if country:
|
||||
# CSE uses 'cr' for country restrict
|
||||
params['cr'] = country
|
||||
|
||||
try:
|
||||
response = self._client.get(CSE_API_URL, params=params)
|
||||
data = response.json()
|
||||
|
||||
# Check for API errors
|
||||
if 'error' in data:
|
||||
error_info = data['error']
|
||||
return CSEResponse(
|
||||
results=[],
|
||||
total_results='0',
|
||||
search_time=0.0,
|
||||
query=query,
|
||||
start_index=start,
|
||||
error=CSEError(
|
||||
code=error_info.get('code', 500),
|
||||
message=error_info.get('message', 'Unknown error')
|
||||
)
|
||||
)
|
||||
|
||||
# Parse successful response
|
||||
search_info = data.get('searchInformation', {})
|
||||
items = data.get('items', [])
|
||||
is_image = search_type == 'image'
|
||||
|
||||
results = []
|
||||
for item in items:
|
||||
# Extract image-specific data if present
|
||||
image_data = item.get('image', {})
|
||||
|
||||
results.append(CSEResult(
|
||||
title=item.get('title', ''),
|
||||
link=item.get('link', ''),
|
||||
snippet=item.get('snippet', ''),
|
||||
display_link=item.get('displayLink', ''),
|
||||
html_title=item.get('htmlTitle'),
|
||||
html_snippet=item.get('htmlSnippet'),
|
||||
# Image fields
|
||||
image_url=item.get('link') if is_image else None,
|
||||
thumbnail_url=image_data.get('thumbnailLink'),
|
||||
image_width=image_data.get('width'),
|
||||
image_height=image_data.get('height'),
|
||||
context_link=image_data.get('contextLink')
|
||||
))
|
||||
|
||||
return CSEResponse(
|
||||
results=results,
|
||||
total_results=search_info.get('totalResults', '0'),
|
||||
search_time=float(search_info.get('searchTime', 0)),
|
||||
query=query,
|
||||
start_index=start,
|
||||
is_image_search=is_image
|
||||
)
|
||||
|
||||
except httpx.TimeoutException:
|
||||
return CSEResponse(
|
||||
results=[],
|
||||
total_results='0',
|
||||
search_time=0.0,
|
||||
query=query,
|
||||
start_index=start,
|
||||
error=CSEError(code=408, message='Request timed out')
|
||||
)
|
||||
except httpx.RequestError as e:
|
||||
return CSEResponse(
|
||||
results=[],
|
||||
total_results='0',
|
||||
search_time=0.0,
|
||||
query=query,
|
||||
start_index=start,
|
||||
error=CSEError(code=500, message=f'Request failed: {str(e)}')
|
||||
)
|
||||
except Exception as e:
|
||||
return CSEResponse(
|
||||
results=[],
|
||||
total_results='0',
|
||||
search_time=0.0,
|
||||
query=query,
|
||||
start_index=start,
|
||||
error=CSEError(code=500, message=f'Unexpected error: {str(e)}')
|
||||
)
|
||||
|
||||
def close(self):
|
||||
"""Close the HTTP client"""
|
||||
self._client.close()
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, *args):
|
||||
self.close()
|
||||
|
||||
|
||||
def cse_results_to_html(response: CSEResponse, query: str) -> str:
|
||||
"""Convert CSE API response to HTML matching Whoogle's result format
|
||||
|
||||
This generates HTML that mimics the structure expected by Whoogle's
|
||||
existing filter and result processing pipeline.
|
||||
|
||||
Args:
|
||||
response: CSEResponse from the API
|
||||
query: Original search query
|
||||
|
||||
Returns:
|
||||
HTML string formatted like Google search results
|
||||
"""
|
||||
if response.has_error:
|
||||
error = response.error
|
||||
if error.is_quota_exceeded:
|
||||
return _error_html(
|
||||
'API Quota Exceeded',
|
||||
'Your Google Custom Search API quota has been exceeded. '
|
||||
'Free tier allows 100 queries/day. Wait until midnight PT '
|
||||
'or enable billing in Google Cloud Console.'
|
||||
)
|
||||
elif error.is_invalid_key:
|
||||
return _error_html(
|
||||
'Invalid API Key',
|
||||
'Your Google Custom Search API key is invalid. '
|
||||
'Please check your API key and CSE ID in settings.'
|
||||
)
|
||||
else:
|
||||
return _error_html('Search Error', error.message)
|
||||
|
||||
if not response.has_results:
|
||||
return _no_results_html(query)
|
||||
|
||||
# Use different HTML structure for image vs web results
|
||||
if response.is_image_search:
|
||||
return _image_results_html(response, query)
|
||||
|
||||
# Build HTML results matching Whoogle's expected structure
|
||||
results_html = []
|
||||
|
||||
for result in response.results:
|
||||
# Escape HTML in content
|
||||
title = _escape_html(result.title)
|
||||
snippet = _escape_html(result.snippet)
|
||||
link = result.link
|
||||
display_link = _escape_html(result.display_link)
|
||||
|
||||
# Use HTML versions if available (they have bold tags for query terms)
|
||||
if result.html_title:
|
||||
title = result.html_title
|
||||
if result.html_snippet:
|
||||
snippet = result.html_snippet
|
||||
|
||||
# Match the structure used by Google/mock results
|
||||
result_html = f'''
|
||||
<div class="ZINbbc xpd O9g5cc uUPGi">
|
||||
<div class="kCrYT">
|
||||
<a href="{link}">
|
||||
<h3 class="BNeawe vvjwJb AP7Wnd">{title}</h3>
|
||||
<div class="BNeawe UPmit AP7Wnd luh4tb" style="color: var(--whoogle-result-url);">{display_link}</div>
|
||||
</a>
|
||||
</div>
|
||||
<div class="kCrYT">
|
||||
<div class="BNeawe s3v9rd AP7Wnd">
|
||||
<span class="VwiC3b">{snippet}</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
'''
|
||||
results_html.append(result_html)
|
||||
|
||||
# Build pagination if needed
|
||||
pagination_html = ''
|
||||
if int(response.total_results) > 10:
|
||||
pagination_html = _pagination_html(response.start_index, response.query)
|
||||
|
||||
# Wrap in expected structure
|
||||
# Add data-cse attribute to prevent collapse_sections from collapsing these results
|
||||
return f'''
|
||||
<html>
|
||||
<body>
|
||||
<div id="main" data-cse="true">
|
||||
<div id="cnt">
|
||||
<div id="rcnt">
|
||||
<div id="center_col">
|
||||
<div id="res">
|
||||
<div id="search">
|
||||
<div id="rso">
|
||||
{''.join(results_html)}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{pagination_html}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
|
||||
def _escape_html(text: str) -> str:
|
||||
"""Escape HTML special characters"""
|
||||
if not text:
|
||||
return ''
|
||||
return (text
|
||||
.replace('&', '&')
|
||||
.replace('<', '<')
|
||||
.replace('>', '>')
|
||||
.replace('"', '"')
|
||||
.replace("'", '''))
|
||||
|
||||
|
||||
def _error_html(title: str, message: str) -> str:
|
||||
"""Generate error HTML"""
|
||||
return f'''
|
||||
<html>
|
||||
<body>
|
||||
<div id="main">
|
||||
<div style="padding: 20px; text-align: center;">
|
||||
<h2 style="color: #d93025;">{_escape_html(title)}</h2>
|
||||
<p>{_escape_html(message)}</p>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
|
||||
def _no_results_html(query: str) -> str:
|
||||
"""Generate no results HTML"""
|
||||
return f'''
|
||||
<html>
|
||||
<body>
|
||||
<div id="main">
|
||||
<div style="padding: 20px;">
|
||||
<p>No results found for <b>{_escape_html(query)}</b></p>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
|
||||
|
||||
def _image_results_html(response: CSEResponse, query: str) -> str:
|
||||
"""Generate HTML for image search results using the imageresults template
|
||||
|
||||
Args:
|
||||
response: CSEResponse with image results
|
||||
query: Original search query
|
||||
|
||||
Returns:
|
||||
HTML string formatted for image results display
|
||||
"""
|
||||
# Convert CSE results to the format expected by imageresults.html template
|
||||
results = []
|
||||
for result in response.results:
|
||||
image_url = result.image_url or result.link
|
||||
thumbnail_url = result.thumbnail_url or image_url
|
||||
web_page = result.context_link or result.link
|
||||
domain = urlparse(web_page).netloc if web_page else result.display_link
|
||||
|
||||
results.append({
|
||||
'domain': domain,
|
||||
'img_url': image_url,
|
||||
'web_page': web_page,
|
||||
'img_tbn': thumbnail_url
|
||||
})
|
||||
|
||||
# Build pagination link if needed
|
||||
next_link = None
|
||||
if int(response.total_results) > response.start_index + len(response.results) - 1:
|
||||
next_start = response.start_index + 10
|
||||
next_link = f'search?q={query}&tbm=isch&start={next_start}'
|
||||
|
||||
# Use the same template as regular image results
|
||||
return render_template(
|
||||
'imageresults.html',
|
||||
length=len(results),
|
||||
results=results,
|
||||
view_label="View Image",
|
||||
next_link=next_link
|
||||
)
|
||||
|
||||
|
||||
def _pagination_html(current_start: int, query: str) -> str:
|
||||
"""Generate pagination links"""
|
||||
# CSE API uses 1-based indexing, 10 results per page
|
||||
current_page = (current_start - 1) // 10 + 1
|
||||
|
||||
prev_link = ''
|
||||
next_link = ''
|
||||
|
||||
if current_page > 1:
|
||||
prev_start = (current_page - 2) * 10 + 1
|
||||
prev_link = f'<a href="search?q={query}&start={prev_start}">Previous</a>'
|
||||
|
||||
next_start = current_page * 10 + 1
|
||||
next_link = f'<a href="search?q={query}&start={next_start}">Next</a>'
|
||||
|
||||
return f'''
|
||||
<div id="foot" style="text-align: center; padding: 20px;">
|
||||
{prev_link}
|
||||
<span style="margin: 0 20px;">Page {current_page}</span>
|
||||
{next_link}
|
||||
</div>
|
||||
'''
|
||||
219
app/services/http_client.py
Normal file
219
app/services/http_client.py
Normal file
|
|
@ -0,0 +1,219 @@
|
|||
import threading
|
||||
import time
|
||||
from typing import Any, Dict, Optional, Tuple
|
||||
|
||||
import httpx
|
||||
from cachetools import TTLCache
|
||||
import ssl
|
||||
import os
|
||||
|
||||
# Import h2 exceptions for better error handling
|
||||
try:
|
||||
from h2.exceptions import ProtocolError as H2ProtocolError
|
||||
except ImportError:
|
||||
H2ProtocolError = None
|
||||
|
||||
|
||||
class HttpxClient:
|
||||
"""Thin wrapper around httpx.Client providing simple retries and optional TTL caching.
|
||||
|
||||
The client is intended to be safe for reuse across requests. Per-request
|
||||
overrides for headers/cookies are supported.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
proxies: Optional[Dict[str, str]] = None,
|
||||
timeout_seconds: float = 15.0,
|
||||
cache_ttl_seconds: int = 30,
|
||||
cache_maxsize: int = 256,
|
||||
http2: bool = True) -> None:
|
||||
# Allow disabling HTTP/2 via environment variable
|
||||
# HTTP/2 can sometimes cause protocol errors with certain servers
|
||||
if os.environ.get('WHOOGLE_DISABLE_HTTP2', '').lower() in ('1', 'true', 't', 'yes', 'y'):
|
||||
http2 = False
|
||||
|
||||
client_kwargs = dict(http2=http2,
|
||||
timeout=timeout_seconds,
|
||||
follow_redirects=True)
|
||||
# Prefer future-proof mounts when proxies are provided; fall back to proxies=
|
||||
self._proxies = proxies or {}
|
||||
self._http2 = http2
|
||||
|
||||
# Determine verify behavior and initialize client with fallbacks
|
||||
self._verify = self._determine_verify_setting()
|
||||
try:
|
||||
self._client = self._build_client(client_kwargs, self._verify)
|
||||
except ssl.SSLError:
|
||||
# Fallback to system trust store
|
||||
try:
|
||||
system_ctx = ssl.create_default_context()
|
||||
self._client = self._build_client(client_kwargs, system_ctx)
|
||||
self._verify = system_ctx
|
||||
except ssl.SSLError:
|
||||
insecure_fallback = os.environ.get('WHOOGLE_INSECURE_FALLBACK', '0').lower() in ('1', 'true', 't', 'yes', 'y')
|
||||
if insecure_fallback:
|
||||
self._client = self._build_client(client_kwargs, False)
|
||||
self._verify = False
|
||||
else:
|
||||
raise
|
||||
self._timeout_seconds = timeout_seconds
|
||||
self._cache = TTLCache(maxsize=cache_maxsize, ttl=cache_ttl_seconds)
|
||||
self._cache_lock = threading.Lock()
|
||||
|
||||
def _determine_verify_setting(self):
|
||||
"""Determine SSL verification setting from environment.
|
||||
|
||||
Honors:
|
||||
- WHOOGLE_CA_BUNDLE: path to CA bundle file
|
||||
- WHOOGLE_SSL_VERIFY: '0' to disable verification
|
||||
- WHOOGLE_SSL_BACKEND: 'system' to prefer system trust store
|
||||
"""
|
||||
ca_bundle = os.environ.get('WHOOGLE_CA_BUNDLE', '').strip()
|
||||
if ca_bundle:
|
||||
return ca_bundle
|
||||
|
||||
verify_env = os.environ.get('WHOOGLE_SSL_VERIFY', '1').lower()
|
||||
if verify_env in ('0', 'false', 'no', 'n'):
|
||||
return False
|
||||
|
||||
backend = os.environ.get('WHOOGLE_SSL_BACKEND', '').lower()
|
||||
if backend == 'system':
|
||||
return ssl.create_default_context()
|
||||
|
||||
return True
|
||||
|
||||
def _build_client(self, client_kwargs: Dict[str, Any], verify: Any) -> httpx.Client:
|
||||
"""Construct httpx.Client with proxies and provided verify setting."""
|
||||
kwargs = dict(client_kwargs)
|
||||
kwargs['verify'] = verify
|
||||
if self._proxies:
|
||||
proxy_values = list(self._proxies.values())
|
||||
single_proxy = proxy_values[0] if proxy_values and all(v == proxy_values[0] for v in proxy_values) else None
|
||||
if single_proxy:
|
||||
try:
|
||||
return httpx.Client(proxy=single_proxy, **kwargs)
|
||||
except TypeError:
|
||||
try:
|
||||
return httpx.Client(proxies=self._proxies, **kwargs)
|
||||
except TypeError:
|
||||
mounts: Dict[str, httpx.Proxy] = {}
|
||||
for scheme_key, url in self._proxies.items():
|
||||
prefix = f"{scheme_key}://"
|
||||
mounts[prefix] = httpx.Proxy(url)
|
||||
return httpx.Client(mounts=mounts, **kwargs)
|
||||
else:
|
||||
try:
|
||||
return httpx.Client(proxies=self._proxies, **kwargs)
|
||||
except TypeError:
|
||||
mounts: Dict[str, httpx.Proxy] = {}
|
||||
for scheme_key, url in self._proxies.items():
|
||||
prefix = f"{scheme_key}://"
|
||||
mounts[prefix] = httpx.Proxy(url)
|
||||
return httpx.Client(mounts=mounts, **kwargs)
|
||||
else:
|
||||
return httpx.Client(**kwargs)
|
||||
|
||||
@property
|
||||
def proxies(self) -> Dict[str, str]:
|
||||
return self._proxies
|
||||
|
||||
def _cache_key(self, method: str, url: str, headers: Optional[Dict[str, str]]) -> Tuple[str, str, Tuple[Tuple[str, str], ...]]:
|
||||
normalized_headers = tuple(sorted((headers or {}).items()))
|
||||
return (method.upper(), url, normalized_headers)
|
||||
|
||||
def get(self,
|
||||
url: str,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
cookies: Optional[Dict[str, str]] = None,
|
||||
retries: int = 2,
|
||||
backoff_seconds: float = 0.5,
|
||||
use_cache: bool = False) -> httpx.Response:
|
||||
if use_cache:
|
||||
key = self._cache_key('GET', url, headers)
|
||||
with self._cache_lock:
|
||||
cached = self._cache.get(key)
|
||||
if cached is not None:
|
||||
return cached
|
||||
|
||||
last_exc: Optional[Exception] = None
|
||||
attempt = 0
|
||||
while attempt <= retries:
|
||||
try:
|
||||
# Check if client is closed and recreate if needed
|
||||
if self._client.is_closed:
|
||||
self._recreate_client()
|
||||
|
||||
response = self._client.get(url, headers=headers, cookies=cookies)
|
||||
if use_cache and response.status_code == 200:
|
||||
with self._cache_lock:
|
||||
self._cache[key] = response
|
||||
return response
|
||||
except Exception as exc:
|
||||
last_exc = exc
|
||||
# Check for specific errors that require client recreation
|
||||
should_recreate = False
|
||||
|
||||
if isinstance(exc, (httpx.HTTPError, RuntimeError)):
|
||||
if "client has been closed" in str(exc).lower():
|
||||
should_recreate = True
|
||||
|
||||
# Handle H2 protocol errors (connection state issues)
|
||||
if H2ProtocolError and isinstance(exc, H2ProtocolError):
|
||||
should_recreate = True
|
||||
|
||||
# Also check if the error message contains h2 protocol error info
|
||||
if "ProtocolError" in str(exc) or "ConnectionState.CLOSED" in str(exc):
|
||||
should_recreate = True
|
||||
|
||||
if should_recreate:
|
||||
self._recreate_client()
|
||||
if attempt < retries:
|
||||
time.sleep(backoff_seconds * (2 ** attempt))
|
||||
attempt += 1
|
||||
continue
|
||||
|
||||
# For non-recoverable errors or last attempt, raise
|
||||
if attempt == retries:
|
||||
raise
|
||||
|
||||
# For other errors, still retry with backoff
|
||||
time.sleep(backoff_seconds * (2 ** attempt))
|
||||
attempt += 1
|
||||
|
||||
# Should not reach here
|
||||
if last_exc:
|
||||
raise last_exc
|
||||
raise httpx.HTTPError('Unknown HTTP error')
|
||||
|
||||
def _recreate_client(self) -> None:
|
||||
"""Recreate the HTTP client when it has been closed."""
|
||||
try:
|
||||
self._client.close()
|
||||
except Exception:
|
||||
pass # Client might already be closed
|
||||
|
||||
# Recreate with same configuration
|
||||
client_kwargs = dict(timeout=self._timeout_seconds,
|
||||
follow_redirects=True,
|
||||
http2=self._http2)
|
||||
|
||||
try:
|
||||
self._client = self._build_client(client_kwargs, self._verify)
|
||||
except ssl.SSLError:
|
||||
try:
|
||||
system_ctx = ssl.create_default_context()
|
||||
self._client = self._build_client(client_kwargs, system_ctx)
|
||||
self._verify = system_ctx
|
||||
except ssl.SSLError:
|
||||
insecure_fallback = os.environ.get('WHOOGLE_INSECURE_FALLBACK', '0').lower() in ('1', 'true', 't', 'yes', 'y')
|
||||
if insecure_fallback:
|
||||
self._client = self._build_client(client_kwargs, False)
|
||||
self._verify = False
|
||||
else:
|
||||
raise
|
||||
|
||||
def close(self) -> None:
|
||||
self._client.close()
|
||||
|
||||
|
||||
40
app/services/provider.py
Normal file
40
app/services/provider.py
Normal file
|
|
@ -0,0 +1,40 @@
|
|||
import os
|
||||
from typing import Dict, Tuple
|
||||
|
||||
from app.services.http_client import HttpxClient
|
||||
|
||||
|
||||
_clients: Dict[tuple, HttpxClient] = {}
|
||||
|
||||
|
||||
def _proxies_key(proxies: Dict[str, str]) -> Tuple[Tuple[str, str], Tuple[str, str]]:
|
||||
if not proxies:
|
||||
return tuple(), tuple()
|
||||
# Separate http/https for stable key
|
||||
items = sorted((proxies or {}).items())
|
||||
return tuple(items), tuple(items)
|
||||
|
||||
|
||||
def get_http_client(proxies: Dict[str, str]) -> HttpxClient:
|
||||
# Determine HTTP/2 enablement from env (default on)
|
||||
http2_env = os.environ.get('WHOOGLE_HTTP2', '1').lower()
|
||||
http2_enabled = http2_env in ('1', 'true', 't', 'yes', 'y')
|
||||
|
||||
key = (_proxies_key(proxies or {}), http2_enabled)
|
||||
client = _clients.get(key)
|
||||
if client is not None:
|
||||
return client
|
||||
client = HttpxClient(proxies=proxies or None, http2=http2_enabled)
|
||||
_clients[key] = client
|
||||
return client
|
||||
|
||||
|
||||
def close_all_clients() -> None:
|
||||
for client in list(_clients.values()):
|
||||
try:
|
||||
client.close()
|
||||
except Exception:
|
||||
pass
|
||||
_clients.clear()
|
||||
|
||||
|
||||
|
|
@ -32,15 +32,15 @@ textarea {
|
|||
color: var(--whoogle-dark-text) !important;
|
||||
}
|
||||
|
||||
a:visited h3 div {
|
||||
a:visited h3 div, a:visited .qXLe6d {
|
||||
color: var(--whoogle-dark-result-visited) !important;
|
||||
}
|
||||
|
||||
a:link h3 div {
|
||||
a:link h3 div, a:link .qXLe6d {
|
||||
color: var(--whoogle-dark-result-title) !important;
|
||||
}
|
||||
|
||||
a:link div {
|
||||
a:link div, a:link .fYyStc {
|
||||
color: var(--whoogle-dark-result-url) !important;
|
||||
}
|
||||
|
||||
|
|
@ -62,7 +62,7 @@ select {
|
|||
background-color: var(--whoogle-dark-page-bg) !important;
|
||||
}
|
||||
|
||||
.ZINbbc {
|
||||
.ZINbbc, .ezO2md {
|
||||
overflow: hidden;
|
||||
box-shadow: 0 0 0 0 !important;
|
||||
background-color: var(--whoogle-dark-result-bg) !important;
|
||||
|
|
|
|||
|
|
@ -75,15 +75,15 @@ select {
|
|||
}
|
||||
|
||||
|
||||
a:visited h3 div {
|
||||
a:visited div, a:visited .qXLe6d {
|
||||
color: var(--whoogle-result-visited) !important;
|
||||
}
|
||||
|
||||
a:link h3 div {
|
||||
a:link div, a:link .qXLe6d {
|
||||
color: var(--whoogle-result-title) !important;
|
||||
}
|
||||
|
||||
a:link div {
|
||||
a:link div, a:link .fYyStc {
|
||||
color: var(--whoogle-result-url) !important;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -7,6 +7,12 @@ body {
|
|||
font-size: 16px !important;
|
||||
}
|
||||
|
||||
.ezO2md {
|
||||
border-radius: 10px;
|
||||
border: 0 !important;
|
||||
box-shadow: 0 3px 5px rgb(0 0 0 / 0.2);
|
||||
}
|
||||
|
||||
.autocomplete {
|
||||
position: relative;
|
||||
display: inline-block;
|
||||
|
|
|
|||
|
|
@ -34,6 +34,20 @@ const setupConfigLayout = () => {
|
|||
|
||||
content.classList.toggle("open");
|
||||
});
|
||||
|
||||
// Setup user agent dropdown handler
|
||||
const userAgentSelect = document.getElementById("config-user-agent");
|
||||
const customUserAgentDiv = document.querySelector(".config-div-custom-user-agent");
|
||||
|
||||
if (userAgentSelect && customUserAgentDiv) {
|
||||
userAgentSelect.addEventListener("change", function() {
|
||||
if (this.value === "custom") {
|
||||
customUserAgentDiv.style.display = "block";
|
||||
} else {
|
||||
customUserAgentDiv.style.display = "none";
|
||||
}
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
const loadConfig = event => {
|
||||
|
|
|
|||
|
|
@ -128,8 +128,6 @@
|
|||
{"name": "Lithuania", "value": "LT"},
|
||||
{"name": "Luxembourg", "value": "LU"},
|
||||
{"name": "Macao", "value": "MO"},
|
||||
{"name": "Macedonia, the Former Yugosalv Republic of",
|
||||
"value": "MK"},
|
||||
{"name": "Madagascar", "value": "MG"},
|
||||
{"name": "Malawi", "value": "MW"},
|
||||
{"name": "Malaysia", "value": "MY"},
|
||||
|
|
@ -162,6 +160,7 @@
|
|||
{"name": "Nigeria", "value": "NG"},
|
||||
{"name": "Niue", "value": "NU"},
|
||||
{"name": "Norfolk Island", "value": "NF"},
|
||||
{"name": "North Macedonia", "value": "MK"},
|
||||
{"name": "Northern Mariana Islands", "value": "MP"},
|
||||
{"name": "Norway", "value": "NO"},
|
||||
{"name": "Oman", "value": "OM"},
|
||||
|
|
@ -201,8 +200,7 @@
|
|||
{"name": "Solomon Islands", "value": "SB"},
|
||||
{"name": "Somalia", "value": "SO"},
|
||||
{"name": "South Africa", "value": "ZA"},
|
||||
{"name": "South Georgia and the South Sandwich Islands",
|
||||
"value": "GS"},
|
||||
{"name": "South Georgia and the South Sandwich Islands", "value": "GS"},
|
||||
{"name": "Spain", "value": "ES"},
|
||||
{"name": "Sri Lanka", "value": "LK"},
|
||||
{"name": "Sudan", "value": "SD"},
|
||||
|
|
@ -221,10 +219,10 @@
|
|||
{"name": "Tonga", "value": "TO"},
|
||||
{"name": "Trinidad and Tobago", "value": "TT"},
|
||||
{"name": "Tunisia", "value": "TN"},
|
||||
{"name": "Turkey", "value": "TR"},
|
||||
{"name": "Turkmenistan", "value": "TM"},
|
||||
{"name": "Turks and Caicos Islands", "value": "TC"},
|
||||
{"name": "Tuvalu", "value": "TV"},
|
||||
{"name": "Türkiye", "value": "TR"},
|
||||
{"name": "Uganda", "value": "UG"},
|
||||
{"name": "Ukraine", "value": "UA"},
|
||||
{"name": "United Arab Emirates", "value": "AE"},
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@
|
|||
{"name": "Swahili (Kiswahili)", "value": "lang_sw"},
|
||||
{"name": "Swedish (Svenska)", "value": "lang_sv"},
|
||||
{"name": "Thai (ไทย)", "value": "lang_th"},
|
||||
{"name": "Turkish (Türk)", "value": "lang_tr"},
|
||||
{"name": "Turkish (Türkçe)", "value": "lang_tr"},
|
||||
{"name": "Ukrainian (Українська)", "value": "lang_uk"},
|
||||
{"name": "Vietnamese (Tiếng Việt)", "value": "lang_vi"},
|
||||
{"name": "Welsh (Cymraeg)", "value": "lang_cy"},
|
||||
|
|
|
|||
|
|
@ -1286,5 +1286,61 @@
|
|||
"qdr:w": "Τελευταία Βδομάδα",
|
||||
"qdr:m": "Τελευταίος Μήνας",
|
||||
"qdr:y": "Τελευταίος Χρόνος"
|
||||
}
|
||||
},
|
||||
"lang_tr": {
|
||||
"": "--",
|
||||
"search": "Ara",
|
||||
"config": "Seçenekler",
|
||||
"config-country": "Ülke",
|
||||
"config-lang": "Arayüz Dili",
|
||||
"config-lang-search": "Arama Dili",
|
||||
"config-near": "Yakınında",
|
||||
"config-near-help": "Şehir Adı",
|
||||
"config-block": "Engelle",
|
||||
"config-block-help": "Virgülle ayrılmış site listesi",
|
||||
"config-block-title": "Başlığa Göre Engelle",
|
||||
"config-block-title-help": "Regex kullan",
|
||||
"config-block-url": "URL'ye Göre Engelle",
|
||||
"config-block-url-help": "Regex kullan",
|
||||
"config-theme": "Tema",
|
||||
"config-nojs": "Anonim Görünümde Javascript'i Kaldır",
|
||||
"config-anon-view": "Anonim Görünüm Bağlantılarını Göster",
|
||||
"config-dark": "Karanlık Mod",
|
||||
"config-safe": "Güvenli Arama",
|
||||
"config-alts": "Sosyal Medya Bağlantılarını Değiştir",
|
||||
"config-alts-help": "Twitter/YouTube/vb. bağlantıları gizliliğe saygılı alternatiflerle değiştirir.",
|
||||
"config-new-tab": "Bağlantıları Yeni Sekmede Aç",
|
||||
"config-images": "Tam Boyutlu Görsel Arama",
|
||||
"config-images-help": "(Deneysel) Masaüstü görsel aramalarına 'Görseli Görüntüle' seçeneği ekler. Bu, görsel sonuç küçük resimlerinin daha düşük çözünürlükte olmasına neden olur.",
|
||||
"config-tor": "Tor Kullan",
|
||||
"config-get-only": "Yalnızca GET İstekleri",
|
||||
"config-url": "Kök URL",
|
||||
"config-pref-url": "Tercihler URL'si",
|
||||
"config-pref-encryption": "Tercihleri Şifrele",
|
||||
"config-pref-help": "WHOOGLE_CONFIG_PREFERENCES_KEY gerektirir, aksi takdirde bu göz ardı edilir.",
|
||||
"config-css": "Özel CSS",
|
||||
"config-time-period": "Zaman Aralığı",
|
||||
"load": "Yükle",
|
||||
"apply": "Uygula",
|
||||
"save-as": "Farklı Kaydet...",
|
||||
"github-link": "GitHub'da Görüntüle",
|
||||
"translate": "çevir",
|
||||
"light": "açık",
|
||||
"dark": "koyu",
|
||||
"system": "sistem",
|
||||
"ratelimit": "Sunucu hız sınırına ulaştı",
|
||||
"continue-search": "Aramanızı Farside ile sürdürün",
|
||||
"all": "Tümü",
|
||||
"images": "Görseller",
|
||||
"maps": "Haritalar",
|
||||
"videos": "Videolar",
|
||||
"news": "Haberler",
|
||||
"books": "Kitaplar",
|
||||
"anon-view": "Anonim Görünüm",
|
||||
"qdr:h": "Son saat",
|
||||
"qdr:d": "Son 24 saat",
|
||||
"qdr:w": "Geçen hafta",
|
||||
"qdr:m": "Geçen ay",
|
||||
"qdr:y": "Geçen yıl"
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -193,10 +193,13 @@ const calc = () => {
|
|||
(statement.match(/\(/g) || []).length >
|
||||
(statement.match(/\)/g) || []).length
|
||||
) statement += ")"; else break;
|
||||
// evaluate the expression.
|
||||
// evaluate the expression using a safe evaluator (no eval())
|
||||
console.log("calculating [" + statement + "]");
|
||||
try {
|
||||
var result = eval(statement);
|
||||
// Safe evaluation: create a sandboxed function with only Math object available
|
||||
// This prevents arbitrary code execution while allowing mathematical operations
|
||||
const safeEval = new Function('Math', `'use strict'; return (${statement})`);
|
||||
var result = safeEval(Math);
|
||||
document.getElementById("prev-equation").innerHTML = mathtext.innerHTML + " = ";
|
||||
mathtext.innerHTML = result;
|
||||
mathtext.classList.remove("error-border");
|
||||
|
|
|
|||
|
|
@ -9,10 +9,14 @@
|
|||
{% endif %}
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<meta name="referrer" content="no-referrer">
|
||||
<link rel="stylesheet" href="{{ cb_url('logo.css') }}">
|
||||
<link rel="stylesheet" href="{{ cb_url('input.css') }}">
|
||||
<link rel="stylesheet" href="{{ cb_url('search.css') }}">
|
||||
<link rel="stylesheet" href="{{ cb_url('header.css') }}">
|
||||
{% if bundle_static() %}
|
||||
<link rel="stylesheet" href="/{{ cb_url('bundle.css') }}">
|
||||
{% else %}
|
||||
<link rel="stylesheet" href="{{ cb_url('logo.css') }}">
|
||||
<link rel="stylesheet" href="{{ cb_url('input.css') }}">
|
||||
<link rel="stylesheet" href="{{ cb_url('search.css') }}">
|
||||
<link rel="stylesheet" href="{{ cb_url('header.css') }}">
|
||||
{% endif %}
|
||||
{% if config.theme %}
|
||||
{% if config.theme == 'system' %}
|
||||
<style>
|
||||
|
|
@ -22,10 +26,12 @@
|
|||
{% else %}
|
||||
<link rel="stylesheet" href="{{ cb_url(config.theme + '-theme.css') }}"/>
|
||||
{% endif %}
|
||||
{% else %}
|
||||
<link rel="stylesheet" href="{{ cb_url(('dark' if config.dark else 'light') + '-theme.css') }}"/>
|
||||
{% endif %}
|
||||
<style>{{ config.style }}</style>
|
||||
{% if config.style %}
|
||||
<style>
|
||||
{{ config.style }}
|
||||
</style>
|
||||
{% endif %}
|
||||
<title>{{ clean_query(query) }} - Whoogle Search</title>
|
||||
</head>
|
||||
<body>
|
||||
|
|
@ -39,10 +45,14 @@
|
|||
{{ response|safe }}
|
||||
</body>
|
||||
{% include 'footer.html' %}
|
||||
{% if autocomplete_enabled == '1' %}
|
||||
<script src="{{ cb_url('autocomplete.js') }}"></script>
|
||||
{% if bundle_static() %}
|
||||
<script src="/{{ cb_url('bundle.js') }}" defer></script>
|
||||
{% else %}
|
||||
{% if autocomplete_enabled == '1' %}
|
||||
<script src="{{ cb_url('autocomplete.js') }}"></script>
|
||||
{% endif %}
|
||||
<script src="{{ cb_url('utils.js') }}"></script>
|
||||
<script src="{{ cb_url('keyboard.js') }}"></script>
|
||||
<script src="{{ cb_url('currency.js') }}"></script>
|
||||
{% endif %}
|
||||
<script src="{{ cb_url('utils.js') }}"></script>
|
||||
<script src="{{ cb_url('keyboard.js') }}"></script>
|
||||
<script src="{{ cb_url('currency.js') }}"></script>
|
||||
</html>
|
||||
|
|
|
|||
|
|
@ -7,11 +7,13 @@
|
|||
{% else %}
|
||||
<link rel="stylesheet" href="{{ cb_url(config.theme + '-theme.css') }}"/>
|
||||
{% endif %}
|
||||
{% else %}
|
||||
<link rel="stylesheet" href="{{ cb_url(('dark' if config.dark else 'light') + '-theme.css') }}"/>
|
||||
{% endif %}
|
||||
{% if bundle_static() %}
|
||||
<link rel="stylesheet" href="/{{ cb_url('bundle.css') }}">
|
||||
{% else %}
|
||||
<link rel="stylesheet" href="{{ cb_url('main.css') }}">
|
||||
<link rel="stylesheet" href="{{ cb_url('error.css') }}">
|
||||
{% endif %}
|
||||
<style>{{ config.style }}</style>
|
||||
<div>
|
||||
<h1>Error</h1>
|
||||
|
|
@ -43,6 +45,16 @@
|
|||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>
|
||||
<a href="https://git.lolcat.ca/lolcat/4get">4get</a>
|
||||
<ul>
|
||||
<li>
|
||||
<a class="link-color" href="{{farside}}/4get/web?s={{query}}&scraper=google">
|
||||
{{farside}}/4get/web?s={{query}}&scraper=google
|
||||
</a>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
<hr>
|
||||
<h4>Other options:</h4>
|
||||
|
|
@ -58,6 +70,16 @@
|
|||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>
|
||||
<a href="https://4get.ca">4get</a>
|
||||
<ul>
|
||||
<li>
|
||||
<a class="link-color" href="https://4get.ca/web?s={{query}}">
|
||||
4get.ca/web?s={{query}}
|
||||
</a>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>
|
||||
<a href="https://duckduckgo.com">DuckDuckGo</a>
|
||||
<ul>
|
||||
|
|
|
|||
|
|
@ -5,5 +5,8 @@
|
|||
{% if has_update %}
|
||||
|| <span class="update_available">Update Available 🟢</span>
|
||||
{% endif %}
|
||||
{% if config.show_user_agent and used_user_agent %}
|
||||
<br><span class="user-agent-display" style="font-size: 0.85em; color: #666;">User Agent: {{ used_user_agent }}</span>
|
||||
{% endif %}
|
||||
</p>
|
||||
</footer>
|
||||
|
|
|
|||
|
|
@ -155,4 +155,8 @@
|
|||
</div>
|
||||
</div>
|
||||
|
||||
{% if bundle_static() %}
|
||||
<script src="/{{ cb_url('bundle.js') }}" defer></script>
|
||||
{% else %}
|
||||
<script type="text/javascript" src="{{ cb_url('header.js') }}"></script>
|
||||
{% endif %}
|
||||
|
|
|
|||
|
|
@ -10,9 +10,9 @@
|
|||
background-color: #fff;
|
||||
}
|
||||
body {
|
||||
padding: 0 8px;
|
||||
padding: 0 12px;
|
||||
margin: 0 auto;
|
||||
max-width: 736px;
|
||||
max-width: 1200px;
|
||||
}
|
||||
a {
|
||||
text-decoration: none;
|
||||
|
|
@ -161,13 +161,13 @@
|
|||
.e3goi {
|
||||
vertical-align: top;
|
||||
padding: 0;
|
||||
height: 180px;
|
||||
}
|
||||
.GpQGbf {
|
||||
margin: auto;
|
||||
border-collapse: collapse;
|
||||
border-spacing: 0;
|
||||
width: 100%;
|
||||
table-layout: fixed;
|
||||
}
|
||||
.X6ZCif {
|
||||
color: #202124;
|
||||
|
|
@ -210,19 +210,20 @@
|
|||
text-align: center;
|
||||
}
|
||||
.RAyV4b {
|
||||
width: 162px;
|
||||
height: 140px;
|
||||
line-height: 140px;
|
||||
overflow: "hidden";
|
||||
height: 220px;
|
||||
line-height: 220px;
|
||||
overflow: hidden;
|
||||
text-align: center;
|
||||
}
|
||||
.t0fcAb {
|
||||
text-align: center;
|
||||
margin: auto;
|
||||
vertical-align: middle;
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
object-fit: contain;
|
||||
object-fit: cover;
|
||||
max-width: 100%;
|
||||
height: auto;
|
||||
max-height: 220px;
|
||||
display: block;
|
||||
}
|
||||
.Tor4Ec {
|
||||
padding-top: 2px;
|
||||
|
|
@ -318,6 +319,24 @@
|
|||
a .CVA68e:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
.e3goi {
|
||||
width: 25%;
|
||||
padding: 10px;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
.svla5d {
|
||||
max-width: 100%;
|
||||
}
|
||||
@media (max-width: 900px) {
|
||||
.e3goi {
|
||||
width: 50%;
|
||||
}
|
||||
}
|
||||
@media (max-width: 600px) {
|
||||
.e3goi {
|
||||
width: 100%;
|
||||
}
|
||||
}
|
||||
</style>
|
||||
<div>
|
||||
<div>
|
||||
|
|
|
|||
|
|
@ -17,13 +17,21 @@
|
|||
<meta name="referrer" content="no-referrer">
|
||||
<meta name="msapplication-TileColor" content="#ffffff">
|
||||
<meta name="msapplication-TileImage" content="static/img/favicon/ms-icon-144x144.png">
|
||||
{% if autocomplete_enabled == '1' %}
|
||||
<script src="{{ cb_url('autocomplete.js') }}"></script>
|
||||
{% if bundle_static() %}
|
||||
<script src="/{{ cb_url('bundle.js') }}" defer></script>
|
||||
{% else %}
|
||||
{% if autocomplete_enabled == '1' %}
|
||||
<script src="{{ cb_url('autocomplete.js') }}"></script>
|
||||
{% endif %}
|
||||
<script type="text/javascript" src="{{ cb_url('controller.js') }}"></script>
|
||||
{% endif %}
|
||||
<script type="text/javascript" src="{{ cb_url('controller.js') }}"></script>
|
||||
<link rel="search" href="opensearch.xml" type="application/opensearchdescription+xml" title="Whoogle Search">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<link rel="stylesheet" href="{{ cb_url('logo.css') }}">
|
||||
{% if bundle_static() %}
|
||||
<link rel="stylesheet" href="/{{ cb_url('bundle.css') }}">
|
||||
{% else %}
|
||||
<link rel="stylesheet" href="{{ cb_url('logo.css') }}">
|
||||
{% endif %}
|
||||
{% if config.theme %}
|
||||
{% if config.theme == 'system' %}
|
||||
<style>
|
||||
|
|
@ -33,10 +41,10 @@
|
|||
{% else %}
|
||||
<link rel="stylesheet" href="{{ cb_url(config.theme + '-theme.css') }}"/>
|
||||
{% endif %}
|
||||
{% else %}
|
||||
<link rel="stylesheet" href="{{ cb_url(('dark' if config.dark else 'light') + '-theme.css') }}"/>
|
||||
{% endif %}
|
||||
<link rel="stylesheet" href="{{ cb_url('main.css') }}">
|
||||
{% if not bundle_static() %}
|
||||
<link rel="stylesheet" href="{{ cb_url('main.css') }}">
|
||||
{% endif %}
|
||||
<noscript>
|
||||
<style>
|
||||
#main {
|
||||
|
|
@ -194,10 +202,6 @@
|
|||
</select>
|
||||
</div>
|
||||
<!-- DEPRECATED -->
|
||||
<!--<div class="config-div config-div-dark">-->
|
||||
<!--<label for="config-dark">{{ translation['config-dark'] }}: </label>-->
|
||||
<!--<input type="checkbox" name="dark" id="config-dark" {{ 'checked' if config.dark else '' }}>-->
|
||||
<!--</div>-->
|
||||
<div class="config-div config-div-safe">
|
||||
<label for="config-safe">{{ translation['config-safe'] }}: </label>
|
||||
<input type="checkbox" name="safe" id="config-safe" {{ 'checked' if config.safe else '' }}>
|
||||
|
|
@ -231,8 +235,8 @@
|
|||
<div class="config-div config-div-user-agent">
|
||||
<label for="config-user-agent">User Agent: </label>
|
||||
<select name="user_agent" id="config-user-agent">
|
||||
<option value="LYNX_UA" {% if not config.user_agent or config.user_agent == 'LYNX_UA' %}selected{% endif %}>Lynx Browser</option>
|
||||
<option value="" {% if config.user_agent == '' and config.user_agent != 'LYNX_UA' %}selected{% endif %}>Original (Random)</option>
|
||||
<option value="env_conf" {% if config.user_agent == 'env_conf' %}selected{% endif %}>Use ENV Conf</option>
|
||||
<option value="default" {% if config.user_agent == 'default' %}selected{% endif %}>Default</option>
|
||||
<option value="custom" {% if config.user_agent == 'custom' %}selected{% endif %}>Custom</option>
|
||||
</select>
|
||||
</div>
|
||||
|
|
@ -248,6 +252,35 @@
|
|||
<input type="checkbox" name="accept_language"
|
||||
id="config-accept-language" {{ 'checked' if config.accept_language else '' }}>
|
||||
</div>
|
||||
<div class="config-div config-div-show-user-agent">
|
||||
<label for="config-show-user-agent">Show User Agent in Footer: </label>
|
||||
<input type="checkbox" name="show_user_agent"
|
||||
id="config-show-user-agent" {{ 'checked' if config.show_user_agent else '' }}>
|
||||
</div>
|
||||
<!-- Google Custom Search Engine (BYOK) Settings -->
|
||||
<div class="config-div config-div-cse-header" style="margin-top: 20px; border-top: 1px solid var(--result-bg); padding-top: 15px;">
|
||||
<strong>Google Custom Search (BYOK)</strong>
|
||||
<div><span class="info-text"> — <a href="https://github.com/benbusby/whoogle-search#google-custom-search-byok">Setup Guide</a></span></div>
|
||||
</div>
|
||||
<div class="config-div config-div-use-cse">
|
||||
<label for="config-use-cse">Use Custom Search API: </label>
|
||||
<input type="checkbox" name="use_cse" id="config-use-cse" {{ 'checked' if config.use_cse else '' }}>
|
||||
<div><span class="info-text"> — Enable to use your own Google API key (100 free queries/day)</span></div>
|
||||
</div>
|
||||
<div class="config-div config-div-cse-api-key">
|
||||
<label for="config-cse-api-key">CSE API Key: </label>
|
||||
<input type="password" name="cse_api_key" id="config-cse-api-key"
|
||||
value="{{ config.cse_api_key }}"
|
||||
placeholder="AIza..."
|
||||
autocomplete="off">
|
||||
</div>
|
||||
<div class="config-div config-div-cse-id">
|
||||
<label for="config-cse-id">CSE ID: </label>
|
||||
<input type="text" name="cse_id" id="config-cse-id"
|
||||
value="{{ config.cse_id }}"
|
||||
placeholder="abc123..."
|
||||
autocomplete="off">
|
||||
</div>
|
||||
<div class="config-div config-div-root-url">
|
||||
<label for="config-url">{{ translation['config-url'] }}: </label>
|
||||
<input type="text" name="url" id="config-url" value="{{ config.url }}">
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
import json
|
||||
import requests
|
||||
import httpx
|
||||
import urllib.parse as urlparse
|
||||
import os
|
||||
import glob
|
||||
|
|
@ -43,7 +43,8 @@ def load_all_bangs(ddg_bangs_file: str, ddg_bangs: dict = {}):
|
|||
|
||||
for i, bang_file in enumerate(bang_files):
|
||||
try:
|
||||
bangs |= json.load(open(bang_file))
|
||||
with open(bang_file, 'r', encoding='utf-8') as f:
|
||||
bangs |= json.load(f)
|
||||
except json.decoder.JSONDecodeError:
|
||||
# Ignore decoding error only for the ddg bangs file, since this can
|
||||
# occur if file is still being written
|
||||
|
|
@ -63,12 +64,9 @@ def gen_bangs_json(bangs_file: str) -> None:
|
|||
None
|
||||
|
||||
"""
|
||||
try:
|
||||
# Request full list from DDG
|
||||
r = requests.get(DDG_BANGS)
|
||||
r.raise_for_status()
|
||||
except requests.exceptions.HTTPError as err:
|
||||
raise SystemExit(err)
|
||||
# Request full list from DDG
|
||||
r = httpx.get(DDG_BANGS)
|
||||
r.raise_for_status()
|
||||
|
||||
# Convert to json
|
||||
data = json.loads(r.text)
|
||||
|
|
@ -83,7 +81,8 @@ def gen_bangs_json(bangs_file: str) -> None:
|
|||
'suggestion': bang_command + ' (' + row['s'] + ')'
|
||||
}
|
||||
|
||||
json.dump(bangs_data, open(bangs_file, 'w'))
|
||||
with open(bangs_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(bangs_data, f)
|
||||
print('* Finished creating ddg bangs json')
|
||||
load_all_bangs(bangs_file, bangs_data)
|
||||
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ import io
|
|||
import os
|
||||
import re
|
||||
|
||||
from requests import exceptions, get
|
||||
import httpx
|
||||
from urllib.parse import urlparse
|
||||
from bs4 import BeautifulSoup as bsoup
|
||||
from cryptography.fernet import Fernet
|
||||
|
|
@ -36,7 +36,7 @@ def fetch_favicon(url: str) -> bytes:
|
|||
bytes - the favicon bytes, or a placeholder image if one
|
||||
was not returned
|
||||
"""
|
||||
response = get(f'{ddg_favicon_site}/{urlparse(url).netloc}.ico')
|
||||
response = httpx.get(f'{ddg_favicon_site}/{urlparse(url).netloc}.ico')
|
||||
|
||||
if response.status_code == 200 and len(response.content) > 0:
|
||||
tmp_mem = io.BytesIO()
|
||||
|
|
@ -48,7 +48,8 @@ def fetch_favicon(url: str) -> bytes:
|
|||
|
||||
|
||||
def gen_file_hash(path: str, static_file: str) -> str:
|
||||
file_contents = open(os.path.join(path, static_file), 'rb').read()
|
||||
with open(os.path.join(path, static_file), 'rb') as f:
|
||||
file_contents = f.read()
|
||||
file_hash = hashlib.md5(file_contents).hexdigest()[:8]
|
||||
filename_split = os.path.splitext(static_file)
|
||||
|
||||
|
|
@ -97,8 +98,8 @@ def get_proxy_host_url(r: Request, default: str, root=False) -> str:
|
|||
def check_for_update(version_url: str, current: str) -> int:
|
||||
# Check for the latest version of Whoogle
|
||||
has_update = ''
|
||||
with contextlib.suppress(exceptions.ConnectionError, AttributeError):
|
||||
update = bsoup(get(version_url).text, 'html.parser')
|
||||
with contextlib.suppress(httpx.RequestError, AttributeError):
|
||||
update = bsoup(httpx.get(version_url).text, 'html.parser')
|
||||
latest = update.select_one('[class="Link--primary"]').string[1:]
|
||||
current = int(''.join(filter(str.isdigit, current)))
|
||||
latest = int(''.join(filter(str.isdigit, latest)))
|
||||
|
|
|
|||
|
|
@ -1,7 +1,8 @@
|
|||
from app.models.config import Config
|
||||
from app.models.endpoint import Endpoint
|
||||
from app.utils.misc import list_to_dict
|
||||
from bs4 import BeautifulSoup, NavigableString
|
||||
from bs4 import BeautifulSoup, NavigableString, MarkupResemblesLocatorWarning
|
||||
import warnings
|
||||
import copy
|
||||
from flask import current_app
|
||||
import html
|
||||
|
|
@ -9,7 +10,7 @@ import os
|
|||
import urllib.parse as urlparse
|
||||
from urllib.parse import parse_qs
|
||||
import re
|
||||
import warnings
|
||||
warnings.filterwarnings('ignore', category=MarkupResemblesLocatorWarning)
|
||||
|
||||
SKIP_ARGS = ['ref_src', 'utm']
|
||||
SKIP_PREFIX = ['//www.', '//mobile.', '//m.']
|
||||
|
|
@ -114,7 +115,7 @@ def bold_search_terms(response: str, query: str) -> BeautifulSoup:
|
|||
for word in re.split(r'\s+(?=[^"]*(?:"[^"]*"[^"]*)*$)', query):
|
||||
word = re.sub(r'[@_!#$%^&*()<>?/\|}{~:]+', '', word)
|
||||
target = response.find_all(
|
||||
text=re.compile(r'' + re.escape(word), re.I))
|
||||
string=re.compile(r'' + re.escape(word), re.I))
|
||||
for nav_str in target:
|
||||
replace_any_case(nav_str, word)
|
||||
|
||||
|
|
@ -136,7 +137,7 @@ def has_ad_content(element: str) -> bool:
|
|||
or 'ⓘ' in element)
|
||||
|
||||
|
||||
def get_first_link(soup: BeautifulSoup) -> str:
|
||||
def get_first_link(soup) -> str:
|
||||
"""Retrieves the first result link from the query response
|
||||
|
||||
Args:
|
||||
|
|
@ -147,24 +148,18 @@ def get_first_link(soup: BeautifulSoup) -> str:
|
|||
|
||||
"""
|
||||
first_link = ''
|
||||
orig_details = []
|
||||
|
||||
# Temporarily remove details so we don't grab those links
|
||||
for details in soup.find_all('details'):
|
||||
temp_details = soup.new_tag('removed_details')
|
||||
orig_details.append(details.replace_with(temp_details))
|
||||
|
||||
# Replace hrefs with only the intended destination (no "utm" type tags)
|
||||
# Find the first valid search result link, excluding details elements
|
||||
for a in soup.find_all('a', href=True):
|
||||
# Skip links that are inside details elements (collapsible sections)
|
||||
if a.find_parent('details'):
|
||||
continue
|
||||
|
||||
# Return the first search result URL
|
||||
if a['href'].startswith('http://') or a['href'].startswith('https://'):
|
||||
first_link = a['href']
|
||||
break
|
||||
|
||||
# Add the details back
|
||||
for orig_detail, details in zip(orig_details, soup.find_all('removed_details')):
|
||||
details.replace_with(orig_detail)
|
||||
|
||||
return first_link
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@ from app.filter import Filter
|
|||
from app.request import gen_query
|
||||
from app.utils.misc import get_proxy_host_url
|
||||
from app.utils.results import get_first_link
|
||||
from app.services.cse_client import CSEClient, cse_results_to_html
|
||||
from bs4 import BeautifulSoup as bsoup
|
||||
from cryptography.fernet import Fernet, InvalidToken
|
||||
from flask import g
|
||||
|
|
@ -55,7 +56,7 @@ class Search:
|
|||
config: the current user config settings
|
||||
session_key: the flask user fernet key
|
||||
"""
|
||||
def __init__(self, request, config, session_key, cookies_disabled=False):
|
||||
def __init__(self, request, config, session_key, cookies_disabled=False, user_request=None):
|
||||
method = request.method
|
||||
self.request = request
|
||||
self.request_params = request.args if method == 'GET' else request.form
|
||||
|
|
@ -66,6 +67,7 @@ class Search:
|
|||
self.query = ''
|
||||
self.widget = ''
|
||||
self.cookies_disabled = cookies_disabled
|
||||
self.user_request = user_request
|
||||
self.search_type = self.request_params.get(
|
||||
'tbm') if 'tbm' in self.request_params else ''
|
||||
|
||||
|
|
@ -103,7 +105,7 @@ class Search:
|
|||
pass
|
||||
|
||||
# Strip '!' for "feeling lucky" queries
|
||||
if match := re.search("(^|\s)!($|\s)", q):
|
||||
if match := re.search(r"(^|\s)!($|\s)", q):
|
||||
self.feeling_lucky = True
|
||||
start, end = match.span()
|
||||
self.query = " ".join([seg for seg in [q[:start], q[end:]] if seg])
|
||||
|
|
@ -139,7 +141,91 @@ class Search:
|
|||
root_url=root_url,
|
||||
mobile=mobile,
|
||||
config=self.config,
|
||||
query=self.query)
|
||||
query=self.query,
|
||||
page_url=self.request.url)
|
||||
|
||||
# Check if CSE (Custom Search Engine) should be used
|
||||
use_cse = (
|
||||
self.config.use_cse and
|
||||
self.config.cse_api_key and
|
||||
self.config.cse_id
|
||||
)
|
||||
|
||||
if use_cse:
|
||||
# Use Google Custom Search API
|
||||
return self._generate_cse_response(content_filter, root_url, mobile)
|
||||
|
||||
# Default: Use traditional scraping method
|
||||
return self._generate_scrape_response(content_filter, root_url, mobile)
|
||||
|
||||
def _generate_cse_response(self, content_filter: Filter, root_url: str, mobile: bool) -> str:
|
||||
"""Generate response using Google Custom Search API
|
||||
|
||||
Args:
|
||||
content_filter: Filter instance for processing results
|
||||
root_url: Root URL of the instance
|
||||
mobile: Whether this is a mobile request
|
||||
|
||||
Returns:
|
||||
str: HTML response string
|
||||
"""
|
||||
# Get pagination start index from request params
|
||||
start = int(self.request_params.get('start', 1))
|
||||
|
||||
# Determine safe search setting
|
||||
safe = 'high' if self.config.safe else 'off'
|
||||
|
||||
# Determine search type (web or image)
|
||||
# tbm=isch or udm=2 indicates image search
|
||||
search_type = ''
|
||||
if self.search_type == 'isch' or self.request_params.get('udm') == '2':
|
||||
search_type = 'image'
|
||||
|
||||
# Create CSE client and perform search
|
||||
with CSEClient(
|
||||
api_key=self.config.cse_api_key,
|
||||
cse_id=self.config.cse_id
|
||||
) as client:
|
||||
response = client.search(
|
||||
query=self.query,
|
||||
start=start,
|
||||
safe=safe,
|
||||
language=self.config.lang_search,
|
||||
country=self.config.country,
|
||||
search_type=search_type
|
||||
)
|
||||
|
||||
# Convert CSE response to HTML
|
||||
html_content = cse_results_to_html(response, self.query)
|
||||
|
||||
# Store full query for tabs
|
||||
self.full_query = self.query
|
||||
|
||||
# Parse and filter the HTML
|
||||
html_soup = bsoup(html_content, 'html.parser')
|
||||
|
||||
# Handle feeling lucky
|
||||
if self.feeling_lucky:
|
||||
if response.has_results and response.results:
|
||||
return response.results[0].link
|
||||
self.feeling_lucky = False
|
||||
|
||||
# Apply content filter (encrypts links, applies CSS, etc.)
|
||||
formatted_results = content_filter.clean(html_soup)
|
||||
|
||||
return str(formatted_results)
|
||||
|
||||
def _generate_scrape_response(self, content_filter: Filter, root_url: str, mobile: bool) -> str:
|
||||
"""Generate response using traditional HTML scraping
|
||||
|
||||
Args:
|
||||
content_filter: Filter instance for processing results
|
||||
root_url: Root URL of the instance
|
||||
mobile: Whether this is a mobile request
|
||||
|
||||
Returns:
|
||||
str: HTML response string
|
||||
"""
|
||||
full_query = gen_query(self.query,
|
||||
self.request_params,
|
||||
self.config)
|
||||
|
|
@ -147,24 +233,31 @@ class Search:
|
|||
|
||||
# force mobile search when view image is true and
|
||||
# the request is not already made by a mobile
|
||||
view_image = ('tbm=isch' in full_query
|
||||
and self.config.view_image
|
||||
and not g.user_request.mobile)
|
||||
is_image_query = ('tbm=isch' in full_query) or ('udm=2' in full_query)
|
||||
# Always parse image results when hitting the images endpoint (udm=2)
|
||||
# to avoid Google returning only text/AI blocks.
|
||||
view_image = is_image_query
|
||||
|
||||
get_body = g.user_request.send(query=full_query,
|
||||
force_mobile=view_image,
|
||||
user_agent=self.user_agent)
|
||||
client = self.user_request or g.user_request
|
||||
get_body = client.send(query=full_query,
|
||||
force_mobile=self.config.view_image,
|
||||
user_agent=self.user_agent)
|
||||
|
||||
# Produce cleanable html soup from response
|
||||
get_body_safed = get_body.text.replace("<","andlt;").replace(">","andgt;")
|
||||
html_soup = bsoup(get_body_safed, 'html.parser')
|
||||
|
||||
# Ensure we extract only the content within <html> if it exists
|
||||
# This prevents doctype declarations from appearing in the output
|
||||
if html_soup.html:
|
||||
html_soup = html_soup.html
|
||||
|
||||
# Replace current soup if view_image is active
|
||||
if view_image:
|
||||
html_soup = content_filter.view_image(html_soup)
|
||||
|
||||
# Indicate whether or not a Tor connection is active
|
||||
if g.user_request.tor_valid:
|
||||
if (self.user_request or g.user_request).tor_valid:
|
||||
html_soup.insert(0, bsoup(TOR_BANNER, 'html.parser'))
|
||||
|
||||
formatted_results = content_filter.clean(html_soup)
|
||||
|
|
@ -188,4 +281,3 @@ class Search:
|
|||
link['href'] += param_str
|
||||
|
||||
return str(formatted_results)
|
||||
|
||||
|
|
|
|||
336
app/utils/ua_generator.py
Normal file
336
app/utils/ua_generator.py
Normal file
|
|
@ -0,0 +1,336 @@
|
|||
"""
|
||||
User Agent Generator for Opera-based UA strings.
|
||||
|
||||
This module generates realistic Opera User Agent strings based on patterns
|
||||
found in working UA strings that successfully bypass Google's restrictions.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
from datetime import datetime, timedelta
|
||||
from typing import List, Dict
|
||||
|
||||
|
||||
# Default fallback UA if generation fails
|
||||
DEFAULT_FALLBACK_UA = "Opera/9.80 (iPad; Opera Mini/5.0.17381/503; U; eu) Presto/2.6.35 Version/11.10)"
|
||||
|
||||
# Opera UA Pattern Templates
|
||||
OPERA_PATTERNS = [
|
||||
# Opera Mini (J2ME/MIDP)
|
||||
"Opera/9.80 (J2ME/MIDP; Opera Mini/{version}/{build}; U; {lang}) Presto/{presto} Version/{final}",
|
||||
|
||||
# Opera Mobile (Android)
|
||||
"Opera/9.80 (Android; Linux; Opera Mobi/{build}; U; {lang}) Presto/{presto} Version/{final}",
|
||||
|
||||
# Opera Mobile (iPhone)
|
||||
"Opera/9.80 (iPhone; Opera Mini/{version}/{build}; U; {lang}) Presto/{presto} Version/{final}",
|
||||
|
||||
# Opera Mobile (iPad)
|
||||
"Opera/9.80 (iPad; Opera Mini/{version}/{build}; U; {lang}) Presto/{presto} Version/{final}",
|
||||
]
|
||||
|
||||
# Randomization pools based on working UAs
|
||||
OPERA_MINI_VERSIONS = [
|
||||
"4.0", "4.1.11321", "4.1.12965", "4.1.13573", "4.1.13907", "4.1.14287",
|
||||
"4.1.15082", "4.2.13057", "4.2.13221", "4.2.13265", "4.2.13337",
|
||||
"4.2.13400", "4.2.13918", "4.2.13943", "4.2.14320", "4.2.14409",
|
||||
"4.2.14753", "4.2.14881", "4.2.14885", "4.2.14912", "4.2.15066",
|
||||
"4.2.15410", "4.2.16007", "4.2.16320", "4.2.18887", "4.2.19634",
|
||||
"4.2.21465", "4.2.22228", "4.2.23453", "4.2.24721", "4.3.13337",
|
||||
"4.3.24214", "4.4.26736", "4.4.29476", "4.5.33867", "4.5.40312",
|
||||
"5.0.15650", "5.0.16823", "5.0.17381", "5.0.17443", "5.0.18635",
|
||||
"5.0.18741", "5.0.19683", "5.0.19693", "5.0.20873", "5.0.22349",
|
||||
"5.1.21051", "5.1.21126", "5.1.21214", "5.1.21415", "5.1.21594",
|
||||
"5.1.21595", "5.1.22296", "5.1.22303", "5.1.22396", "5.1.22460",
|
||||
"5.1.22783", "5.1.22784", "6.0.24095", "6.0.24212", "6.0.24455",
|
||||
"6.1.25375", "6.1.25378", "6.1.25759", "6.24093", "6.24096",
|
||||
"6.24209", "6.24288", "6.5.26955", "6.5.29702", "7.0.29952",
|
||||
"7.1.32052", "7.1.32444", "7.1.32694", "7.29530", "7.5.33361",
|
||||
"7.6.35766", "9.80", "36.2.2254"
|
||||
]
|
||||
|
||||
OPERA_MOBI_BUILDS = [
|
||||
"27", "49", "447", "498", "1181", "1209", "3730",
|
||||
"ADR-1011151731", "ADR-1012211514", "ADR-1012221546", "ADR-1012272315",
|
||||
"SYB-1103211396", "SYB-1104061449", "SYB-1107071606",
|
||||
"ADR-1111101157"
|
||||
]
|
||||
|
||||
BUILD_NUMBERS = [
|
||||
"18.678", "18.684", "18.738", "18.794", "19.892", "19.916",
|
||||
"20.2477", "20.2479", "20.2485", "20.2489", "21.529", "22.387",
|
||||
"22.394", "22.401", "22.414", "22.453", "22.478", "23.317",
|
||||
"23.333", "23.334", "23.377", "23.390", "24.741", "24.743",
|
||||
"24.746", "24.783", "24.838", "24.871", "24.899", "25.657",
|
||||
"25.677", "25.729", "25.872", "26.1305", "27.1366", "27.1407",
|
||||
"27.1573", "28.2075", "28.2555", "28.2647", "28.2766", "29.3594",
|
||||
"30.3316", "31.1350", "35.2883", "35.5706", "37.6584", "119.132",
|
||||
"170.51", "170.54", "764", "870", "886", "490", "503"
|
||||
]
|
||||
|
||||
PRESTO_VERSIONS = [
|
||||
"2.2.0", "2.4.15", "2.4.154.15", "2.4.18", "2.5.25", "2.5.28",
|
||||
"2.6.35", "2.7.60", "2.7.81", "2.8.119", "2.8.149", "2.8.191",
|
||||
"2.9.201", "2.12.423"
|
||||
]
|
||||
|
||||
FINAL_VERSIONS = [
|
||||
"10.00", "10.1", "10.5", "10.54", "10.5454", "11.00", "11.10",
|
||||
"12.02", "12.16", "13.00"
|
||||
]
|
||||
|
||||
LANGUAGES = [
|
||||
# English variants
|
||||
"en", "en-US", "en-GB", "en-CA", "en-AU", "en-NZ", "en-ZA", "en-IN", "en-SG",
|
||||
# Western European
|
||||
"de", "de-DE", "de-AT", "de-CH",
|
||||
"fr", "fr-FR", "fr-CA", "fr-BE", "fr-CH", "fr-LU",
|
||||
"es", "es-ES", "es-MX", "es-AR", "es-CO", "es-CL", "es-PE", "es-VE", "es-LA",
|
||||
"it", "it-IT", "it-CH",
|
||||
"pt", "pt-PT", "pt-BR",
|
||||
"nl", "nl-NL", "nl-BE",
|
||||
# Nordic languages
|
||||
"da", "da-DK",
|
||||
"sv", "sv-SE",
|
||||
"no", "no-NO", "nb", "nn",
|
||||
"fi", "fi-FI",
|
||||
"is", "is-IS",
|
||||
# Eastern European
|
||||
"pl", "pl-PL",
|
||||
"cs", "cs-CZ",
|
||||
"sk", "sk-SK",
|
||||
"hu", "hu-HU",
|
||||
"ro", "ro-RO",
|
||||
"bg", "bg-BG",
|
||||
"hr", "hr-HR",
|
||||
"sr", "sr-RS",
|
||||
"sl", "sl-SI",
|
||||
"uk", "uk-UA",
|
||||
"ru", "ru-RU",
|
||||
# Asian languages
|
||||
"zh", "zh-CN", "zh-TW", "zh-HK",
|
||||
"ja", "ja-JP",
|
||||
"ko", "ko-KR",
|
||||
"th", "th-TH",
|
||||
"vi", "vi-VN",
|
||||
"id", "id-ID",
|
||||
"ms", "ms-MY",
|
||||
"fil", "tl",
|
||||
# Middle Eastern
|
||||
"tr", "tr-TR",
|
||||
"ar", "ar-SA", "ar-AE", "ar-EG",
|
||||
"he", "he-IL",
|
||||
"fa", "fa-IR",
|
||||
# Other
|
||||
"hi", "hi-IN",
|
||||
"bn", "bn-IN",
|
||||
"ta", "ta-IN",
|
||||
"te", "te-IN",
|
||||
"mr", "mr-IN",
|
||||
"el", "el-GR",
|
||||
"ca", "ca-ES",
|
||||
"eu", "eu-ES"
|
||||
]
|
||||
|
||||
|
||||
|
||||
def generate_opera_ua() -> str:
|
||||
"""
|
||||
Generate a single random Opera User Agent string.
|
||||
|
||||
Returns:
|
||||
str: A randomly generated Opera UA string
|
||||
"""
|
||||
pattern = random.choice(OPERA_PATTERNS)
|
||||
|
||||
# Determine which parameters to use based on the pattern
|
||||
params = {
|
||||
'lang': random.choice(LANGUAGES)
|
||||
}
|
||||
|
||||
if '{version}' in pattern:
|
||||
params['version'] = random.choice(OPERA_MINI_VERSIONS)
|
||||
|
||||
if '{build}' in pattern:
|
||||
# Use MOBI build for "Opera Mobi", regular build for "Opera Mini"
|
||||
if "Opera Mobi" in pattern:
|
||||
params['build'] = random.choice(OPERA_MOBI_BUILDS)
|
||||
else:
|
||||
params['build'] = random.choice(BUILD_NUMBERS)
|
||||
|
||||
if '{presto}' in pattern:
|
||||
params['presto'] = random.choice(PRESTO_VERSIONS)
|
||||
|
||||
if '{final}' in pattern:
|
||||
params['final'] = random.choice(FINAL_VERSIONS)
|
||||
|
||||
return pattern.format(**params)
|
||||
|
||||
|
||||
def generate_ua_pool(count: int = 10) -> List[str]:
|
||||
"""
|
||||
Generate a pool of unique Opera User Agent strings.
|
||||
|
||||
Args:
|
||||
count: Number of UA strings to generate (default: 10)
|
||||
|
||||
Returns:
|
||||
List[str]: List of unique UA strings
|
||||
"""
|
||||
ua_pool = set()
|
||||
|
||||
# Keep generating until we have enough unique UAs
|
||||
# Add safety limit to prevent infinite loop
|
||||
max_attempts = count * 100
|
||||
attempts = 0
|
||||
|
||||
try:
|
||||
while len(ua_pool) < count and attempts < max_attempts:
|
||||
ua = generate_opera_ua()
|
||||
ua_pool.add(ua)
|
||||
attempts += 1
|
||||
except Exception:
|
||||
# If generation fails entirely, return at least the default fallback
|
||||
if not ua_pool:
|
||||
return [DEFAULT_FALLBACK_UA]
|
||||
|
||||
# If we couldn't generate enough, fill remaining with default
|
||||
result = list(ua_pool)
|
||||
while len(result) < count:
|
||||
result.append(DEFAULT_FALLBACK_UA)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def save_ua_pool(uas: List[str], cache_path: str) -> None:
|
||||
"""
|
||||
Save UA pool to cache file.
|
||||
|
||||
Args:
|
||||
uas: List of UA strings to save
|
||||
cache_path: Path to cache file
|
||||
"""
|
||||
cache_data = {
|
||||
'generated_at': datetime.now().isoformat(),
|
||||
'user_agents': uas
|
||||
}
|
||||
|
||||
# Ensure directory exists
|
||||
cache_dir = os.path.dirname(cache_path)
|
||||
if cache_dir and not os.path.exists(cache_dir):
|
||||
os.makedirs(cache_dir, exist_ok=True)
|
||||
|
||||
with open(cache_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(cache_data, f, indent=2)
|
||||
|
||||
|
||||
def load_custom_ua_list(file_path: str) -> List[str]:
|
||||
"""
|
||||
Load custom UA list from a text file.
|
||||
|
||||
Args:
|
||||
file_path: Path to text file containing UA strings (one per line)
|
||||
|
||||
Returns:
|
||||
List[str]: List of UA strings, or empty list if file is invalid
|
||||
"""
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
uas = [line.strip() for line in f if line.strip()]
|
||||
|
||||
# Validate that we have at least one UA
|
||||
if not uas:
|
||||
return []
|
||||
|
||||
return uas
|
||||
except (FileNotFoundError, PermissionError, UnicodeDecodeError):
|
||||
return []
|
||||
|
||||
|
||||
def load_ua_pool(cache_path: str, count: int = 10) -> List[str]:
|
||||
"""
|
||||
Load UA pool from custom list file, cache, or generate new one.
|
||||
|
||||
Priority order:
|
||||
1. Custom UA list file (if WHOOGLE_UA_LIST_FILE is set)
|
||||
2. Cached auto-generated UAs
|
||||
3. Newly generated UAs
|
||||
|
||||
Args:
|
||||
cache_path: Path to cache file
|
||||
count: Number of UAs to generate if cache is invalid (default: 10)
|
||||
|
||||
Returns:
|
||||
List[str]: List of UA strings
|
||||
"""
|
||||
# Check for custom UA list file first (highest priority)
|
||||
custom_ua_file = os.environ.get('WHOOGLE_UA_LIST_FILE', '').strip()
|
||||
if custom_ua_file:
|
||||
custom_uas = load_custom_ua_list(custom_ua_file)
|
||||
if custom_uas:
|
||||
# Custom list loaded successfully
|
||||
return custom_uas
|
||||
else:
|
||||
# Custom file specified but invalid, log warning and fall back
|
||||
print(f"Warning: Custom UA list file '{custom_ua_file}' not found or invalid, falling back to auto-generated UAs")
|
||||
|
||||
# Check if we should use cache
|
||||
use_cache = os.environ.get('WHOOGLE_UA_CACHE_PERSISTENT', '1') == '1'
|
||||
refresh_days = int(os.environ.get('WHOOGLE_UA_CACHE_REFRESH_DAYS', '0'))
|
||||
|
||||
# If cache disabled, always generate new
|
||||
if not use_cache:
|
||||
uas = generate_ua_pool(count)
|
||||
save_ua_pool(uas, cache_path)
|
||||
return uas
|
||||
|
||||
# Try to load from cache
|
||||
if os.path.exists(cache_path):
|
||||
try:
|
||||
with open(cache_path, 'r', encoding='utf-8') as f:
|
||||
cache_data = json.load(f)
|
||||
|
||||
# Check if cache is expired (if refresh_days > 0)
|
||||
if refresh_days > 0:
|
||||
generated_at = datetime.fromisoformat(cache_data['generated_at'])
|
||||
age_days = (datetime.now() - generated_at).days
|
||||
|
||||
if age_days >= refresh_days:
|
||||
# Cache expired, generate new
|
||||
uas = generate_ua_pool(count)
|
||||
save_ua_pool(uas, cache_path)
|
||||
return uas
|
||||
|
||||
# Cache is valid, return it
|
||||
return cache_data['user_agents']
|
||||
except (json.JSONDecodeError, KeyError, ValueError):
|
||||
# Cache file is corrupted, generate new
|
||||
pass
|
||||
|
||||
# No valid cache, generate new
|
||||
uas = generate_ua_pool(count)
|
||||
save_ua_pool(uas, cache_path)
|
||||
return uas
|
||||
|
||||
|
||||
def get_random_ua(ua_pool: List[str]) -> str:
|
||||
"""
|
||||
Get a random UA from the pool.
|
||||
|
||||
Args:
|
||||
ua_pool: List of UA strings
|
||||
|
||||
Returns:
|
||||
str: Random UA string from the pool
|
||||
"""
|
||||
if not ua_pool:
|
||||
# Fallback to generating one if pool is empty
|
||||
try:
|
||||
return generate_opera_ua()
|
||||
except Exception:
|
||||
# If generation fails, use default fallback
|
||||
return DEFAULT_FALLBACK_UA
|
||||
|
||||
return random.choice(ua_pool)
|
||||
|
||||
|
|
@ -4,4 +4,5 @@ optional_dev_tag = ''
|
|||
if os.getenv('DEV_BUILD'):
|
||||
optional_dev_tag = '.dev' + os.getenv('DEV_BUILD')
|
||||
|
||||
__version__ = '0.9.2' + optional_dev_tag
|
||||
__version__ = '1.2.2' + optional_dev_tag
|
||||
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ name: whoogle
|
|||
description: A self hosted search engine on Kubernetes
|
||||
type: application
|
||||
version: 0.1.0
|
||||
appVersion: 0.9.2
|
||||
appVersion: 0.9.4
|
||||
|
||||
icon: https://github.com/benbusby/whoogle-search/raw/main/app/static/img/favicon/favicon-96x96.png
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,9 @@
|
|||
{{- if .Values.autoscaling.enabled }}
|
||||
{{- if semverCompare ">=1.23-0" .Capabilities.KubeVersion.GitVersion -}}
|
||||
apiVersion: autoscaling/v2
|
||||
{{- else -}}
|
||||
apiVersion: autoscaling/v2beta1
|
||||
{{- end }}
|
||||
kind: HorizontalPodAutoscaler
|
||||
metadata:
|
||||
name: {{ include "whoogle.fullname" . }}
|
||||
|
|
@ -17,12 +21,24 @@ spec:
|
|||
- type: Resource
|
||||
resource:
|
||||
name: cpu
|
||||
{{- if semverCompare ">=1.23-0" .Capabilities.KubeVersion.GitVersion }}
|
||||
target:
|
||||
type: Utilization
|
||||
averageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
|
||||
{{- else -}}
|
||||
targetAverageUtilization: {{ .Values.autoscaling.targetCPUUtilizationPercentage }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.autoscaling.targetMemoryUtilizationPercentage }}
|
||||
- type: Resource
|
||||
resource:
|
||||
name: memory
|
||||
{{- if semverCompare ">=1.23-0" .Capabilities.KubeVersion.GitVersion }}
|
||||
target:
|
||||
type: Utilization
|
||||
averageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }}
|
||||
{{- else -}}
|
||||
targetAverageUtilization: {{ .Values.autoscaling.targetMemoryUtilizationPercentage }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
# can't use mem_limit in a 3.x docker-compose file in non swarm mode
|
||||
# see https://github.com/docker/compose/issues/4513
|
||||
version: "2.4"
|
||||
# Modern docker-compose format (v2+) does not require version specification
|
||||
# Memory limits are supported in Compose v2+ without version field
|
||||
|
||||
services:
|
||||
whoogle-search:
|
||||
|
|
|
|||
363
misc/check_google_user_agents.py
Executable file
363
misc/check_google_user_agents.py
Executable file
|
|
@ -0,0 +1,363 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Test User Agent strings against Google to find which ones return actual search results
|
||||
instead of JavaScript pages or upgrade browser messages.
|
||||
|
||||
Usage:
|
||||
python test_google_user_agents.py <user_agent_file> [--output <output_file>] [--query <search_query>]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import random
|
||||
import sys
|
||||
import time
|
||||
from typing import List, Tuple
|
||||
import requests
|
||||
|
||||
# Common search queries to cycle through for more realistic testing
|
||||
DEFAULT_SEARCH_QUERIES = [
|
||||
"python programming",
|
||||
"weather today",
|
||||
"news",
|
||||
"how to cook pasta",
|
||||
"best movies 2025",
|
||||
"restaurants near me",
|
||||
"translate hello",
|
||||
"calculator",
|
||||
"time",
|
||||
"maps",
|
||||
"images",
|
||||
"videos",
|
||||
"shopping",
|
||||
"travel",
|
||||
"sports scores",
|
||||
"stock market",
|
||||
"recipes",
|
||||
"music",
|
||||
"books",
|
||||
"technology",
|
||||
"AI",
|
||||
"AI programming",
|
||||
"Why does google hate users?"
|
||||
]
|
||||
|
||||
# Markers that indicate blocked/JS pages
|
||||
BLOCK_MARKERS = [
|
||||
"unusual traffic",
|
||||
"sorry but your computer",
|
||||
"solve the captcha",
|
||||
"request looks automated",
|
||||
"g-recaptcha",
|
||||
"upgrade your browser",
|
||||
"browser is not supported",
|
||||
"please upgrade",
|
||||
"isn't supported",
|
||||
"isn\"t supported", # With escaped quote
|
||||
"upgrade to a recent version",
|
||||
"update your browser",
|
||||
"your browser isn't supported",
|
||||
]
|
||||
|
||||
# Markers that indicate actual search results
|
||||
SUCCESS_MARKERS = [
|
||||
'<div class="g"', # Google search result container
|
||||
'<div id="search"', # Search results container
|
||||
'<div class="rc"', # Result container
|
||||
'class="yuRUbf"', # Result link container
|
||||
'class="LC20lb"', # Result title
|
||||
'- Google Search</title>', # Page title indicator
|
||||
'id="rso"', # Results container
|
||||
'class="g"', # Result class (without div tag)
|
||||
]
|
||||
|
||||
|
||||
def read_user_agents(file_path: str) -> List[str]:
|
||||
"""Read user agent strings from a file, one per line."""
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
user_agents = [line.strip() for line in f if line.strip()]
|
||||
return user_agents
|
||||
except FileNotFoundError:
|
||||
print(f"Error: File '{file_path}' not found.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"Error reading file: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def test_user_agent(user_agent: str, query: str = "test", timeout: float = 10.0) -> Tuple[bool, str]:
|
||||
"""
|
||||
Test a user agent against Google search.
|
||||
|
||||
Returns:
|
||||
Tuple of (is_working: bool, reason: str)
|
||||
"""
|
||||
url = "https://www.google.com/search"
|
||||
params = {"q": query, "gbv": "1", "num": "10"}
|
||||
|
||||
headers = {
|
||||
"User-Agent": user_agent,
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
"Connection": "keep-alive",
|
||||
"Upgrade-Insecure-Requests": "1",
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.get(url, params=params, headers=headers, timeout=timeout)
|
||||
|
||||
# Check HTTP status
|
||||
if response.status_code == 429:
|
||||
# Rate limited - raise this so we can handle it specially
|
||||
raise Exception(f"Rate limited (429)")
|
||||
if response.status_code >= 500:
|
||||
return False, f"Server error ({response.status_code})"
|
||||
if response.status_code == 403:
|
||||
return False, f"Blocked ({response.status_code})"
|
||||
if response.status_code >= 400:
|
||||
return False, f"HTTP {response.status_code}"
|
||||
|
||||
body_lower = response.text.lower()
|
||||
|
||||
# Check for block markers
|
||||
for marker in BLOCK_MARKERS:
|
||||
if marker.lower() in body_lower:
|
||||
return False, f"Blocked: {marker}"
|
||||
|
||||
# Check for redirect indicators first - these indicate non-working responses
|
||||
has_redirect = ("window.location" in body_lower or "location.href" in body_lower) and "google.com" not in body_lower
|
||||
if has_redirect:
|
||||
return False, "JavaScript redirect detected"
|
||||
|
||||
# Check for noscript redirect (another indicator of JS-only page)
|
||||
if 'noscript' in body_lower and 'http-equiv="refresh"' in body_lower:
|
||||
return False, "NoScript redirect page"
|
||||
|
||||
# Check for success markers (actual search results)
|
||||
# We need at least one strong indicator of search results
|
||||
has_results = any(marker in response.text for marker in SUCCESS_MARKERS)
|
||||
|
||||
if has_results:
|
||||
return True, "OK - Has search results"
|
||||
else:
|
||||
# Check for very short responses (likely error pages)
|
||||
if len(response.text) < 1000:
|
||||
return False, "Response too short (likely error page)"
|
||||
# If we don't have success markers, it's not a working response
|
||||
# Even if it's substantial and doesn't have block markers, it might be a JS-only page
|
||||
return False, "No search results found"
|
||||
|
||||
except requests.Timeout:
|
||||
return False, "Request timeout"
|
||||
except requests.HTTPError as e:
|
||||
if e.response and e.response.status_code == 429:
|
||||
# Rate limited - raise this so we can handle it specially
|
||||
raise Exception(f"Rate limited (429) - {str(e)}")
|
||||
return False, f"HTTP error: {str(e)}"
|
||||
except requests.RequestException as e:
|
||||
# Check if it's a 429 in the response
|
||||
if hasattr(e, 'response') and e.response and e.response.status_code == 429:
|
||||
raise Exception(f"Rate limited (429) - {str(e)}")
|
||||
return False, f"Request error: {str(e)}"
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Test User Agent strings against Google to find working ones.",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
python test_google_user_agents.py UAs.txt
|
||||
python test_google_user_agents.py UAs.txt --output working_uas.txt
|
||||
python test_google_user_agents.py UAs.txt --query "python programming"
|
||||
"""
|
||||
)
|
||||
parser.add_argument(
|
||||
"user_agent_file",
|
||||
help="Path to file containing user agent strings (one per line)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output", "-o",
|
||||
help="Output file to write working user agents (default: stdout)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--query", "-q",
|
||||
default=None,
|
||||
help="Search query to use for testing (default: cycles through random queries)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--random-queries", "-r",
|
||||
action="store_true",
|
||||
help="Use random queries from a predefined list (default: True if --query not specified)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--timeout", "-t",
|
||||
type=float,
|
||||
default=10.0,
|
||||
help="Request timeout in seconds (default: 10.0)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--delay", "-d",
|
||||
type=float,
|
||||
default=0.5,
|
||||
help="Delay between requests in seconds (default: 0.5)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--verbose", "-v",
|
||||
action="store_true",
|
||||
help="Show detailed results for each user agent"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Determine query strategy
|
||||
use_random_queries = args.random_queries or (args.query is None)
|
||||
if use_random_queries:
|
||||
search_queries = DEFAULT_SEARCH_QUERIES.copy()
|
||||
random.shuffle(search_queries) # Shuffle for variety
|
||||
current_query_idx = 0
|
||||
query_display = f"cycling through {len(search_queries)} random queries"
|
||||
else:
|
||||
search_queries = [args.query]
|
||||
query_display = f"'{args.query}'"
|
||||
|
||||
# Read user agents
|
||||
user_agents = read_user_agents(args.user_agent_file)
|
||||
if not user_agents:
|
||||
print("No user agents found in file.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Testing {len(user_agents)} user agents against Google...", file=sys.stderr)
|
||||
print(f"Query: {query_display}", file=sys.stderr)
|
||||
if args.output:
|
||||
print(f"Output file: {args.output} (appending results incrementally)", file=sys.stderr)
|
||||
print(file=sys.stderr)
|
||||
|
||||
# Load existing working user agents from output file to avoid duplicates
|
||||
existing_working = set()
|
||||
if args.output:
|
||||
try:
|
||||
with open(args.output, 'r', encoding='utf-8') as f:
|
||||
existing_working = {line.strip() for line in f if line.strip()}
|
||||
if existing_working:
|
||||
print(f"Found {len(existing_working)} existing user agents in output file", file=sys.stderr)
|
||||
except FileNotFoundError:
|
||||
# File doesn't exist yet, that's fine
|
||||
pass
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not read existing output file: {e}", file=sys.stderr)
|
||||
|
||||
# Open output file for incremental writing if specified (append mode)
|
||||
output_file = None
|
||||
if args.output:
|
||||
try:
|
||||
output_file = open(args.output, 'a', encoding='utf-8')
|
||||
except Exception as e:
|
||||
print(f"Error opening output file: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
working_agents = []
|
||||
failed_count = 0
|
||||
skipped_count = 0
|
||||
last_successful_idx = 0
|
||||
|
||||
try:
|
||||
for idx, ua in enumerate(user_agents, 1):
|
||||
# Skip testing if this UA is already in the working file
|
||||
if args.output and ua in existing_working:
|
||||
skipped_count += 1
|
||||
if args.verbose:
|
||||
print(f"[{idx}/{len(user_agents)}] ⊘ SKIPPED - Already in working file", file=sys.stderr)
|
||||
last_successful_idx = idx
|
||||
continue
|
||||
|
||||
try:
|
||||
# Get the next query (cycle through if using random queries)
|
||||
if use_random_queries:
|
||||
query = search_queries[current_query_idx % len(search_queries)]
|
||||
current_query_idx += 1
|
||||
else:
|
||||
query = args.query
|
||||
|
||||
is_working, reason = test_user_agent(ua, query, args.timeout)
|
||||
|
||||
if is_working:
|
||||
working_agents.append(ua)
|
||||
status = "✓"
|
||||
# Write immediately to output file if specified (skip if duplicate)
|
||||
if output_file:
|
||||
if ua not in existing_working:
|
||||
output_file.write(ua + '\n')
|
||||
output_file.flush() # Ensure it's written to disk
|
||||
existing_working.add(ua) # Track it to avoid duplicates
|
||||
else:
|
||||
if args.verbose:
|
||||
print(f"[{idx}/{len(user_agents)}] {status} WORKING (duplicate, skipped) - {reason}", file=sys.stderr)
|
||||
# Also print to stdout if no output file
|
||||
if not args.output:
|
||||
print(ua)
|
||||
|
||||
if args.verbose:
|
||||
print(f"[{idx}/{len(user_agents)}] {status} WORKING - {reason}", file=sys.stderr)
|
||||
else:
|
||||
failed_count += 1
|
||||
status = "✗"
|
||||
if args.verbose:
|
||||
print(f"[{idx}/{len(user_agents)}] {status} FAILED - {reason}", file=sys.stderr)
|
||||
|
||||
last_successful_idx = idx
|
||||
|
||||
# Progress indicator for non-verbose mode
|
||||
if not args.verbose and idx % 10 == 0:
|
||||
print(f"Progress: {idx}/{len(user_agents)} tested ({len(working_agents)} working, {failed_count} failed)", file=sys.stderr)
|
||||
|
||||
# Delay between requests to avoid rate limiting
|
||||
if idx < len(user_agents):
|
||||
time.sleep(args.delay)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print(file=sys.stderr)
|
||||
print(f"\nInterrupted by user at index {idx}/{len(user_agents)}", file=sys.stderr)
|
||||
print(f"Last successful test: {last_successful_idx}/{len(user_agents)}", file=sys.stderr)
|
||||
break
|
||||
except Exception as e:
|
||||
# Handle unexpected errors (like network issues or rate limits)
|
||||
error_msg = str(e)
|
||||
if "429" in error_msg or "Rate limited" in error_msg:
|
||||
print(file=sys.stderr)
|
||||
print(f"\n⚠️ RATE LIMIT DETECTED at index {idx}/{len(user_agents)}", file=sys.stderr)
|
||||
print(f"Last successful test: {last_successful_idx}/{len(user_agents)}", file=sys.stderr)
|
||||
print(f"Working user agents found so far: {len(working_agents)}", file=sys.stderr)
|
||||
if args.output:
|
||||
print(f"Results saved to: {args.output}", file=sys.stderr)
|
||||
print(f"\nTo resume later, you can skip the first {last_successful_idx} user agents.", file=sys.stderr)
|
||||
raise # Re-raise to exit the loop
|
||||
else:
|
||||
print(f"[{idx}/{len(user_agents)}] ERROR - {error_msg}", file=sys.stderr)
|
||||
failed_count += 1
|
||||
last_successful_idx = idx
|
||||
if idx < len(user_agents):
|
||||
time.sleep(args.delay)
|
||||
continue
|
||||
|
||||
finally:
|
||||
# Close output file if opened
|
||||
if output_file:
|
||||
output_file.close()
|
||||
|
||||
# Summary
|
||||
print(file=sys.stderr)
|
||||
tested_count = last_successful_idx - skipped_count
|
||||
print(f"Summary: {len(working_agents)} working, {failed_count} failed, {skipped_count} skipped out of {last_successful_idx} processed (of {len(user_agents)} total)", file=sys.stderr)
|
||||
if last_successful_idx < len(user_agents):
|
||||
print(f"Note: Processing stopped at index {last_successful_idx}. {len(user_agents) - last_successful_idx} user agents not processed.", file=sys.stderr)
|
||||
if args.output:
|
||||
print(f"Results saved to: {args.output}", file=sys.stderr)
|
||||
|
||||
return 0 if working_agents else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
|
||||
198
misc/generate_uas.py
Executable file
198
misc/generate_uas.py
Executable file
|
|
@ -0,0 +1,198 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Standalone Opera User Agent String Generator
|
||||
|
||||
This tool generates Opera-based User Agent strings that can be used with Whoogle.
|
||||
It can be run independently to generate and display UA strings on demand.
|
||||
|
||||
Usage:
|
||||
python misc/generate_uas.py [count]
|
||||
|
||||
Arguments:
|
||||
count: Number of UA strings to generate (default: 10)
|
||||
|
||||
Examples:
|
||||
python misc/generate_uas.py # Generate 10 UAs
|
||||
python misc/generate_uas.py 20 # Generate 20 UAs
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Default fallback UA if generation fails
|
||||
DEFAULT_FALLBACK_UA = "Opera/9.30 (Nintendo Wii; U; ; 3642; en)"
|
||||
|
||||
# Try to import from the app module if available
|
||||
try:
|
||||
# Add parent directory to path to allow imports
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
||||
from app.utils.ua_generator import generate_ua_pool
|
||||
USE_APP_MODULE = True
|
||||
except ImportError:
|
||||
USE_APP_MODULE = False
|
||||
# Self-contained version if app module is not available
|
||||
import random
|
||||
|
||||
# Opera UA Pattern Templates
|
||||
OPERA_PATTERNS = [
|
||||
"Opera/9.80 (J2ME/MIDP; Opera Mini/{version}/{build}; U; {lang}) Presto/{presto} Version/{final}",
|
||||
"Opera/9.80 (Android; Linux; Opera Mobi/{build}; U; {lang}) Presto/{presto} Version/{final}",
|
||||
"Opera/9.80 (iPhone; Opera Mini/{version}/{build}; U; {lang}) Presto/{presto} Version/{final}",
|
||||
"Opera/9.80 (iPad; Opera Mini/{version}/{build}; U; {lang}) Presto/{presto} Version/{final}",
|
||||
]
|
||||
|
||||
OPERA_MINI_VERSIONS = [
|
||||
"4.0", "4.1.11321", "4.2.13337", "4.2.14912", "4.2.15410", "4.3.24214",
|
||||
"5.0.18741", "5.1.22296", "5.1.22783", "6.0.24095", "6.24093", "7.1.32444",
|
||||
"7.6.35766", "36.2.2254"
|
||||
]
|
||||
|
||||
OPERA_MOBI_BUILDS = [
|
||||
"27", "49", "447", "1209", "3730", "ADR-1012221546", "SYB-1107071606"
|
||||
]
|
||||
|
||||
BUILD_NUMBERS = [
|
||||
"22.387", "22.478", "23.334", "23.377", "24.746", "24.783", "25.657",
|
||||
"27.1407", "28.2647", "35.5706", "119.132", "870", "886"
|
||||
]
|
||||
|
||||
PRESTO_VERSIONS = [
|
||||
"2.4.15", "2.4.18", "2.5.25", "2.8.119", "2.12.423"
|
||||
]
|
||||
|
||||
FINAL_VERSIONS = [
|
||||
"10.00", "10.1", "10.54", "11.10", "12.16", "13.00"
|
||||
]
|
||||
|
||||
LANGUAGES = [
|
||||
# English variants
|
||||
"en", "en-US", "en-GB", "en-CA", "en-AU", "en-NZ", "en-ZA", "en-IN", "en-SG",
|
||||
# Western European
|
||||
"de", "de-DE", "de-AT", "de-CH",
|
||||
"fr", "fr-FR", "fr-CA", "fr-BE", "fr-CH", "fr-LU",
|
||||
"es", "es-ES", "es-MX", "es-AR", "es-CO", "es-CL", "es-PE", "es-VE", "es-LA",
|
||||
"it", "it-IT", "it-CH",
|
||||
"pt", "pt-PT", "pt-BR",
|
||||
"nl", "nl-NL", "nl-BE",
|
||||
# Nordic languages
|
||||
"da", "da-DK",
|
||||
"sv", "sv-SE",
|
||||
"no", "no-NO", "nb", "nn",
|
||||
"fi", "fi-FI",
|
||||
"is", "is-IS",
|
||||
# Eastern European
|
||||
"pl", "pl-PL",
|
||||
"cs", "cs-CZ",
|
||||
"sk", "sk-SK",
|
||||
"hu", "hu-HU",
|
||||
"ro", "ro-RO",
|
||||
"bg", "bg-BG",
|
||||
"hr", "hr-HR",
|
||||
"sr", "sr-RS",
|
||||
"sl", "sl-SI",
|
||||
"uk", "uk-UA",
|
||||
"ru", "ru-RU",
|
||||
# Asian languages
|
||||
"zh", "zh-CN", "zh-TW", "zh-HK",
|
||||
"ja", "ja-JP",
|
||||
"ko", "ko-KR",
|
||||
"th", "th-TH",
|
||||
"vi", "vi-VN",
|
||||
"id", "id-ID",
|
||||
"ms", "ms-MY",
|
||||
"fil", "tl",
|
||||
# Middle Eastern
|
||||
"tr", "tr-TR",
|
||||
"ar", "ar-SA", "ar-AE", "ar-EG",
|
||||
"he", "he-IL",
|
||||
"fa", "fa-IR",
|
||||
# Other
|
||||
"hi", "hi-IN",
|
||||
"bn", "bn-IN",
|
||||
"ta", "ta-IN",
|
||||
"te", "te-IN",
|
||||
"mr", "mr-IN",
|
||||
"el", "el-GR",
|
||||
"ca", "ca-ES",
|
||||
"eu", "eu-ES"
|
||||
]
|
||||
|
||||
def generate_opera_ua():
|
||||
"""Generate a single random Opera User Agent string."""
|
||||
pattern = random.choice(OPERA_PATTERNS)
|
||||
params = {'lang': random.choice(LANGUAGES)}
|
||||
|
||||
if '{version}' in pattern:
|
||||
params['version'] = random.choice(OPERA_MINI_VERSIONS)
|
||||
if '{build}' in pattern:
|
||||
if "Opera Mobi" in pattern:
|
||||
params['build'] = random.choice(OPERA_MOBI_BUILDS)
|
||||
else:
|
||||
params['build'] = random.choice(BUILD_NUMBERS)
|
||||
if '{presto}' in pattern:
|
||||
params['presto'] = random.choice(PRESTO_VERSIONS)
|
||||
if '{final}' in pattern:
|
||||
params['final'] = random.choice(FINAL_VERSIONS)
|
||||
|
||||
return pattern.format(**params)
|
||||
|
||||
def generate_ua_pool(count=10):
|
||||
"""Generate a pool of unique Opera User Agent strings."""
|
||||
ua_pool = set()
|
||||
max_attempts = count * 100
|
||||
attempts = 0
|
||||
|
||||
try:
|
||||
while len(ua_pool) < count and attempts < max_attempts:
|
||||
ua = generate_opera_ua()
|
||||
ua_pool.add(ua)
|
||||
attempts += 1
|
||||
except Exception:
|
||||
# If generation fails entirely, return at least the default fallback
|
||||
if not ua_pool:
|
||||
return [DEFAULT_FALLBACK_UA]
|
||||
|
||||
# If we couldn't generate enough, fill remaining with default
|
||||
result = list(ua_pool)
|
||||
while len(result) < count:
|
||||
result.append(DEFAULT_FALLBACK_UA)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function to generate and display UA strings."""
|
||||
# Parse command line argument
|
||||
count = 10 # Default
|
||||
if len(sys.argv) > 1:
|
||||
try:
|
||||
count = int(sys.argv[1])
|
||||
if count < 1:
|
||||
print("Error: Count must be a positive integer", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
except ValueError:
|
||||
print(f"Error: Invalid count '{sys.argv[1]}'. Must be an integer.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Show which mode we're using (to stderr so it doesn't interfere with output)
|
||||
if USE_APP_MODULE:
|
||||
print(f"# Using app.utils.ua_generator module", file=sys.stderr)
|
||||
else:
|
||||
print(f"# Using standalone generator (app module not available)", file=sys.stderr)
|
||||
|
||||
print(f"# Generating {count} Opera User Agent strings...\n", file=sys.stderr)
|
||||
|
||||
# Generate UAs
|
||||
uas = generate_ua_pool(count)
|
||||
|
||||
# Display them (one per line, no numbering)
|
||||
for ua in uas:
|
||||
print(ua)
|
||||
|
||||
# Summary to stderr so it doesn't interfere with piping
|
||||
print(f"\n# Generated {len(uas)} unique User Agent strings", file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
|
|
@ -1,24 +1,6 @@
|
|||
https://search.albony.xyz
|
||||
https://search.garudalinux.org
|
||||
https://search.dr460nf1r3.org
|
||||
https://search.nezumi.party
|
||||
https://s.tokhmi.xyz
|
||||
https://search.sethforprivacy.com
|
||||
https://whoogle.dcs0.hu
|
||||
https://whoogle.lunar.icu
|
||||
https://gowogle.voring.me
|
||||
https://whoogle.privacydev.net
|
||||
https://whoogle.hostux.net
|
||||
https://wg.vern.cc
|
||||
https://whoogle.hxvy0.gq
|
||||
https://whoogle.ungovernable.men
|
||||
https://whoogle2.ungovernable.men
|
||||
https://whoogle3.ungovernable.men
|
||||
https://wgl.frail.duckdns.org
|
||||
https://whoogle.no-logs.com
|
||||
https://whoogle.ftw.lol
|
||||
https://whoogle-search--replitcomreside.repl.co
|
||||
https://search.notrustverify.ch
|
||||
https://whoogle.datura.network
|
||||
https://whoogle.yepserver.xyz
|
||||
https://search.snine.nl
|
||||
https://whoogle.lunar.icu
|
||||
https://whoogle.4040940.xyz
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
import json
|
||||
import pathlib
|
||||
import requests
|
||||
import httpx
|
||||
|
||||
lingva = 'https://lingva.ml/api/v1/en'
|
||||
|
||||
|
|
@ -25,7 +25,7 @@ def translate(v: str, lang: str) -> str:
|
|||
|
||||
lingva_req = f'{lingva}/{lang}/{v}'
|
||||
|
||||
response = requests.get(lingva_req).json()
|
||||
response = httpx.get(lingva_req).json()
|
||||
|
||||
if 'translation' in response:
|
||||
return response['translation']
|
||||
|
|
|
|||
|
|
@ -1,3 +1,16 @@
|
|||
[build-system]
|
||||
requires = ["setuptools", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[tool.ruff]
|
||||
line-length = 100
|
||||
target-version = "py312"
|
||||
lint.select = [
|
||||
"E", "F", "W", # pycodestyle/pyflakes
|
||||
"I", # isort
|
||||
]
|
||||
lint.ignore = []
|
||||
|
||||
[tool.black]
|
||||
line-length = 100
|
||||
target-version = ['py312']
|
||||
|
|
|
|||
|
|
@ -1,37 +1,34 @@
|
|||
attrs==22.2.0
|
||||
beautifulsoup4==4.11.2
|
||||
brotli==1.0.9
|
||||
cachelib==0.10.2
|
||||
certifi==2024.7.4
|
||||
cffi==1.17.1
|
||||
chardet==5.1.0
|
||||
click==8.1.3
|
||||
cryptography==3.3.2; platform_machine == 'armv7l'
|
||||
cryptography==43.0.1; platform_machine != 'armv7l'
|
||||
cssutils==2.7.0
|
||||
attrs==25.3.0
|
||||
beautifulsoup4==4.13.5
|
||||
brotli==1.2.0
|
||||
certifi==2025.8.3
|
||||
cffi==2.0.0
|
||||
click==8.3.0
|
||||
cryptography==46.0.1
|
||||
cssutils==2.11.1
|
||||
defusedxml==0.7.1
|
||||
Flask==2.3.2
|
||||
idna==3.7
|
||||
itsdangerous==2.1.2
|
||||
Jinja2==3.1.4
|
||||
MarkupSafe==2.1.2
|
||||
more-itertools==9.0.0
|
||||
packaging==23.0
|
||||
pluggy==1.0.0
|
||||
pycodestyle==2.10.0
|
||||
Flask==3.1.2
|
||||
idna==3.10
|
||||
itsdangerous==2.2.0
|
||||
Jinja2==3.1.6
|
||||
MarkupSafe==3.0.2
|
||||
more-itertools==10.8.0
|
||||
packaging==25.0
|
||||
pluggy==1.6.0
|
||||
pycodestyle==2.14.0
|
||||
pycparser==2.22
|
||||
pyOpenSSL==19.1.0; platform_machine == 'armv7l'
|
||||
pyOpenSSL==24.2.1; platform_machine != 'armv7l'
|
||||
pyparsing==3.0.9
|
||||
PySocks==1.7.1
|
||||
pytest==7.2.1
|
||||
python-dateutil==2.8.2
|
||||
requests==2.32.2
|
||||
soupsieve==2.4
|
||||
stem==1.8.1
|
||||
urllib3==1.26.19
|
||||
validators==0.22.0
|
||||
waitress==3.0.1
|
||||
wcwidth==0.2.6
|
||||
Werkzeug==3.0.6
|
||||
python-dotenv==0.21.1
|
||||
pyOpenSSL==25.3.0
|
||||
pyparsing==3.2.5
|
||||
pytest==8.3.3
|
||||
python-dateutil==2.9.0.post0
|
||||
httpx[http2,socks]==0.28.1
|
||||
cachetools==6.2.0
|
||||
soupsieve==2.8
|
||||
stem==1.8.2
|
||||
httpcore>=1.0.9
|
||||
h11>=0.16.0
|
||||
validators==0.35.0
|
||||
waitress==3.0.2
|
||||
wcwidth==0.2.14
|
||||
Werkzeug==3.1.4
|
||||
python-dotenv==1.1.1
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ install_requires=
|
|||
defusedxml
|
||||
Flask
|
||||
python-dotenv
|
||||
requests
|
||||
httpx[http2,socks]
|
||||
stem
|
||||
validators
|
||||
waitress
|
||||
|
|
|
|||
|
|
@ -1,11 +1,13 @@
|
|||
from app import app
|
||||
from app.request import Request
|
||||
from app.utils.session import generate_key
|
||||
from test.mock_google import build_mock_response
|
||||
import httpx
|
||||
import pytest
|
||||
import random
|
||||
|
||||
demo_config = {
|
||||
'near': random.choice(['Seattle', 'New York', 'San Francisco']),
|
||||
'dark': str(random.getrandbits(1)),
|
||||
'nojs': str(random.getrandbits(1)),
|
||||
'lang_interface': random.choice(app.config['LANGUAGES'])['value'],
|
||||
'lang_search': random.choice(app.config['LANGUAGES'])['value'],
|
||||
|
|
@ -13,6 +15,38 @@ demo_config = {
|
|||
}
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def mock_google(monkeypatch):
|
||||
original_send = Request.send
|
||||
|
||||
def fake_send(self, base_url='', query='', attempt=0,
|
||||
force_mobile=False, user_agent=''):
|
||||
use_mock = not base_url or 'google.com/search' in base_url
|
||||
if not use_mock:
|
||||
return original_send(self, base_url, query, attempt,
|
||||
force_mobile, user_agent)
|
||||
|
||||
html = build_mock_response(query, getattr(self, 'language', ''), getattr(self, 'country', ''))
|
||||
request_url = (base_url or self.search_url) + query
|
||||
request = httpx.Request('GET', request_url)
|
||||
return httpx.Response(200, request=request, text=html)
|
||||
|
||||
def fake_autocomplete(self, q):
|
||||
normalized = q.replace('+', ' ').lower()
|
||||
suggestions = []
|
||||
if 'green eggs and' in normalized:
|
||||
suggestions.append('green eggs and ham')
|
||||
if 'the cat in the' in normalized:
|
||||
suggestions.append('the cat in the hat')
|
||||
if normalized.startswith('who'):
|
||||
suggestions.extend(['whoogle', 'whoogle search'])
|
||||
return suggestions
|
||||
|
||||
monkeypatch.setattr(Request, 'send', fake_send)
|
||||
monkeypatch.setattr(Request, 'autocomplete', fake_autocomplete)
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def client():
|
||||
with app.test_client() as client:
|
||||
|
|
|
|||
136
test/mock_google.py
Normal file
136
test/mock_google.py
Normal file
|
|
@ -0,0 +1,136 @@
|
|||
from urllib.parse import parse_qs, unquote, quote
|
||||
|
||||
from app.models.config import Config
|
||||
|
||||
DEFAULT_RESULTS = [
|
||||
('Example Domain', 'https://example.com/{slug}', 'Example information about {term}.'),
|
||||
('Whoogle Search', 'https://github.com/benbusby/whoogle-search', 'Private self-hosted Google proxy'),
|
||||
('Wikipedia', 'https://en.wikipedia.org/wiki/{title}', '{title} – encyclopedia entry.'),
|
||||
]
|
||||
|
||||
|
||||
def _result_block(title, href, snippet):
|
||||
encoded_href = quote(href, safe=':/')
|
||||
return (
|
||||
f'<div class="ZINbbc xpd O9g5cc uUPGi">'
|
||||
f'<div class="kCrYT">'
|
||||
f'<a href="/url?q={encoded_href}&sa=U&ved=2ahUKE">'
|
||||
f'<h3 class="BNeawe vvjwJb AP7Wnd">{title}</h3>'
|
||||
f'<span class="CVA68e">{title}</span>'
|
||||
f'</a>'
|
||||
f'<div class="VwiC3b">{snippet}</div>'
|
||||
f'</div>'
|
||||
f'</div>'
|
||||
)
|
||||
|
||||
|
||||
def _main_results(query, params, language='', country=''):
|
||||
term = query.lower()
|
||||
slug = query.replace(' ', '-')
|
||||
results = []
|
||||
|
||||
pref_lang = ''
|
||||
pref_country = ''
|
||||
if 'preferences' in params:
|
||||
try:
|
||||
pref_data = Config(**{})._decode_preferences(params['preferences'][0])
|
||||
pref_lang = str(pref_data.get('lang_interface', '') or '').lower()
|
||||
pref_country = str(pref_data.get('country', '') or '').lower()
|
||||
except Exception:
|
||||
pref_lang = pref_country = ''
|
||||
else:
|
||||
pref_lang = pref_country = ''
|
||||
|
||||
if 'wikipedia' in term:
|
||||
hl = str(params.get('hl', [''])[0] or '').lower()
|
||||
gl = str(params.get('gl', [''])[0] or '').lower()
|
||||
lr = str(params.get('lr', [''])[0] or '').lower()
|
||||
language_code = str(language or '').lower()
|
||||
country_code = str(country or '').lower()
|
||||
is_japanese = (
|
||||
hl.startswith('ja') or
|
||||
gl.startswith('jp') or
|
||||
lr.endswith('lang_ja') or
|
||||
language_code.endswith('lang_ja') or
|
||||
country_code.startswith('jp') or
|
||||
pref_lang.endswith('lang_ja') or
|
||||
pref_country.startswith('jp')
|
||||
)
|
||||
if is_japanese:
|
||||
results.append((
|
||||
'ウィキペディア',
|
||||
'https://ja.wikipedia.org/wiki/ウィキペディア',
|
||||
'日本語版ウィキペディアの記事です。'
|
||||
))
|
||||
else:
|
||||
results.append((
|
||||
'Wikipedia',
|
||||
'https://www.wikipedia.org/wiki/Wikipedia',
|
||||
'Wikipedia is a free online encyclopedia.'
|
||||
))
|
||||
|
||||
if 'pinterest' in term:
|
||||
results.append((
|
||||
'Pinterest',
|
||||
'https://www.pinterest.com/ideas/',
|
||||
'Discover recipes, home ideas, style inspiration and other ideas.'
|
||||
))
|
||||
|
||||
if 'whoogle' in term:
|
||||
results.append((
|
||||
'Whoogle Search GitHub',
|
||||
'https://github.com/benbusby/whoogle-search',
|
||||
'Source code for Whoogle Search.'
|
||||
))
|
||||
|
||||
if 'github' in term:
|
||||
results.append((
|
||||
'GitHub',
|
||||
f'https://github.com/search?q={slug}',
|
||||
'GitHub is a development platform to host and review code.'
|
||||
))
|
||||
|
||||
for title, url, snippet in DEFAULT_RESULTS:
|
||||
formatted_url = url.format(slug=slug, term=term, title=title.replace(' ', '_'))
|
||||
formatted_snippet = snippet.format(term=query, title=title)
|
||||
results.append((title, formatted_url, formatted_snippet))
|
||||
|
||||
unique = []
|
||||
seen = set()
|
||||
for entry in results:
|
||||
if entry[1] in seen:
|
||||
continue
|
||||
seen.add(entry[1])
|
||||
unique.append(entry)
|
||||
|
||||
return ''.join(_result_block(*entry) for entry in unique)
|
||||
|
||||
|
||||
def build_mock_response(raw_query, language='', country=''):
|
||||
if '&' in raw_query:
|
||||
q_part, extra = raw_query.split('&', 1)
|
||||
else:
|
||||
q_part, extra = raw_query, ''
|
||||
|
||||
query = unquote(q_part)
|
||||
params = parse_qs(extra)
|
||||
|
||||
results_html = _main_results(query, params, language, country)
|
||||
safe_query = query.replace('"', '')
|
||||
pagination = (
|
||||
f'<a href="/search?q={q_part}&start=10">Next</a>'
|
||||
f'<a href="/search?q={q_part}&start=20">More</a>'
|
||||
)
|
||||
|
||||
return (
|
||||
'<html>'
|
||||
'<head><title>Mock Google Results</title></head>'
|
||||
'<body>'
|
||||
f'<div id="main">{results_html}</div>'
|
||||
f'<form action="/search" method="GET">'
|
||||
f'<input name="q" value="{safe_query}">'
|
||||
'</form>'
|
||||
f'<footer class="TuS8Ad">{pagination}</footer>'
|
||||
'</body>'
|
||||
'</html>'
|
||||
)
|
||||
114
test/test_alts.py
Normal file
114
test/test_alts.py
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
import copy
|
||||
import os
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from app import app
|
||||
from app.filter import Filter
|
||||
from app.models.config import Config
|
||||
from app.utils.session import generate_key
|
||||
from app.utils import results as results_mod
|
||||
|
||||
|
||||
def build_soup(html: str):
|
||||
return BeautifulSoup(html, 'html.parser')
|
||||
|
||||
|
||||
def make_filter(soup: BeautifulSoup):
|
||||
secret_key = generate_key()
|
||||
with app.app_context():
|
||||
cfg = Config(**{'alts': True})
|
||||
f = Filter(user_key=secret_key, config=cfg)
|
||||
f.soup = soup
|
||||
return f
|
||||
|
||||
|
||||
def test_no_duplicate_alt_prefix_reddit(monkeypatch):
|
||||
original_site_alts = copy.deepcopy(results_mod.SITE_ALTS)
|
||||
try:
|
||||
# Simulate user setting alt to old.reddit.com
|
||||
monkeypatch.setitem(results_mod.SITE_ALTS, 'reddit.com', 'old.reddit.com')
|
||||
|
||||
html = '''
|
||||
<div id="main">
|
||||
<a href="https://www.reddit.com/r/whoogle">www.reddit.com</a>
|
||||
<div>www.reddit.com</div>
|
||||
<div>old.reddit.com</div>
|
||||
</div>
|
||||
'''
|
||||
soup = build_soup(html)
|
||||
f = make_filter(soup)
|
||||
f.site_alt_swap()
|
||||
|
||||
# Href replaced once
|
||||
a = soup.find('a')
|
||||
assert a['href'].startswith('https://old.reddit.com')
|
||||
|
||||
# Bare domain replaced, but already-alt text stays unchanged (no old.old...)
|
||||
divs = [d.get_text() for d in soup.find_all('div') if d.get_text().strip()]
|
||||
assert 'old.reddit.com' in divs
|
||||
assert 'old.old.reddit.com' not in ''.join(divs)
|
||||
finally:
|
||||
results_mod.SITE_ALTS.clear()
|
||||
results_mod.SITE_ALTS.update(original_site_alts)
|
||||
|
||||
|
||||
def test_wikipedia_simple_no_lang_param(monkeypatch):
|
||||
original_site_alts = copy.deepcopy(results_mod.SITE_ALTS)
|
||||
try:
|
||||
monkeypatch.setitem(results_mod.SITE_ALTS, 'wikipedia.org', 'https://wikiless.example')
|
||||
|
||||
html = '''
|
||||
<div id="main">
|
||||
<a href="https://simple.wikipedia.org/wiki/Whoogle">https://simple.wikipedia.org/wiki/Whoogle</a>
|
||||
<div>simple.wikipedia.org</div>
|
||||
</div>
|
||||
'''
|
||||
soup = build_soup(html)
|
||||
f = make_filter(soup)
|
||||
f.site_alt_swap()
|
||||
|
||||
a = soup.find('a')
|
||||
# Should be rewritten to the alt host, without ?lang
|
||||
assert a['href'].startswith('https://wikiless.example')
|
||||
assert '?lang=' not in a['href']
|
||||
|
||||
# Description host replaced once
|
||||
text = soup.find('div').get_text()
|
||||
assert 'wikiless.example' in text
|
||||
assert 'simple.wikipedia.org' not in text
|
||||
finally:
|
||||
results_mod.SITE_ALTS.clear()
|
||||
results_mod.SITE_ALTS.update(original_site_alts)
|
||||
|
||||
|
||||
def test_single_pass_description_replacement(monkeypatch):
|
||||
original_site_alts = copy.deepcopy(results_mod.SITE_ALTS)
|
||||
try:
|
||||
monkeypatch.setitem(results_mod.SITE_ALTS, 'twitter.com', 'https://nitter.example')
|
||||
|
||||
html = '''
|
||||
<div id="main">
|
||||
<a href="https://twitter.com/whoogle">https://twitter.com/whoogle</a>
|
||||
<div>https://www.twitter.com</div>
|
||||
</div>
|
||||
'''
|
||||
soup = build_soup(html)
|
||||
f = make_filter(soup)
|
||||
f.site_alt_swap()
|
||||
|
||||
a = soup.find('a')
|
||||
assert a['href'].startswith('https://nitter.example')
|
||||
|
||||
# Ensure description got host swapped once, no double scheme or duplication
|
||||
main_div = soup.find('div', id='main')
|
||||
# The description div is the first inner div under #main in this fixture
|
||||
text = main_div.find_all('div')[0].get_text().strip()
|
||||
assert text.startswith('https://nitter.example')
|
||||
assert 'https://https://' not in text
|
||||
assert 'nitter.examplenitter.example' not in text
|
||||
finally:
|
||||
results_mod.SITE_ALTS.clear()
|
||||
results_mod.SITE_ALTS.update(original_site_alts)
|
||||
|
||||
|
||||
31
test/test_autocomplete_xml.py
Normal file
31
test/test_autocomplete_xml.py
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
from app import app
|
||||
from app.request import Request
|
||||
from app.models.config import Config
|
||||
|
||||
|
||||
class FakeHttpClient:
|
||||
def get(self, url, headers=None, cookies=None, retries=0, backoff_seconds=0.5, use_cache=False):
|
||||
# Minimal XML in Google Toolbar Autocomplete format
|
||||
xml = (
|
||||
'<?xml version="1.0"?>\n'
|
||||
'<topp>\n'
|
||||
' <CompleteSuggestion><suggestion data="whoogle"/></CompleteSuggestion>\n'
|
||||
' <CompleteSuggestion><suggestion data="whoogle search"/></CompleteSuggestion>\n'
|
||||
'</topp>'
|
||||
)
|
||||
class R:
|
||||
text = xml
|
||||
return R()
|
||||
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
|
||||
def test_autocomplete_parsing():
|
||||
with app.app_context():
|
||||
cfg = Config(**{})
|
||||
req = Request(normal_ua='UA', root_path='http://localhost:5000', config=cfg, http_client=FakeHttpClient())
|
||||
suggestions = req.autocomplete('who')
|
||||
assert 'whoogle' in suggestions
|
||||
assert 'whoogle search' in suggestions
|
||||
|
||||
33
test/test_http_client.py
Normal file
33
test/test_http_client.py
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
import types
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
|
||||
from app.services.http_client import HttpxClient
|
||||
|
||||
|
||||
def test_httpxclient_follow_redirects_and_proxy(monkeypatch):
|
||||
calls = []
|
||||
|
||||
class FakeClient:
|
||||
def __init__(self, *args, **kwargs):
|
||||
calls.append(kwargs)
|
||||
def get(self, *args, **kwargs):
|
||||
class R:
|
||||
status_code = 200
|
||||
text = ''
|
||||
return R()
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
monkeypatch.setattr(httpx, 'Client', FakeClient)
|
||||
|
||||
proxies = {'http': 'socks5://127.0.0.1:9050', 'https': 'socks5://127.0.0.1:9050'}
|
||||
client = HttpxClient(proxies=proxies)
|
||||
|
||||
# Ensure the constructor attempted to set follow_redirects and one of proxy/proxies
|
||||
assert len(calls) == 1
|
||||
kwargs = calls[0]
|
||||
assert kwargs.get('follow_redirects') is True
|
||||
assert ('proxy' in kwargs) or ('proxies' in kwargs) or ('mounts' in kwargs)
|
||||
|
||||
79
test/test_json.py
Normal file
79
test/test_json.py
Normal file
|
|
@ -0,0 +1,79 @@
|
|||
import json
|
||||
import types
|
||||
|
||||
import pytest
|
||||
|
||||
from app.models.endpoint import Endpoint
|
||||
from app.utils import search as search_mod
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def stubbed_search_response(monkeypatch):
|
||||
# Stub Search.new_search_query to return a stable query
|
||||
def fake_new_query(self):
|
||||
self.query = 'whoogle'
|
||||
return self.query
|
||||
|
||||
# Return a minimal filtered HTML snippet with a couple of links
|
||||
html = (
|
||||
'<div id="main">'
|
||||
' <a href="https://example.com/page">Example Page</a>'
|
||||
' <a href="/relative">Relative</a>'
|
||||
' <a href="https://example.org/other">Other</a>'
|
||||
'</div>'
|
||||
)
|
||||
|
||||
def fake_generate(self):
|
||||
return html
|
||||
|
||||
monkeypatch.setattr(search_mod.Search, 'new_search_query', fake_new_query)
|
||||
monkeypatch.setattr(search_mod.Search, 'generate_response', fake_generate)
|
||||
|
||||
|
||||
def test_search_json_accept(client, stubbed_search_response):
|
||||
rv = client.get(f'/{Endpoint.search}?q=whoogle', headers={'Accept': 'application/json'})
|
||||
assert rv._status_code == 200
|
||||
data = json.loads(rv.data)
|
||||
assert data['query'] == 'whoogle'
|
||||
assert isinstance(data['results'], list)
|
||||
hrefs = {item['href'] for item in data['results']}
|
||||
assert 'https://example.com/page' in hrefs
|
||||
assert 'https://example.org/other' in hrefs
|
||||
# Relative href should be excluded
|
||||
assert not any(href.endswith('/relative') for href in hrefs)
|
||||
# Verify new fields are present while maintaining backward compatibility
|
||||
for result in data['results']:
|
||||
assert 'href' in result
|
||||
assert 'text' in result # Original field maintained
|
||||
assert 'title' in result # New field
|
||||
assert 'content' in result # New field
|
||||
|
||||
|
||||
def test_search_json_format_param(client, stubbed_search_response):
|
||||
rv = client.get(f'/{Endpoint.search}?q=whoogle&format=json')
|
||||
assert rv._status_code == 200
|
||||
data = json.loads(rv.data)
|
||||
assert data['query'] == 'whoogle'
|
||||
assert len(data['results']) >= 2
|
||||
|
||||
|
||||
def test_search_json_feeling_lucky(client, monkeypatch):
|
||||
# Force query to be interpreted as feeling lucky and return a redirect URL
|
||||
def fake_new_query(self):
|
||||
self.query = 'whoogle !'
|
||||
# emulate behavior of new_search_query setting feeling_lucky
|
||||
self.feeling_lucky = True
|
||||
return self.query
|
||||
|
||||
def fake_generate(self):
|
||||
return 'https://example.com/lucky'
|
||||
|
||||
monkeypatch.setattr(search_mod.Search, 'new_search_query', fake_new_query)
|
||||
monkeypatch.setattr(search_mod.Search, 'generate_response', fake_generate)
|
||||
|
||||
rv = client.get(f'/{Endpoint.search}?q=whoogle%20!', headers={'Accept': 'application/json'})
|
||||
assert rv._status_code == 303
|
||||
data = json.loads(rv.data)
|
||||
assert data['redirect'] == 'https://example.com/lucky'
|
||||
|
||||
|
||||
|
|
@ -3,6 +3,7 @@ from app.filter import Filter
|
|||
from app.models.config import Config
|
||||
from app.models.endpoint import Endpoint
|
||||
from app.utils import results
|
||||
from app.utils import search as search_mod
|
||||
from app.utils.session import generate_key
|
||||
from datetime import datetime
|
||||
from dateutil.parser import ParserError, parse
|
||||
|
|
@ -32,18 +33,24 @@ def get_search_results(data):
|
|||
return result_divs
|
||||
|
||||
|
||||
def test_get_results(client):
|
||||
# FIXME: Temporary fix while #1211 is investigated
|
||||
return
|
||||
def test_get_results(client, monkeypatch):
|
||||
def fake_generate(self):
|
||||
# Build 10 results under #main, each with a single inner div
|
||||
items = []
|
||||
for i in range(10):
|
||||
items.append(f'<div><div><a href="https://example.com/{i}">Item {i}</a></div></div>')
|
||||
return f'<div id="main">{"".join(items)}</div>'
|
||||
|
||||
monkeypatch.setattr(search_mod.Search, 'generate_response', fake_generate)
|
||||
|
||||
rv = client.get(f'/{Endpoint.search}?q=test')
|
||||
assert rv._status_code == 200
|
||||
|
||||
# Depending on the search, there can be more
|
||||
# than 10 result divs
|
||||
results = get_search_results(rv.data)
|
||||
assert len(results) >= 10
|
||||
assert len(results) <= 15
|
||||
results_divs = get_search_results(rv.data)
|
||||
assert len(results_divs) >= 10
|
||||
assert len(results_divs) <= 15
|
||||
|
||||
|
||||
def test_post_results(client):
|
||||
|
|
@ -87,9 +94,12 @@ def test_block_results(client):
|
|||
assert result_site not in 'pinterest.com'
|
||||
|
||||
|
||||
def test_view_my_ip(client):
|
||||
# FIXME: Temporary fix while #1211 is investigated
|
||||
return
|
||||
def test_view_my_ip(client, monkeypatch):
|
||||
def fake_generate(self):
|
||||
# Minimal page; ip card is injected later by routes when widget == 'ip'
|
||||
return '<div id="main"></div>'
|
||||
|
||||
monkeypatch.setattr(search_mod.Search, 'generate_response', fake_generate)
|
||||
|
||||
rv = client.get(f'/{Endpoint.search}?q=my ip address')
|
||||
assert rv._status_code == 200
|
||||
|
|
@ -100,9 +110,16 @@ def test_view_my_ip(client):
|
|||
assert '127.0.0.1' in str_data
|
||||
|
||||
|
||||
def test_recent_results(client):
|
||||
# FIXME: Temporary fix while #1211 is investigated
|
||||
return
|
||||
def test_recent_results(client, monkeypatch):
|
||||
def fake_generate(self):
|
||||
# Create results with a span containing today's date so it passes all windows
|
||||
today = datetime.now().strftime('%b %d, %Y')
|
||||
items = []
|
||||
for i in range(5):
|
||||
items.append(f'<div><div><span>{today}</span></div></div>')
|
||||
return f'<div id="main">{"".join(items)}</div>'
|
||||
|
||||
monkeypatch.setattr(search_mod.Search, 'generate_response', fake_generate)
|
||||
|
||||
times = {
|
||||
'tbs=qdr:y': 365,
|
||||
|
|
|
|||
|
|
@ -75,14 +75,14 @@ def test_config(client):
|
|||
|
||||
# Test disabling changing config from client
|
||||
app.config['CONFIG_DISABLE'] = 1
|
||||
dark_mod = not demo_config['dark']
|
||||
demo_config['dark'] = dark_mod
|
||||
nojs_mod = not bool(int(demo_config['nojs']))
|
||||
demo_config['nojs'] = str(int(nojs_mod))
|
||||
rv = client.post(f'/{Endpoint.config}', data=demo_config)
|
||||
assert rv._status_code == 403
|
||||
|
||||
rv = client.get(f'/{Endpoint.config}')
|
||||
config = json.loads(rv.data)
|
||||
assert config['dark'] != dark_mod
|
||||
assert config['nojs'] != nojs_mod
|
||||
|
||||
|
||||
def test_opensearch(client):
|
||||
|
|
|
|||
26
test/test_routes_json.py
Normal file
26
test/test_routes_json.py
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from app.models.endpoint import Endpoint
|
||||
from app.utils import search as search_mod
|
||||
|
||||
|
||||
def test_captcha_json_block(client, monkeypatch):
|
||||
def fake_new_query(self):
|
||||
self.query = 'test'
|
||||
return self.query
|
||||
|
||||
def fake_generate(self):
|
||||
# Inject a captcha marker into HTML so route returns 503 JSON
|
||||
return '<div>div class="g-recaptcha"</div>'
|
||||
|
||||
monkeypatch.setattr(search_mod.Search, 'new_search_query', fake_new_query)
|
||||
monkeypatch.setattr(search_mod.Search, 'generate_response', fake_generate)
|
||||
|
||||
rv = client.get(f'/{Endpoint.search}?q=test&format=json')
|
||||
assert rv._status_code == 503
|
||||
data = json.loads(rv.data)
|
||||
assert data['blocked'] is True
|
||||
assert 'error_message' in data
|
||||
|
||||
52
test/test_tor.py
Normal file
52
test/test_tor.py
Normal file
|
|
@ -0,0 +1,52 @@
|
|||
import pytest
|
||||
|
||||
from app import app
|
||||
from app.request import Request, TorError
|
||||
from app.models.config import Config
|
||||
|
||||
|
||||
class FakeResponse:
|
||||
def __init__(self, text: str = '', status_code: int = 200, content: bytes = b''):
|
||||
self.text = text
|
||||
self.status_code = status_code
|
||||
self.content = content or b''
|
||||
|
||||
|
||||
class FakeHttpClient:
|
||||
def __init__(self, tor_ok: bool):
|
||||
self._tor_ok = tor_ok
|
||||
|
||||
def get(self, url, headers=None, cookies=None, retries=0, backoff_seconds=0.5, use_cache=False):
|
||||
if 'check.torproject.org' in url:
|
||||
return FakeResponse(text=('Congratulations' if self._tor_ok else 'Not Tor'))
|
||||
return FakeResponse(text='', status_code=200, content=b'OK')
|
||||
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
|
||||
def build_config(tor: bool) -> Config:
|
||||
# Minimal config with tor flag
|
||||
with app.app_context():
|
||||
return Config(**{'tor': tor})
|
||||
|
||||
|
||||
def test_tor_validation_success(monkeypatch):
|
||||
# Prevent real Tor signal attempts
|
||||
monkeypatch.setattr('app.request.send_tor_signal', lambda signal: True)
|
||||
cfg = build_config(tor=True)
|
||||
req = Request(normal_ua='TestUA', root_path='http://localhost:5000', config=cfg, http_client=FakeHttpClient(tor_ok=True))
|
||||
# Avoid sending a Tor NEWNYM/HEARTBEAT in unit tests by setting attempt>0 false path
|
||||
resp = req.send(base_url='https://example.com', query='')
|
||||
assert req.tor_valid is True
|
||||
assert resp.status_code == 200
|
||||
|
||||
|
||||
def test_tor_validation_failure(monkeypatch):
|
||||
# Prevent real Tor signal attempts
|
||||
monkeypatch.setattr('app.request.send_tor_signal', lambda signal: True)
|
||||
cfg = build_config(tor=True)
|
||||
req = Request(normal_ua='TestUA', root_path='http://localhost:5000', config=cfg, http_client=FakeHttpClient(tor_ok=False))
|
||||
with pytest.raises(TorError):
|
||||
_ = req.send(base_url='https://example.com', query='')
|
||||
|
||||
|
|
@ -72,9 +72,6 @@
|
|||
# Remove everything except basic result cards from all search queries
|
||||
#WHOOGLE_MINIMAL=0
|
||||
|
||||
# Set the number of results per page
|
||||
#WHOOGLE_RESULTS_PER_PAGE=10
|
||||
|
||||
# Controls visibility of autocomplete/search suggestions
|
||||
#WHOOGLE_AUTOCOMPLETE=1
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue