mirror of
https://github.com/benbusby/whoogle-search.git
synced 2026-03-11 08:54:34 +00:00
Enhance link extraction logic in search function to handle cases with no result containers, improving robustness and accuracy of results.
This commit is contained in:
parent
442060b2ef
commit
457725ee5a
2 changed files with 52 additions and 23 deletions
20
.github/workflows/pypi.yml
vendored
20
.github/workflows/pypi.yml
vendored
|
|
@ -38,23 +38,37 @@ jobs:
|
|||
password: ${{ secrets.TEST_PYPI_API_TOKEN }}
|
||||
repository_url: https://test.pypi.org/legacy/
|
||||
publish:
|
||||
# Gate real PyPI publishing to stable SemVer tags only (e.g., v1.2.3 or 1.2.3)
|
||||
if: startsWith(github.ref, 'refs/tags/') && (github.ref_name matches '^v?\\d+\\.\\d+\\.\\d+$')
|
||||
# Gate real PyPI publishing to stable SemVer tags only
|
||||
if: startsWith(github.ref, 'refs/tags/')
|
||||
name: Build and publish to PyPI
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Check if stable release
|
||||
id: check_tag
|
||||
run: |
|
||||
TAG="${{ github.ref_name }}"
|
||||
if echo "$TAG" | grep -qE '^v?[0-9]+\.[0-9]+\.[0-9]+$'; then
|
||||
echo "is_stable=true" >> $GITHUB_OUTPUT
|
||||
echo "Tag '$TAG' is a stable release. Will publish to PyPI."
|
||||
else
|
||||
echo "is_stable=false" >> $GITHUB_OUTPUT
|
||||
echo "Tag '$TAG' is not a stable release (contains pre-release suffix). Skipping PyPI publish."
|
||||
fi
|
||||
- name: Set up Python 3.9
|
||||
if: steps.check_tag.outputs.is_stable == 'true'
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: 3.9
|
||||
- name: Install pypa/build
|
||||
if: steps.check_tag.outputs.is_stable == 'true'
|
||||
run: >-
|
||||
python -m
|
||||
pip install
|
||||
build
|
||||
--user
|
||||
- name: Build binary wheel and source tarball
|
||||
if: steps.check_tag.outputs.is_stable == 'true'
|
||||
run: >-
|
||||
python -m
|
||||
build
|
||||
|
|
@ -63,7 +77,7 @@ jobs:
|
|||
--outdir dist/
|
||||
.
|
||||
- name: Publish distribution to PyPI
|
||||
if: startsWith(github.ref, 'refs/tags')
|
||||
if: steps.check_tag.outputs.is_stable == 'true'
|
||||
uses: pypa/gh-action-pypi-publish@master
|
||||
with:
|
||||
password: ${{ secrets.PYPI_API_TOKEN }}
|
||||
|
|
|
|||
|
|
@ -411,28 +411,43 @@ def search():
|
|||
# Find all result containers (using known result classes)
|
||||
result_divs = json_soup.find_all('div', class_=['ZINbbc', 'ezO2md'])
|
||||
|
||||
for div in result_divs:
|
||||
# Find the first valid link in this result container
|
||||
link = None
|
||||
for a in div.find_all('a', href=True):
|
||||
if a['href'].startswith('http'):
|
||||
link = a
|
||||
break
|
||||
|
||||
if not link:
|
||||
continue
|
||||
if result_divs:
|
||||
# Process structured Google results with container divs
|
||||
for div in result_divs:
|
||||
# Find the first valid link in this result container
|
||||
link = None
|
||||
for a in div.find_all('a', href=True):
|
||||
if a['href'].startswith('http'):
|
||||
link = a
|
||||
break
|
||||
|
||||
href = link['href']
|
||||
if href in seen:
|
||||
continue
|
||||
|
||||
# Get all text from the result container, not just the link
|
||||
text = div.get_text(separator=' ', strip=True)
|
||||
if not text:
|
||||
continue
|
||||
if not link:
|
||||
continue
|
||||
|
||||
href = link['href']
|
||||
if href in seen:
|
||||
continue
|
||||
|
||||
seen.add(href)
|
||||
results.append({'href': href, 'text': text})
|
||||
# Get all text from the result container, not just the link
|
||||
text = div.get_text(separator=' ', strip=True)
|
||||
if not text:
|
||||
continue
|
||||
|
||||
seen.add(href)
|
||||
results.append({'href': href, 'text': text})
|
||||
else:
|
||||
# Fallback: extract links directly if no result containers found
|
||||
for a in json_soup.find_all('a', href=True):
|
||||
href = a['href']
|
||||
if not href.startswith('http'):
|
||||
continue
|
||||
if href in seen:
|
||||
continue
|
||||
text = a.get_text(strip=True)
|
||||
if not text:
|
||||
continue
|
||||
seen.add(href)
|
||||
results.append({'href': href, 'text': text})
|
||||
|
||||
return jsonify({
|
||||
'query': urlparse.unquote(query),
|
||||
|
|
|
|||
Loading…
Reference in a new issue