Enhance link extraction logic in search function to handle cases with no result containers, improving robustness and accuracy of results.

This commit is contained in:
Don-Swanson 2025-09-30 20:32:51 -05:00
parent 442060b2ef
commit 457725ee5a
No known key found for this signature in database
GPG key ID: C6A6ACD574A005E5
2 changed files with 52 additions and 23 deletions

View file

@ -38,23 +38,37 @@ jobs:
password: ${{ secrets.TEST_PYPI_API_TOKEN }}
repository_url: https://test.pypi.org/legacy/
publish:
# Gate real PyPI publishing to stable SemVer tags only (e.g., v1.2.3 or 1.2.3)
if: startsWith(github.ref, 'refs/tags/') && (github.ref_name matches '^v?\\d+\\.\\d+\\.\\d+$')
# Gate real PyPI publishing to stable SemVer tags only
if: startsWith(github.ref, 'refs/tags/')
name: Build and publish to PyPI
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Check if stable release
id: check_tag
run: |
TAG="${{ github.ref_name }}"
if echo "$TAG" | grep -qE '^v?[0-9]+\.[0-9]+\.[0-9]+$'; then
echo "is_stable=true" >> $GITHUB_OUTPUT
echo "Tag '$TAG' is a stable release. Will publish to PyPI."
else
echo "is_stable=false" >> $GITHUB_OUTPUT
echo "Tag '$TAG' is not a stable release (contains pre-release suffix). Skipping PyPI publish."
fi
- name: Set up Python 3.9
if: steps.check_tag.outputs.is_stable == 'true'
uses: actions/setup-python@v5
with:
python-version: 3.9
- name: Install pypa/build
if: steps.check_tag.outputs.is_stable == 'true'
run: >-
python -m
pip install
build
--user
- name: Build binary wheel and source tarball
if: steps.check_tag.outputs.is_stable == 'true'
run: >-
python -m
build
@ -63,7 +77,7 @@ jobs:
--outdir dist/
.
- name: Publish distribution to PyPI
if: startsWith(github.ref, 'refs/tags')
if: steps.check_tag.outputs.is_stable == 'true'
uses: pypa/gh-action-pypi-publish@master
with:
password: ${{ secrets.PYPI_API_TOKEN }}

View file

@ -411,28 +411,43 @@ def search():
# Find all result containers (using known result classes)
result_divs = json_soup.find_all('div', class_=['ZINbbc', 'ezO2md'])
for div in result_divs:
# Find the first valid link in this result container
link = None
for a in div.find_all('a', href=True):
if a['href'].startswith('http'):
link = a
break
if not link:
continue
if result_divs:
# Process structured Google results with container divs
for div in result_divs:
# Find the first valid link in this result container
link = None
for a in div.find_all('a', href=True):
if a['href'].startswith('http'):
link = a
break
href = link['href']
if href in seen:
continue
# Get all text from the result container, not just the link
text = div.get_text(separator=' ', strip=True)
if not text:
continue
if not link:
continue
href = link['href']
if href in seen:
continue
seen.add(href)
results.append({'href': href, 'text': text})
# Get all text from the result container, not just the link
text = div.get_text(separator=' ', strip=True)
if not text:
continue
seen.add(href)
results.append({'href': href, 'text': text})
else:
# Fallback: extract links directly if no result containers found
for a in json_soup.find_all('a', href=True):
href = a['href']
if not href.startswith('http'):
continue
if href in seen:
continue
text = a.get_text(strip=True)
if not text:
continue
seen.add(href)
results.append({'href': href, 'text': text})
return jsonify({
'query': urlparse.unquote(query),