Implements the anti-spam check

This commit is contained in:
Alicia Sykes 2026-03-07 16:54:02 +00:00
parent f1b012efb2
commit f564947c20
3 changed files with 118 additions and 13 deletions

View file

@ -32,6 +32,7 @@ jobs:
PR_TITLE: ${{ github.event.pull_request.title }}
PR_BODY: ${{ github.event.pull_request.body }}
PR_DRAFT: ${{ github.event.pull_request.draft }}
BASE_REF: ${{ github.event.pull_request.base.sha }}
README_FAILED: ${{ steps.readme.outcome == 'failure' && 'true' || 'false' }}
run: python lib/checks/check-pr-meta.py
- name: Upload findings
@ -117,6 +118,7 @@ jobs:
- name: Check project health
env:
PR_USER: ${{ github.event.pull_request.user.login }}
PR_BODY: ${{ github.event.pull_request.body }}
GITHUB_TOKEN: ${{ github.token }}
run: python lib/checks/check-project.py
- name: Upload findings

View file

@ -3,6 +3,7 @@
import json
import os
import re
import subprocess
import sys
FINDINGS_PATH = "/tmp/findings-compliance.json"
@ -32,6 +33,15 @@ README_MSG = (
" content in `awesome-privacy.yml`, and so your changes will be overridden!"
" Instead, only modify the YAML file, and be sure to follow our Contributing Guidelines."
)
BOT_MSG = (
"Submissions are only accepted from humans."
" This PR appears to have been authored by a bot or AI assistant."
)
_BOT_AUTHOR_RE = re.compile(
r"(?:noreply@anthropic\.com|devin-ai-integration|copilot-swe-agent|noreply@cursor\.com)",
re.IGNORECASE,
)
def extract_section(body, header):
@ -83,6 +93,24 @@ def check_checkboxes(body):
return None
def check_bot_coauthors(base_ref):
"""Return a finding if any commit in the PR has a bot author or co-author."""
if not base_ref:
return None
try:
result = subprocess.run(
["git", "log", f"{base_ref}..HEAD", "--format=%aN <%aE>%n%B"],
capture_output=True, text=True, timeout=30,
)
if result.returncode != 0:
return None
if _BOT_AUTHOR_RE.search(result.stdout):
return BOT_MSG
except Exception:
pass
return None
def check_readme(readme_failed):
"""Return a finding if the README check reported a failure."""
if readme_failed == "true":
@ -104,6 +132,11 @@ def main():
body = os.environ.get("PR_BODY", "")
draft = os.environ.get("PR_DRAFT", "false")
readme_failed = os.environ.get("README_FAILED", "false")
base_ref = os.environ.get("BASE_REF", "")
finding = check_bot_coauthors(base_ref)
if finding:
findings.append(finding)
finding = check_title(title)
if finding:

View file

@ -2,8 +2,9 @@
import json
import os
import re
import sys
from datetime import datetime, timezone
from datetime import datetime, timedelta, timezone
import requests
import yaml
@ -18,11 +19,14 @@ USER_AGENT = "awesome-privacy-ci/1.0"
MIN_STARS = 100
INACTIVE_DAYS = 90
MIN_AGE_DAYS = 120
AI_COMMIT_THRESHOLD = 5
AI_COMMIT_RATIO = 0.2
AI_BOT_AUTHORS = [
"noreply@anthropic.com",
"devin-ai-integration[bot]",
"copilot-swe-agent.github.com",
"noreply@cursor.com",
]
SPAM_PR_THRESHOLD = 5
LINK_MSG = (
"Our automated checks were unable to verify the link(s) you included"
@ -33,7 +37,7 @@ AUTHOR_MSG = (
" have clearly disclosed this in your PR body for transparency"
)
STARS_MSG = (
"It looks like your submission is adding a quite small project."
"It looks like your submission is quite a small project without a lot of users yet."
" In some circumstances we may ask you to resubmit this once the project"
" is more mature and has a proven track record of good practices and maintenance."
)
@ -64,6 +68,10 @@ SECURITY_MSG = (
"This project has open security vulnerabilities (critical or high severity)"
" flagged by GitHub Dependabot. Please verify these have been addressed"
)
SPAM_MSG = (
"This user has opened up a large number of PRs to other awesome-* repos"
" in the past 24 hours, and appears to be spamming"
)
def load_diff(path):
@ -183,6 +191,21 @@ def check_links(diff, head):
return None
def _commit_has_bot(commit, bot_set):
"""Check if a commit was authored or co-authored by a known AI bot."""
author = commit.get("commit", {}).get("author", {})
email = (author.get("email") or "").lower()
name = (author.get("name") or "").lower()
if email in bot_set or name in bot_set:
return True
message = (commit.get("commit", {}).get("message") or "").lower()
for line in message.splitlines():
if line.strip().startswith("co-authored-by:"):
if any(bot in line for bot in bot_set):
return True
return False
def check_ai_commits(owner, repo, token):
"""Return AI_CODE_MSG if recent commits contain significant AI bot activity."""
try:
@ -195,15 +218,12 @@ def check_ai_commits(owner, repo, token):
)
if resp.status_code != 200:
return None
commits = resp.json()
if not commits:
return None
bot_set = {a.lower() for a in AI_BOT_AUTHORS}
count = 0
for commit in resp.json():
author = commit.get("commit", {}).get("author", {})
email = (author.get("email") or "").lower()
name = (author.get("name") or "").lower()
if email in bot_set or name in bot_set:
count += 1
if count >= AI_COMMIT_THRESHOLD:
count = sum(1 for c in commits if _commit_has_bot(c, bot_set))
if count / len(commits) >= AI_COMMIT_RATIO:
return AI_CODE_MSG
except Exception:
pass
@ -228,7 +248,51 @@ def check_security_alerts(owner, repo, token):
return None
def check_repo_signals(diff, pr_user, token):
def check_spam_prs(pr_user, token):
"""Return SPAM_MSG if the user has opened many PRs to other awesome-* repos recently."""
if not pr_user or not token:
return None
try:
since = (datetime.now(timezone.utc) - timedelta(days=1)).strftime("%Y-%m-%d")
headers = {"Accept": "application/vnd.github.v3+json", "User-Agent": USER_AGENT}
headers["Authorization"] = f"token {token}"
resp = requests.get(
"https://api.github.com/search/issues",
headers=headers, timeout=TIMEOUT,
params={"q": f"type:pr author:{pr_user} created:>={since}"},
)
if resp.status_code != 200:
return None
items = resp.json().get("items", [])
this_repo = os.environ.get("GITHUB_REPOSITORY", "Lissy93/awesome-privacy").lower()
count = 0
for item in items:
repo_url = item.get("repository_url", "")
# repository_url looks like https://api.github.com/repos/owner/repo-name
repo_full = "/".join(repo_url.rstrip("/").split("/")[-2:]).lower()
repo_name = repo_url.rstrip("/").split("/")[-1].lower()
if repo_name.startswith("awesome-") and repo_full != this_repo:
count += 1
if count >= SPAM_PR_THRESHOLD:
return SPAM_MSG
except Exception:
pass
return None
_DISCLOSURE_RE = re.compile(
r"i am the author|i'm the author|my project|i created|i develop"
r"|i maintain|i built|my own project|i made|author of|maintainer of",
re.IGNORECASE,
)
def _pr_discloses_authorship(pr_body):
"""Return True if the PR body already discloses the submitter is the author."""
return bool(pr_body and _DISCLOSURE_RE.search(pr_body))
def check_repo_signals(diff, pr_user, token, pr_body=""):
"""Check GitHub repo author match, stars, and activity for added services."""
findings = []
if not token:
@ -252,6 +316,7 @@ def check_repo_signals(diff, pr_user, token):
and repo_owner.get("type") == "User"
and repo_owner.get("login", "").lower() == pr_user.lower()
and AUTHOR_MSG not in findings
and not _pr_discloses_authorship(pr_body)
):
findings.append(AUTHOR_MSG)
@ -316,8 +381,13 @@ def main():
findings.append(finding)
pr_user = os.environ.get("PR_USER", "")
pr_body = os.environ.get("PR_BODY", "")
token = os.environ.get("GITHUB_TOKEN", "")
findings.extend(check_repo_signals(diff, pr_user, token))
findings.extend(check_repo_signals(diff, pr_user, token, pr_body))
finding = check_spam_prs(pr_user, token)
if finding:
findings.append(finding)
except Exception:
pass