mirror of
https://github.com/Lissy93/awesome-privacy.git
synced 2026-03-11 08:55:33 +00:00
Implements the anti-spam check
This commit is contained in:
parent
f1b012efb2
commit
f564947c20
3 changed files with 118 additions and 13 deletions
2
.github/workflows/pr-check.yml
vendored
2
.github/workflows/pr-check.yml
vendored
|
|
@ -32,6 +32,7 @@ jobs:
|
|||
PR_TITLE: ${{ github.event.pull_request.title }}
|
||||
PR_BODY: ${{ github.event.pull_request.body }}
|
||||
PR_DRAFT: ${{ github.event.pull_request.draft }}
|
||||
BASE_REF: ${{ github.event.pull_request.base.sha }}
|
||||
README_FAILED: ${{ steps.readme.outcome == 'failure' && 'true' || 'false' }}
|
||||
run: python lib/checks/check-pr-meta.py
|
||||
- name: Upload findings
|
||||
|
|
@ -117,6 +118,7 @@ jobs:
|
|||
- name: Check project health
|
||||
env:
|
||||
PR_USER: ${{ github.event.pull_request.user.login }}
|
||||
PR_BODY: ${{ github.event.pull_request.body }}
|
||||
GITHUB_TOKEN: ${{ github.token }}
|
||||
run: python lib/checks/check-project.py
|
||||
- name: Upload findings
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
import json
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
FINDINGS_PATH = "/tmp/findings-compliance.json"
|
||||
|
|
@ -32,6 +33,15 @@ README_MSG = (
|
|||
" content in `awesome-privacy.yml`, and so your changes will be overridden!"
|
||||
" Instead, only modify the YAML file, and be sure to follow our Contributing Guidelines."
|
||||
)
|
||||
BOT_MSG = (
|
||||
"Submissions are only accepted from humans."
|
||||
" This PR appears to have been authored by a bot or AI assistant."
|
||||
)
|
||||
|
||||
_BOT_AUTHOR_RE = re.compile(
|
||||
r"(?:noreply@anthropic\.com|devin-ai-integration|copilot-swe-agent|noreply@cursor\.com)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def extract_section(body, header):
|
||||
|
|
@ -83,6 +93,24 @@ def check_checkboxes(body):
|
|||
return None
|
||||
|
||||
|
||||
def check_bot_coauthors(base_ref):
|
||||
"""Return a finding if any commit in the PR has a bot author or co-author."""
|
||||
if not base_ref:
|
||||
return None
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["git", "log", f"{base_ref}..HEAD", "--format=%aN <%aE>%n%B"],
|
||||
capture_output=True, text=True, timeout=30,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
return None
|
||||
if _BOT_AUTHOR_RE.search(result.stdout):
|
||||
return BOT_MSG
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def check_readme(readme_failed):
|
||||
"""Return a finding if the README check reported a failure."""
|
||||
if readme_failed == "true":
|
||||
|
|
@ -104,6 +132,11 @@ def main():
|
|||
body = os.environ.get("PR_BODY", "")
|
||||
draft = os.environ.get("PR_DRAFT", "false")
|
||||
readme_failed = os.environ.get("README_FAILED", "false")
|
||||
base_ref = os.environ.get("BASE_REF", "")
|
||||
|
||||
finding = check_bot_coauthors(base_ref)
|
||||
if finding:
|
||||
findings.append(finding)
|
||||
|
||||
finding = check_title(title)
|
||||
if finding:
|
||||
|
|
|
|||
|
|
@ -2,8 +2,9 @@
|
|||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from datetime import datetime, timedelta, timezone
|
||||
|
||||
import requests
|
||||
import yaml
|
||||
|
|
@ -18,11 +19,14 @@ USER_AGENT = "awesome-privacy-ci/1.0"
|
|||
MIN_STARS = 100
|
||||
INACTIVE_DAYS = 90
|
||||
MIN_AGE_DAYS = 120
|
||||
AI_COMMIT_THRESHOLD = 5
|
||||
AI_COMMIT_RATIO = 0.2
|
||||
AI_BOT_AUTHORS = [
|
||||
"noreply@anthropic.com",
|
||||
"devin-ai-integration[bot]",
|
||||
"copilot-swe-agent.github.com",
|
||||
"noreply@cursor.com",
|
||||
]
|
||||
SPAM_PR_THRESHOLD = 5
|
||||
|
||||
LINK_MSG = (
|
||||
"Our automated checks were unable to verify the link(s) you included"
|
||||
|
|
@ -33,7 +37,7 @@ AUTHOR_MSG = (
|
|||
" have clearly disclosed this in your PR body for transparency"
|
||||
)
|
||||
STARS_MSG = (
|
||||
"It looks like your submission is adding a quite small project."
|
||||
"It looks like your submission is quite a small project without a lot of users yet."
|
||||
" In some circumstances we may ask you to resubmit this once the project"
|
||||
" is more mature and has a proven track record of good practices and maintenance."
|
||||
)
|
||||
|
|
@ -64,6 +68,10 @@ SECURITY_MSG = (
|
|||
"This project has open security vulnerabilities (critical or high severity)"
|
||||
" flagged by GitHub Dependabot. Please verify these have been addressed"
|
||||
)
|
||||
SPAM_MSG = (
|
||||
"This user has opened up a large number of PRs to other awesome-* repos"
|
||||
" in the past 24 hours, and appears to be spamming"
|
||||
)
|
||||
|
||||
|
||||
def load_diff(path):
|
||||
|
|
@ -183,6 +191,21 @@ def check_links(diff, head):
|
|||
return None
|
||||
|
||||
|
||||
def _commit_has_bot(commit, bot_set):
|
||||
"""Check if a commit was authored or co-authored by a known AI bot."""
|
||||
author = commit.get("commit", {}).get("author", {})
|
||||
email = (author.get("email") or "").lower()
|
||||
name = (author.get("name") or "").lower()
|
||||
if email in bot_set or name in bot_set:
|
||||
return True
|
||||
message = (commit.get("commit", {}).get("message") or "").lower()
|
||||
for line in message.splitlines():
|
||||
if line.strip().startswith("co-authored-by:"):
|
||||
if any(bot in line for bot in bot_set):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def check_ai_commits(owner, repo, token):
|
||||
"""Return AI_CODE_MSG if recent commits contain significant AI bot activity."""
|
||||
try:
|
||||
|
|
@ -195,15 +218,12 @@ def check_ai_commits(owner, repo, token):
|
|||
)
|
||||
if resp.status_code != 200:
|
||||
return None
|
||||
commits = resp.json()
|
||||
if not commits:
|
||||
return None
|
||||
bot_set = {a.lower() for a in AI_BOT_AUTHORS}
|
||||
count = 0
|
||||
for commit in resp.json():
|
||||
author = commit.get("commit", {}).get("author", {})
|
||||
email = (author.get("email") or "").lower()
|
||||
name = (author.get("name") or "").lower()
|
||||
if email in bot_set or name in bot_set:
|
||||
count += 1
|
||||
if count >= AI_COMMIT_THRESHOLD:
|
||||
count = sum(1 for c in commits if _commit_has_bot(c, bot_set))
|
||||
if count / len(commits) >= AI_COMMIT_RATIO:
|
||||
return AI_CODE_MSG
|
||||
except Exception:
|
||||
pass
|
||||
|
|
@ -228,7 +248,51 @@ def check_security_alerts(owner, repo, token):
|
|||
return None
|
||||
|
||||
|
||||
def check_repo_signals(diff, pr_user, token):
|
||||
def check_spam_prs(pr_user, token):
|
||||
"""Return SPAM_MSG if the user has opened many PRs to other awesome-* repos recently."""
|
||||
if not pr_user or not token:
|
||||
return None
|
||||
try:
|
||||
since = (datetime.now(timezone.utc) - timedelta(days=1)).strftime("%Y-%m-%d")
|
||||
headers = {"Accept": "application/vnd.github.v3+json", "User-Agent": USER_AGENT}
|
||||
headers["Authorization"] = f"token {token}"
|
||||
resp = requests.get(
|
||||
"https://api.github.com/search/issues",
|
||||
headers=headers, timeout=TIMEOUT,
|
||||
params={"q": f"type:pr author:{pr_user} created:>={since}"},
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
return None
|
||||
items = resp.json().get("items", [])
|
||||
this_repo = os.environ.get("GITHUB_REPOSITORY", "Lissy93/awesome-privacy").lower()
|
||||
count = 0
|
||||
for item in items:
|
||||
repo_url = item.get("repository_url", "")
|
||||
# repository_url looks like https://api.github.com/repos/owner/repo-name
|
||||
repo_full = "/".join(repo_url.rstrip("/").split("/")[-2:]).lower()
|
||||
repo_name = repo_url.rstrip("/").split("/")[-1].lower()
|
||||
if repo_name.startswith("awesome-") and repo_full != this_repo:
|
||||
count += 1
|
||||
if count >= SPAM_PR_THRESHOLD:
|
||||
return SPAM_MSG
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
_DISCLOSURE_RE = re.compile(
|
||||
r"i am the author|i'm the author|my project|i created|i develop"
|
||||
r"|i maintain|i built|my own project|i made|author of|maintainer of",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def _pr_discloses_authorship(pr_body):
|
||||
"""Return True if the PR body already discloses the submitter is the author."""
|
||||
return bool(pr_body and _DISCLOSURE_RE.search(pr_body))
|
||||
|
||||
|
||||
def check_repo_signals(diff, pr_user, token, pr_body=""):
|
||||
"""Check GitHub repo author match, stars, and activity for added services."""
|
||||
findings = []
|
||||
if not token:
|
||||
|
|
@ -252,6 +316,7 @@ def check_repo_signals(diff, pr_user, token):
|
|||
and repo_owner.get("type") == "User"
|
||||
and repo_owner.get("login", "").lower() == pr_user.lower()
|
||||
and AUTHOR_MSG not in findings
|
||||
and not _pr_discloses_authorship(pr_body)
|
||||
):
|
||||
findings.append(AUTHOR_MSG)
|
||||
|
||||
|
|
@ -316,8 +381,13 @@ def main():
|
|||
findings.append(finding)
|
||||
|
||||
pr_user = os.environ.get("PR_USER", "")
|
||||
pr_body = os.environ.get("PR_BODY", "")
|
||||
token = os.environ.get("GITHUB_TOKEN", "")
|
||||
findings.extend(check_repo_signals(diff, pr_user, token))
|
||||
findings.extend(check_repo_signals(diff, pr_user, token, pr_body))
|
||||
|
||||
finding = check_spam_prs(pr_user, token)
|
||||
if finding:
|
||||
findings.append(finding)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue