Initial updates for the new Pushshift API

This should make things mostly work with the new API, however there's
a lot more to do plus the API is (probably) still in flux.
This commit is contained in:
Christopher Gurnee 2022-12-13 22:43:05 +00:00
parent d45daec442
commit cb316a541e
2 changed files with 19 additions and 19 deletions

View file

@ -1,10 +1,10 @@
import { fetchJson, sleep } from '../../utils'
export const chunkSize = 250;
const postURL = 'https://api.pushshift.io/reddit/submission/search/?fields=author,created_utc,domain,edited,id,link_flair_text,num_comments,permalink,position,removed_by_category,retrieved_on,retrieved_utc,score,selftext,subreddit,thumbnail,thumbnail_height,thumbnail_width,title,url&ids='
const commentURL = 'https://api.pushshift.io/reddit/comment/search/?fields=author,body,created_utc,id,link_id,parent_id,retrieved_on,retrieved_utc,score,subreddit&'
export const chunkSize = 1000;
const postURL = 'https://api.pushshift.io/reddit/search/submission?filter=author,created_utc,domain,edited,id,link_flair_text,num_comments,permalink,position,removed_by_category,retrieved_on,retrieved_utc,score,selftext,subreddit,thumbnail,thumbnail_height,thumbnail_width,title,url&ids='
const commentURL = 'https://api.pushshift.io/reddit/search/comment?filter=author,body,created_utc,id,link_id,parent_id,retrieved_on,retrieved_utc,score,subreddit&'
const commentURLbyIDs = `${commentURL}ids=`
const commentURLbyLink = `${commentURL}metadata=true&size=${chunkSize}&sort=asc&link_id=`
const commentURLbyLink = `${commentURL}limit=${chunkSize}&sort=id&order=asc&link_id=`
const errorHandler = (msg, origError, from) => {
console.error(from + ': ' + origError)
@ -62,12 +62,12 @@ class TokenBucket {
}
}
const pushshiftTokenBucket = new TokenBucket(2015, 7)
const pushshiftTokenBucket = new TokenBucket(515, 7)
export const getPost = async threadID => {
await pushshiftTokenBucket.waitForToken()
try {
return (await fetchJson(`${postURL}${threadID}`)).data[0]
return (await fetchJson(`${postURL}${parseInt(threadID, 36)}`)).data[0]
} catch (error) {
errorHandler('Could not get removed/edited post', error, 'pushshift.getPost')
}
@ -91,7 +91,7 @@ export const getCommentsFromIds = async commentIDs => {
while (true) {
await pushshiftTokenBucket.waitForToken()
try {
response = await fetchJson(`${commentURLbyIDs}${commentIDs.join()}`)
response = await fetchJson(`${commentURLbyIDs}${commentIDs.map(id => parseInt(id, 36)).join()}`)
break
} catch (error) {
if (delay >= 2000) // after ~4s of consecutive failures
@ -120,17 +120,16 @@ export const getCommentsFromIds = async commentIDs => {
// The callback() function is called with an Array of comments after each chunk is
// retrieved. It should return as quickly as possible (scheduling time-taking work
// later), and may return false to cause getComments to exit early, or true otherwise.
export const getComments = async (callback, threadID, maxComments, after = 0, before = undefined) => {
export const getComments = async (callback, threadID, maxComments, after = -1, before = undefined) => {
let chunks = Math.floor(maxComments / chunkSize), response, lastCreatedUtc = 1
while (true) {
let query = commentURLbyLink + threadID
let query = commentURLbyLink + parseInt(threadID, 36)
//if (!inBrokenRange(after))
query += '&q=*'
if (after)
query += `&after=${after}`
// query += '&q=*'
query += `&since=${after + 1}`
if (before)
query += `&before=${before}`
query += `&until=${before}`
let delay = 0
while (true) {
await pushshiftTokenBucket.waitForToken()
@ -150,7 +149,7 @@ export const getComments = async (callback, threadID, maxComments, after = 0, be
const comments = response.data
const exitEarly = !callback(comments.map(c => ({
...c,
parent_id: c.parent_id?.substring(3) || threadID,
parent_id: c.parent_id ? toBase36(c.parent_id) : threadID,
link_id: c.link_id?.substring(3) || threadID
})))
@ -159,9 +158,10 @@ export const getComments = async (callback, threadID, maxComments, after = 0, be
// return getComments(callback, threadID, maxComments, 1503014401, before)
//firstChunk = false
const loadedAllComments = Object.prototype.hasOwnProperty.call(response.metadata, 'total_results') ?
response.metadata.results_returned >= response.metadata.total_results :
comments.length < chunkSize/2
//const loadedAllComments = Object.prototype.hasOwnProperty.call(response.metadata, 'total_results') ?
// response.metadata.results_returned >= response.metadata.total_results :
// comments.length < chunkSize/2
const loadedAllComments = comments.length < chunkSize*3/4
if (comments.length)
lastCreatedUtc = comments[comments.length - 1].created_utc
if (loadedAllComments || chunks <= 1 || exitEarly)

View file

@ -96,8 +96,8 @@ const About = props => {
<p>
Occasionally, Pushshift (the service used by Unddit) goes offline for a while.
This can result in &ldquo;Could not get removed post/comments&rdquo; errors on Unddit.
To check its status, click <a href='https://api.pushshift.io/reddit/comment/search/?size=1&sort=asc&fields=body&q=*&link_id=wdla1b'>this direct link to Pushshift</a>.
You should either get a short message saying that Pushshift is up, or an error.
To check its status, click <a href='https://api.pushshift.io/reddit/search/comment?limit=1&filter=body&since=1659370272&until=1659370273&link_id=1957745423'>this direct link to Pushshift</a>.
You should either get a message saying that &ldquo;Pushshift is UP!&rdquo; within the first few words of the message, or an error.
</p>
</div>
<div id='difference' className={hash == '#difference' ? 'highlighted' : undefined}>