Work around Pushshift bug for certain time ranges

Comments in the time ranges below need a workaround which slows down
Pushshift queries, but it's required to retrieve them correctly

 * Sep/1/2017 0:00 - Sep/30/2017 23:59:59 UTC
 * Feb/1/2018 0:00 - Mar/31/2018 23:59:59 UTC
This commit is contained in:
Christopher Gurnee 2022-05-09 19:55:18 +00:00
parent 0ca5630ffe
commit 08991dfbb0

View file

@ -2,7 +2,7 @@ import { fetchJson, sleep } from '../../utils'
export const chunkSize = 100;
const postURL = 'https://api.pushshift.io/reddit/submission/search/?fields=author,created_utc,domain,edited,id,link_flair_text,num_comments,permalink,position,removed_by_category,retrieved_on,retrieved_utc,score,selftext,subreddit,thumbnail,thumbnail_height,thumbnail_width,title,url&ids='
const commentURL = `https://api.pushshift.io/reddit/comment/search/?metadata=true&size=${chunkSize}&sort=asc&fields=author,body,created_utc,id,link_id,parent_id,retrieved_on,retrieved_utc,score,subreddit&q=*&link_id=`
const commentURL = `https://api.pushshift.io/reddit/comment/search/?metadata=true&size=${chunkSize}&sort=asc&fields=author,body,created_utc,id,link_id,parent_id,retrieved_on,retrieved_utc,score,subreddit&link_id=`
const errorHandler = (msg, origError, from) => {
console.error(from + ': ' + origError)
@ -71,18 +71,33 @@ export const getPost = async threadID => {
}
}
// Comments w/a created_utc in the ranges below must be queried *without* the faster `q=*` parameter:
// 1504224000 - 1506815999 (Sep/1/2017 0:00 - Sep/30/2017 23:59:59 UTC)
// 1517443200 - 1522540799 (Feb/1/2018 0:00 - Mar/31/2018 23:59:59 UTC)
// The ranges below subtract two weeks from the range start and optionally add two to the end.
const inBrokenRange = (utc, looseEnd = false) => looseEnd ?
utc > 1503014400 && utc < 1508025599 || utc > 1516233600 && utc < 1523750399 :
utc > 1503014400 && utc < 1506815999 || utc > 1516233600 && utc < 1522540799
// The callback() function is called with an Array of comments after each chunk is
// retrieved. It should return as quickly as possible (scheduling time-taking work
// later), and may return false to cause getComments to exit early, or true otherwise.
export const getComments = async (callback, threadID, maxComments, after = 0, before = undefined) => {
let chunks = Math.floor(maxComments / chunkSize), response, lastCreatedUtc = 1
let chunks = Math.floor(maxComments / chunkSize), firstChunk = true, response, lastCreatedUtc = 1
while (true) {
let delay = 0
while (true) {
let query = commentURL + threadID
if (!inBrokenRange(after))
query += '&q=*'
if (after)
query += `&after=${after}`
if (before)
query += `&before=${before}`
await pushshiftTokenBucket.waitForToken()
try {
response = await fetchJson(`${commentURL}${threadID}${after ? `&after=${after}` : ''}${before ? `&before=${before}` : ``}`)
response = await fetchJson(query)
break
} catch (error) {
if (delay >= 8000) // after ~16s of consecutive failures
@ -101,7 +116,14 @@ export const getComments = async (callback, threadID, maxComments, after = 0, be
link_id: c.link_id?.substring(3) || threadID
})))
const loadedAllComments = response.metadata.results_returned >= response.metadata.total_results
// If there's a chance the comments are in a broken range, restart the retrieval
if (firstChunk && !after && (comments.length === 0 || inBrokenRange(comments[0].created_utc, true)))
return getComments(callback, threadID, maxComments, 1503014401, before)
firstChunk = false
const loadedAllComments = response.metadata.hasOwnProperty('total_results') ?
response.metadata.results_returned >= response.metadata.total_results :
comments.length < chunkSize/2
if (comments.length)
lastCreatedUtc = comments[comments.length - 1].created_utc
if (loadedAllComments || chunks <= 1 || exitEarly)