From 08991dfbb0903e66c4a4bdcbb9453ee717bbab83 Mon Sep 17 00:00:00 2001 From: Christopher Gurnee Date: Mon, 9 May 2022 19:55:18 +0000 Subject: [PATCH] Work around Pushshift bug for certain time ranges Comments in the time ranges below need a workaround which slows down Pushshift queries, but it's required to retrieve them correctly * Sep/1/2017 0:00 - Sep/30/2017 23:59:59 UTC * Feb/1/2018 0:00 - Mar/31/2018 23:59:59 UTC --- src/api/pushshift/index.js | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/src/api/pushshift/index.js b/src/api/pushshift/index.js index 53df5b1..66e0d94 100644 --- a/src/api/pushshift/index.js +++ b/src/api/pushshift/index.js @@ -2,7 +2,7 @@ import { fetchJson, sleep } from '../../utils' export const chunkSize = 100; const postURL = 'https://api.pushshift.io/reddit/submission/search/?fields=author,created_utc,domain,edited,id,link_flair_text,num_comments,permalink,position,removed_by_category,retrieved_on,retrieved_utc,score,selftext,subreddit,thumbnail,thumbnail_height,thumbnail_width,title,url&ids=' -const commentURL = `https://api.pushshift.io/reddit/comment/search/?metadata=true&size=${chunkSize}&sort=asc&fields=author,body,created_utc,id,link_id,parent_id,retrieved_on,retrieved_utc,score,subreddit&q=*&link_id=` +const commentURL = `https://api.pushshift.io/reddit/comment/search/?metadata=true&size=${chunkSize}&sort=asc&fields=author,body,created_utc,id,link_id,parent_id,retrieved_on,retrieved_utc,score,subreddit&link_id=` const errorHandler = (msg, origError, from) => { console.error(from + ': ' + origError) @@ -71,18 +71,33 @@ export const getPost = async threadID => { } } +// Comments w/a created_utc in the ranges below must be queried *without* the faster `q=*` parameter: +// 1504224000 - 1506815999 (Sep/1/2017 0:00 - Sep/30/2017 23:59:59 UTC) +// 1517443200 - 1522540799 (Feb/1/2018 0:00 - Mar/31/2018 23:59:59 UTC) +// The ranges below subtract two weeks from the range start and optionally add two to the end. +const inBrokenRange = (utc, looseEnd = false) => looseEnd ? + utc > 1503014400 && utc < 1508025599 || utc > 1516233600 && utc < 1523750399 : + utc > 1503014400 && utc < 1506815999 || utc > 1516233600 && utc < 1522540799 + // The callback() function is called with an Array of comments after each chunk is // retrieved. It should return as quickly as possible (scheduling time-taking work // later), and may return false to cause getComments to exit early, or true otherwise. export const getComments = async (callback, threadID, maxComments, after = 0, before = undefined) => { - let chunks = Math.floor(maxComments / chunkSize), response, lastCreatedUtc = 1 + let chunks = Math.floor(maxComments / chunkSize), firstChunk = true, response, lastCreatedUtc = 1 while (true) { let delay = 0 while (true) { + let query = commentURL + threadID + if (!inBrokenRange(after)) + query += '&q=*' + if (after) + query += `&after=${after}` + if (before) + query += `&before=${before}` await pushshiftTokenBucket.waitForToken() try { - response = await fetchJson(`${commentURL}${threadID}${after ? `&after=${after}` : ''}${before ? `&before=${before}` : ``}`) + response = await fetchJson(query) break } catch (error) { if (delay >= 8000) // after ~16s of consecutive failures @@ -101,7 +116,14 @@ export const getComments = async (callback, threadID, maxComments, after = 0, be link_id: c.link_id?.substring(3) || threadID }))) - const loadedAllComments = response.metadata.results_returned >= response.metadata.total_results + // If there's a chance the comments are in a broken range, restart the retrieval + if (firstChunk && !after && (comments.length === 0 || inBrokenRange(comments[0].created_utc, true))) + return getComments(callback, threadID, maxComments, 1503014401, before) + firstChunk = false + + const loadedAllComments = response.metadata.hasOwnProperty('total_results') ? + response.metadata.results_returned >= response.metadata.total_results : + comments.length < chunkSize/2 if (comments.length) lastCreatedUtc = comments[comments.length - 1].created_utc if (loadedAllComments || chunks <= 1 || exitEarly)