mirror of
https://github.com/gurnec/removeddit.git
synced 2026-03-11 08:54:27 +00:00
Work around Pushshift bug for certain time ranges
Comments in the time ranges below need a workaround which slows down Pushshift queries, but it's required to retrieve them correctly * Sep/1/2017 0:00 - Sep/30/2017 23:59:59 UTC * Feb/1/2018 0:00 - Mar/31/2018 23:59:59 UTC
This commit is contained in:
parent
0ca5630ffe
commit
08991dfbb0
1 changed files with 26 additions and 4 deletions
|
|
@ -2,7 +2,7 @@ import { fetchJson, sleep } from '../../utils'
|
|||
|
||||
export const chunkSize = 100;
|
||||
const postURL = 'https://api.pushshift.io/reddit/submission/search/?fields=author,created_utc,domain,edited,id,link_flair_text,num_comments,permalink,position,removed_by_category,retrieved_on,retrieved_utc,score,selftext,subreddit,thumbnail,thumbnail_height,thumbnail_width,title,url&ids='
|
||||
const commentURL = `https://api.pushshift.io/reddit/comment/search/?metadata=true&size=${chunkSize}&sort=asc&fields=author,body,created_utc,id,link_id,parent_id,retrieved_on,retrieved_utc,score,subreddit&q=*&link_id=`
|
||||
const commentURL = `https://api.pushshift.io/reddit/comment/search/?metadata=true&size=${chunkSize}&sort=asc&fields=author,body,created_utc,id,link_id,parent_id,retrieved_on,retrieved_utc,score,subreddit&link_id=`
|
||||
|
||||
const errorHandler = (msg, origError, from) => {
|
||||
console.error(from + ': ' + origError)
|
||||
|
|
@ -71,18 +71,33 @@ export const getPost = async threadID => {
|
|||
}
|
||||
}
|
||||
|
||||
// Comments w/a created_utc in the ranges below must be queried *without* the faster `q=*` parameter:
|
||||
// 1504224000 - 1506815999 (Sep/1/2017 0:00 - Sep/30/2017 23:59:59 UTC)
|
||||
// 1517443200 - 1522540799 (Feb/1/2018 0:00 - Mar/31/2018 23:59:59 UTC)
|
||||
// The ranges below subtract two weeks from the range start and optionally add two to the end.
|
||||
const inBrokenRange = (utc, looseEnd = false) => looseEnd ?
|
||||
utc > 1503014400 && utc < 1508025599 || utc > 1516233600 && utc < 1523750399 :
|
||||
utc > 1503014400 && utc < 1506815999 || utc > 1516233600 && utc < 1522540799
|
||||
|
||||
// The callback() function is called with an Array of comments after each chunk is
|
||||
// retrieved. It should return as quickly as possible (scheduling time-taking work
|
||||
// later), and may return false to cause getComments to exit early, or true otherwise.
|
||||
export const getComments = async (callback, threadID, maxComments, after = 0, before = undefined) => {
|
||||
let chunks = Math.floor(maxComments / chunkSize), response, lastCreatedUtc = 1
|
||||
let chunks = Math.floor(maxComments / chunkSize), firstChunk = true, response, lastCreatedUtc = 1
|
||||
while (true) {
|
||||
|
||||
let delay = 0
|
||||
while (true) {
|
||||
let query = commentURL + threadID
|
||||
if (!inBrokenRange(after))
|
||||
query += '&q=*'
|
||||
if (after)
|
||||
query += `&after=${after}`
|
||||
if (before)
|
||||
query += `&before=${before}`
|
||||
await pushshiftTokenBucket.waitForToken()
|
||||
try {
|
||||
response = await fetchJson(`${commentURL}${threadID}${after ? `&after=${after}` : ''}${before ? `&before=${before}` : ``}`)
|
||||
response = await fetchJson(query)
|
||||
break
|
||||
} catch (error) {
|
||||
if (delay >= 8000) // after ~16s of consecutive failures
|
||||
|
|
@ -101,7 +116,14 @@ export const getComments = async (callback, threadID, maxComments, after = 0, be
|
|||
link_id: c.link_id?.substring(3) || threadID
|
||||
})))
|
||||
|
||||
const loadedAllComments = response.metadata.results_returned >= response.metadata.total_results
|
||||
// If there's a chance the comments are in a broken range, restart the retrieval
|
||||
if (firstChunk && !after && (comments.length === 0 || inBrokenRange(comments[0].created_utc, true)))
|
||||
return getComments(callback, threadID, maxComments, 1503014401, before)
|
||||
firstChunk = false
|
||||
|
||||
const loadedAllComments = response.metadata.hasOwnProperty('total_results') ?
|
||||
response.metadata.results_returned >= response.metadata.total_results :
|
||||
comments.length < chunkSize/2
|
||||
if (comments.length)
|
||||
lastCreatedUtc = comments[comments.length - 1].created_utc
|
||||
if (loadedAllComments || chunks <= 1 || exitEarly)
|
||||
|
|
|
|||
Loading…
Reference in a new issue