diff --git a/src/api/pushshift/index.js b/src/api/pushshift/index.js index b6856b0..12d5399 100644 --- a/src/api/pushshift/index.js +++ b/src/api/pushshift/index.js @@ -7,54 +7,46 @@ const commentURL = `https://api.pushshift.io/reddit/comment/search/?size=${chunk const sleep = ms => new Promise(slept => setTimeout(slept, ms)) -export const getPost = threadID => - fetchJson(`${postURL}${threadID}`) - .then(({ data }) => data[0]) - .catch(error => { - console.error('pushshift.getPost: ' + error) - throw new Error('Could not get removed post') - }) +export const getPost = async threadID => { + try { + return (await fetchJson(`${postURL}${threadID}`)).data[0] + } catch (error) { + console.error('pushshift.getPost: ' + error) + throw new Error('Could not get removed post') + } +} -// Helper function that fetches a list of comments using a binary backoff, -// and also returns the next delay which should be passed back in -const fetchComments = (threadID, after, delay) => - fetchJson(`${commentURL}${threadID}&after=${after}`) - .then(({ data }) => - [ data.map(comment => ({ - ...comment, - parent_id: comment.parent_id.substring(3) || threadID, - link_id: comment.link_id.substring(3) || threadID - })), - delay - ] - ) - .catch(error => { - if (delay > 8000) { - console.error('pushshift.fetchComments: ' + error) - throw new Error('Could not get removed comments'); +export const getComments = async (threadID, maxComments) => { + let chunks = Math.ceil(maxComments / chunkSize) + let after = 0, delay = 0, comments + const allComments = new Map() + while (true) { + + while (true) { + try { + comments = (await fetchJson(`${commentURL}${threadID}&after=${after}`)).data + break + } catch (error) { + if (delay > 4000) { + console.error('pushshift.getComments: ' + error) + throw new Error('Could not get removed comments') + } + delay = delay * 2 || 500 } - return sleep(delay) - .then(() => fetchComments(threadID, after, delay * 2)) - }) + await sleep(delay) + } -const doGetComments = (threadID, chunks = 10, after = 0, delay = 500) => - fetchComments(threadID, after, delay) - .then(([comments, newDelay]) => { - if (comments.length < chunkSize/2 || chunks <= 1) - return comments; - const newAfter = Math.max(comments[comments.length - 1].created_utc - 1, after + 1); - return (newDelay > 500 ? sleep(newDelay / 2) : Promise.resolve()) - .then(() => doGetComments(threadID, chunks - 1, newAfter, newDelay)) - .then(remainingComments => { - const seenIDs = new Set(comments.map(c => c.id)); - for (var i = 0; i < remainingComments.length; i++) { - if ( ! seenIDs.has(remainingComments[i].id) ) - break - } - comments.push(...remainingComments.slice(i)); - return comments; - }) - }) - -export const getComments = (threadID, maxComments) => - doGetComments(threadID, Math.ceil(maxComments / chunkSize)) + comments.forEach(c => allComments.set(c.id, { + ...c, + parent_id: c.parent_id?.substring(3) || threadID, + link_id: c.link_id?.substring(3) || threadID + })) + if (comments.length < chunkSize/2 || chunks <= 1) + break + chunks -= 1 + after = Math.max(comments[comments.length - 1].created_utc - 1, after + 1) + if (delay) + await sleep(delay) + } + return allComments +} diff --git a/src/pages/thread/index.js b/src/pages/thread/index.js index b1ff21a..0f5bf39 100644 --- a/src/pages/thread/index.js +++ b/src/pages/thread/index.js @@ -89,14 +89,13 @@ class Thread extends React.Component { // Get comment ids from pushshift getPushshiftComments(threadID, this.props.global.state.maxComments) - .then(pushshiftComments => { - console.log(`Pushshift: ${pushshiftComments.length} comments`) - const pushshiftCommentLookup = new Map(pushshiftComments.map(c => [c.id, c])) + .then(pushshiftCommentLookup => { + console.log(`Pushshift: ${pushshiftCommentLookup.size} comments`) const ids = [] const missingIds = new Set() // Extract ids from pushshift response - pushshiftComments.forEach(comment => { + pushshiftCommentLookup.forEach(comment => { ids.push(comment.id) if (comment.parent_id != threadID && !pushshiftCommentLookup.has(comment.parent_id) && @@ -105,7 +104,6 @@ class Thread extends React.Component { missingIds.add(comment.parent_id) } }); - pushshiftComments = undefined missingIds.clear() // Get all the comments from reddit