Rewrite api/pushsift/index.js using async/await

* much simpler, got rid of recursion
 * no more Set temporaries, less copying of arrays
 * now returns a Map (thread/index.js need not create it)
This commit is contained in:
Christopher Gurnee 2022-02-16 20:32:24 -05:00
parent c6585ab749
commit c98f71bd4d
2 changed files with 43 additions and 53 deletions

View file

@ -7,54 +7,46 @@ const commentURL = `https://api.pushshift.io/reddit/comment/search/?size=${chunk
const sleep = ms =>
new Promise(slept => setTimeout(slept, ms))
export const getPost = threadID =>
fetchJson(`${postURL}${threadID}`)
.then(({ data }) => data[0])
.catch(error => {
console.error('pushshift.getPost: ' + error)
throw new Error('Could not get removed post')
})
export const getPost = async threadID => {
try {
return (await fetchJson(`${postURL}${threadID}`)).data[0]
} catch (error) {
console.error('pushshift.getPost: ' + error)
throw new Error('Could not get removed post')
}
}
// Helper function that fetches a list of comments using a binary backoff,
// and also returns the next delay which should be passed back in
const fetchComments = (threadID, after, delay) =>
fetchJson(`${commentURL}${threadID}&after=${after}`)
.then(({ data }) =>
[ data.map(comment => ({
...comment,
parent_id: comment.parent_id.substring(3) || threadID,
link_id: comment.link_id.substring(3) || threadID
})),
delay
]
)
.catch(error => {
if (delay > 8000) {
console.error('pushshift.fetchComments: ' + error)
throw new Error('Could not get removed comments');
export const getComments = async (threadID, maxComments) => {
let chunks = Math.ceil(maxComments / chunkSize)
let after = 0, delay = 0, comments
const allComments = new Map()
while (true) {
while (true) {
try {
comments = (await fetchJson(`${commentURL}${threadID}&after=${after}`)).data
break
} catch (error) {
if (delay > 4000) {
console.error('pushshift.getComments: ' + error)
throw new Error('Could not get removed comments')
}
delay = delay * 2 || 500
}
return sleep(delay)
.then(() => fetchComments(threadID, after, delay * 2))
})
await sleep(delay)
}
const doGetComments = (threadID, chunks = 10, after = 0, delay = 500) =>
fetchComments(threadID, after, delay)
.then(([comments, newDelay]) => {
if (comments.length < chunkSize/2 || chunks <= 1)
return comments;
const newAfter = Math.max(comments[comments.length - 1].created_utc - 1, after + 1);
return (newDelay > 500 ? sleep(newDelay / 2) : Promise.resolve())
.then(() => doGetComments(threadID, chunks - 1, newAfter, newDelay))
.then(remainingComments => {
const seenIDs = new Set(comments.map(c => c.id));
for (var i = 0; i < remainingComments.length; i++) {
if ( ! seenIDs.has(remainingComments[i].id) )
break
}
comments.push(...remainingComments.slice(i));
return comments;
})
})
export const getComments = (threadID, maxComments) =>
doGetComments(threadID, Math.ceil(maxComments / chunkSize))
comments.forEach(c => allComments.set(c.id, {
...c,
parent_id: c.parent_id?.substring(3) || threadID,
link_id: c.link_id?.substring(3) || threadID
}))
if (comments.length < chunkSize/2 || chunks <= 1)
break
chunks -= 1
after = Math.max(comments[comments.length - 1].created_utc - 1, after + 1)
if (delay)
await sleep(delay)
}
return allComments
}

View file

@ -89,14 +89,13 @@ class Thread extends React.Component {
// Get comment ids from pushshift
getPushshiftComments(threadID, this.props.global.state.maxComments)
.then(pushshiftComments => {
console.log(`Pushshift: ${pushshiftComments.length} comments`)
const pushshiftCommentLookup = new Map(pushshiftComments.map(c => [c.id, c]))
.then(pushshiftCommentLookup => {
console.log(`Pushshift: ${pushshiftCommentLookup.size} comments`)
const ids = []
const missingIds = new Set()
// Extract ids from pushshift response
pushshiftComments.forEach(comment => {
pushshiftCommentLookup.forEach(comment => {
ids.push(comment.id)
if (comment.parent_id != threadID &&
!pushshiftCommentLookup.has(comment.parent_id) &&
@ -105,7 +104,6 @@ class Thread extends React.Component {
missingIds.add(comment.parent_id)
}
});
pushshiftComments = undefined
missingIds.clear()
// Get all the comments from reddit