mirror of
https://github.com/gurnec/removeddit.git
synced 2026-03-11 08:54:27 +00:00
Rewrite api/pushsift/index.js using async/await
* much simpler, got rid of recursion * no more Set temporaries, less copying of arrays * now returns a Map (thread/index.js need not create it)
This commit is contained in:
parent
c6585ab749
commit
c98f71bd4d
2 changed files with 43 additions and 53 deletions
|
|
@ -7,54 +7,46 @@ const commentURL = `https://api.pushshift.io/reddit/comment/search/?size=${chunk
|
|||
const sleep = ms =>
|
||||
new Promise(slept => setTimeout(slept, ms))
|
||||
|
||||
export const getPost = threadID =>
|
||||
fetchJson(`${postURL}${threadID}`)
|
||||
.then(({ data }) => data[0])
|
||||
.catch(error => {
|
||||
console.error('pushshift.getPost: ' + error)
|
||||
throw new Error('Could not get removed post')
|
||||
})
|
||||
export const getPost = async threadID => {
|
||||
try {
|
||||
return (await fetchJson(`${postURL}${threadID}`)).data[0]
|
||||
} catch (error) {
|
||||
console.error('pushshift.getPost: ' + error)
|
||||
throw new Error('Could not get removed post')
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function that fetches a list of comments using a binary backoff,
|
||||
// and also returns the next delay which should be passed back in
|
||||
const fetchComments = (threadID, after, delay) =>
|
||||
fetchJson(`${commentURL}${threadID}&after=${after}`)
|
||||
.then(({ data }) =>
|
||||
[ data.map(comment => ({
|
||||
...comment,
|
||||
parent_id: comment.parent_id.substring(3) || threadID,
|
||||
link_id: comment.link_id.substring(3) || threadID
|
||||
})),
|
||||
delay
|
||||
]
|
||||
)
|
||||
.catch(error => {
|
||||
if (delay > 8000) {
|
||||
console.error('pushshift.fetchComments: ' + error)
|
||||
throw new Error('Could not get removed comments');
|
||||
export const getComments = async (threadID, maxComments) => {
|
||||
let chunks = Math.ceil(maxComments / chunkSize)
|
||||
let after = 0, delay = 0, comments
|
||||
const allComments = new Map()
|
||||
while (true) {
|
||||
|
||||
while (true) {
|
||||
try {
|
||||
comments = (await fetchJson(`${commentURL}${threadID}&after=${after}`)).data
|
||||
break
|
||||
} catch (error) {
|
||||
if (delay > 4000) {
|
||||
console.error('pushshift.getComments: ' + error)
|
||||
throw new Error('Could not get removed comments')
|
||||
}
|
||||
delay = delay * 2 || 500
|
||||
}
|
||||
return sleep(delay)
|
||||
.then(() => fetchComments(threadID, after, delay * 2))
|
||||
})
|
||||
await sleep(delay)
|
||||
}
|
||||
|
||||
const doGetComments = (threadID, chunks = 10, after = 0, delay = 500) =>
|
||||
fetchComments(threadID, after, delay)
|
||||
.then(([comments, newDelay]) => {
|
||||
if (comments.length < chunkSize/2 || chunks <= 1)
|
||||
return comments;
|
||||
const newAfter = Math.max(comments[comments.length - 1].created_utc - 1, after + 1);
|
||||
return (newDelay > 500 ? sleep(newDelay / 2) : Promise.resolve())
|
||||
.then(() => doGetComments(threadID, chunks - 1, newAfter, newDelay))
|
||||
.then(remainingComments => {
|
||||
const seenIDs = new Set(comments.map(c => c.id));
|
||||
for (var i = 0; i < remainingComments.length; i++) {
|
||||
if ( ! seenIDs.has(remainingComments[i].id) )
|
||||
break
|
||||
}
|
||||
comments.push(...remainingComments.slice(i));
|
||||
return comments;
|
||||
})
|
||||
})
|
||||
|
||||
export const getComments = (threadID, maxComments) =>
|
||||
doGetComments(threadID, Math.ceil(maxComments / chunkSize))
|
||||
comments.forEach(c => allComments.set(c.id, {
|
||||
...c,
|
||||
parent_id: c.parent_id?.substring(3) || threadID,
|
||||
link_id: c.link_id?.substring(3) || threadID
|
||||
}))
|
||||
if (comments.length < chunkSize/2 || chunks <= 1)
|
||||
break
|
||||
chunks -= 1
|
||||
after = Math.max(comments[comments.length - 1].created_utc - 1, after + 1)
|
||||
if (delay)
|
||||
await sleep(delay)
|
||||
}
|
||||
return allComments
|
||||
}
|
||||
|
|
|
|||
|
|
@ -89,14 +89,13 @@ class Thread extends React.Component {
|
|||
|
||||
// Get comment ids from pushshift
|
||||
getPushshiftComments(threadID, this.props.global.state.maxComments)
|
||||
.then(pushshiftComments => {
|
||||
console.log(`Pushshift: ${pushshiftComments.length} comments`)
|
||||
const pushshiftCommentLookup = new Map(pushshiftComments.map(c => [c.id, c]))
|
||||
.then(pushshiftCommentLookup => {
|
||||
console.log(`Pushshift: ${pushshiftCommentLookup.size} comments`)
|
||||
const ids = []
|
||||
const missingIds = new Set()
|
||||
|
||||
// Extract ids from pushshift response
|
||||
pushshiftComments.forEach(comment => {
|
||||
pushshiftCommentLookup.forEach(comment => {
|
||||
ids.push(comment.id)
|
||||
if (comment.parent_id != threadID &&
|
||||
!pushshiftCommentLookup.has(comment.parent_id) &&
|
||||
|
|
@ -105,7 +104,6 @@ class Thread extends React.Component {
|
|||
missingIds.add(comment.parent_id)
|
||||
}
|
||||
});
|
||||
pushshiftComments = undefined
|
||||
missingIds.clear()
|
||||
|
||||
// Get all the comments from reddit
|
||||
|
|
|
|||
Loading…
Reference in a new issue