Update the endpoint to api.pushsift.io v4

Also update the reddit endpoint for posts, and add the capability
to download from Pushshift in multiple batches, with a binary
backoff; it's currently set to download up to 1,000 comments.

Credit (once again) to @ayan4m1 for a couple snippets of code.
This commit is contained in:
Christopher Gurnee 2021-10-15 16:59:57 -04:00
parent b35beaae52
commit c8b0ac8523
3 changed files with 58 additions and 63 deletions

View file

@ -1,57 +1,57 @@
import { toBase10, toBase36 } from '../../utils'
const chunkSize = 100;
const postURL = 'https://api.pushshift.io/reddit/submission/search/?ids='
const commentURL = `https://api.pushshift.io/reddit/comment/search/?size=${chunkSize}&sort=asc&fields=author,body,created_utc,id,link_id,parent_id,score,subreddit&q=*&link_id=`
const postURL = 'https://elastic.pushshift.io/rs/submissions/_search?source='
const commentURL = 'https://elastic.pushshift.io/rc/comments/_search?source='
const sleep = ms =>
new Promise(slept => setTimeout(slept, ms))
export const getPost = threadID => {
const elasticQuery = {
query: {
term: {
id: toBase10(threadID)
}
}
}
const max = (a, b) =>
a > b ? a : b
return window.fetch(postURL + JSON.stringify(elasticQuery))
export const getPost = threadID =>
window.fetch(`${postURL}${threadID}`)
.then(response => response.json())
.then(response => {
const post = response.hits.hits[0]._source
post.id = toBase36(post.id)
return post
.then(({ data }) => data[0])
.catch(() => {
throw new Error('Could not get removed post')
})
.catch(() => { throw new Error('Could not get removed post') })
}
export const getComments = threadID => {
const elasticQuery = {
query: {
match: {
link_id: toBase10(threadID)
}
},
size: 20000,
_source: [
'author', 'body', 'created_utc', 'parent_id', 'score', 'subreddit', 'link_id'
]
}
return window.fetch(commentURL + JSON.stringify(elasticQuery))
// Helper function that fetches a list of comments using a binary backoff,
// and also returns the next delay which should be passed back in
const fetchComments = (threadID, after, delay) =>
window.fetch(`${commentURL}${threadID}&after=${after}`)
.then(response => response.json())
.then(response => {
const comments = response.hits.hits
return comments.map(comment => {
comment._source.id = toBase36(comment._id)
comment._source.link_id = toBase36(comment._source.link_id)
// Missing parent id === direct reply to thread
if (!comment._source.parent_id) {
comment._source.parent_id = threadID
} else {
comment._source.parent_id = toBase36(comment._source.parent_id)
}
return comment._source
})
.then(({ data }) =>
[ data.map(comment => ({
...comment,
parent_id: comment.parent_id.substring(3) || threadID,
link_id: comment.link_id.substring(3) || threadID
})),
delay
]
)
.catch(() => {
if (delay > 8000)
throw new Error('Could not get removed comments');
return sleep(delay)
.then(() => fetchComments(threadID, after, delay * 2))
})
export const getComments = (threadID, chunks = 10, after = 0, delay = 500) =>
fetchComments(threadID, after, delay)
.then(([comments, newDelay]) => {
if (comments.length < chunkSize/2 || chunks <= 1)
return comments;
const newAfter = max(comments[comments.length - 1].created_utc - 1, after + 1);
return (newDelay > 500 ? sleep(newDelay / 2) : Promise.resolve())
.then(() => getComments(threadID, chunks - 1, newAfter, newDelay))
.then(remainingComments => {
const seenIDs = new Set(comments.map(c => c.id));
for (var i = 0; i < remainingComments.length; i++) {
if ( ! seenIDs.has(remainingComments[i].id) )
break
}
comments.push(...remainingComments.slice(i));
return comments;
})
})
.catch(() => { throw new Error('Could not get removed comments') })
}

View file

@ -5,24 +5,23 @@ const errorHandler = () => {
throw new Error('Could not connect to Reddit')
}
// Thread = Post + Comments
// Return the post itself
export const getPost = (subreddit, threadID) => (
getAuth()
.then(auth => window.fetch(`https://oauth.reddit.com/r/${subreddit}/comments/${threadID}/_/`, auth))
.then(auth => window.fetch(`https://oauth.reddit.com/comments/${threadID}.json?limit=1`, auth))
.then(response => response.json())
.then(thread => thread[0].data.children[0].data)
.catch(errorHandler)
)
// Fetch multiple threads (via the info endpoint)
export const getThreads = threadIDs => {
return getAuth()
.then(auth => window.fetch(`https://oauth.reddit.com/api/info?id=${threadIDs.map(id => `t3_${id}`).join()}`, auth))
.then(response => response.json())
.then(response => response.data.children.map(threadData => threadData.data))
.catch(errorHandler)
}
//// Fetch multiple threads (via the info endpoint)
//export const getThreads = threadIDs => {
// return getAuth()
// .then(auth => window.fetch(`https://oauth.reddit.com/api/info?id=${threadIDs.map(id => `t3_${id}`).join()}`, auth))
// .then(response => response.json())
// .then(response => response.data.children.map(threadData => threadData.data))
// .catch(errorHandler)
//}
// Helper function that fetches a list of comments
const fetchComments = (commentIDs, auth) => {

View file

@ -17,10 +17,6 @@ export const chunk = (arr, size) => {
return chunks
}
// Change bases
export const toBase36 = number => parseInt(number, 10).toString(36)
export const toBase10 = numberString => parseInt(numberString, 36)
// Reddits way of indicating that something is deleted (the '\\' is for Reddit and the other is for pushshift)
export const isDeleted = textBody => textBody === '\\[deleted\\]' || textBody === '[deleted]'