mirror of
https://github.com/gurnec/removeddit.git
synced 2026-03-11 08:54:27 +00:00
Update the endpoint to api.pushsift.io v4
Also update the reddit endpoint for posts, and add the capability to download from Pushshift in multiple batches, with a binary backoff; it's currently set to download up to 1,000 comments. Credit (once again) to @ayan4m1 for a couple snippets of code.
This commit is contained in:
parent
b35beaae52
commit
c8b0ac8523
3 changed files with 58 additions and 63 deletions
|
|
@ -1,57 +1,57 @@
|
|||
import { toBase10, toBase36 } from '../../utils'
|
||||
const chunkSize = 100;
|
||||
const postURL = 'https://api.pushshift.io/reddit/submission/search/?ids='
|
||||
const commentURL = `https://api.pushshift.io/reddit/comment/search/?size=${chunkSize}&sort=asc&fields=author,body,created_utc,id,link_id,parent_id,score,subreddit&q=*&link_id=`
|
||||
|
||||
const postURL = 'https://elastic.pushshift.io/rs/submissions/_search?source='
|
||||
const commentURL = 'https://elastic.pushshift.io/rc/comments/_search?source='
|
||||
const sleep = ms =>
|
||||
new Promise(slept => setTimeout(slept, ms))
|
||||
|
||||
export const getPost = threadID => {
|
||||
const elasticQuery = {
|
||||
query: {
|
||||
term: {
|
||||
id: toBase10(threadID)
|
||||
}
|
||||
}
|
||||
}
|
||||
const max = (a, b) =>
|
||||
a > b ? a : b
|
||||
|
||||
return window.fetch(postURL + JSON.stringify(elasticQuery))
|
||||
export const getPost = threadID =>
|
||||
window.fetch(`${postURL}${threadID}`)
|
||||
.then(response => response.json())
|
||||
.then(response => {
|
||||
const post = response.hits.hits[0]._source
|
||||
post.id = toBase36(post.id)
|
||||
return post
|
||||
.then(({ data }) => data[0])
|
||||
.catch(() => {
|
||||
throw new Error('Could not get removed post')
|
||||
})
|
||||
.catch(() => { throw new Error('Could not get removed post') })
|
||||
}
|
||||
|
||||
export const getComments = threadID => {
|
||||
const elasticQuery = {
|
||||
query: {
|
||||
match: {
|
||||
link_id: toBase10(threadID)
|
||||
}
|
||||
},
|
||||
size: 20000,
|
||||
_source: [
|
||||
'author', 'body', 'created_utc', 'parent_id', 'score', 'subreddit', 'link_id'
|
||||
]
|
||||
}
|
||||
|
||||
return window.fetch(commentURL + JSON.stringify(elasticQuery))
|
||||
// Helper function that fetches a list of comments using a binary backoff,
|
||||
// and also returns the next delay which should be passed back in
|
||||
const fetchComments = (threadID, after, delay) =>
|
||||
window.fetch(`${commentURL}${threadID}&after=${after}`)
|
||||
.then(response => response.json())
|
||||
.then(response => {
|
||||
const comments = response.hits.hits
|
||||
return comments.map(comment => {
|
||||
comment._source.id = toBase36(comment._id)
|
||||
comment._source.link_id = toBase36(comment._source.link_id)
|
||||
|
||||
// Missing parent id === direct reply to thread
|
||||
if (!comment._source.parent_id) {
|
||||
comment._source.parent_id = threadID
|
||||
} else {
|
||||
comment._source.parent_id = toBase36(comment._source.parent_id)
|
||||
}
|
||||
|
||||
return comment._source
|
||||
})
|
||||
.then(({ data }) =>
|
||||
[ data.map(comment => ({
|
||||
...comment,
|
||||
parent_id: comment.parent_id.substring(3) || threadID,
|
||||
link_id: comment.link_id.substring(3) || threadID
|
||||
})),
|
||||
delay
|
||||
]
|
||||
)
|
||||
.catch(() => {
|
||||
if (delay > 8000)
|
||||
throw new Error('Could not get removed comments');
|
||||
return sleep(delay)
|
||||
.then(() => fetchComments(threadID, after, delay * 2))
|
||||
})
|
||||
|
||||
export const getComments = (threadID, chunks = 10, after = 0, delay = 500) =>
|
||||
fetchComments(threadID, after, delay)
|
||||
.then(([comments, newDelay]) => {
|
||||
if (comments.length < chunkSize/2 || chunks <= 1)
|
||||
return comments;
|
||||
const newAfter = max(comments[comments.length - 1].created_utc - 1, after + 1);
|
||||
return (newDelay > 500 ? sleep(newDelay / 2) : Promise.resolve())
|
||||
.then(() => getComments(threadID, chunks - 1, newAfter, newDelay))
|
||||
.then(remainingComments => {
|
||||
const seenIDs = new Set(comments.map(c => c.id));
|
||||
for (var i = 0; i < remainingComments.length; i++) {
|
||||
if ( ! seenIDs.has(remainingComments[i].id) )
|
||||
break
|
||||
}
|
||||
comments.push(...remainingComments.slice(i));
|
||||
return comments;
|
||||
})
|
||||
})
|
||||
.catch(() => { throw new Error('Could not get removed comments') })
|
||||
}
|
||||
|
|
|
|||
|
|
@ -5,24 +5,23 @@ const errorHandler = () => {
|
|||
throw new Error('Could not connect to Reddit')
|
||||
}
|
||||
|
||||
// Thread = Post + Comments
|
||||
// Return the post itself
|
||||
export const getPost = (subreddit, threadID) => (
|
||||
getAuth()
|
||||
.then(auth => window.fetch(`https://oauth.reddit.com/r/${subreddit}/comments/${threadID}/_/`, auth))
|
||||
.then(auth => window.fetch(`https://oauth.reddit.com/comments/${threadID}.json?limit=1`, auth))
|
||||
.then(response => response.json())
|
||||
.then(thread => thread[0].data.children[0].data)
|
||||
.catch(errorHandler)
|
||||
)
|
||||
|
||||
// Fetch multiple threads (via the info endpoint)
|
||||
export const getThreads = threadIDs => {
|
||||
return getAuth()
|
||||
.then(auth => window.fetch(`https://oauth.reddit.com/api/info?id=${threadIDs.map(id => `t3_${id}`).join()}`, auth))
|
||||
.then(response => response.json())
|
||||
.then(response => response.data.children.map(threadData => threadData.data))
|
||||
.catch(errorHandler)
|
||||
}
|
||||
//// Fetch multiple threads (via the info endpoint)
|
||||
//export const getThreads = threadIDs => {
|
||||
// return getAuth()
|
||||
// .then(auth => window.fetch(`https://oauth.reddit.com/api/info?id=${threadIDs.map(id => `t3_${id}`).join()}`, auth))
|
||||
// .then(response => response.json())
|
||||
// .then(response => response.data.children.map(threadData => threadData.data))
|
||||
// .catch(errorHandler)
|
||||
//}
|
||||
|
||||
// Helper function that fetches a list of comments
|
||||
const fetchComments = (commentIDs, auth) => {
|
||||
|
|
|
|||
|
|
@ -17,10 +17,6 @@ export const chunk = (arr, size) => {
|
|||
return chunks
|
||||
}
|
||||
|
||||
// Change bases
|
||||
export const toBase36 = number => parseInt(number, 10).toString(36)
|
||||
export const toBase10 = numberString => parseInt(numberString, 36)
|
||||
|
||||
// Reddits way of indicating that something is deleted (the '\\' is for Reddit and the other is for pushshift)
|
||||
export const isDeleted = textBody => textBody === '\\[deleted\\]' || textBody === '[deleted]'
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue