I’m currently working on retrieving data from a subreddit, aiming to capture all posts since its inception. However, I’m encountering a limitation where I can only access information for the latest 1000 posts. Below is the code I’m currently using:
Is there a workaround or alternative method that would allow me to fetch all posts from the subreddit, starting from its earliest entries? Any guidance or assistance would be greatly appreciated! Thank you.
subreddit = reddit.subreddit("AfricanCichlids")
%%time
def scrape_subreddit_data(subreddit):
data = {
'Title': [],
'Author': [],
'Upvotes': [],
'Downvotes': [],
'Score': [],
'Number of Comments': [],
'Creation Time': [],
'Is NSFW': [],
'Is Spoiler': [],
'Is Locked': [],
'Is Archived': [],
'Subreddit Subscribers': [],
'Subreddit Active Users': [],
'Subreddit Creation Time': [],
'Comments': []
# Add more fields as needed
}
for submission in subreddit.new(limit=None): # Paginate through all posts from the start
data['Title'].append(submission.title)
data['Author'].append(submission.author)
data['Upvotes'].append(submission.score)
data['Downvotes'].append(submission.downs)
data['Score'].append(submission.ups - submission.downs)
data['Number of Comments'].append(submission.num_comments)
data['Creation Time'].append(pd.to_datetime(submission.created_utc, unit='s'))
data['Is NSFW'].append(submission.over_18)
data['Is Spoiler'].append(submission.spoiler)
data['Is Locked'].append(submission.locked)
data['Is Archived'].append(submission.archived)
data['Subreddit Subscribers'].append(submission.subreddit.subscribers)
data['Subreddit Active Users'].append(submission.subreddit.accounts_active)
data['Subreddit Creation Time'].append(pd.to_datetime(submission.subreddit.created_utc, unit='s'))
# Rate limiting: sleep for a short duration between requests
time.sleep(2) # Sleep for 1 second between requests
submission.comments.replace_more(limit=None)
comments = []
for comment in submission.comments.list():
comments.append(comment.body)
data['Comments'].append(comments)
# Add more fields as needed
return data
# Scrape data from subreddit
data = scrape_subreddit_data(subreddit)
# Convert data to DataFrame
df = pd.DataFrame(data)