I want to store authors from Quotes To Scrape in a list.
I have some functions to get quotes and authors, wich are good : in the console, my scraping is perfect, and the script is finishing with a perfect list of quotes.
But my script can’t store the authors, and my list of authors is empty.
Here is my functions :
library(rvest)
library(tidyverse)
start_url <- 'http://quotes.toscrape.com/'
start_page <- read_html(start_url)
session <- session(start_url)
get_quotes_elements <- function(page_url) {
page <- read_html(page_url)
quotes_elements <- html_nodes(page,".quote")
return(quotes_elements)
}
get_quote <- function(quote_element) {
quote <- list()
quote_text = html_nodes(quote_element,'.text') %>% html_text()
quote_author = html_nodes(quote_element,'.author') %>% html_text()
quote_tags = html_nodes(quote_element,'.tags') %>% html_text()
quote['Author'] <- quote_author
quote['Quote'] <- quote_text
quote['Tags'] <- quote_tags
author_page_url <- paste0('https://quotes.toscrape.com', html_nodes(quote_element,"a[href*='/author/']") %>% html_attr("href"))
author <- get_author(author_page_url)
authors <- append(authors, author)
print(paste(" Quote from ", author["Name"], " added"))
return(quote)
}
get_quotes_pages <- function(start_url) {
start_page_number <- 1
pages_count <- 9
quotes_pages <- list()
quotes_pages <- append(quotes_pages, start_url)
for (i in start_page_number:pages_count) {
new_quotes_page_url <- quotes_pages[[i]]
# print(paste("Processing ", new_quotes_page_url))
new_quotes_page <- read_html(session %>% session_jump_to(new_quotes_page_url))
next_quotes_page_url <- paste0('https://quotes.toscrape.com', new_quotes_page %>% html_nodes('li.next a') %>% html_attr("href"))
# print(paste("next_quotes_page_url = ", next_quotes_page_url))
quotes_pages <- append(quotes_pages, next_quotes_page_url)
}
return(quotes_pages)
}
get_author <- function(author_page_url) {
author <- list()
first_author_page <- read_html(session %>% session_jump_to(author_page_url))
author_name = html_nodes(first_author_page,'.author-title') %>% html_text()
# print(paste(" Author name ", author_name))
author_born_date = html_nodes(first_author_page,'.author-born-date') %>% html_text()
# print(paste(" Author born date ", author_born_date))
author_description = html_nodes(first_author_page,'.author-description') %>% html_text()
# print(paste(" Author description ", author_description))
author['Name'] <- author_name
author['BornDate'] <- author_born_date
author['Description'] <- author_description
return(author)
}
And I lunch it with this script (I have limited the exemple to the first page of quotes) :
quotes_pages <- get_quotes_pages(start_url)
first_quotes_page <- quotes_pages[[1]]
quotes <- list()
authors <- list()
for (quotes_page in first_quotes_page) {
print(paste("Processing ", quotes_page))
new_quotes <- list()
new_quotes <- lapply(get_quotes_elements(quotes_page), get_quote)
quotes <- append(quotes, new_quotes)
}
I was thinking that authors was declared as global variable, so the list authors can be filled with it, but apparently not 🙁
Many thanks in advance for your help !