Written a spider code for scraping heading , url links and content in it.
import scrapy
from scrapy.selector import Selector
class FoolSpider(scrapy.Spider):
name = “fool”
def start_requests(self):
url = 'https://www.fool.com/earnings-call-transcripts/'
yield scrapy.Request(url, cb_kwargs={"page": 1})
def parse(self, response, page=None):
if page > 1:
# after first page take extract html from json
text = response.json()["html"]
# wrap the in a parent tag and create a scrapy selector
response = Selector(text=f"<html>{text}</html>")
for headline in response.css('a.text-gray-1100'):
headline_text=headline.css('h5.font-medium::text').get()
url_links=headline.css('::attr(href)').get()
yield response.follow(url_links, self.parse_content, meta={'headline': headline_text})
yield scrapy.Request(
f"https://www.fool.com/earnings-call-transcripts/filtered_articles_by_page/?page={page+1}",
cb_kwargs={"page": page+1},
headers={"X-Requested-With": "fetch"}
)
def parse_content(self, response):
content = response.css('div.article-body p::text').get()
# You can now include the content in your yield statement
yield {
'headline': response.meta['headline'],
'url': response.url,
'content': content
}
So I am getting no content at all and only getting headline and url for first page
I want the content, headline, and url links of all pages.
Pls help me with the code
New contributor
Escobar is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.