I’m working on a Scrapy project and have a custom spider defined as follows:
class JosephCrawlSpider(Spider):
parse_spider = JosephParseSpider()
def start_requests(self):
for url in self.start_urls:
category = self.extract_category_from_url(url)
yield Request(
url, callback=self.parse, cookies=self.cookies,
meta={'category': category}
)
def parse(self, response):
product_links = response.css('.product-name a.name-link::attr(href)').getall()
if not product_links:
return
category = response.meta.get('category', 'unknown')
for link in product_links:
yield response.follow(
link, callback=self.parse_product, cookies=self.cookies,
meta={'category': category}
)
start = int(response.url.split('start=')[-1].split('&')[0])
next_start = start + 12
next_page = (
f"https://www.joseph-fashion.com/en-gb/womens/"
f"?start={next_start}&sz=12&format=page-element&rowNumber=4"
f"¤tView=model_view"
)
yield Request(
url=next_page, callback=self.parse, cookies=self.cookies,
meta={'category': category}
)
def parse_product(self, response):
response.meta['raw_product'] = response
yield from self.parse_item(response)
def parse_item(self, response):
return self.parse_spider.parse(response)
def extract_category_from_url(self, url):
parts = url.split('/')
try:
start_index = parts.index("womens")
return parts[start_index:start_index + 3]
except ValueError:
return []
I would like to refactor this crawler to use Scrapy’s Rule
and LinkExtractor
to handle the extraction of product links and pagination. How can I modify the spider to utilize these classes?
I’m particularly interested in how to set up Rule
for both extracting product links and handling pagination in the JosephCrawlSpider
class.
Any guidance or examples on how to integrate Rule
and LinkExtractor
would be greatly appreciated!