Thiết kế website giá rẻ

Question

I am currently working with the Python library Scrapy and I am inheriting from the CrawlSpider so that I can override/define custom Rules. I have defined rules that should block all URLs with auth/ and allow only URLs with /tags.

My solution:

class DockerhubDockerRegistrySpider(CrawlSpider):
    name = "dockerhubDockerQueriedRegistrySpider"
    allowed_domains = ["hub.docker.com"]

    rules = (
        Rule(
            LinkExtractor(allow='/tags'),
            callback='parse_additional_page',
            follow=False
        ),
        Rule(
            LinkExtractor(deny='auth/'),
            follow=False
        ),

    )

The Problem is that when the follow methode is called, it automatically calls always https://hub.docker.com/auth/profile and this page cannot be accessed, so I want to block it but somehow my rules do not fire.

Follow URL call:

yield response.follow(
                additional_data_url_absolute,
                self.parse_additional_page,
                cb_kwargs=dict(item=item),
                meta=dict(
                    playwright=True,
                    playwright_include_page=True,
                    playwright_page_methods={
                        "wait_for_page_load": PageMethod("wait_for_selector", 'body[aria-describedby="global-progress"]')
                    },

                )
            )

Follow Callback:

def parse_additional_page(self, response, item):
    self.logger.info("Additional page meta: %s", response.meta)
    self.logger.info("Additional page HTML: %s", response.css('title::text').get())
    self.logger.info("Additional page HTML Repo-Name: %s", response.css('h2[data-testid="repoName"]::text').get())
    item['additional_data'] = response.css('h2[data-testid="repoName"]::text').get()
    yield item

Debug log:

2024-07-16 20:28:17 [scrapy-playwright] DEBUG: [Context=default] Response: <401 https://hub.docker.com/auth/profile>
2024-07-16 20:28:18 [scrapy-playwright] DEBUG: [Context=default] Response: <401 https://hub.docker.com/auth/profile>
2024-07-16 20:28:19 [scrapy-playwright] DEBUG: [Context=default] Request: <POST 2024-07-16 20:28:19 [scrapy-playwright] DEBUG: [Context=default] Request: <GET https://hub.docker.com/auth/profile> (resource type: fetch, referrer: https://hub.docker.com/search?q=python&page=1)

2024-07-16 20:44:23 [scrapy-playwright] DEBUG: [Context=default] Request: <GET https://hub.docker.com/auth/profile> (resource type: fetch, referrer: https://hub.docker.com/r/google/guestbook-python-redis/tags)

How to block requests for https://hub.docker.com/auth/profile?

My whole solution:

class DockerhubDockerRegistrySpider(CrawlSpider):
name = "dockerhubDockerQueriedRegistrySearchSpiderTemp"
allowed_domains = ["hub.docker.com"]

rules = (
    Rule(
        LinkExtractor(allow=r'/tags$'),  # Only allow URLs that end with '/tags'
        callback='parse_additional_page',
        follow=False
    ),

    Rule(
        LinkExtractor(allow='search'),
        callback='parse_registry',
        follow=True
    ),

)

def __init__(self, query=None, *args, **kwargs):
    super(DockerhubDockerRegistrySpider, self).__init__(*args, **kwargs)
    self.query = query
    self.start_urls = [f'https://hub.docker.com/search?q={query}&page={i}' for i in range(1, 12)]

def start_requests(self):
    for index, url in enumerate(self.start_urls, start=1):
        self.logger.info(f"Starting request: {url}")
        yield scrapy.Request(
                url,
                meta=dict(
                    page_number=index,
                    playwright=True,
                    playwright_include_page=True,
                    playwright_page_methods={
                        "wait_for_search_results": PageMethod("wait_for_selector", "div#searchResults"),
                    }
                ),
                callback=self.parse_registry
            )

async def parse_registry(self, response):
    page = response.meta["playwright_page"]

    if await page.title() == "hub.docker.com":
        await page.close()
        await page.context.close()

    page_number = response.meta.get("page_number")
    if page_number is None:
        self.logger.warning("Page number not found in meta: %s", response.url)
        return

    search_results = response.xpath('//a[@data-testid="imageSearchResult"]')

    for result in search_results:
        item = DockerImageItem()
        item['page_number'] = page_number
        item['name'] = result.css('[data-testid="product-title"]::text').get()

        uploader_elem = result.css("span::text").re(r"^By (.+)")

        if uploader_elem:
            item["uploader"] = uploader_elem[0].strip()
        else:
            official_icon = result.css('[data-testid="official-icon"]')
            verified_publisher_icon = result.css(
                '[data-testid="verified_publisher-icon"]'
            )

            item["is_official_image"] = bool(official_icon)
            item["is_verified_publisher"] = bool(verified_publisher_icon)

        item['is_official_image'] = bool(result.css('[data-testid="official-icon"]'))
        item['is_verified_publisher'] = bool(result.css('[data-testid="verified_publisher-icon"]'))
        item['last_update'] = self.parse_update_string(result.css('span:contains("Updated")::text').get())

        item['description'] = result.xpath(
            './/span[contains(text(), "Updated")]/ancestor::div[1]/following-sibling::p[1]/text()').get()

        item['chips'] = result.css('[data-testid="productChip"] span::text').getall()

        # Extract pulls last week
        pulls_elem = (
            result.css('p:contains("Pulls:")')
            .xpath("following-sibling::p/text()")
            .get()
        )
        item["pulls_last_week"] = (
            pulls_elem.replace(",", "") if pulls_elem else None
        )

        item['downloads'] = result.css('[data-testid="DownloadIcon"] + p::text').get()
        item['stars'] = result.css('svg[data-testid="StarOutlineIcon"] + span > strong::text').get()

        # Clean up text fields
        item['name'] = item['name'].strip() if item['name'] else None
        item['description'] = item['description'].strip() if item['description'] else None
        item['downloads'] = item['downloads'].strip() if item['downloads'] else None
        item['pulls_last_week'] = item['pulls_last_week'].replace(",", "") if item['pulls_last_week'] else None
        item['stars'] = item['stars'].strip() if item['stars'] else None

        additional_data_url = result.attrib.get('href')
        if additional_data_url and '/r/' or '/_/' in additional_data_url and '/tags' in additional_data_url:
            additional_data_url_absolute = f"https://hub.docker.com{additional_data_url}/tags"
            self.logger.info("Entered Additional Page: %s", additional_data_url_absolute)

            yield response.follow(
                additional_data_url_absolute,
                callback=self.parse_additional_page,
                cb_kwargs=dict(item=item),
                meta=dict(
                    playwright=True,
                    playwright_include_page=True,
                    errback=self.close_context_on_error,
                    playwright_page_methods={
                        "wait_for_page_load": PageMethod("wait_for_selector", 'div[data-testid="repotagsTagListItem"]'),
                    },
                    playwright_context_kwargs={
                        "ignore_https_errors": True,
                    }
                )
            )
        else:
            self.logger.error("No additional page available for: %s", item['name'])
            yield item

async def close_context_on_error(self, failure):
    page = failure.request.meta["playwright_page"]
    await page.close()
    await page.context.close()

def parse_additional_page(self, response, item):

    self.logger.info("Additional page meta: %s", response.meta)
    self.logger.info("Additional page HTML: %s", response.css('title::text').get())
    self.logger.info("Additional page HTML Repo-Name: %s", response.css('h2[data-testid="repoName"]::text').get())
    item['additional_data'] = response.css('h2[data-testid="repoName"]::text').get()
    return item


@staticmethod
def parse_update_string(update_string):
    return update_string.strip() if update_string else None

Thiết kế website giá rẻ

Danh mục

Defined Rules get not called in Scrapy