I have tried to download PDFs from certain URLs (e.g. https://example.com
) using the Norconex Web Crawler (v3.0) and the configuration below but no luck. Can someone please help me with this?
<?xml version="1.0" encoding="UTF-8"?>
<httpcollector id="PDF Finder Config HTTP Collector">
<workDir>./pdf-finder-output</workDir>
<crawlers>
<crawler id="PDF Finder">
<startURLs stayOnDomain="true" includeSubdomains="true">
<url>https://example.com</url>
</startURLs>
<maxDepth>-1</maxDepth>
<sitemapResolver ignore="true" />
<delay default="1000" />
<documentFilters>
<filter class="ExtensionReferenceFilter" onMatch="include">
PDF
</filter>
</documentFilters>
</crawler>
</crawlers>
</httpcollector>