I’m scraping text from various webshops (no images/videos or other data). I’m no expert on user tracking, so I’d like to know if there’s a way for me to write my crawler so it won’t interfere with the webshop owners tracking. Perhaps this is already the case since the crawler isn’t storing any cookies, requesting images or anything else but the actual pages, but I’d like to be sure.
What would I do when requesting the pages to ensure that nothing is tracked in Google Analytics for instance? Should I send an e-mail to the owners telling them to filter a specific user-agent or…?
I’ve seen How to be a good citizen when crawling web sites? where the last answer states that one should add “crawler” or “spider” in the user-agent. I’m not sure what to make of that as I can’t find anything to back it up.
(The crawler is written in node.js and uses the request-module to download websites)
EDIT:
For anyone interested, here’s the infinitely simple crawler I’ve made. It doesn’t obey robots.txt because I’m specifying what kind of links I want to follow myself (and I’m too lazy right now to write something that obeys robots.txt):
var request = require("request")
, cheerio = require("cheerio")
;
exports.crawl = function(options) {
var links = [].concat(options.url) // Takes either an array of url's or just a string.
, ongoingRequests = 0
, index = 0
, done = false
;
process.nextTick(ticker);
function ticker() {
if(ongoingRequests < options.maxRequests && index < links.length && !done) {
var url = links[index++];
ongoingRequests++;
if(options.debug) console.log("Downloading " + url);
request({ url: url, encoding: options.encoding || "utf-8" }, function(err, response) {
ongoingRequests--;
if(err) {
return;
}
if(!done) {
var $ = cheerio.load(response.body)
, shouldContinue = options.data($, response, url)
;
if(shouldContinue === false) {
console.log("Crawler interrupted.");
options.done();
done = true;
}
var newLinks = options.extractLinks($).filter(function(url, i) { return links.indexOf(url, i + 1) === -1; });
if(options.debug && newLinks.length) console.log(newLinks);
links = links.concat(newLinks);
if((index - 1) % 5 === 0) console.log("Current index: " + (index - 1) + ", links found so far: " + links.length);
}
});
} else if(!ongoingRequests) {
if(!done) {
options.done();
done = true;
}
return;
}
process.nextTick(ticker);
}
};
Use it like this:
var crawler = require("../crawler");
crawler.crawl({
url: "http://website.com"
, debug: true
, maxRequests: 5
, data: function($, response, url) {
if(url.indexOf("/product/") === -1) {
return;
}
console.log("extract stuff from $ using CSS-selectors and a nice jQuery-like API");
}
, done: function() {
console.log("DONE!!");
}
, extractLinks: function($) {
return $("a").map(function() {
return $(this).attr("href");
}).filter(function(url, i, a) {
if(!url || url[0] !== "/") {
return false;
}
return i === a.indexOf(url) // Remove duplicates
&& url.indexOf("/cart") === -1
&& url.indexOf(".htm") > -1
;
}).map(function(url) {
return "http://website.com" + url;
});
}
});
Google Analytics tracking is implemented as javascript that runs in the browser.
Your crawler only requests the page, it does not execute any javascript that may be found on it.
So, there’s no reason for you to worry about messing with the user tracking.