I have a requirement to
- take a bunch of id’s from the user (100 or more)
- create a URL from those id’s
- open each URL as a separate tab and scrape info from it
- store all the scrapped info as a json obj
- write the json obj into a csv file
I am trying to achieve this by running a js script in my chromedevtools environment, however keep ending up with Error code: SIGILL
Error code: SIGILL after the script performs the above actions for 20-30 Id’s
how do I resolve this
My code is as follows:-
// Function to prompt user for input and convert it to an array of strings
function promptAndConvertToArray() {
// Prompt the user for input
const userInput = prompt('Enter a list of IDs separated by new lines:');
// Check if user clicked cancel or entered nothing
if (!userInput) {
console.log('No input provided.');
return [];
}
// Split the input string by new line characters to get an array
const idArray = userInput.split('n').map(id => id.trim());
return idArray;
}
// Function to generate links
function generateLinks(ids) {
const prefix = ''; // removing the prefixes as its proprietary info
const suffix = ''; // removing the suffixes as its proprietary info
return ids.map(id => prefix + id + suffix);
}
// Function to write output to a CSV file
function downloadCSV(data) {
const csvHeaders = ['Link'];
const csvRows = [];
// Extract all possible keys from the inner objects
const allKeys = Object.values(data).reduce((keys, innerData) => {
return keys.concat(Object.keys(innerData.spans));
}, []);
// Remove duplicate keys and sort them
const uniqueKeys = [...new Set(allKeys)].sort();
// Add headers for each key in the CSV
csvHeaders.push(...uniqueKeys);
// Generate rows for each link
for (const [link, innerData] of Object.entries(data)) {
const row = [link];
// Populate row with values for each key
for (const key of uniqueKeys) {
row.push(innerData.spans[key] || '');
console.log(row)
}
csvRows.push(row.join(','));
}
// Combine headers and rows
const csvContent = [csvHeaders.join(','), ...csvRows].join('n');
// Create CSV file and download
const blob = new Blob([csvContent], { type: 'text/csv' });
const url = URL.createObjectURL(blob);
const link = document.createElement('a');
link.href = url;
link.download = 'data.csv';
document.body.appendChild(link);
link.click();
document.body.removeChild(link);
URL.revokeObjectURL(url);
}
// Function to scrape data from a single page
async function scrapeDataFromPage(link) {
// Open a new window or tab for the link (you may need to adjust this depending on your environment)
const page = window.open(link);
// Wait for the page to load (you might need to add more sophisticated waiting logic)
await new Promise(resolve => setTimeout(resolve, 1800000)); // Adjust the delay as needed
// Inject the scrapeData function into the opened window/tab
await page.eval(`
function scrapeData() {
const spanElements = document.querySelectorAll('span.classname');
const data = {
spans: {}
};
// Scrape data from span elements and count unique elements
spanElements.forEach(span => {
const text = span.textContent.trim();
data.spans[text] = (data.spans[text] || 0) + 1;
});
return data;
}
` );
// Extract the scraped data using the injected function
const scrapedData = await page.eval('scrapeData()');
// Close the current page
page.close();
return scrapedData;
}
// Function to throttle requests
function throttle(delay) {
let lastCall = 0;
return function(fn) {
const now = Date.now();
if (now - lastCall < delay) {
return new Promise(resolve => setTimeout(() => resolve(fn()), delay - (now - lastCall)));
}
lastCall = now;
return fn();
};
}
// Function to scrape data from multiple webpages in parallel
async function scrapeDataFromMultiplePages(links) {
//const throttledScrape = throttle(10000); // Throttle to one request per second
//const scrapePromises = links.map(link => throttledScrape(() => scrapeDataFromPage(link)));
const scrapePromises = links.map(link => scrapeDataFromPage(link));
// Wait for all scraping tasks to finish
const scrapedDataArray = await Promise.all(scrapePromises);
const resultObject = {};
// Combine results from all scraping tasks into a single object
links.forEach((link, index) => {
resultObject[link.split('-')[1]] = scrapedDataArray[index];
console.log(resultObject)
});
return resultObject;
}
// Example usage:
(async function() {
sessionStorage.clear()
const ids = promptAndConvertToArray();
const links = generateLinks(ids);
const batchSize = 9;
// Initialize session storage
sessionStorage.setItem('scrapedData', JSON.stringify({}));
for (let i = 0; i < links.length; i += batchSize) {
const batchLinks = links.slice(i, i + batchSize); // Get a batch of links
try {
// Scrape data for the current batch
const result = await scrapeDataFromMultiplePages(batchLinks);
// Merge the result into the session storage
const storedData = JSON.parse(sessionStorage.getItem('scrapedData'));
Object.assign(storedData, result);
sessionStorage.setItem('scrapedData', JSON.stringify(storedData));
} catch (error) {
console.error('Error:', error);
}
}
// Once all scraping is done, retrieve data from session storage and download CSV
const scrapedData = JSON.parse(sessionStorage.getItem('scrapedData'));
console.log(scrapedData);
downloadCSV(scrapedData);
})();