const baseUrl = 'https://replicate.npmjs.com/_all_docs';
const outputFile = 'npm_packages_metadata_7.json';
const pageSize = 1000;
const maxRetries = 3;
const retryDelay = 5000; // Delay in milliseconds (5 seconds)
let hasMore = true;
let lastDocId = '';
function delay(ms: number) {
return new Promise(resolve => setTimeout(resolve, ms));
}
async function fetchPage(startkey: string | null, outputFile: string, retryCount = 0) {
try {
console.time(`Step ${step + 1} time`);
const queryParams = new URLSearchParams({
include_docs: 'true',
limit: pageSize.toString(),
});
if (startkey) {
queryParams.set('startkey', JSON.stringify(startkey));
queryParams.set('skip', '1'); // Skip the first item to avoid duplication
}
const url = `${baseUrl}?${queryParams.toString()}`;
console.log(`[${new Date().toISOString()}] Fetching: ${url}`);
const response = await fetch(url);
console.log(`[${new Date().toISOString()}] Fetched response status ${response.status} statusText: ${response.statusText}, size: ${response.size} , ok: ${response.ok} redirected: ${response.redirected}`);
const data: any = await response.json();
if (data.rows.length === pageSize) {
hasMore = true;
} else {
hasMore = false;
}
console.log(`[${new Date().toISOString()}] Fetched ${data.rows.length} rows. HasMore: ${hasMore}`);
// Process and save the data
const docs = data.rows.map((row: any) => row.doc);
if (docs.length > 0) {
docs.forEach((doc: any, index: any) => {
const jsonString = JSON.stringify(doc, null, 2);
appendFileSync(outputFile, jsonString + (hasMore || index < docs.length - 1 ? ',n' : 'n'));
});
// Update the last document ID
lastDocId = docs[docs.length - 1]._id;
}
step = step + 1;
console.timeEnd(`Step ${step} time`);
// Check if there's more data to fetch
if (hasMore) {
lastDocId = data.rows[data.rows.length - 1].id;
console.log(`[${new Date().toISOString()}] Last key: ${lastDocId}`);
await fetchPage(lastDocId, outputFile);
}
} catch (error) {
console.log(`[${new Date().toISOString()}] An error occurred while fetching the data:`, error);
if (retryCount < maxRetries) {
console.log(`[${new Date().toISOString()}] Error details:`, error);
console.log(`[${new Date().toISOString()}] Retrying (${retryCount + 1}/${maxRetries}) after ${retryDelay}ms...`);
await delay(retryDelay);
await fetchPage(startkey, outputFile, retryCount + 1);
} else {
console.log(`[${new Date().toISOString()}] Max retries reached. Skipping this page.`);
}
}
}
async function fetchAndSaveNpmDocs() {
console.time("Total time");
writeFileSync(outputFile, '[n'); // Start the JSON array
try {
await fetchPage(null, outputFile);
} catch (error) {
console.log('An error occurred while fetching the data:', error);
} finally {
appendFileSync(outputFile, '{}]n'); // Add an empty object to handle the trailing comma
console.log('All package metadata has been saved to', outputFile);
console.timeEnd("Total time");
}
}
export {
fetchAndSaveNpmDocs
}
For research purposes, I’d like to get all the packages that are available on npm. How can I do this? I tried using different scripts, with this one I managed to download around ~9 GB of data but it’s not stable. And is getting slower with time. Is there a more efficient way to do it?
I saw it even in this paper https://arxiv.org/pdf/2112.10165 that they did a snapshot of the npm registry but didn’t provide details. I contacted the authors but still no response from them after a month.