Thiết kế website giá rẻ

Question

const baseUrl = 'https://replicate.npmjs.com/_all_docs';
const outputFile = 'npm_packages_metadata_7.json';
const pageSize = 1000;
const maxRetries = 3;
const retryDelay = 5000; // Delay in milliseconds (5 seconds)

let hasMore = true;
let lastDocId = '';

function delay(ms: number) {
    return new Promise(resolve => setTimeout(resolve, ms));
}

async function fetchPage(startkey: string | null, outputFile: string, retryCount = 0) {
    try {
        console.time(`Step ${step + 1} time`);
        const queryParams = new URLSearchParams({
            include_docs: 'true',
            limit: pageSize.toString(),
        });
        if (startkey) {
            queryParams.set('startkey', JSON.stringify(startkey));
            queryParams.set('skip', '1'); // Skip the first item to avoid duplication
        }

        const url = `${baseUrl}?${queryParams.toString()}`;
        console.log(`[${new Date().toISOString()}] Fetching: ${url}`);

        const response = await fetch(url);
        console.log(`[${new Date().toISOString()}] Fetched response status ${response.status} statusText: ${response.statusText}, size: ${response.size} , ok: ${response.ok} redirected: ${response.redirected}`);
        const data: any = await response.json();
        if (data.rows.length === pageSize) {
            hasMore = true;
        } else {
            hasMore = false;
        }

        console.log(`[${new Date().toISOString()}] Fetched ${data.rows.length} rows. HasMore: ${hasMore}`);

        // Process and save the data
        const docs = data.rows.map((row: any) => row.doc);
        if (docs.length > 0) {
            docs.forEach((doc: any, index: any) => {
                const jsonString = JSON.stringify(doc, null, 2);
                appendFileSync(outputFile, jsonString + (hasMore || index < docs.length - 1 ? ',n' : 'n'));
            });
            // Update the last document ID
            lastDocId = docs[docs.length - 1]._id;
        }

        step = step + 1;
        console.timeEnd(`Step ${step} time`);

        // Check if there's more data to fetch
        if (hasMore) {
            lastDocId = data.rows[data.rows.length - 1].id;
            console.log(`[${new Date().toISOString()}] Last key: ${lastDocId}`);

            await fetchPage(lastDocId, outputFile);
        }
    } catch (error) {
        console.log(`[${new Date().toISOString()}] An error occurred while fetching the data:`, error);
        if (retryCount < maxRetries) {
            console.log(`[${new Date().toISOString()}] Error details:`, error);
            console.log(`[${new Date().toISOString()}] Retrying (${retryCount + 1}/${maxRetries}) after ${retryDelay}ms...`);
            await delay(retryDelay);
            await fetchPage(startkey, outputFile, retryCount + 1);
        } else {
            console.log(`[${new Date().toISOString()}] Max retries reached. Skipping this page.`);
        }
    }

}

async function fetchAndSaveNpmDocs() {
    console.time("Total time");
    writeFileSync(outputFile, '[n');  // Start the JSON array
    try {
        await fetchPage(null, outputFile);
    } catch (error) {
        console.log('An error occurred while fetching the data:', error);
    } finally {
        appendFileSync(outputFile, '{}]n');  // Add an empty object to handle the trailing comma
        console.log('All package metadata has been saved to', outputFile);
        console.timeEnd("Total time");
    }
}

export {
    fetchAndSaveNpmDocs
}

For research purposes, I’d like to get all the packages that are available on npm. How can I do this? I tried using different scripts, with this one I managed to download around ~9 GB of data but it’s not stable. And is getting slower with time. Is there a more efficient way to do it?

I saw it even in this paper https://arxiv.org/pdf/2112.10165 that they did a snapshot of the npm registry but didn’t provide details. I contacted the authors but still no response from them after a month.

Thiết kế website giá rẻ

Danh mục

Get all public packages in the npm registry