here’s the HTML code.
<div class="list-row">
<div class="list-item">
<div class="imgframe">
<div class="img-wrap">
<div class="img-item">
<a href="">
<img src="img1">
</a>
<div class="in-lable">
<a href="link1">
<span class="title">title1</span>
</a>
</div>
</div>
<div class="img-item">
<a href="">
<img src="img2">
</a>
<div class="in-lable">
<a href="link2">
<span class="title">title2</span>
</a>
</div>
</div>
</div>
</div>
</div>
</div>
Here’s my puppeteer code.
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin());
(async () => {
const browser = await puppeteer.launch({
headless: false,
targetFilter: (target) => target.type() !== "other",
});
const page = await browser.newPage();
try {
var url = 'https://booktoki351.com/novel?book=%EC%9D%BC%EB%B0%98%EC%86%8C%EC%84%A4';
await page.goto(url, {'timeout': 50000, 'waitUntil':'load'});
await page.waitForSelector('#webtoon-list');
const titlesAndImage = await page.evaluate(() => {
const listItems = Array.from(document.querySelectorAll('.img-item'));
return listItems.map((list) => {
const img = list.querySelector('img').src;
const link = list.querySelector('.in-lable').getAttribute('href');
const title = list.querySelector('.in-lable').textContent;
return { title, img, link };
})
});
} catch (error) {
console.log(error);
} finally {
console.log('done');
await browser.close();
}
})();
And here’s my console.log testing.
Array.from(document.querySelectorAll('.img-item')).map((itemlist) => itemlist.querySelector('img').src);
Array.from(document.querySelectorAll('.img-item span.title')).map((itemlist) => itemlist.textContent);
both are working but I wanted to get them inside 1 map which is inside the img-item or whatever selector can be used based on the example HTML.
I’m expecting an output like this.
[
{
title: ‘title1’,
img: ‘img1’,
link: ‘link1’
},
{
title: ‘title2’,
img: ‘img2’,
link: ‘link2’
},
]
2
The following solution uses puppeteer’s “multi-selector” $$
to retrieve all elements with class img-item
with one asynchronous operation. It then starts parallel asynchronous operations per elem
, which compute the DOM representation e
and access its desired properties with synchronous DOM operations querySelector
.
const titlesAndImage = await Promise.all(
(await page.$$(".img-item")).map((elem) =>
elem.evaluate(function (e) {
return {
title: e.querySelector(".in-lable span").textContent,
img: e.querySelector("img").src,
link: e.querySelector(".in-lable a").href
};
})
)
);
7
Currently this is what I did I separate getting the title and link to image.
Maybe someone can help me merge them together on one process.
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin());
(async () => {
const browser = await puppeteer.launch({
headless: false,
targetFilter: (target) => target.type() !== "other",
});
const page = await browser.newPage();
try {
var url = 'https://booktoki351.com/novel?book=%EC%9D%BC%EB%B0%98%EC%86%8C%EC%84%A4';
await page.goto(url, {'timeout': 50000, 'waitUntil':'load'});
await page.waitForSelector('#webtoon-list');
// get the title and link
const titlesAndLink = await Promise.all(
(await page.$$("div.in-lable")).map((elem) =>
elem.evaluate(function (e) {
return {
title: e.querySelector("span.title").innerText,
link: e.querySelector("a").href
};
})
)
);
// console.log(titlesAndLink);
// get the image
const imageLink = await Promise.all(
(await page.$$("div.img-item")).map((elem) =>
elem.evaluate(function (e) {
return {
img: e.querySelector("img").src
};
})
)
);
// console.log(imageLink);
console.log(titlesAndLink[0]['title'], titlesAndLink[0]['link'], imageLink[0]['img']);
// console.log(titlesAndImage);
} catch (error) {
console.log(error);
} finally {
console.log('done');
await browser.close();
}
})();
1