I have taken over old code from someone who is no longer with the company.
And im trying to fix his scrapers since the homepages that it was based on has changed their layout dramatically.
Sadly I get errors with his code. with a few changes.
So I changed the ID’s it collect from. And changed links as well:
Before:
const PATH_VARIATIONS = [
{
URL_XPATH_CLASS: 'job', URL_XPATH_ATTRIBUTES: '/header/h2/a/@href', TITLE_XPATH_CLASS: 'job',
TITLE_XPATH_ATTRIBUTES: '/header/h2/a'
}
];
After
const PATH_VARIATIONS = [
{
URL_XPATH_CLASS: 'jobs', URL_XPATH_ATTRIBUTES: '/header/h2/a/@href', TITLE_XPATH_CLASS: 'job clicky',
TITLE_XPATH_ATTRIBUTES: '/header/h2/a'
}
];
This is the code where it fails:
async getCurrentPageURLTitles(page, PAGE_SELECTOR) {
await page.goto(PAGE_SELECTOR, {
timeout: this.PAGE_TIMEOUT
})
.catch((value) => {
throw new Error("page.goto() → " + value);
});
let counter = 0;
let titles = [], urls = [], companies = [];
while (titles.length === 0 && counter < this.PATH_VARIATIONS.length) {
let currentObject = this.PATH_VARIATIONS[counter];
let candidateObj;
try {
if (currentObject.COMPANY_XPATH_CLASS === undefined) {
candidateObj = await this.tryPathVariationOnPage(
page,
currentObject.TITLE_XPATH_CLASS,
currentObject.TITLE_XPATH_ATTRIBUTES,
currentObject.URL_XPATH_CLASS,
currentObject.URL_XPATH_ATTRIBUTES);
console.log("CandidateObj: xpath is undefined " + candidateObj);
} else {
candidateObj = await this.tryPathVariationOnPage(
page,
currentObject.TITLE_XPATH_CLASS,
currentObject.TITLE_XPATH_ATTRIBUTES,
currentObject.URL_XPATH_CLASS,
currentObject.URL_XPATH_ATTRIBUTES,
currentObject.COMPANY_XPATH_CLASS,
currentObject.COMPANY_XPATH_ATTRIBUTES);
companies = candidateObj ? candidateObj.companyUrls : [];
console.log("CandidateObj: xpath is variation" + candidateObj);
console.log("companies: " + companies);
}
if (candidateObj) {
console.log("CandidateObj: " + JSON.stringify(candidateObj));
titles = candidateObj.titleList || [];
urls = candidateObj.urlList || [];
} else {
console.log("CandidateObj is undefined");
}
} catch (error) {
console.error("Error in tryPathVariationOnPage: ", error);
}
counter++;
}
if (titles.length === 0) {
throw new Error("No valid path found!");
}
return { PAGE_TITLES: titles, PAGE_URLS: urls, PAGE_COMPANY_URLS: companies };
}
/**
* Tries the path variations defined in PATH_VARIATIONS on the current page.
*
* @since 1.0.0
* @access private
*
* @param {Object} page The current page the scraper has reached.
* @param {String} titleClass XPath to the general element in which we are searching.
* @param {String} titleAttributes XPath to the specific children of titleClass XPath.
* @param {String} urlClass XPath to the element where the text representation of url is kept.
* @param {String} urlAttributes XPath to the specific child which keeps the text
*
* @returns {Promise<{titleList: Array, urlList: Array}>}
*/
async tryPathVariationOnPage(page, titleClass, titleAttributes, urlClass, urlAttributes, companyClass, companyAttributes) {
let titles = [], urls = [], company = [];
try {
// Sets the XPath to the elements.
let xPathTitleStr = `//*[contains(@class, "${titleClass}")]${titleAttributes}`;
//let xPathTitleStr = `//[@id="result_list_box"]/div/div[2]/div[2]/div/a[2]/b`
let xpathTitleData = await page.$x(xPathTitleStr)
.catch((error) => {
throw new Error("page.$x(): " + error);
});
console.log(xpathTitleData);
console-log("I got through the company title");
// Extract the title from the selected elements
for (let element of xpathTitleData) {
let title = await page.evaluate(el => el.getAttribute('title'), element);
titles.push(title);
}
let xpathCompany, xpathCompanyData;
if (companyClass !== undefined) {
xpathCompany = `//li[contains(@class, "${companyClass}")]${companyAttributes}`;
xpathCompanyData = await page.$x(xpathCompany)
.catch((error) => {
throw new Error("page.$x(): " + error)
})
}
let xPathUrlStr = `//*[contains(@class, "${urlClass}")]${urlAttributes}`;
//let xPathUrlStr = `//[@id="result_list_box"]/div/div[2]/div[2]/div/a[2]`
let xpathUrlData = await page.$x(xPathUrlStr)
.catch((error) => {
throw new Error("page.$x(): " + error);
});
// Runs through all advertisements with XPath on current page.
for (let i = 0; i < xpathTitleData.length; i++) {
// Retrieving elements from specific advertisement.
let xpathTitleTextContent = await xpathTitleData[i].getProperty('textContent')
.catch((error) => {
throw new Error("xpathTitleData.getProperty(): " + error);
});
let xpathUrlTextContent = await xpathUrlData[i].getProperty('textContent')
.catch((error) => {
throw new Error("xpathUrlData.getProperty(): " + error);
});
// Extracting the text values from gathered elements.
let titleText = await xpathTitleTextContent.jsonValue()
.catch((error) => {
throw new Error("xpathTitleTextContent.getProperty(): " + error);
});
titleText = titleText.trim();
let urlText = await xpathUrlTextContent.jsonValue()
.catch((error) => {
throw new Error("xpathUrlTextContent.getProperty(): " + error);
});
// If one property is empty, the advertisement is invalid.
if (titleText.length !== 0 && urlText !== 0) {
titles.push(titleText);
urls.push(urlText);
//company.push("https://www.jobindex.dk" + companyText)
}
}
// Run through company data for all ads on current page.
if (xpathCompanyData !== undefined) {
for (let i = 0; i < xpathCompanyData.length; i++) {
let xpathCompanyTextContent = await xpathCompanyData[i].getProperty('textContent')
.catch((error) => {
throw new Error("xpathCompanyData.getProperty(): " + error)
})
let companyText = await xpathCompanyTextContent.jsonValue()
.catch((error) => {
throw new Error("xpathCompanyTextContent.getProperty(): " + error);
})
company.push("https://www.jobindex.dk" + companyText)
}
}
return { titleList: titles, urlList: urls, companyUrls: company };
} catch (error) {
console.log("Error at getPageTitlesAndUrls() → " + error)
}
}
Specifically this is the part that is failing:
try {
// Sets the XPath to the elements.
let xPathTitleStr = `//*[contains(@class, "${titleClass}")]${titleAttributes}`;
//let xPathTitleStr = `//[@id="result_list_box"]/div/div[2]/div[2]/div/a[2]/b`
let xpathTitleData = await page.$x(xPathTitleStr)
.catch((error) => {
throw new Error("page.$x(): " + error);
});
console.log(xpathTitleData);
console-log("I got through the company title");
The errors I get are:
BEGINNING SCRAPING ON PAGE: 1
PAGE_SELECTOR: https://www.careerjet.dk/jobs?s=&l=Bornholm&nw=1&p=1
Error at getPageTitlesAndUrls() → TypeError: page.$x is not a function
CandidateObj: xpath is undefined undefined
CandidateObj is undefined
Sadly the department I work for has no programmers and im an intern. So I have no one at premises who can actually help.