I have a pupeteer sharp scraper in .NET Core 6 that basically looks for a date in a <nobr>
tag (that is inside and iframe in the page) and then returns the coordinates of the clickable date.
Here’s a screenshot of the page:
Page to be scraped
Basically I want to get the 10/09/2024 coordinates and click it with pupeteer sharp, and it works perfectly locally but in the AWS lambda it “misses” and clicks 09/09/2024, which messes everything up.
Here’s is my function for getting the coordinates
private static async Task<dynamic> GetCoordinatesWithStringAsync(IFrame _IFrame, dynamic iframeRect, string text, GetCoordinatesOptions? options = null)
{
options ??= new GetCoordinatesOptions();
var tag = options.Tag;
var scroll = options.Scroll;
var isSecondTdPrice = options.IsSecondTdPrice;
var className = options.Class;
var selectLower = options.SelectLower;
var result = await _IFrame.EvaluateFunctionAsync<dynamic>(@"
async (text, tag, iframeRect, scroll, isSecondTdPrice, className, selectLower) => {
const findTagRecursively = (container) => {
if (container.tagName.toLowerCase() === tag && (className === null || container.className.includes(className))) {
console.log(`Browser: found the ${tag} element with value: ` + container.textContent.trim());
return container;
}
for (let child of container.children) {
const found = findTagRecursively(child);
if (found) return found;
}
return null;
};
const waitForElementAndAdjustCoordinates = (query, timeout) => new Promise((resolve, reject) => {
let interval = 100; // Check every 100 ms
let totalTime = 0;
const checker = setInterval(() => {
const tags = document.querySelectorAll(tag);
for (let tag of tags) {
if (scroll) {
tag.scrollIntoView();
}
if (tag && (tag.textContent.trim() === text.trim()) && (className === null || tag.className.includes(className))) {
if (scroll) {
tag.scrollIntoView();
setTimeout(() => {}, 3000);
}
clearInterval(checker);
if (isSecondTdPrice)
{
const secondTd = tag.closest('tr').querySelector('td:nth-child(2)');
var secondTdTag = findTagRecursively(secondTd);
if (secondTdTag && secondTdTag.textContent.trim() === '-')
{
resolve({
Rect: null,
Failed: false,
Message: 'No hay liquidaciones para la fecha de ayer.',
ThrowError: false
});
return;
}
}
if (scroll) {
console.log('Browser: scrolling to the element')
tag.scrollIntoView();
setTimeout(() => {}, 3000);
}
const rect = tag.getBoundingClientRect();
console.log(rect);
console.log(iframeRect);
const coords = {
x: rect.left + (rect.width / 2) + iframeRect.left,
y: rect.top + (rect.height / 2) + iframeRect.top
};
if (selectLower === true)
{
console.log(rect.top)
console.log(rect.height)
console.log(iframeRect.top)
console.log(rect.top + (rect.height /2) + iframeRect.top)
coords.y = rect.top + (rect.height * 3 / 4) + iframeRect.top;
console.log(coords.y)
// get tag at coords and print it to confirm
const element = document.elementFromPoint(coords.x-iframeRect.left, coords.y-iframeRect.top);
if (element) {
console.log(element.outerHTML); // This will print the full HTML of the element
} else {
console.log('No element found at the specified coordinates.');
}
}
resolve({
Rect: coords,
Failed: false,
Message: null
});
return;
}
}
totalTime += interval;
if (totalTime >= timeout) {
clearInterval(checker);
resolve({
Rect: null,
Failed: true,
Message: 'No se pudo obtener las coordenadas del elemento con el texto ' + text
}); // Element wasn't found within the timeout
}
}, interval);
});
return await waitForElementAndAdjustCoordinates(text, 10000); // Wait up to 10 seconds
}
", text, tag, iframeRect, scroll, isSecondTdPrice, className, selectLower);
EvaluationResult evaluationResult = result.ToObject<EvaluationResult>();
if (evaluationResult.Rect != null)
{
return evaluationResult.Rect;
}
else if (evaluationResult.Message != null)
{
var throwError = evaluationResult.ThrowError ?? true;
if (!throwError)
{
return new { hasData = false };
}
throw new ScrapingException(evaluationResult.Message, evaluationResult.Failed);
}
else
{
throw new ScrapingException("No se pudo obtener las coordenadas del elemento con el texto " + text, evaluationResult.Failed);
}
}
Last detail: there are 2 identical pages as the one in the screenshot, one for debit transactions and one for credit transactions. They look the same and i use the same code on both to get the day 10/09/2024 and yet on AWS lambda it gets it right in the credit tab and after extracting all the data and going to the debit tab it misses the date and clicks the one above for some reason.
lambda logs of the coordinates
My AWS lambda is also using Windows, but I’m not exactly sure how to compare the chromium versions because that may also be the cause.
Again, very strange problem that I can’t really find a way to debug properly since it only happens in the AWS lambda.
Tried replicating the problem locally by throttling the network and changing window size, console logged every coordinate but cant replicate the problem locally.
Also pupeteer functions never work which is why I’m doing javascript magic