ciao ragazzi vorrei capire perche il mio xpath cattura soltanto laprima inserzione all interno della pagina web qualcuno sa dirmi perche ? grazie
script:
#!/bin/bash
# Cartella contenente i file HTML salvati
input_folder="salvati"
# Nome del file del database
db_file="immo.db"
# Nome della tabella nel database
sGTable="bakeca"
# Controlla se il file del database esiste
if [[ ! -f "$db_file" ]]; then
echo "Errore: Il file del database '$db_file' non esiste."
exit 1
fi
# Loop attraverso i file HTML nella cartella 'salvati'
for file in "$input_folder"/bakeka_page_*.html; do
echo "Processando il file: $file"
# Utilizzo di xidel per estrarre gli annunci dal file HTML
xidel_output=$(xidel --xpath '
//div[contains(@class, "cursor-pointer border-b relative p-3 tablet:px-0 -mx-3 mobile:mx-0 annuncio-in-elenco bg-white")] ! string-join(
(
"price=" || normalize-space(//strong[@class="text--section text-base block"]),
"size=" || normalize-space(//span[@class="text-sm font-semibold"]),
"link=" || //a[@class="flex relative"]/@href,
"desc=" || normalize-space(//p[@class="text-sm mt-2 hidden tablet:block"])
),
codepoints-to-string(9)
)
' "$file")
# Loop through the lines and insert them into the SQLite database
while IFS= read -r line; do
# Extract price, size, link, and description values from the lines using awk
prezzo=$(awk -F 'price=' '{print $2}' <<< "$line" | awk -F 'size=' '{print $1}' | tr -d '€')
size=$(awk -F 'size=' '{print $2}' <<< "$line" | awk -F 'link=' '{print $1}' | grep -o '[0-9]+')
link=$(awk -F 'link=' '{print $2}' <<< "$line" | awk -F 'desc=' '{print $1}')
descrizione=$(awk -F 'desc=' '{print $2}' <<< "$line")
# Determine if the description contains "asta"
if [[ $descrizione =~ "asta" ]]; then
asta=1
else
asta=0
fi
# Insert the data into the SQLite database
sqlite3 "$db_file" "INSERT INTO $sGTable (prezzo, link, descrizione, metratura, asta) VALUES ('$prezzo', '$link', '$descrizione', '$size', $asta)"
done < <(echo "$xidel_output")
echo "Dati estratti e inseriti dal file $file"
done
# Rimuovi i record duplicati dalla tabella SQLite
sqlite3 "$db_file" "DELETE FROM $sGTable WHERE rowid NOT IN (SELECT MIN(rowid) FROM $sGTable GROUP BY prezzo, link, descrizione, metratura, asta)"
echo "Rimozione dei record duplicati completata."
echo "Elaborazione completata."
my web page:
you can download the page for look here
the page is to big
I expect to retrieve various links of listings, prices, descriptions, and square footage. I’ve tried different combinations of XPath but it always captures only the first listing. Can anyone help me with this?