Thiết kế website giá rẻ

Question

I would like to scrape the hierarchy of institutions of German universities. The attached script successfully scrapes the div-classes of the individual institutions, but unfortunately I am not able to recreate the paths realistically in the data preparation. My question is whether anyone has an idea of how to create the paths correctly during the scraping process.

What I would need is a path variable that looks like this, for example:

https://www.gerit.org/de/institutiondetail/10282

Fakultät 1: Wirtschafts- und Sozialwissenschaftliche Fakultät >Fakultätsbereich Soziologie und Sozialpsychologie > Institut für Soziologie und Sozialpsychologie (ISS) > Lehrstuhl für Wirtschafts- und Sozialpsychologie > Professur Hölzl

Here is my script so far:

#////////////////////////////////////////////////////////////
# Scrape Gerit
#//////////////////////////////////////////////////////////// 

# loading needed libraries -----------------------------------------------------
if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, rvest, xml2, readr,RSelenium,openxlsx)

#////////////////////////////////////////////////////////////
# START RSELENIUM
#//////////////////////////////////////////////////////////// 

# Starten einer Remote-Sitzung mit Chrome
driver <- rsDriver(browser = "chrome", chromever = "125.0.6422.60", port = 4112L)

# Zugriff auf die gestartete Sitzung
rmdr <- driver[["client"]]


    
scrape_gerit <- function(url, uni_name) {
  
  # Versuche, die Seite abzurufen, und behandle den Fehler
  page <- tryCatch({
    read_html(url)
  }, error = function(e) {
    warning(paste("Fehler beim Abrufen der Seite:", url))
    return(NA)
  })
  
  # Überprüfe, ob die Seite erfolgreich abgerufen wurde
  if (is.na(page)) {
    # Rückgabe eines Datenrahmens mit NA-Werten
    return(tibble(Einrichtung = NA, Level_0 = NA, Einrichtung_url = NA, Einrichtung_Fach = NA, HS = uni_name))
  } else {
    # Füge den restlichen Code hier ein, um die Seite zu verarbeiten
    rmdr$navigate(url)
    
    rmdr$findElement("xpath", '//*[(@id = "associatedData")]//a[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]//span')$clickElement()
    
    # Inhalte der aktuellen Seite extrahieren und direkt verarbeiten
    Einrichtung <- rmdr$getPageSource()[[1]] %>% 
      read_html() %>% 
      html_nodes(".associated-institution") %>% 
      html_text()
    
    Einrichtung_url <- rmdr$getPageSource()[[1]] %>% 
      read_html() %>% 
      html_nodes(" #associated a")  |>
      html_attr("href")
    
    Level_0 <- rmdr$getPageSource()[[1]] %>% 
      read_html() %>% 
      html_nodes(".associated-contend") %>%
      map_chr(~ {
        node <- .x
        class_attr <- html_attr(node, "class")
        if (str_detect(class_attr, "level0")) {
          html_text(node)
        } else {
          NA
        }
      })

    Level_1  <- rmdr$getPageSource()[[1]] %>% 
      read_html() %>% 
      html_nodes(".associated-contend") %>%
      map_chr(~ {
        node <- .x
        class_attr <- html_attr(node, "class")
        if (str_detect(class_attr, "level1")) {
          html_text(node)
        } else {
          NA
        }
      })

    Level_2  <- rmdr$getPageSource()[[1]] %>% 
      read_html() %>% 
      html_nodes(".associated-contend") %>%
      map_chr(~ {
        node <- .x
        class_attr <- html_attr(node, "class")
        if (str_detect(class_attr, "level2")) {
          html_text(node)
        } else {
          NA
        }
      })

    Level_3  <- rmdr$getPageSource()[[1]] %>% 
      read_html() %>% 
      html_nodes(".associated-contend") %>%
      map_chr(~ {
        node <- .x
        class_attr <- html_attr(node, "class")
        if (str_detect(class_attr, "level3")) {
          html_text(node)
        } else {
          NA
        }
      })

    Level_4  <- rmdr$getPageSource()[[1]] %>% 
      read_html() %>% 
      html_nodes(".associated-contend") %>%
      map_chr(~ {
        node <- .x
        class_attr <- html_attr(node, "class")
        if (str_detect(class_attr, "level4")) {
          html_text(node)
        } else {
          NA
        }
      })

    Level_5  <- rmdr$getPageSource()[[1]] %>% 
      read_html() %>% 
      html_nodes(".associated-contend") %>%
      map_chr(~ {
        node <- .x
        class_attr <- html_attr(node, "class")
        if (str_detect(class_attr, "level5")) {
          html_text(node)
        } else {
          NA
        }
      })

    Level_6  <- rmdr$getPageSource()[[1]] %>% 
      read_html() %>% 
      html_nodes(".associated-contend") %>%
      map_chr(~ {
        node <- .x
        class_attr <- html_attr(node, "class")
        if (str_detect(class_attr, "level6")) {
          html_text(node)
        } else {
          NA
        }
      })

    noMoreSubs <- rmdr$getPageSource()[[1]] %>% 
      read_html() %>% 
      html_nodes(".associated-contend") %>%
      map_chr(~ {
        node <- .x
        class_attr <- html_attr(node, "class")
        if (str_detect(class_attr, "noMoreSubs")) {
          html_text(node)
        } else {
          NA
        }
    })
    
    Einrichtung_url <- paste0("https://gerit.org", Einrichtung_url)
    
    # Funktion zum Extrahieren der Daten von Gerit
    extract_data <- function(url) {
      message("Starte scraping von: ", url)
      page <- tryCatch({
        read_html(url)
      }, error = function(e) {
        warning(paste("Fehler beim Abrufen der Seite:", url))
        return(NA)
      })
      
      if (is.na(page)) {
        return(NA)
      } else {
        data <- page %>%
          html_nodes(".institutionFach") %>%
          html_text()
        message("Scraping beendet")
        return(data)
      }
    }
    
    # Anwenden der Funktion auf jede URL aus Gerit
    results <- lapply(Einrichtung_url, extract_data)
    
    # Füge Daten zusammen
    uni_name <- tibble(Einrichtung = Einrichtung, 
                       Level_0 = Level_0,
                       Level_1 = Level_1,
                       Level_2 = Level_2,
                       Level_3 = Level_3,
                       Level_4 = Level_4,
                       Level_5 = Level_5,
                       Level_6 = Level_6,
                       noMoreSubs = noMoreSubs,
                       Einrichtung_url = Einrichtung_url, 
                       Einrichtung_Fach = results,
                       HS = uni_name)
    
    return(uni_name)
  }
}


gerit_unis <- data.frame(
  Hochschule = c("Universität zu Köln", "Technische Universität München", "Rheinisch-Westfälische Technische Hochschule Aachen"),
  Gerit_Url = c("https://gerit.org/de/institutiondetail/10282", "https://gerit.org/de/institutiondetail/10125", "https://gerit.org/de/institutiondetail/10293")
)

# Funktion zum Scrapen über jede URL in hochschulen
ergebnisse <- lapply(1:nrow(gerit_unis), function(i) {
  res <- scrape_gerit(url = gerit_unis$Gerit_Url[i], uni_name = gerit_unis$Hochschule[i])
  res$HS <- gerit_unis$Hochschule[i]
  return(res)
})

save(ergebnisse, file = "Data/ergebnisse_ext.rda")
 
driver[["server"]]$stop()

Thiết kế website giá rẻ

Danh mục

scrape path of Institutions in gerit.org