Getting data from nested nodes in RSelenium

26 views Asked by At

I am doing literature review, so I am trying to get the list of papers under certain search terms in Google scholar. I have trouble getting the PDF links when available. Some references have PDFs, some do not, and others have up to two. I am interested in only the first one.

The nodes with the PDFs are located here ".gs_or_ggsm a", the problem is that when I try to get such information with:

PDFs <- page %>% html_nodes(".gs_or_ggsm a")%>%  html_attr("href")

or

PDFs <- page %>% rvest::html_nodes(".gs_or_ggsm a") %>%
        rvest::html_attr("href") %>% 
        .[startsWith(., "https://")] %>%
        .[!startsWith(., "/scholar?")]

...the number of rows I get do not match the number of references. Is there a way to get NA for references with no PDFs, and for the others collect only the first one?

In the image below is an example for a full reference. Note the nesteness of the nodes, and how there are two href at the end. I would need only the one in blue.

enter image description here

library(tidyverse)
library(RSelenium)
library(rvest)
library(httr)
library(netstat)

TERM="Eriophyidae"

rD <- rsDriver(browser = "firefox",
           chromever = NULL,
           port = free_port())
remDr <- rD$client

# Function to extract data from a page
extract_data <- function(page_source) {
  page <- read_html(page_source)
  titles <- page %>% html_nodes(".gs_rt") %>% html_text()
  authors <- page %>% html_nodes(".gs_a") %>% html_text()
  years <- str_extract(authors, "\\d{4}")
  authors <- str_replace(authors, "\\d{4}", "")
  urls <- page %>% html_nodes(".gs_rt a") %>% html_attr("href")

 # PDFs <- page %>% html_nodes(".gs_or_ggsm a")%>%  html_attr("href") #This is the line with a problem

  data.frame(Title_name = titles, Author_Names = authors, Year_Publication = years, Title_URL = urls )#, PDF=PDF)
}


# Function to search for a specific term on Google Scholar
search_google_scholar <- function(term) {
  tryCatch({
    remDr$navigate("https://scholar.google.com/")
    search_box <- remDr$findElement("css", "#gs_hdr_tsi")
    search_box$sendKeysToElement(list(term, key="enter"))
    Sys.sleep(5) # Allow time for page to load

    pages <- 2 # Number of pages to scrape 
    results <- data.frame()

    for (page in 1:pages) {
      page_source <- remDr$getPageSource()[[1]]
      page_data <- extract_data(page_source)
      results <- rbind(results, page_data)
  
      next_button <- remDr$findElement("css", "#gs_n a")
      if (length(next_button) == 0) {
        break
      } else {
        next_button$clickElement()
        Sys.sleep(5) # Allow time for page to load
      }
    }

    return(results)
  }, error = function(e) {
    message("An error occurred: ", conditionMessage(e))
    NULL
  })
}


search_results <- search_google_scholar(TERM)

# Close the browser
remDr$close()

# Stop the Selenium server
rD$server$stop()
1

There are 1 answers

0
Camilo On

I figure out a solution. Basically doing a loop through each reference and getting the PDF, if not available return NA.

AllPDFs <- page %>% rvest::html_nodes(".gs_r.gs_or.gs_scl") 

PDFs=c()
for (i in 1: 10){
    P1=AllPDFs[i]
    PDFi= P1 %>% rvest::html_nodes(".gs_or_ggsm a")  %>%  rvest::html_attr("href") %>% 
      .[startsWith(., "https://")] %>%
      .[!startsWith(., "/scholar?")]
    if(length(PDFi)==0) {PDFi=NA}
    PDFs=c(PDFs,PDFi ) 
  }