批量爬取NCBI

#我的网盘/博客素材/批量爬取NCBI
# 安装R包
#install.packages('RSelenium')
cd C:\Users\53446\Desktop\try2
java -Dwebdriver.chrome.driver="geckodriver.exe" -jar selenium-server-standalone-4.0.0-alpha-2.jar
# 加载R包
library(rvest)
library(RSelenium)
# 加载浏览器驱动
remDr <- remoteDriver(
  browserName = "firefox", # chrome、firefox
  remoteServerAddr = "localhost",
  port = 4444)
# 打开浏览器
remDr$open()
# 打开爬取的网页
#url <- c('https://pubmed.ncbi.nlm.nih.gov/?term=m6a&page=1')
#https://pubmed.ncbi.nlm.nih.gov/?term=Demmer+RT+AND+%28y_5%5BFilter%5D%29
url <- c('https://pubmed.ncbi.nlm.nih.gov/?term=%28Huang-Ge+Zhang%5BAuthor%5D%29+AND+%28University+of+Louisville%5BAffiliation%5D%29+AND+%28y_5%5BFilter%5D%29')
#安装scholarscope
## 导航
remDr$navigate(url)
remDr$getPageSource()
remDr$getPageSource()[[1]][1] %>%
  read_html()
# 提取单页
# 提取文章期刊
journal <- remDr$getPageSource()[[1]][1] %>%
  read_html() %>%
  html_nodes('.Scholarscope_Journal') %>%
  html_text()
journal
# 提取文章影响因子
IF <- remDr$getPageSource()[[1]][1] %>%
  read_html() %>%
  html_nodes('.Scholarscope_Factor') %>%
  html_text()
IF
time_up <- remDr$getPageSource()[[1]][1] %>%
  read_html() %>%
  html_nodes('.Scholarscope_Year') %>%
  html_text()
time_up
title <- remDr$getPageSource()[[1]][1] %>%
  read_html() %>%
  html_nodes('.docsum-title') %>%
  html_text(trim = T)
title
# 合并成表格
df <- data.frame(title = title,
                 time_up = time_up,
                 journal = journal,
                 IF = IF)

#https://pubmed.ncbi.nlm.nih.gov/?term=%28Huang-Ge+Zhang%5BAuthor%5D%29+AND+%28University+of+Louisville%5BAffiliation%5D%29+AND+%28y_5%5BFilter%5D%29
#https://pubmed.ncbi.nlm.nih.gov/?term=(Huang-Ge%20Zhang%5BAuthor%5D)%20AND%20(University%20of%20Louisville%5BAffiliation%5D)%20AND%20(y_5%5BFilter%5D)&page=2
# 构造网页
#url <- c()
#for (i in 1:2) {
 # url <- c(url,paste('https://pubmed.ncbi.nlm.nih.gov/?term=m6a&page=',
#                     i,sep = ''))
#}
# 构造网页
url <- c()
for (i in 1:3) {
  url <- c(url,paste('https://pubmed.ncbi.nlm.nih.gov/?term=(Huang-Ge%20Zhang%5BAuthor%5D)%20AND%20(University%20of%20Louisville%5BAffiliation%5D)%20AND%20(y_5%5BFilter%5D)&page=',
                     i,sep = ''))
}
# 批量提取
# 创建储存内容变量
journal <- c()
IF <- c()
time_up <- c()
title <- c()
for(i in 1:length(url)){
  # 导航
  remDr$navigate(url[i])
  # 获取页面内容
  remDr$getPageSource()
  # 提取文章期刊
  journal <- c(journal,remDr$getPageSource()[[1]][1] %>% read_html() %>% html_nodes('.Scholarscope_Journal') %>% html_text())
  # 提取文章影响因子
  IF <- c(IF,remDr$getPageSource()[[1]][1] %>% read_html() %>% html_nodes('.Scholarscope_Factor') %>% html_text())
  # 提取文章发表时间
  time_up <- c(time_up,remDr$getPageSource()[[1]][1] %>% read_html() %>% html_nodes('.Scholarscope_Year') %>% html_text())
  # 提取文章标题
  title <- c(title,remDr$getPageSource()[[1]][1] %>% read_html() %>% html_nodes('.docsum-title') %>% html_text(trim = T))
}
# 合并内容
df <- data.frame(title = title,
                 time_up = time_up,
                 journal = journal,
                 IF = IF)

发送评论 编辑评论

发送评论编辑评论