批量爬取NCBI
#我的网盘/博客素材/批量爬取NCBI
# 安装R包
#install.packages('RSelenium')
cd C:\Users\53446\Desktop\try2
java -Dwebdriver.chrome.driver="geckodriver.exe" -jar selenium-server-standalone-4.0.0-alpha-2.jar
# 加载R包
library(rvest)
library(RSelenium)
# 加载浏览器驱动
remDr <- remoteDriver(
browserName = "firefox", # chrome、firefox
remoteServerAddr = "localhost",
port = 4444)
# 打开浏览器
remDr$open()
# 打开爬取的网页
#url <- c('https://pubmed.ncbi.nlm.nih.gov/?term=m6a&page=1')
#https://pubmed.ncbi.nlm.nih.gov/?term=Demmer+RT+AND+%28y_5%5BFilter%5D%29
url <- c('https://pubmed.ncbi.nlm.nih.gov/?term=%28Huang-Ge+Zhang%5BAuthor%5D%29+AND+%28University+of+Louisville%5BAffiliation%5D%29+AND+%28y_5%5BFilter%5D%29')
#安装scholarscope
## 导航
remDr$navigate(url)
remDr$getPageSource()
remDr$getPageSource()[[1]][1] %>%
read_html()
# 提取单页
# 提取文章期刊
journal <- remDr$getPageSource()[[1]][1] %>%
read_html() %>%
html_nodes('.Scholarscope_Journal') %>%
html_text()
journal
# 提取文章影响因子
IF <- remDr$getPageSource()[[1]][1] %>%
read_html() %>%
html_nodes('.Scholarscope_Factor') %>%
html_text()
IF
time_up <- remDr$getPageSource()[[1]][1] %>%
read_html() %>%
html_nodes('.Scholarscope_Year') %>%
html_text()
time_up
title <- remDr$getPageSource()[[1]][1] %>%
read_html() %>%
html_nodes('.docsum-title') %>%
html_text(trim = T)
title
# 合并成表格
df <- data.frame(title = title,
time_up = time_up,
journal = journal,
IF = IF)
#https://pubmed.ncbi.nlm.nih.gov/?term=%28Huang-Ge+Zhang%5BAuthor%5D%29+AND+%28University+of+Louisville%5BAffiliation%5D%29+AND+%28y_5%5BFilter%5D%29
#https://pubmed.ncbi.nlm.nih.gov/?term=(Huang-Ge%20Zhang%5BAuthor%5D)%20AND%20(University%20of%20Louisville%5BAffiliation%5D)%20AND%20(y_5%5BFilter%5D)&page=2
# 构造网页
#url <- c()
#for (i in 1:2) {
# url <- c(url,paste('https://pubmed.ncbi.nlm.nih.gov/?term=m6a&page=',
# i,sep = ''))
#}
# 构造网页
url <- c()
for (i in 1:3) {
url <- c(url,paste('https://pubmed.ncbi.nlm.nih.gov/?term=(Huang-Ge%20Zhang%5BAuthor%5D)%20AND%20(University%20of%20Louisville%5BAffiliation%5D)%20AND%20(y_5%5BFilter%5D)&page=',
i,sep = ''))
}
# 批量提取
# 创建储存内容变量
journal <- c()
IF <- c()
time_up <- c()
title <- c()
for(i in 1:length(url)){
# 导航
remDr$navigate(url[i])
# 获取页面内容
remDr$getPageSource()
# 提取文章期刊
journal <- c(journal,remDr$getPageSource()[[1]][1] %>% read_html() %>% html_nodes('.Scholarscope_Journal') %>% html_text())
# 提取文章影响因子
IF <- c(IF,remDr$getPageSource()[[1]][1] %>% read_html() %>% html_nodes('.Scholarscope_Factor') %>% html_text())
# 提取文章发表时间
time_up <- c(time_up,remDr$getPageSource()[[1]][1] %>% read_html() %>% html_nodes('.Scholarscope_Year') %>% html_text())
# 提取文章标题
title <- c(title,remDr$getPageSource()[[1]][1] %>% read_html() %>% html_nodes('.docsum-title') %>% html_text(trim = T))
}
# 合并内容
df <- data.frame(title = title,
time_up = time_up,
journal = journal,
IF = IF)
暂无评论

发送评论 编辑评论


				
|´・ω・)ノ
ヾ(≧∇≦*)ゝ
(☆ω☆)
(╯‵□′)╯︵┴─┴
 ̄﹃ ̄
(/ω\)
∠( ᐛ 」∠)_
(๑•̀ㅁ•́ฅ)
→_→
୧(๑•̀⌄•́๑)૭
٩(ˊᗜˋ*)و
(ノ°ο°)ノ
(´இ皿இ`)
⌇●﹏●⌇
(ฅ´ω`ฅ)
(╯°A°)╯︵○○○
φ( ̄∇ ̄o)
ヾ(´・ ・`。)ノ"
( ง ᵒ̌皿ᵒ̌)ง⁼³₌₃
(ó﹏ò。)
Σ(っ °Д °;)っ
( ,,´・ω・)ノ"(´っω・`。)
╮(╯▽╰)╭
o(*////▽////*)q
>﹏<
( ๑´•ω•) "(ㆆᴗㆆ)
😂
😀
😅
😊
🙂
🙃
😌
😍
😘
😜
😝
😏
😒
🙄
😳
😡
😔
😫
😱
😭
💩
👻
🙌
🖕
👍
👫
👬
👭
🌚
🌝
🙈
💊
😶
🙏
🍦
🍉
😣
Source: github.com/k4yt3x/flowerhd
颜文字
Emoji
小恐龙
花!
上一篇
下一篇