#来源于https://mp.weixin.qq.com/s/u1XxeOs9gtVYtURD4dGB7Q
library(XML)
library(httr)
library(magrittr)
library(rvest)
library(xml2)
library(stringr)
library(tidyverse)
#devtools::install_github("ChristianAmes/HMDBScraper")
library(HMDBScraper)#包在github上,注意安装方式
# web <- read_html("https://hmdb.ca/bmi_metabolomics")
# pag1 <- web %>%
# html_table() %>%.[[1]]
# pag1
dir.create("./HMDB爬虫/")
#--第一部分:bmi代谢租数据#-----------
# 294是在网页直接看到的总条数,25是每页显示的条数。
pages = 1:ceiling(294 / 25)
url <- "http://www.hmdb.ca/bmi_metabolomics?page="
url_all <- paste(url, pages, sep="")
for (i in pages) {
tem = url_all[i] %>%
read_html %>%
html_table() %>%.[[1]]
if (i == 1) {
dat = tem
} else {
dat = rbind(dat,tem)
}
}
head(dat)
write.csv(dat,"./HMDB爬虫/bim_HMDB.csv")
#--第二部分:age代谢租数据#-----------
pages = 1:ceiling(418 / 25)
url <- "https://hmdb.ca/age_metabolomics?page="
url_all <- paste(url, pages, sep="")
for (i in pages) {
tem = url_all[i] %>%
read_html %>%
html_table() %>%.[[1]]
if (i == 1) {
dat1 = tem
} else {
dat1 = rbind(dat1,tem)
}
}
dim(dat1)
head(dat1)
write.csv(dat1,"./HMDB爬虫/age_HMDB.csv")
#--第三部分:gender代谢租数据#-----------
pages = 1:ceiling(515 / 25)
url <- "https://hmdb.ca/gender_metabolomics?page="
url_all <- paste(url, pages, sep="")
for (i in pages) {
tem = url_all[i] %>%
read_html %>%
html_table() %>%.[[1]]
if (i == 1) {
dat2 = tem
} else {
dat2 = rbind(dat2,tem)
}
}
dim(dat2)
head(dat2)
write.csv(dat2,"./HMDB爬虫/gender_HMDB.csv")
#--第四部分:geno代谢租数据#-----------
pages = 1:ceiling(6777 / 25)
url <- "https://hmdb.ca/geno_metabolomics?page="
url_all <- paste(url, pages, sep="")
for (i in pages) {
tem = url_all[i] %>%
read_html %>%
html_table() %>%.[[1]]
if (i == 1) {
dat3 = tem
} else {
dat3 = rbind(dat3,tem)
}
}
head(dat3)
dim(dat3)
write.csv(dat3,"./HMDB爬虫/geno_HMDB.csv")
#--第五部分:pharmaco代谢租数据#-----------
pages = 1:ceiling(2497 / 25)
url <- "https://hmdb.ca/pharmaco_metabolomics?page="
url_all <- paste(url, pages, sep="")
for (i in pages) {
tem = url_all[i] %>%
read_html %>%
html_table() %>%.[[1]]
if (i == 1) {
dat4 = tem
} else {
dat4 = rbind(dat4,tem)
}
}
head(dat4)
dim(dat4)
write.csv(dat4,"./HMDB爬虫/pharmaco_HMDB.csv")
#--第六部分:diurnal代谢租数据#-----------
pages = 1:ceiling(2315 / 25)
url <- "https://hmdb.ca/pharmaco_metabolomics?page="
url_all <- paste(url, pages, sep="")
for (i in pages) {
tem = url_all[i] %>%
read_html %>%
html_table() %>%.[[1]]
if (i == 1) {
dat5 = tem
} else {
dat5 = rbind(dat5,tem)
}
}
head(dat5)
dim(dat5)
write.csv(dat5,"./HMDB爬虫/diurnal_HMDB.csv")
############合并数据
datall = data.frame(ID = c(dat$Metabolite,
dat1$Metabolite,
dat2$Metabolite,
dat3$Metabolite,
dat4$Metabolite,
dat5$Metabolite))
tail(datall,20)
datall$metabolite =
sapply(strsplit(datall$ID, "[(]HMDB"), `[`, 1)
datall$HMDB_ID =
sapply(strsplit(datall$ID, "[(]HMDB"), `[`, 2)
datall$HMDB_ID = gsub(")","",datall$HMDB_ID)
datall$HMDB_ID = paste("HMDB",datall$HMDB_ID,sep = "")
write.csv(datall,"./HMDB爬虫/all_HMDB_ID_metabolites.csv")
#--获取代谢物全部信息
get.entry<- function(id, prefix= "http://www.hmdb.ca/metabolites/",check_availability=T){
if(check_availability){
if(!check.availability(id)){ stop(paste(id, " id could not be found"))}
}
#create link
link<- paste(prefix,id,".xml",sep= "")
#download data
txt<- try(readLines(link))
#error handling code, just try another 3 times
i<-1
while(inherits(txt, "try-error"))
{
print(paste("Retrieving has failed for ",i,". time",sep=""))
txt<- try(readLines(link))
i= i+1
if(i>3) stop("Could not find Server or ID")
}
#process data to be usable
data<- XML::xmlTreeParse(txt,asText= T)
data<- XML::xmlToList(data)
return (data)
}
tem = read.csv("./HMDB爬虫/all_HMDB_ID_metabolites.csv",row.names = 1)
head(tem)
id0 = tem$HMDB_ID
A = c()
B = c()
C = c()
D = c()
E = c()
G = c()
#for (i in 373:length(id0)) {
for (i in 1:2) {
dat = try(get.entry(id = id0[i]) , silent = FALSE)
# 提取kegg id
if (!is.null(dat$kegg_id)) {
A[i] = dat$kegg_id
} else{
A[i] = ""
}
if (!is.null(dat$taxonomy$description)) {
# 提取描述信息
B[i] =dat$taxonomy$description
} else{
B[i] = ""
}
if (!is.null(dat$taxonomy$kingdom)) {
#-提取kingdom分类
C[i] = dat$taxonomy$kingdom
} else{
B[i] = ""
}
if (!is.null(dat$taxonomy$super_class)) {
# 提取super_class分类
D[i] = dat$taxonomy$super_class
} else{
B[i] = ""
}
if (!is.null(dat$taxonomy$class)) {
# 提取class
E[i] = dat$taxonomy$class
} else{
B[i] = ""
}
if (!is.null(dat$taxonomy$sub_class)) {
# 提取sub_class分类
G[i]= dat$taxonomy$sub_class
} else{
B[i] = ""
}
print(i)
}
# 合并全部内容
tax.hmdb = data.frame(
KEGGID = A,
Descrip = B,
Kingdom = C,
Super_class = D,
Class = E,
Sub_class = G)
暂无评论