library(KEGGREST)
library(readxl)
library(tidyverse)
library(openxlsx)
library(tidyverse)
library(magrittr)
#######读取数据,mzcloud阈值大于80,
CD_c18_pos<-read_xlsx("CD-c18-pos2.1.xlsx",sheet = "Compounds")
CD_c18_pos <- CD_c18_pos %>%
filter(!is.na(`mzCloud Best Match`)) %>%
filter(`mzCloud Best Match` > 80) %>%
group_by(Name)
CD_c18_pos_max<- CD_c18_pos %>%
summarise_all(max)
fecal<-CD_c18_pos_max
fecal$Name <- gsub("^\\(\\+\\)-", "", fecal$Name)
fecal$Name <- gsub("^\\(\\-\\)-", "", fecal$Name)
fecal$Name <- gsub("^\\(\\+/-\\)-", "", fecal$Name)
fecal$Name <- gsub("^\\(\\+/-\\)", "", fecal$Name)
fecal$Name <- gsub("^\\(\\±/-\\)", "", fecal$Name)
fecal$Name <- gsub("^\\(±\\)", "", fecal$Name)
fecal$Name <- gsub("β", "beta", fecal$Name)
fecal$Name <- gsub("α", "alpha", fecal$Name)
fecal <- fecal[!grepl("^NP-", fecal$Name), ]
dim(fecal)
###########
get_kegg_id <- function(name) {
kegg_id <- tryCatch(
{
keggFind("compound", name)
},
error = function(e) {
NA
}
)
return(kegg_id) # 返回KEGG ID或NA
}
kegg <- lapply(fecal$Name, get_kegg_id)
split_values <- data.frame(unlist(kegg))
split_values2 <- data.frame(unlist(strsplit(split_values$unlist.kegg., "; ")))
colnames(split_values2)<-"kegg"
data <- inner_join(fecal, split_values2, by = c("Name" = "kegg"))
data<-data[,1]
kegg_ids <- lapply(data$Name, get_kegg_id)
new_df<-c()
for (i in 1:length(kegg_ids)) {
kegg_id <- kegg_ids[[i]]
kegg_id_df <- as.data.frame(print(kegg_id))
kegg_id_df$name <- row.names(kegg_id_df)
name <- row.names(kegg_id_df)
colnames(kegg_id_df) <- c("kegg_key", "cpd")
for (j in 1:nrow(kegg_id_df)) {
split_values <- unlist(strsplit(kegg_id_df$kegg_key[j], "; "))
new_rows <- data.frame(kegg_key = split_values, cpd = kegg_id_df$cpd[j])
new_df <- rbind(new_df, new_rows)
}
}
df <- unique(new_df[new_df$kegg_key %in% data$Name,])
colnames(df)<-c("Name","cpd")
fecal_kegg<-left_join(df,fecal,by="Name")
distinct_fecal_kegg <- distinct(fecal_kegg, Name, .keep_all = TRUE)
distinct_fecal_kegg$cpd <- gsub("cpd:", "", distinct_fecal_kegg$cpd)
dim(distinct_fecal_kegg)
#> dim(distinct_fecal_kegg)
#[1] 88 100
#即CD-c18-pos2.1.xlsx含KEGG的ID有88
MetDNA_C18_pos<-read_xlsx("MetDNA_C18_pos.xlsx",sheet = "MetDNA_C18_pos")
dim(MetDNA_C18_pos)
#> dim(MetDNA_C18_pos)
#[1] 4489 69
MetDNA_C18_pos<- MetDNA_C18_pos %>%
filter(`id_kegg` != "NA")
dim(MetDNA_C18_pos)
#去除id_kegg为NA的行
#> dim(MetDNA_C18_pos)
#[1] 4364 69
inner<-inner_join(distinct_fecal_kegg,MetDNA_C18_pos,by=c("cpd"="id_kegg"))
dim(inner)
inner_bijiao<-inner %>% select(Name, name,confidence_level,everything())
dim(inner_bijiao)
#> dim(inner_bijiao)
#[1] 57 168
inner_level1<-inner_bijiao[inner_bijiao$confidence_level=="level1",]
dim(inner_level1)
#> dim(inner_level1)
#[1] 39 168
write.xlsx(inner_level1,"inner_level1.xlsx")
暂无评论