library(KEGGREST) library(readxl) library(tidyverse) library(openxlsx) library(tidyverse) library(magrittr) #######读取数据,mzcloud阈值大于80, CD_c18_pos<-read_xlsx("CD-c18-pos2.1.xlsx",sheet = "Compounds") CD_c18_pos <- CD_c18_pos %>% filter(!is.na(`mzCloud Best Match`)) %>% filter(`mzCloud Best Match` > 80) %>% group_by(Name) CD_c18_pos_max<- CD_c18_pos %>% summarise_all(max) fecal<-CD_c18_pos_max fecal$Name <- gsub("^\\(\\+\\)-", "", fecal$Name) fecal$Name <- gsub("^\\(\\-\\)-", "", fecal$Name) fecal$Name <- gsub("^\\(\\+/-\\)-", "", fecal$Name) fecal$Name <- gsub("^\\(\\+/-\\)", "", fecal$Name) fecal$Name <- gsub("^\\(\\±/-\\)", "", fecal$Name) fecal$Name <- gsub("^\\(±\\)", "", fecal$Name) fecal$Name <- gsub("β", "beta", fecal$Name) fecal$Name <- gsub("α", "alpha", fecal$Name) fecal <- fecal[!grepl("^NP-", fecal$Name), ] dim(fecal) ########### get_kegg_id <- function(name) { kegg_id <- tryCatch( { keggFind("compound", name) }, error = function(e) { NA } ) return(kegg_id) # 返回KEGG ID或NA } kegg <- lapply(fecal$Name, get_kegg_id) split_values <- data.frame(unlist(kegg)) split_values2 <- data.frame(unlist(strsplit(split_values$unlist.kegg., "; "))) colnames(split_values2)<-"kegg" data <- inner_join(fecal, split_values2, by = c("Name" = "kegg")) data<-data[,1] kegg_ids <- lapply(data$Name, get_kegg_id) new_df<-c() for (i in 1:length(kegg_ids)) { kegg_id <- kegg_ids[[i]] kegg_id_df <- as.data.frame(print(kegg_id)) kegg_id_df$name <- row.names(kegg_id_df) name <- row.names(kegg_id_df) colnames(kegg_id_df) <- c("kegg_key", "cpd") for (j in 1:nrow(kegg_id_df)) { split_values <- unlist(strsplit(kegg_id_df$kegg_key[j], "; ")) new_rows <- data.frame(kegg_key = split_values, cpd = kegg_id_df$cpd[j]) new_df <- rbind(new_df, new_rows) } } df <- unique(new_df[new_df$kegg_key %in% data$Name,]) colnames(df)<-c("Name","cpd") fecal_kegg<-left_join(df,fecal,by="Name") distinct_fecal_kegg <- distinct(fecal_kegg, Name, .keep_all = TRUE) distinct_fecal_kegg$cpd <- gsub("cpd:", "", distinct_fecal_kegg$cpd) dim(distinct_fecal_kegg) #> dim(distinct_fecal_kegg) #[1] 88 100 #即CD-c18-pos2.1.xlsx含KEGG的ID有88 MetDNA_C18_pos<-read_xlsx("MetDNA_C18_pos.xlsx",sheet = "MetDNA_C18_pos") dim(MetDNA_C18_pos) #> dim(MetDNA_C18_pos) #[1] 4489 69 MetDNA_C18_pos<- MetDNA_C18_pos %>% filter(`id_kegg` != "NA") dim(MetDNA_C18_pos) #去除id_kegg为NA的行 #> dim(MetDNA_C18_pos) #[1] 4364 69 inner<-inner_join(distinct_fecal_kegg,MetDNA_C18_pos,by=c("cpd"="id_kegg")) dim(inner) inner_bijiao<-inner %>% select(Name, name,confidence_level,everything()) dim(inner_bijiao) #> dim(inner_bijiao) #[1] 57 168 inner_level1<-inner_bijiao[inner_bijiao$confidence_level=="level1",] dim(inner_level1) #> dim(inner_level1) #[1] 39 168 write.xlsx(inner_level1,"inner_level1.xlsx")
暂无评论