跟着iMeta学做图｜ggplot2绘制曼哈顿图展现物种丰度差异

# 农心生信工作室
# 2022-10-05
# 代码编写及注释：农心生信工作室
# 曼哈顿图 (Manhattan Plot) 是散点图的一种，在微生物组研究中通常用于表示微生物差异丰度的结果，
# 是微生物组分析中常用的展示方式。本期我们挑选2022年6月23日刊登在iMeta上的Biochar stimulates tomato roots to recruit a bacterial assemblage contributing to disease resistance against Fusarium wilt- iMeta | 
# 东农吴凤芝/南农韦中等揭示生物炭抑制作物土传病害机理，选择文章的Figure 2D进行复现，
# 降解和探讨曼哈顿图的绘制方法，先上原图：

# 接下来，我们将通过详尽的代码逐步拆解曼哈顿图，最终实现对原图的复现。
# R包检测和安装
# 1.安装核心R包ggplot2以及一些功能辅助性R包，并载入所有R包
if (!require("ggplot2"))
  install.packages('ggplot2') 
if (!require("dplyr"))
  install.packages('dplyr') 
if (!require("ggrepel"))
  install.packages('ggrepel') 
# 加载包
library(ggplot2)
library(dplyr)
library(ggrepel)
# 2.由于并未找到原文图片对应的数据，
# 在这里我们生成测试数据。测试数据应为差异丰度分析后产生的数据框。设置随机种子并生成df
# 设置随机数种子，确保数据可重复
set.seed(123)
# 根据图片，推测至少需要以下几列关键数据
# OTU的ID：字符串，每一个OTU的ID名
# P值：0-1之间的随机数，为了更接近原图，先生成绝大多数0.05-1间的随机数，在根据图像生成一些特定数值
# RA值：这里选择0.05-5之间的随机数
# group：字符串，标明OTU分别属于哪些属
# level：字符串，标注该OTU是否显著富集
# 本例中生成1000 OTUs
df = data.frame(OTU=c(paste("OTU",seq(1,997),sep = "")),
                P=as.matrix(runif(997,min=0.05,max=1)),
                RA=as.matrix(runif(997,min=0.05,max=5)),
                Group=c(rep("Alphaproteobacteria",100),
                        rep("Betaproteobacteria",40),rep("Gammaproteobacteria",60),
                        rep("Deltaproteobacteria",80),rep("Acidobacteria",45),
                        rep("Actinobacteria",110),rep("Bacteroidetes",100),
                        rep("Chloroflexi",140),rep("Gemdfimonadetes",40),
                        rep("Firmicutes",75),rep("Planctomycetes",117),
                        rep("Others",90))
) %>% rbind(data.frame(OTU=c("OTU9998","OTU9999","OTU10000"),P=c(1e-23,1e-6,1e-7),RA=c(5,2.5,0.05),Group=c("Gammaproteobacteria","Actinobacteria","Bacteroidetes")))
#通过ifelse函数，基于P值给df添加新的一列Level，P小于0.05的属于"enriched",其他属于"nosig"
df$Level=ifelse(df$P<0.05,"enriched","nosig")
#给df添加新的一列NegativeLogP，即为P值的以10为底的逆对数
df$NegativeLogP=-log10(df$P)
#观察原图，发现x轴坐标刻度名称是固定的，基于OTU的属分组，因此在这里我们通过factor函数固定属分组顺序，并按属分组对df排序
df$Group=factor(df$Group,levels = c("Alphaproteobacteria","Betaproteobacteria","Gammaproteobacteria","Deltaproteobacteria","Acidobacteria","Actinobacteria","Bacteroidetes","Chloroflexi","Gemdfimonadetes","Firmicutes","Planctomycetes","Others"))#固定属分组顺序
df=arrange(df,Group)#dplyr包的arrange函数对df排序
df$OTU=factor(df$OTU,levels = df$OTU)#固定OTU名顺序
# 曼哈顿图绘制
# 3.使用ggplot2包绘制一个最简单的曼哈顿图（这时只是一张简单的散点图），x轴为OTU名，y轴为NegativeLogP值，
# 颜色映射到属分组，散点大小映射RA值，散点形状映射Level：
p<-ggplot(df, aes(x=OTU, y=NegativeLogP, color=Group, size=RA, shape=Level)) +
  geom_point(alpha=.7)

# 4. 设置散点的形状以及尺寸,并隐藏color和shape对应的图例
p<-ggplot(df, aes(x=OTU, y=NegativeLogP, color=Group, size=RA, shape=Level)) +
  geom_point(alpha=.7)+
  scale_shape_manual(values=c(16,1))+ #选择形状，nosig对应空心圆，enrich对应实心圆
  scale_size_continuous(range = c(0.5,5),
                        breaks = c(0.05, 0.10,0.50,1.00,5.00),
                        labels = c('0.05', '0.10','0.50','1.00','5.00'))+ #设置散点尺寸
  guides(color="none",shape="none")#隐藏图例

#5. 观察预览图，我们发现x轴刻度标签初显示是一条粗黑线，其实是1000个OTU名同时显示导致的，我们观察原图，发现原图是以属分组的属名作为x轴刻度标签，刻度线是属分组中位的OTU。为了获得作为刻度的每个属中位OTU，我们编写并运行函数getBreak()来实现：

getBreak<-function(x,y){
  freq<-as.vector(table(y))  #table()计算频数
  half_freq<-freq%/%2
  for (i in seq(2,length(freq))){
    new_num<-freq[i]+freq[i-1]
    freq[i]=new_num}
  pos<-freq-half_freq
  break_point<-as.vector(x[pos])
  return(break_point)
}
#运行函数getBreak()后，修改x轴刻度及标签，并增加x轴左右的空白：

p<-ggplot(df, aes(x=OTU, y=NegativeLogP, color=Group, size=RA, shape=Level)) +
  geom_point(alpha=.7)+
  scale_shape_manual(values=c(16,1))+ #选择形状，nosig对应空心圆，enrich对应实心圆
  scale_size_continuous(range = c(0.5,5),
                        breaks = c(0.05, 0.10,0.50,1.00,5.00),
                        labels = c('0.05', '0.10','0.50','1.00','5.00'))+ #设置散点尺寸
  guides(color="none",shape="none")+#隐藏图例
  scale_x_discrete(breaks=getBreak(df$OTU,df$Group),
                   labels=c("Alphaproteobacteria","Betaproteobacteria","Gammaproteobacteria","Deltaproteobacteria","Acidobacteria","Actinobacteria","Bacteroidetes","Chloroflexi","Gemdfimonadetes","Firmicutes","Planctomycetes","Others"),
                   expand = expansion(mult = 0.03))#expansion函数的参数mult: 百分比间距，可以接受一个向量

# 5 给散点添加文字标签，本例中需要添加标签的是OTU9998，通过ggrepel包的geom_text_repel函数来添加文字标签：

gtext<-filter(df,df$OTU=="OTU9998")

p<-ggplot(df, aes(x=OTU, y=NegativeLogP, color=Group, size=RA, shape=Level)) +
  geom_point(alpha=.7,key_glyph="point")+
  scale_shape_manual(values=c(16,1))+ #选择形状，nosig对应空心圆，enrich对应实心圆
  scale_size_continuous(range = c(0.5,5),
                        breaks = c(0.05, 0.10,0.50,1.00,5.00),
                        labels = c('0.05', '0.10','0.50','1.00','5.00'))+ #设置散点尺寸
  guides(color="none",shape="none")+#隐藏图例
  scale_x_discrete(breaks=getBreak(df$OTU,df$Group),
                   labels=c("Alphaproteobacteria","Betaproteobacteria","Gammaproteobacteria","Deltaproteobacteria","Acidobacteria","Actinobacteria","Bacteroidetes","Chloroflexi","Gemdfimonadetes","Firmicutes","Planctomycetes","Others"),
                   expand = expansion(mult = 0.03))+#expansion函数的参数mult: 百分比间距，可以接受一个向量
  geom_text_repel(aes(OTU,NegativeLogP,label="OTU9998\n(Psedomonas sp.)"),gtext,colour='black', size =5,box.padding = 1, 
                           point.padding = 0.8, 
                           segment.color = "white",
                           show.legend = F)#添加文字标签

# 6 添加表示阈值的虚线，阈值为p=0.05：
threshold<--log10(0.05)

p<-ggplot(df, aes(x=OTU, y=NegativeLogP, color=Group, size=RA, shape=Level)) +
  geom_point(alpha=.7,key_glyph="point")+
  scale_shape_manual(values=c(16,1))+ #选择形状，nosig对应空心圆，enrich对应实心圆
  scale_size_continuous(range = c(0.5,5),
                        breaks = c(0.05, 0.10,0.50,1.00,5.00),
                        labels = c('0.05', '0.10','0.50','1.00','5.00'))+ #设置散点尺寸
  guides(color="none",shape="none")+#隐藏图例
  scale_x_discrete(breaks=getBreak(df$OTU,df$Group),
                   labels=c("Alphaproteobacteria","Betaproteobacteria","Gammaproteobacteria","Deltaproteobacteria","Acidobacteria","Actinobacteria","Bacteroidetes","Chloroflexi","Gemdfimonadetes","Firmicutes","Planctomycetes","Others"),
                   expand = expansion(mult = 0.03))+#expansion函数的参数mult: 百分比间距，可以接受一个向量
  geom_text_repel(aes(OTU,NegativeLogP,label="OTU9998\n(Psedomonas sp.)"),gtext,colour='black', size =5,box.padding = 1, 
                           point.padding = 0.8, 
                           segment.color = "white",
                           show.legend = F)+#添加文字标签
  geom_hline(yintercept=threshold, linetype=2, color="grey")

# 7 改变分组颜色，优化背景，修改轴标题，倾斜x轴刻度标签：
#设置分组颜色
mycol<-c("#008087","#00A962","#86C03D","#74BC56",
         "#E53636","#3D7ABB","#EE8B15","#AF4592",
         "#7AC5C1","#906024","#6E1F86","#545556")

p<-ggplot(df, aes(x=OTU, y=NegativeLogP, color=Group, size=RA, shape=Level)) +
  geom_point(alpha=.7,key_glyph="point")+
  scale_shape_manual(values=c(16,1))+ #选择形状，nosig对应空心圆，enrich对应实心圆
  scale_size_continuous(range = c(0.5,5),
                        breaks = c(0.05, 0.10,0.50,1.00,5.00),
                        labels = c('0.05', '0.10','0.50','1.00','5.00'))+ #设置散点尺寸
  guides(color="none",shape="none")+#隐藏图例
  scale_x_discrete(breaks=getBreak(df$OTU,df$Group),
                   labels=c("Alphaproteobacteria","Betaproteobacteria","Gammaproteobacteria","Deltaproteobacteria","Acidobacteria","Actinobacteria","Bacteroidetes","Chloroflexi","Gemdfimonadetes","Firmicutes","Planctomycetes","Others"),
                   expand = expansion(mult = 0.03))+#expansion函数的参数mult: 百分比间距，可以接受一个向量
  geom_text_repel(aes(OTU,NegativeLogP,label="OTU9998\n(Psedomonas sp.)"),gtext,colour='black', size =5,box.padding = 1, 
                  point.padding = 0.8, 
                  segment.color = "white",
                  show.legend = F)+#添加文字标签
  geom_hline(yintercept=threshold, linetype=2, color="grey")+
  labs(x="", y="-log10(P)")+ #设置坐标轴标题
  scale_color_manual(values = mycol)+ #改变分组颜色
  theme(axis.line = element_line(size = 1),panel.grid=element_blank(),
        panel.background = element_rect(fill = 'white'),
        axis.text.x = element_text(angle = 45, hjust = 0.5, vjust = 0.5))

# 8 我们注意到原图不同区块有不同的背景色，我们可以通过annotate()函数来实现，使图形背景在不同区域颜色不同，注意，因为是改变图形背景色，因此图层应在整个图形的底层：
p<-ggplot(df, aes(x=OTU, y=NegativeLogP, color=Group, size=RA, shape=Level)) +
  annotate("rect",xmin = 1, xmax = 281,
           ymin = 0, ymax = 25,
           fill="#F4F4F3")+
  annotate("rect",xmin = 327, xmax = 437,
           ymin = 0, ymax = 25,
           fill="#F4F4F3")+
  annotate("rect",xmin = 539, xmax = 678,
           ymin = 0, ymax = 25,
           fill="#F4F4F3")+
  annotate("rect",xmin = 719, xmax = 793,
           ymin = 0, ymax = 25,
           fill="#F4F4F3")+
  annotate("rect",xmin = 911, xmax = 1000,
           ymin = 0, ymax = 25,
           fill="#F4F4F3")+
  geom_point(alpha=.7,key_glyph="point")+
  scale_shape_manual(values=c(16,1))+ #选择形状，nosig对应空心圆，enrich对应实心圆
  scale_size_continuous(range = c(0.5,5),
                        breaks = c(0.05, 0.10,0.50,1.00,5.00),
                        labels = c('0.05', '0.10','0.50','1.00','5.00'))+ #设置散点尺寸
  guides(color="none",shape="none")+#隐藏图例
  scale_x_discrete(breaks=getBreak(df$OTU,df$Group),
                   labels=c("Alphaproteobacteria","Betaproteobacteria","Gammaproteobacteria","Deltaproteobacteria","Acidobacteria","Actinobacteria","Bacteroidetes","Chloroflexi","Gemdfimonadetes","Firmicutes","Planctomycetes","Others"),
                   expand = expansion(mult = 0.03))+#expansion函数的参数mult: 百分比间距，可以接受一个向量
  geom_text_repel(aes(OTU,NegativeLogP,label="OTU9998\n(Psedomonas sp.)"),gtext,colour='black', size =5,box.padding = 1, 
                  point.padding = 0.8, 
                  segment.color = "#F4F4F3",
                  show.legend = F)+#添加文字标签
  geom_hline(yintercept=threshold, linetype=2, color="grey")+
  labs(x="", y="-log10(P)")+
  scale_color_manual(values = mycol)+
  theme(axis.line = element_line(size = 1),panel.grid=element_blank(),
        panel.background = element_rect(fill = 'white'),
        axis.text.x = element_text(angle = 45, hjust = 0.5, vjust = 0.5))

9 最后，我们通过Adobe Illustrator（AI）来美化一下图片，成品图如下

# 附.完整代码
library(ggplot2)
library(dplyr)
library(ggrepel)

# 设置随机数种子，确保数据可重复
set.seed(123)
# 根据图片，推测至少需要以下几列关键数据
# OTU的ID：字符串，每一个OTU的ID名
# P值：0-1之间的随机数，为了更接近原图，先生成绝大多数0.05-1间的随机数，在根据图像生成一些特定数值
# RA值：这里选择0.05-5之间的随机数
# group：字符串，标明OTU分别属于哪些属
# level：字符串，标注该OTU是否显著富集
# 本例中生成1000 OTUs
df = data.frame(OTU=c(paste("OTU",seq(1,997),sep = "")),
                P=as.matrix(runif(997,min=0.05,max=1)),
                RA=as.matrix(runif(997,min=0.05,max=5)),
                Group=c(rep("Alphaproteobacteria",100),
                        rep("Betaproteobacteria",40),rep("Gammaproteobacteria",60),
                        rep("Deltaproteobacteria",80),rep("Acidobacteria",45),
                        rep("Actinobacteria",110),rep("Bacteroidetes",100),
                        rep("Chloroflexi",140),rep("Gemdfimonadetes",40),
                        rep("Firmicutes",75),rep("Planctomycetes",117),
                        rep("Others",90))
) %>% rbind(data.frame(OTU=c("OTU9998","OTU9999","OTU10000"),P=c(1e-23,1e-6,1e-7),RA=c(5,2.5,0.05),Group=c("Gammaproteobacteria","Actinobacteria","Bacteroidetes")))
#通过ifelse函数，基于P值给df添加新的一列Level，P小于0.05的属于"enriched",其他属于"nosig"
df$Level=ifelse(df$P<0.05,"enriched","nosig")
#给df添加新的一列NegativeLogP，即为P值的以10为底的逆对数
df$NegativeLogP=-log10(df$P)
#观察原图，发现x轴坐标刻度名称是固定的，基于OTU的属分组，因此在这里我们通过factor函数固定属分组顺序，并按属分组对df排序
df$Group=factor(df$Group,levels = c("Alphaproteobacteria","Betaproteobacteria","Gammaproteobacteria","Deltaproteobacteria","Acidobacteria","Actinobacteria","Bacteroidetes","Chloroflexi","Gemdfimonadetes","Firmicutes","Planctomycetes","Others"))#固定属分组顺序
df=arrange(df,Group)#dplyr包的arrange函数对df排序
df$OTU=factor(df$OTU,levels = df$OTU)#固定OTU名顺序


getBreak<-function(x,y){
  freq<-as.vector(table(y))  #table()计算频数
  half_freq<-freq%/%2
  for (i in seq(2,length(freq))){
    new_num<-freq[i]+freq[i-1]
    freq[i]=new_num}
  pos<-freq-half_freq
  break_point<-as.vector(x[pos])
  return(break_point)
}

gtext<-filter(df,df$OTU=="OTU9998")
threshold<--log10(0.05)
#设置分组颜色
mycol<-c("#008087","#00A962","#86C03D","#74BC56",
         "#E53636","#3D7ABB","#EE8B15","#AF4592",
         "#7AC5C1","#906024","#6E1F86","#545556")


p<-ggplot(df, aes(x=OTU, y=NegativeLogP, color=Group, size=RA, shape=Level)) +
  annotate("rect",xmin = 1, xmax = 281,
           ymin = 0, ymax = 25,
           fill="#F4F4F3")+
  annotate("rect",xmin = 327, xmax = 437,
           ymin = 0, ymax = 25,
           fill="#F4F4F3")+
  annotate("rect",xmin = 539, xmax = 678,
           ymin = 0, ymax = 25,
           fill="#F4F4F3")+
  annotate("rect",xmin = 719, xmax = 793,
           ymin = 0, ymax = 25,
           fill="#F4F4F3")+
  annotate("rect",xmin = 911, xmax = 1000,
           ymin = 0, ymax = 25,
           fill="#F4F4F3")+
  geom_point(alpha=.7,key_glyph="point")+
  scale_shape_manual(values=c(16,1))+ #选择形状，nosig对应空心圆，enrich对应实心圆
  scale_size_continuous(range = c(0.5,5),
                        breaks = c(0.05, 0.10,0.50,1.00,5.00),
                        labels = c('0.05', '0.10','0.50','1.00','5.00'))+ #设置散点尺寸
  guides(color="none",shape="none")+#隐藏图例
  scale_x_discrete(breaks=getBreak(df$OTU,df$Group),
                   labels=c("Alphaproteobacteria","Betaproteobacteria","Gammaproteobacteria","Deltaproteobacteria","Acidobacteria","Actinobacteria","Bacteroidetes","Chloroflexi","Gemdfimonadetes","Firmicutes","Planctomycetes","Others"),
                   expand = expansion(mult = 0.03))+#expansion函数的参数mult: 百分比间距，可以接受一个向量
  geom_text_repel(aes(OTU,NegativeLogP,label="OTU9998\n(Psedomonas sp.)"),gtext,colour='black', size =5,box.padding = 1, 
                  point.padding = 0.8, 
                  segment.color = "#F4F4F3",
                  show.legend = F)+#添加文字标签
  geom_hline(yintercept=threshold, linetype=2, color="grey")+
  labs(x="", y="-log10(P)")+
  scale_color_manual(values = mycol)+
  theme(axis.line = element_line(size = 1),panel.grid=element_blank(),
        panel.background = element_rect(fill = 'white'),
        axis.text.x = element_text(angle = 45, hjust = 0.5, vjust = 0.5))

ggsave("Figure2D.pdf", p, height = 5,width = 10) # 保存图为PDF，指定宽和高

发送评论 编辑评论

发送评论编辑评论