群体遗传学分析(三):举例FST、π 和
前期,我们分享了利用vcftools计算FST、π、TajimaD,感兴趣可以移步阅读“群体遗传学(一)|vcftools 计算 FST与选择信号检测”、“群体遗传学(二)|基于 vcftools 的核苷酸多样性(π)与TajimaD的计算”。
本期,我们分享FST、π、TajimaD联合分析示例及可视化相关结果~
#这里选择注释类型为基因,把带井号的注释去掉,方便后面bedtools注释
awk '$3 == "gene" {split($9, a, ";"); for(i in a) {if(a[i] ~ /ID=/) {split(a[i], b, "="); print $1 "\t" $4-1 "\t" $5 "\t" b[2]}}}' Homo_num.gff > genes_with_names.bed
重点就是需要有CHR、START、EN坐标信息,TajimaD根据窗口大小可以增加END一列。
pi-ration文件用野生群体的PI/驯化群体的PI值得到
bedtools intersect -a genes_with_names.bed -b NSS.Tajima.D_window.bed -wa -wb > NSS_100_50_TajimaD.bed
bedtools intersect -a genes_with_names.bed -b pi_ratio_10_5.windowed.pi -wa -wb > NSS_XSS_100_50_pi.bed
bedtools intersect -a genes_with_names.bed -b NSS_XSS.windowed.weir.fst -wa -wb > NSS_XSS_fst.bed
这样就会在对对应的窗口有基因名称,方便后面分析。
fst<-read.table("fst.bed",header = F)
colnames(fst)<-c("Chr","Start","End","Gene","CHROM","BIN_START","BIN_END","N_VARIANTS","WEIGHTED_FST","MEAN_FST")
fst<-fst[!is.na(fst$WEIGHTED_FST),]
fst <- fst %>%
filter(!(WEIGHTED_FST < 0))
fst$ZFST<-(fst$WEIGHTED_FST-mean(fst$WEIGHTED_FST))/sd(fst$WEIGHTED_FST)
fst$POS<-(fst$BIN_START+fst$BIN_END)/2
fst$SNP<-paste(fst$Chr,fst$POS,sep = ":")
fstthreshold1=quantile(fst$ZFST,0.95)
fstthreshold2=quantile(fst$ZFST,0.99)
fstthreshold3=quantile(fst$ZFST,0.90)
fsttop5<-fst[fst$ZFST>fstthreshold1,]
fsttop1<-fst[fst$ZFST>fstthreshold2,]
fsttop0<-fst[fst$ZFST>fstthreshold3,]
quantile(fst$WEIGHTED_FST,0.90)
pi<-read.table("pi.bed",header = F)
colnames(pi)<-c("Chr","Start","End","Gene","CHROM","BIN_START","BIN_END","N_VARIANTS_RJF","pi_RJF","N_VARIANTS_XH","pi_XH","pi-ratio")
pithreshold1=quantile(pi
群体遗传学分析(三):举例FST、π 和 - 今日头条 pi-ratio`,0.95)pithreshold2=quantile(pi
群体遗传学分析(三):举例FST、π 和 - 今日头条 pi-ratio`,0.99)pithreshold3=quantile(pi
群体遗传学分析(三):举例FST、π 和 - 今日头条 pi-ratio`,0.90)pitop5<-pi[pi
群体遗传学分析(三):举例FST、π 和 - 今日头条 pi-ratio`>pithreshold1,]pitop1<-pi[pi
群体遗传学分析(三):举例FST、π 和 - 今日头条 pi-ratio`>pithreshold2,]pitop0<-pi[pi
群体遗传学分析(三):举例FST、π 和 - 今日头条 pi-ratio`>pithreshold3,]library(dplyr)
# 假设 pitop1 和 fsttop1 都有 Chr、BIN_START、BIN_END 三列来唯一定位窗口
top1 <- inner_join(
pitop1,
fsttop1,
by = c("Chr", "BIN_START", "BIN_END")
)
top5 <- inner_join(
pitop5,
fsttop5,
by = c("Chr", "BIN_START", "BIN_END")
)
top0 <- inner_join(
pitop0,
fsttop0,
by = c("Chr", "BIN_START", "BIN_END")
)
top的基因就是共有两种选择的基因。其他方法也是类似,比如TajimaD就是你目标群体,可以选择top的基因之后和其他方法联合。
1.3 选择信号可视化——曼哈顿图 png("top1_manha.png",height = 3000,width = 3000,res = 300)
manhattan(data, xlab="Chr", ylab="ZFST", cex.lab = 1.2, col.lab ="black", font.lab = 3, col = col,ylim=c(0,3), yaxt="n", cex.axis = 1, cex = 1, chrlabs = c(1:39), chr = "Chr", bp = "POS", p = "ZFST", snp = "SNP", logp = F, genomewideline = F, suggestiveline= F)
title(main = "ZFst(A:B)", col.main = "black", cex.main = 1, font.main = 1)
segments(-50000000,fstthreshold2,x1 = 955000000,y1 = threshold2,col = "red",lty = 1)
segments(960000000,fstthreshold1,x1 = 1040000000,y1 = threshold1,col = "blue",lty = 1)
axis(side=2,at=c(0,5,10),labels=c(0,5,10))
dev.off()
1.4 选择信号可视化——目标区域点线图 fst_snp<-read.table("A_B.weir.fst",header = T)
region_start <- 40000
region_end <- 500000
flank <- 2000 # ±2kb
plot_data <- fst_snp %>%
filter(CHROM == 1,
WEIR_AND_COCKERHAM_FST > 0,
POS >= region_start - flank,
POS <= region_end + flank) %>%
arrange(POS)
# 4. 绘图
# 定义 x 轴范围
xmin <- region_start - flank
xmax <- region_end + flank
ggplot(plot_data, aes(x = POS, y = WEIR_AND_COCKERHAM_FST)) +
# 核心区间阴影
geom_rect(aes(xmin = region_start, xmax = region_end,
ymin = -Inf, ymax = Inf),
inherit.aes = FALSE, fill = "grey90", alpha = 0.4) +
# 折线 + 散点
geom_line(color = "steelblue", size = 0.8) +
geom_point(color = "steelblue", size = 2) +
# X 轴 15 个刻度
scale_x_continuous(
breaks = seq(xmin, xmax, length.out = 15),
labels = scales::comma # 用千分位逗号格式化
) +
# 限定显示范围
coord_cartesian(xlim = c(xmin, xmax)) +
labs(
x = "",
y = "Weighted FST",
title = ""
) +
theme_minimal(base_family = "Times New Roman") +
theme(
text = element_text(color = "black"),
axis.text.x = element_text(angle = 45, hjust = 1, color = "black"),
axis.text.y = element_text(color = "black"),
panel.grid.minor = element_blank()
)
1.5 选择信号可视化——目标区域点图 region_start <- 40000
region_end <- 50000
flank <- 5000 # ±2kb
hp_sub <- hp_df %>%
filter(
CHR == 1,
START >= region_start - flank,
END <= region_end + flank
) %>%
arrange(START)
# 4. 绘图:以 START 为横坐标
ggplot(hp_sub, aes(x = START, y = Hp, color = group, shape = group)) +
# 核心区间阴影
geom_rect(aes(xmin = region_start, xmax = region_end,
ymin = -Inf, ymax = Inf),
inherit.aes = FALSE,
fill = "grey80", alpha = 0.4) +
# 散点
geom_point(size = 3) +
# 手动映射颜色和形状
scale_color_manual(values = c(A = "#F0D562", B = "#0071BE")) +
scale_shape_manual(values = c(A = 16, B = 17)) +
# X 轴 15 个等距刻度 & 千分号格式化
scale_x_continuous(
breaks = seq(region_start - flank, region_end + flank, length.out = 15),
labels = scales::comma
) +
coord_cartesian(xlim = c(region_start - flank, region_end + flank)) +
labs(
x = "Window START on Chr26 (bp)",
y = "Hp",
title = ""
) +
theme_minimal(base_family = "Times New Roman") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1, color = "black"),
axis.text.y = element_text(color = "black"),
legend.title = element_blank(),
panel.grid.minor = element_blank()
)
全局性和区域性画图就需要将数据集改变一下,大家可以根据需要选择全部数据或者某个区域数据画图即可。 (本期数据纯属虚构,无实际意义)
本期分享就到这结束啦,感谢大家关注支持,祝大家科研顺利~