1.输入文件:
2.代码
#title:boxplot-5utr-cds-3tr-ATCG的百分比分布和T检验_封装函数版
rm(list=ls(all=TRUE))
setwd("E:/R/Rscripts/5UTR_ABD_TE")
library(tidyverse)
library(ggplot2)
# library(RColorBrewer)
library(patchwork)
library(dplyr)
library(tidyr)
library(openxlsx)
library(stringr)
dfutr5<- read.table(file="lijinonextended_5utr_ATCG.fasta",na.strings = "#N/A",sep="\t",header = TRUE)
dfcds<- read.table("lijinonextended_cds_ATCG.fasta",na.strings = "#N/A",sep="\t",header = TRUE)
dfutr3<- read.table("lijinonextended_3utr_ATCG.fasta",na.strings = "#N/A",sep="\t",header = TRUE)reshape_data_frame <- function(df, id_column = NULL) {# 如果指定了ID列,则保留ID列,否则只处理核苷酸列if (!is.null(id_column)) {df_long <- df %>%pivot_longer(cols = c(A, T, C, G),names_to = "nucleotide",values_to = "percentage",id_cols = id_column # 保留ID列)} else {df_long <- df %>%select(-Sequence_ID) %>%pivot_longer(cols = c(A, T, C, G),names_to = "nucleotide",values_to = "percentage")}return(df_long)
}# 调用函数,转换数据框,假设我们想保留Id列
# reshaped_df <- reshape_data_frame(df, id_column = "Id")
# print(reshaped_df)# 如果不想保留Id列
dfutr5longer<- reshape_data_frame(dfutr5) %>% mutate(percentage1=percentage/100)
dfcdslonger<- reshape_data_frame(dfcds)%>% mutate(percentage1=percentage/100)
dfutr3longer<- reshape_data_frame(dfutr3)%>% mutate(percentage1=percentage/100)##############################################
#######定义函数用于T检验
##############################################perform_all_combinations_T_test <- function(df, group_column, score_column, df_name) {# 获取所有唯一的组unique_groups <- unique(df[[group_column]])# 生成所有可能的两两组合combinations <- combn(unique_groups, 2, simplify = FALSE)# 初始化一个空的数据框来存储结果results_df <- data.frame(Comparison = character(), Mean1 = numeric(), Mean2 = numeric(), Pvalue = numeric(), stringsAsFactors = FALSE)# 遍历每一对组合进行T检验for(combination in combinations) {group1 <- combination[1]group2 <- combination[2]# 提取两个组的指定Score值scores_group1 <- df[[score_column]][df[[group_column]] == group1]scores_group2 <- df[[score_column]][df[[group_column]] == group2]# 确保scores_group1和scores_group2不为空并且都是数值型if (length(scores_group1) > 0 && length(scores_group2) > 0 && all(is.numeric(scores_group1)) && all(is.numeric(scores_group2))) {# 进行T检验t_test_result <- t.test(scores_group1, scores_group2)# 计算两个组的均值mean_group1 <- mean(scores_group1, na.rm = TRUE)mean_group2 <- mean(scores_group2, na.rm = TRUE)# 向结果数据框添加一行comparison_value <- paste(df_name, group1, "_Vs_", df_name, group2, sep="")new_row <- data.frame(Comparison = comparison_value,Mean1 = mean_group1, Mean2 = mean_group2, Pvalue = t_test_result$p.value, stringsAsFactors = FALSE)results_df <- rbind(results_df, new_row)}}return(results_df)
}# 调用函数的例子:
result5utr <- perform_all_combinations_T_test(dfutr5longer, "nucleotide", "percentage1", "5utr")
resultcds <- perform_all_combinations_T_test(dfcdslonger, "nucleotide", "percentage1", "cds")
result3utr <- perform_all_combinations_T_test(dfutr3longer, "nucleotide", "percentage1", "3utr")# # 正确的调用方法
# t.test_result <- t.test(
# dfutr5longer$percentage1[dfutr5longer$nucleotide == "A"],
# dfutr5longer$percentage1[dfutr5longer$nucleotide == "T"]
# )
#
# # 打印测试结果
# print(t.test_result)###########################################################################
##绘制boxplot-自定义函数
##########################################################################
library(tidyverse)
library(ggplot2)
library(patchwork)# 更新函数定义以包括x轴标题作为参数
create_grouped_boxplot <- function(data, group_var, score_var, x_label = "5'UTR",y_label = "Score", y_limits = c(0, 100), y_breaks = seq(0, 100, 20), fill_values = c("#c59d94", "#afc7e8", "#dbdb8d", "#ff9896")) {data[[group_var]] <- factor(data[[group_var]], levels = c("A", "T", "C", "G"), labels = c("A", "U", "C", "G"), ordered = TRUE)p <- ggplot(data, aes(x = .data[[group_var]], y = .data[[score_var]], fill = .data[[group_var]])) +# geom_violin(trim=FALSE,color="white") + geom_errorbar(width = 0.1,size = 0.35,position = position_dodge(0.9),stat = "boxplot") +geom_boxplot(outlier.size = -1,width = 0.25,position = position_dodge(0.9),fatten = 1.2,size = 0.5) +theme_classic() +labs(y = y_label, x = x_label) +scale_y_continuous(limits = y_limits, breaks = y_breaks) +theme(strip.background = element_rect(colour = "black", fill = "#FFFFFF"),plot.title = element_text(hjust = 0.5, vjust = 1, lineheight = 1, color = "black"),panel.background = element_rect(fill = "white", colour = "black", linewidth = 0.5),axis.title.y = element_text(size = 15, face = "bold", color = "black"),axis.title.x = element_text(size = 15, face = "bold", color = "black", vjust = 0.5, hjust = 0.5, margin = margin(t = 12)),axis.text = element_text(size = 13, face = "bold", color = "black")) +scale_fill_manual(values = fill_values) +guides(fill = "none")return(p)
}
p1 <- create_grouped_boxplot(dfutr5longer, "nucleotide", "percentage", x_label = "5'UTR")
p2 <- create_grouped_boxplot(dfcdslonger, "nucleotide", "percentage", x_label = "CDS")
p3 <- create_grouped_boxplot(dfutr3longer, "nucleotide", "percentage", x_label = "3'UTR")
p4<-p1+p2+p3+plot_layout(nrow = 1,ncol = 3)
ggsave("boxplot-5utr-cds-3tr-ATCG的百分比分布和T检验.pdf",plot=p4,width=18,height=8)
3.输出结果: