- 由于 CoverM 存储库更新, 记录的代码可能与最新代码稍有不同
Contig
分析 mean 方法
- contig 方法的入口位于 src/bin/coverm::main 的 “contig” 分支中, 忽略帮助等参数, 假设已有 bam 文件, 不设置 reads 的额外
filter_params
, 则函数主体简写如下:fn main() {let mut app = build_cli();let matches = app.clone().get_matches();set_log_level(&matches, false);let mut print_stream;match matches.subcommand_name() {Some("contig") => {let m = matches.subcommand_matches("contig").unwrap();let print_zeros = true;// Add metabat filtering params since we are running contiglet filter_params = FilterParameters::generate_from_clap(m);let threads = *m.get_one::<u16>("threads").unwrap();print_stream = OutputWriter::generate(m.get_one::<String>("output-file").map(|x| &**x));let mut estimators_and_taker =EstimatorsAndTaker::generate_from_clap(m, print_stream.clone());estimators_and_taker =estimators_and_taker.print_headers("Contig", print_stream.clone());if m.contains_id("bam-files") {let bam_files: Vec<&str> = m.get_many::<String>("bam-files").unwrap().map(|x| &**x).collect();if filter_params.doing_filtering() {unreachable!()} else if m.get_flag("sharded") {unreachable!()} else {let bam_readers =coverm::bam_generator::generate_named_bam_readers_from_bam_files(bam_files);run_contig(&mut estimators_and_taker,bam_readers,print_zeros,filter_params.flag_filters,threads,&mut print_stream,);}} else {unreachable!()}}_ => {app.print_help().unwrap();println!();}} }
filter_params = FilterParameters::generate_from_clap(m);
获得了对 bam 文件过滤的参数, 这些参数的获取在 “genome”, “filter”, 和 “contig” 三个分支中是相同的, 在这里先忽略let bam_readers = coverm::bam_generator::generate_named_bam_readers_from_bam_files(bam_files);
将 bam 文件名转换为结构化的对象- 随后通过
run_contig
进行读取
run_contig
- run_contig 如下:
fn run_contig<R: coverm::bam_generator::NamedBamReader,T: coverm::bam_generator::NamedBamReaderGenerator<R>, >(estimators_and_taker: &mut EstimatorsAndTaker,bam_readers: Vec<T>,print_zeros: bool,flag_filters: FlagFilter,threads: u16,print_stream: &mut OutputWriter, ) {let reads_mapped = coverm::contig::contig_coverage(bam_readers,&mut estimators_and_taker.taker,&mut estimators_and_taker.estimators,print_zeros,&flag_filters,threads,);debug!("Finalising printing ..");estimators_and_taker.printer.finalise_printing(&estimators_and_taker.taker,print_stream,Some(&reads_mapped),&estimators_and_taker.columns_to_normalise,estimators_and_taker.rpkm_column,estimators_and_taker.tpm_column,); }
- 其首先使用
coverm::contig::contig_coverage
计算 coverage, 随后再通过estimators_and_taker.printer.finalise_printing
输出
- 其首先使用
contig_coverage
- contig_coverage 的主体框架如下:
pub fn contig_coverage<R: NamedBamReader, G: NamedBamReaderGenerator<R>, T: CoverageTaker>(bam_readers: Vec<G>,coverage_taker: &mut T,coverage_estimators: &mut Vec<CoverageEstimator>,print_zero_coverage_contigs: bool,flag_filters: &FlagFilter,threads: u16, ) -> Vec<ReadsMapped> {let mut reads_mapped_vector = vec![];for bam_generator in bam_readers {let mut bam_generated = bam_generator.start();中间步骤let reads_mapped = ReadsMapped {num_mapped_reads: num_mapped_reads_total,num_reads: bam_generated.num_detected_primary_alignments(),};reads_mapped_vector.push(reads_mapped);if bam_generated.num_detected_primary_alignments() == 0 {warn!("No primary alignments were observed for sample {} \- perhaps something went wrong in the mapping?",stoit_name);}bam_generated.finish();}reads_mapped_vector }
- 可见, coverm 逐个处理 bam 文件, 记录每个 bam 的
num_mapped_reads
和 reads 总数 (bam_generated.num_detected_primary_alignments()
, 仅不考虑is_secondary
和is_supplementary
情况, 但是允许 unmapped), 随后返回这个列表
- 可见, coverm 逐个处理 bam 文件, 记录每个 bam 的
- 在这个函数的中间步骤中首先初始化参数, 随后进入一个巨大的 loop 循环
// for record in records loop {match bam_generated.read(&mut record) {None => {break;}Some(Ok(())) => {}Some(e) => {panic!("Error reading BAM record: {:?}", e)}}
- 在这个 loop 循环中, 首先读取 reads 比对 (不考虑 paired)
- 仅处理质量足够好的 reads:
if !flag_filters.passes(&record) {trace!("Skipping read based on flag filtering");continue; } let tid = record.tid();// if mapped if !record.is_unmapped() {
- 假设 bam 文件已经按 contig 完成排序 (也就是说, 所有比对到同一个 reference 上的 reads 都在一起), 那么需要报告所有上一条 reads 的结果:
// if reference has changed, print the last record if tid != last_tid {if tid < last_tid {error!("BAM file appears to be unsorted. Input BAM files must be sorted by reference (i.e. by samtools sort)");panic!("BAM file appears to be unsorted. Input BAM files must be sorted by reference (i.e. by samtools sort)");}process_previous_contigs(last_tid,tid,coverage_estimators,&ups_and_downs,num_mapped_reads_in_current_contig,total_edit_distance_in_current_contig,total_indels_in_current_contig,&mut num_mapped_reads_total,);
- 由于这种结构, 可见 bam 中最后一个 contig 的比对序列结束后循环也结束了, 因此在循环结束后还需进行一次
process_previous_contigs
- 随后重置参数, 以处理 (当前 reads mapping 的) 下一条 contig 的结果
ups_and_downs =vec![0; header.target_len(tid as u32).expect("Corrupt BAM file?") as usize]; debug!("Working on new reference {}",std::str::from_utf8(target_names[tid as usize]).unwrap() ); last_tid = tid; num_mapped_reads_in_current_contig = 0; total_edit_distance_in_current_contig = 0; total_indels_in_current_contig = 0;
- 由于这种结构, 可见 bam 中最后一个 contig 的比对序列结束后循环也结束了, 因此在循环结束后还需进行一次
- 按照 reads mapping 的 cigar 序列, 向整数向量
ups_and_downs
中添加 mapped bases 的变化率:} if !record.is_supplementary() && !record.is_secondary() {num_mapped_reads_in_current_contig += 1; }// for each chunk of the cigar string trace!("read name {:?}",std::str::from_utf8(record.qname()).unwrap() ); let mut cursor: usize = record.pos() as usize; for cig in record.cigar().iter() {trace!("Found cigar {:} from {}", cig, cursor);match cig {Cigar::Match(_) | Cigar::Diff(_) | Cigar::Equal(_) => {// if M, X, or = increment start and decrement end indextrace!("Adding M, X, or = at {} and {}",cursor,cursor + cig.len() as usize);ups_and_downs[cursor] += 1;let final_pos = cursor + cig.len() as usize;if final_pos < ups_and_downs.len() {// True unless the read hits the contig end.ups_and_downs[final_pos] -= 1;}cursor += cig.len() as usize;}Cigar::Del(_) => {cursor += cig.len() as usize;total_indels_in_current_contig += cig.len() as u64;}Cigar::RefSkip(_) => {// if D or N, move the cursorcursor += cig.len() as usize;}Cigar::Ins(_) => {total_indels_in_current_contig += cig.len() as u64;}Cigar::SoftClip(_) | Cigar::HardClip(_) | Cigar::Pad(_) => {}} }
- 找到所有可比对的区域 (
M, X, or =
) - 比对区域起始处
ups_and_downs[cursor] += 1
- 终止处
ups_and_downs[cursor + cig.len() as usize] -= 1
- 找到所有可比对的区域 (
- 假设 bam 文件已经按 contig 完成排序 (也就是说, 所有比对到同一个 reference 上的 reads 都在一起), 那么需要报告所有上一条 reads 的结果:
process_previous_contigs
- process_previous_contigs 是一个在 contig_coverage 中定义的函数, 接受如下参数:
let mut process_previous_contigs =|last_tid,tid,coverage_estimators: &mut Vec<CoverageEstimator>,ups_and_downs: &[i32],num_mapped_reads_in_current_contig,total_edit_distance_in_current_contig,total_indels_in_current_contig,num_mapped_reads_total: &mut u64| {
- 随后 (忽略
last_tid == -2
即初始情况), 进行覆盖度的估计:for estimator in coverage_estimators.iter_mut() {estimator.add_contig(ups_and_downs,num_mapped_reads_in_current_contig,total_edit_distance_in_current_contig - total_indels_in_current_contig,) } let coverages: Vec<f32> = coverage_estimators.iter_mut().map(|estimator| estimator.calculate_coverage(&[0])).collect();
- 使用
estimator.add_contig
总结当前 contig 的 mapping 情况 - 使用
estimator.calculate_coverage
计算丰度, 此时设unobserved_contig_lengths: &[0]
即忽略较短的 contigs
- 使用
- 输出到表格对应
last_tid
的行中coverage_taker.start_entry(last_tid as usize,std::str::from_utf8(target_names[last_tid as usize]).unwrap(), ); for (coverage, estimator) incoverages.iter().zip(coverage_estimators.iter_mut()) {estimator.print_coverage(coverage, coverage_taker); } coverage_taker.finish_entry();
- 在 contig 中, 无需考虑多个 contig 的情况, 因此重置 estimator
for estimator in coverage_estimators.iter_mut() {estimator.setup(); }
- 随后 (忽略
MeanGenomeCoverageEstimator
add_contig-MeanGenomeCoverageEstimator
- 考虑 mean 方法:
fn add_contig(&mut self,ups_and_downs: &[i32],num_mapped_reads_in_contig: u64,total_mismatches_in_contig: u64, ) {match self {CoverageEstimator::MeanGenomeCoverageEstimator {ref mut total_count,ref mut total_bases,ref mut num_covered_bases,ref mut num_mapped_reads,ref mut total_mismatches,contig_end_exclusion,..} => {*num_mapped_reads += num_mapped_reads_in_contig;*total_mismatches += total_mismatches_in_contig;let len = ups_and_downs.len();match *contig_end_exclusion * 2 < len as u64 {true => *total_bases += len as u64 - 2 * *contig_end_exclusion,false => {debug!("Contig too short - less than twice the contig-end-exclusion");return; //contig is all ends, too short}}let mut cumulative_sum: i32 = 0;let start_from = *contig_end_exclusion as usize;let end_at = len - *contig_end_exclusion as usize - 1;for (i, current) in ups_and_downs.iter().enumerate() {cumulative_sum += current;if i >= start_from && i <= end_at {if cumulative_sum > 0 {*num_covered_bases += 1}*total_count += cumulative_sum as u64;}}
- 按
ups_and_downs
的信息, 更新estimator
: - 最终对应于每条 contig 的
MeanGenomeCoverageEstimator
的信息如下:total_count
: u64: 全部能 mapped 到这条 contig 上的 reads 的匹配区域的总碱基数 (modified bycontig_end_exclusion
)total_bases
: u64: 作为模板的 contig 的总长度 (modified bycontig_end_exclusion
)num_covered_bases
: u64: 作为模板的 contig 上, 有 reads 覆盖的总长度 (modified bycontig_end_exclusion
)num_mapped_reads
: u64: 匹配且是首选匹配到这条 contig 上的 reads 的数量- total_mismatches: u64
- min_fraction_covered_bases: f32
contig_end_exclusion
: u64: 默认 75, 可能是考虑到 contig 末端匹配的可靠性相对较低- exclude_mismatches: bool
- 按
calculate_coverage-MeanGenomeCoverageEstimator
- 考虑 mean 方法:
fn calculate_coverage(&mut self, unobserved_contig_lengths: &[u64]) -> f32 {match self {CoverageEstimator::MeanGenomeCoverageEstimator {total_count,total_bases,num_covered_bases,num_mapped_reads: _,total_mismatches,contig_end_exclusion,min_fraction_covered_bases,exclude_mismatches,} => {let final_total_bases = *total_bases// + 0+ CoverageEstimator::calculate_unobserved_bases(unobserved_contig_lengths,*contig_end_exclusion,);debug!("Calculating coverage with unobserved {:?}, \total bases {}, num_covered_bases {}, total_count {}, \total_mismatches {}, final_total_bases {}",unobserved_contig_lengths,total_bases,num_covered_bases,total_count,total_mismatches,final_total_bases,);if final_total_bases == 0|| (*num_covered_bases as f32 / final_total_bases as f32)< *min_fraction_covered_bases{0.0} else {let calculated_coverage = match exclude_mismatches {true => (*total_count - *total_mismatches) as f32,false => *total_count as f32,} / final_total_bases as f32;debug!("Found mean coverage {}", calculated_coverage);calculated_coverage}
- 在 contig 方法中,
unobserved_contig_lengths
设为 0 - 若
num_covered_bases / final_total_bases
较低, 则直接返回 0- 意味着所有 map 到这个 contig 上的 reads 全部不算数了~
- 在当前版本中, 暂不考虑
exclude_mismatches
- 故
mean
方法的calculated_coverage
计算方式为:total_count / total_bases
- 其中每条 contig 两侧长达
contig_end_exclusion
的片段不在计算平均值的范围内.
- 在 contig 方法中,
分析 trimmed_mean 方法
- 假设已有 bam 文件, 不设置 reads 的额外
filter_params
, contig 函数主体简写见上- 计算 coverage method 的方法选择是通过
mut estimators_and_taker = EstimatorsAndTaker::generate_from_clap(m, print_stream.clone());
设置的:let mut estimators_and_taker =EstimatorsAndTaker::generate_from_clap(m, print_stream.clone());
EstimatorsAndTaker::generate_from_clap
具有一个CoverageEstimator
列表 (estimators_and_taker.estimators
)- 前述的 “mean” 对应的即为
MeanGenomeCoverageEstimator
- “trimmed_mean” 对应的即为
TrimmedMeanGenomeCoverageEstimator
- “covered_fraction” 对应的即为
CoverageFractionGenomeCoverageEstimator
- 前述的 “mean” 对应的即为
- 计算 coverage method 的方法选择是通过
- 一直到进行覆盖度的估计, 所有中间的所有过程都没有区别, 直到
- 使用
estimator.add_contig
总结当前 contig 的 mapping 情况 - 使用
estimator.calculate_coverage
计算丰度, 此时设unobserved_contig_lengths: &[0]
即忽略较短的 contigs
- 使用
TrimmedMeanGenomeCoverageEstimator
add_contig-TrimmedMeanGenomeCoverageEstimator
- trimmed_mean 方法:
CoverageEstimator::TrimmedMeanGenomeCoverageEstimator {ref mut counts,ref mut observed_contig_length,ref mut num_covered_bases,ref mut num_mapped_reads,contig_end_exclusion,.. } => {*num_mapped_reads = num_mapped_reads_in_contig;let len1 = ups_and_downs.len();match *contig_end_exclusion * 2 < len1 as u64 {true => {debug!("Adding len1 {}", len1);*observed_contig_length += len1 as u64 - 2 * *contig_end_exclusion}false => {debug!("Contig too short - less than twice the contig-end-exclusion");return; //contig is all ends, too short}}debug!("Total observed length now {}", *observed_contig_length);let mut cumulative_sum: i32 = 0;let start_from = *contig_end_exclusion as usize;let end_at = len1 - *contig_end_exclusion as usize - 1;debug!("ups and downs {:?}", ups_and_downs);for (i, current) in ups_and_downs.iter().enumerate() {if *current != 0 {debug!("At i {}, cumulative sum {} and current {}",i, cumulative_sum, current);}cumulative_sum += current;if i >= start_from && i <= end_at {if cumulative_sum > 0 {*num_covered_bases += 1}if counts.len() <= cumulative_sum as usize {(*counts).resize(cumulative_sum as usize + 1, 0);}(*counts)[cumulative_sum as usize] += 1}}
- 此时不考虑
total_mismatches_in_contig
(错配) - 最终对应于每条 contig 的
TrimmedMeanGenomeCoverageEstimator
的信息如下:counts
: Vec:counts[n_base] = n_site
记录了 reference contig 上, 有多少 (n_site) 个碱基能比对上给定数量 (n_base) 条的 readsobserved_contig_length
: u64: 同MeanGenomeCoverageEstimator.total_bases
num_covered_bases
: u64: (同上)num_mapped_reads
: u64: 当前 contig 的num_mapped_reads
*num_mapped_reads = num_mapped_reads_in_contig
- min: f32:
- max: f32:
- min_fraction_covered_bases: f32: (同上)
contig_end_exclusion
: u64: (同上)
- 此时不考虑
calculate_coverage-TrimmedMeanGenomeCoverageEstimator
- trimmed_mean 方法 类似于 mean 方法, 但首先考虑了 coverage 为 0 的情况:
CoverageEstimator::TrimmedMeanGenomeCoverageEstimator {counts,observed_contig_length,num_covered_bases,num_mapped_reads: _,contig_end_exclusion,min_fraction_covered_bases,min,max, } => {let unobserved_contig_length = CoverageEstimator::calculate_unobserved_bases(unobserved_contig_lengths,*contig_end_exclusion,);let total_bases = *observed_contig_length + unobserved_contig_length;debug!("Calculating coverage with num_covered_bases {}, observed_length {}, unobserved_length {:?} and counts {:?}",num_covered_bases, observed_contig_length, unobserved_contig_lengths, counts);match total_bases {0 => 0.0,_ => {if (*num_covered_bases as f32 / total_bases as f32)< *min_fraction_covered_bases{0.0} else {let min_index: usize = (*min * total_bases as f32).floor() as usize;let max_index: usize = (*max * total_bases as f32).ceil() as usize;if *num_covered_bases == 0 {return 0.0;}counts[0] += unobserved_contig_length;
- 同上:
unobserved_contig_lengths
和counts[0]
在 contig 情况下为 0- 忽略所有
num_covered_bases / final_total_bases
较低的 contigs 的 reads 和丰度
- 根据
TrimmedMeanGenomeCoverageEstimator.min
和TrimmedMeanGenomeCoverageEstimator.max
, 计算了对应 contig 总长的min_index
和max_index
(尽量取大范围) - 其他 (即 coverage 不等于 0) 的情况:
let mut num_accounted_for: usize = 0;let mut total: usize = 0;let mut started = false;for (i, num_covered) in counts.iter().enumerate() {num_accounted_for += *num_covered as usize;debug!("start: i {}, num_accounted_for {}, total {}, min {}, max {}",i, num_accounted_for, total, min_index, max_index);if num_accounted_for >= min_index {debug!("inside");if started {if num_accounted_for > max_index {debug!("num_accounted_for {}, *num_covered {}",num_accounted_for, *num_covered);let num_excess =num_accounted_for - *num_covered as usize;let num_wanted = match max_index >= num_excess {true => max_index - num_excess + 1,false => 0,};debug!("num wanted1: {}", num_wanted);total += num_wanted * i;break;} else {total += *num_covered as usize * i;}} else if num_accounted_for > max_index {// all coverages are the same in the trimmed settotal = (max_index - min_index + 1) * i;started = true} else if num_accounted_for < min_index {debug!("too few on first")} else {let num_wanted = num_accounted_for - min_index + 1;debug!("num wanted2: {}", num_wanted);total = num_wanted * i;started = true;}}debug!("end i {}, num_accounted_for {}, total {}",i, num_accounted_for, total);}total as f32 / (max_index - min_index) as f32}} }
- 按 reads 覆盖数 (从小到大) 排序后, 逐步处理其中的序列
- 一开始
started = false
, 并逐步累加num_accounted_for
(当前处理的总 contig base 数量) - 当超过
min_index
后, 开始累加总匹配 reads 的 base 数到total
中 - 直到超过
min_index
, 直接中断返回
- 一开始
- 按 reads 覆盖数 (从小到大) 排序后, 逐步处理其中的序列
- 故
trimmed_mean
方法的calculated_coverage
计算方式为:total_count / total_bases
- 其中每条 contig 两侧长达
contig_end_exclusion
的片段不在计算平均值的范围内 - 此外, 覆盖度最高的
max%
和最低的min%
部分的 base 不在计算平均值的范围内
- 同上: