#!/bin/bash set -euo pipefail FF=0.05 TARGET=200 log() { echo "[MIX_BAMS:NA18508_x_NA18522_FF_5] $*"; } # ---- 1. Sample ~2000 evenly spaced BED intervals for fast cov estimation. TOTAL_INTERVALS=$(wc -l < xgen-exome-research-panel-v2-targets-hg38.bed) if [ ${TOTAL_INTERVALS} -gt 2000 ]; then awk -v n=${TOTAL_INTERVALS} 'NR % int(n/2000) == 1' xgen-exome-research-panel-v2-targets-hg38.bed > sampled.bed else cp xgen-exome-research-panel-v2-targets-hg38.bed sampled.bed fi TARGET_BP=$(awk '{sum += ($3 - $2)} END {print sum}' sampled.bed) log "BED: ${TOTAL_INTERVALS} intervals, sampled $(wc -l < sampled.bed) covering ${TARGET_BP} bp" estimate_cov() { # $1 = bam path local bases bases=$(samtools bedcov sampled.bed "$1" | awk '{sum += $NF} END {print sum}') awk -v b=${bases} -v t=${TARGET_BP} 'BEGIN {printf "%.4f", b / t}' } # ---- 2. Source BAM stats: read counts + on-target coverage. MAT_READS=$(samtools view -c 3599996111_r8_L48.bam) FET_READS=$(samtools view -c 3599996112_r8_L48.bam) if [ ${MAT_READS} -eq 0 ]; then log "FATAL: maternal BAM has 0 reads"; exit 1; fi if [ ${FET_READS} -eq 0 ]; then log "FATAL: fetal BAM has 0 reads"; exit 1; fi MAT_COV=$(estimate_cov 3599996111_r8_L48.bam) FET_COV=$(estimate_cov 3599996112_r8_L48.bam) log "maternal: 3599996111_r8_L48.bam reads=${MAT_READS} on_target_cov=${MAT_COV}x" log "fetal: 3599996112_r8_L48.bam reads=${FET_READS} on_target_cov=${FET_COV}x" # ---- 3. Compute effective target depth + fractions. # # Constraint: each subsample fraction must be <= 1.0: # mat_frac = (1-ff) * T / D_mat <= 1.0 => T <= D_mat / (1-ff) # fet_frac = ff * T / D_fet <= 1.0 => T <= D_fet / ff # If the requested T exceeds either bound, we lower T (T_eff) so the # subsample fractions stay valid. This keeps the FF *correct* at the cost # of a slightly shallower mixture — preferable to clamping one side to # 1.0, which would distort the actual FF away from the target label. # # Trailing \n is required for `read` + `set -e` (no newline => exit 1). read T_EFF MAT_FRAC FET_FRAC STATUS_DEPTH < <(awk -v t=${TARGET} -v ff=${FF} -v dm=${MAT_COV} -v df=${FET_COV} ' BEGIN { t_mat = (1.0 - ff) > 0 ? dm / (1.0 - ff) : t t_fet = ff > 0 ? df / ff : t t_eff = t status = "OK" if (t_mat < t_eff) { t_eff = t_mat; status = "WARN_MAT_INSUFFICIENT" } if (t_fet < t_eff) { t_eff = t_fet; status = "WARN_FET_INSUFFICIENT" } mat_f = (1.0 - ff) * t_eff / dm fet_f = ff * t_eff / df printf "%.4f %.4f %.4f %s\n", t_eff, mat_f, fet_f, status }') log "effective target=${T_EFF}x fractions: mat=${MAT_FRAC} fet=${FET_FRAC} status=${STATUS_DEPTH}" # ---- 4. samtools view -s expects "seed.frac" — strip leading 0 from "0.NNNN". # T_eff capping above guarantees both fractions are < 1.0 by construction # (modulo float rounding; use min(0.999999, frac) as a final guard). # Parenthesize the ternary — busybox awk needs it to disambiguate. MAT_FRAC_CAP=$(awk -v f=${MAT_FRAC} 'BEGIN { printf "%.6f", (f >= 1.0 ? 0.999999 : f) }') FET_FRAC_CAP=$(awk -v f=${FET_FRAC} 'BEGIN { printf "%.6f", (f >= 1.0 ? 0.999999 : f) }') MAT_S=42$(echo ${MAT_FRAC_CAP} | sed 's/^0//') FET_S=42$(echo ${FET_FRAC_CAP} | sed 's/^0//') samtools view -b -h -s ${MAT_S} -@ 4 -o mat.sub.bam 3599996111_r8_L48.bam samtools view -b -h -s ${FET_S} -@ 4 -o fet.sub.bam 3599996112_r8_L48.bam MAT_SUB_READS=$(samtools view -c mat.sub.bam) FET_SUB_READS=$(samtools view -c fet.sub.bam) log "after subsample: mat=${MAT_SUB_READS}/${MAT_READS} fet=${FET_SUB_READS}/${FET_READS}" # ---- 5. Merge + index. samtools merge -f -@ 4 NA18508_x_NA18522_FF_5.mix.bam mat.sub.bam fet.sub.bam samtools index -@ 4 NA18508_x_NA18522_FF_5.mix.bam # ---- 6. Post-mix verification: read count + actual coverage on sampled BED. MIX_READS=$(samtools view -c NA18508_x_NA18522_FF_5.mix.bam) MIX_COV=$(estimate_cov NA18508_x_NA18522_FF_5.mix.bam) OBS_FF=$(awk -v fr=${FET_SUB_READS} -v mr=${MAT_SUB_READS} 'BEGIN {if (fr+mr>0) printf "%.4f", fr/(fr+mr); else print "0"}') log "mixture: NA18508_x_NA18522_FF_5.mix.bam reads=${MIX_READS} on_target_cov=${MIX_COV}x observed_ff=${OBS_FF}" # ---- 7. Per-mixture report. effective_target_cov differs from target_cov # only when one source is too shallow to support the requested depth at # the requested FF — in that case we lower the depth to keep FF correct. printf "mixture_id\ttarget_ff\ttarget_coverage\teffective_target_cov\tmat_reads_in\tfet_reads_in\tmat_cov_in\tfet_cov_in\tmat_frac\tfet_frac\tmat_reads_kept\tfet_reads_kept\tmix_reads\tmix_cov\tobserved_ff\tstatus\n" > NA18508_x_NA18522_FF_5.mix_report.tsv printf "NA18508_x_NA18522_FF_5\t${FF}\t${TARGET}\t${T_EFF}\t${MAT_READS}\t${FET_READS}\t${MAT_COV}\t${FET_COV}\t${MAT_FRAC}\t${FET_FRAC}\t${MAT_SUB_READS}\t${FET_SUB_READS}\t${MIX_READS}\t${MIX_COV}\t${OBS_FF}\t${STATUS_DEPTH}\n" >> NA18508_x_NA18522_FF_5.mix_report.tsv