#!/bin/bash
set -euo pipefail
FF=0.1
TARGET=200
log() { echo "[MIX_BAMS:HG00136_x_HG00290_FF_10] $*"; }
# ---- 1. Sample ~2000 evenly spaced BED intervals for fast cov estimation.
TOTAL_INTERVALS=$(wc -l < xgen-exome-research-panel-v2-targets-hg38.bed)
if [ ${TOTAL_INTERVALS} -gt 2000 ]; then
awk -v n=${TOTAL_INTERVALS} 'NR % int(n/2000) == 1' xgen-exome-research-panel-v2-targets-hg38.bed > sampled.bed
else
cp xgen-exome-research-panel-v2-targets-hg38.bed sampled.bed
fi
TARGET_BP=$(awk '{sum += ($3 - $2)} END {print sum}' sampled.bed)
log "BED: ${TOTAL_INTERVALS} intervals, sampled $(wc -l < sampled.bed) covering ${TARGET_BP} bp"
estimate_cov() {
# $1 = bam path
local bases
bases=$(samtools bedcov sampled.bed "$1" | awk '{sum += $NF} END {print sum}')
awk -v b=${bases} -v t=${TARGET_BP} 'BEGIN {printf "%.4f", b / t}'
}
# ---- 2. Source BAM stats: read counts + on-target coverage.
MAT_READS=$(samtools view -c 3574260681_r8_L37.bam)
FET_READS=$(samtools view -c 3599996064_r8_L37.bam)
if [ ${MAT_READS} -eq 0 ]; then log "FATAL: maternal BAM has 0 reads"; exit 1; fi
if [ ${FET_READS} -eq 0 ]; then log "FATAL: fetal BAM has 0 reads"; exit 1; fi
MAT_COV=$(estimate_cov 3574260681_r8_L37.bam)
FET_COV=$(estimate_cov 3599996064_r8_L37.bam)
log "maternal: 3574260681_r8_L37.bam reads=${MAT_READS} on_target_cov=${MAT_COV}x"
log "fetal: 3599996064_r8_L37.bam reads=${FET_READS} on_target_cov=${FET_COV}x"
# ---- 3. Compute effective target depth + fractions.
#
# Constraint: each subsample fraction must be <= 1.0:
# mat_frac = (1-ff) * T / D_mat <= 1.0 => T <= D_mat / (1-ff)
# fet_frac = ff * T / D_fet <= 1.0 => T <= D_fet / ff
# If the requested T exceeds either bound, we lower T (T_eff) so the
# subsample fractions stay valid. This keeps the FF *correct* at the cost
# of a slightly shallower mixture — preferable to clamping one side to
# 1.0, which would distort the actual FF away from the target label.
#
# Trailing \n is required for `read` + `set -e` (no newline => exit 1).
read T_EFF MAT_FRAC FET_FRAC STATUS_DEPTH < <(awk -v t=${TARGET} -v ff=${FF} -v dm=${MAT_COV} -v df=${FET_COV} '
BEGIN {
t_mat = (1.0 - ff) > 0 ? dm / (1.0 - ff) : t
t_fet = ff > 0 ? df / ff : t
t_eff = t
status = "OK"
if (t_mat < t_eff) { t_eff = t_mat; status = "WARN_MAT_INSUFFICIENT" }
if (t_fet < t_eff) { t_eff = t_fet; status = "WARN_FET_INSUFFICIENT" }
mat_f = (1.0 - ff) * t_eff / dm
fet_f = ff * t_eff / df
printf "%.4f %.4f %.4f %s\n", t_eff, mat_f, fet_f, status
}')
log "effective target=${T_EFF}x fractions: mat=${MAT_FRAC} fet=${FET_FRAC} status=${STATUS_DEPTH}"
# ---- 4. samtools view -s expects "seed.frac" — strip leading 0 from "0.NNNN".
# T_eff capping above guarantees both fractions are < 1.0 by construction
# (modulo float rounding; use min(0.999999, frac) as a final guard).
# Parenthesize the ternary — busybox awk needs it to disambiguate.
MAT_FRAC_CAP=$(awk -v f=${MAT_FRAC} 'BEGIN { printf "%.6f", (f >= 1.0 ? 0.999999 : f) }')
FET_FRAC_CAP=$(awk -v f=${FET_FRAC} 'BEGIN { printf "%.6f", (f >= 1.0 ? 0.999999 : f) }')
MAT_S=42$(echo ${MAT_FRAC_CAP} | sed 's/^0//')
FET_S=42$(echo ${FET_FRAC_CAP} | sed 's/^0//')
samtools view -b -h -s ${MAT_S} -@ 4 -o mat.sub.bam 3574260681_r8_L37.bam
samtools view -b -h -s ${FET_S} -@ 4 -o fet.sub.bam 3599996064_r8_L37.bam
MAT_SUB_READS=$(samtools view -c mat.sub.bam)
FET_SUB_READS=$(samtools view -c fet.sub.bam)
log "after subsample: mat=${MAT_SUB_READS}/${MAT_READS} fet=${FET_SUB_READS}/${FET_READS}"
# ---- 5. Merge + index.
samtools merge -f -@ 4 HG00136_x_HG00290_FF_10.mix.bam mat.sub.bam fet.sub.bam
samtools index -@ 4 HG00136_x_HG00290_FF_10.mix.bam
# ---- 6. Post-mix verification: read count + actual coverage on sampled BED.
MIX_READS=$(samtools view -c HG00136_x_HG00290_FF_10.mix.bam)
MIX_COV=$(estimate_cov HG00136_x_HG00290_FF_10.mix.bam)
OBS_FF=$(awk -v fr=${FET_SUB_READS} -v mr=${MAT_SUB_READS} 'BEGIN {if (fr+mr>0) printf "%.4f", fr/(fr+mr); else print "0"}')
log "mixture: HG00136_x_HG00290_FF_10.mix.bam reads=${MIX_READS} on_target_cov=${MIX_COV}x observed_ff=${OBS_FF}"
# ---- 7. Per-mixture report. effective_target_cov differs from target_cov
# only when one source is too shallow to support the requested depth at
# the requested FF — in that case we lower the depth to keep FF correct.
printf "mixture_id\ttarget_ff\ttarget_coverage\teffective_target_cov\tmat_reads_in\tfet_reads_in\tmat_cov_in\tfet_cov_in\tmat_frac\tfet_frac\tmat_reads_kept\tfet_reads_kept\tmix_reads\tmix_cov\tobserved_ff\tstatus\n" > HG00136_x_HG00290_FF_10.mix_report.tsv
printf "HG00136_x_HG00290_FF_10\t${FF}\t${TARGET}\t${T_EFF}\t${MAT_READS}\t${FET_READS}\t${MAT_COV}\t${FET_COV}\t${MAT_FRAC}\t${FET_FRAC}\t${MAT_SUB_READS}\t${FET_SUB_READS}\t${MIX_READS}\t${MIX_COV}\t${OBS_FF}\t${STATUS_DEPTH}\n" >> HG00136_x_HG00290_FF_10.mix_report.tsv