File Info

Filename
.command.sh
Full Path
s3://natera-rnd-pltf-dev-nextflow-scratch-01/work/e5/8f0fed98db8624eda58e51104bc50e/.command.sh
Size
4.9 KB
Attempt
#!/bin/bash
set -euo pipefail

FF=0.1
TARGET=200

log() { echo "[MIX_BAMS:HG00136_x_HG00290_FF_10] $*"; }

# ---- 1. Sample ~2000 evenly spaced BED intervals for fast cov estimation.
TOTAL_INTERVALS=$(wc -l < xgen-exome-research-panel-v2-targets-hg38.bed)
if [ ${TOTAL_INTERVALS} -gt 2000 ]; then
    awk -v n=${TOTAL_INTERVALS} 'NR % int(n/2000) == 1' xgen-exome-research-panel-v2-targets-hg38.bed > sampled.bed
else
    cp xgen-exome-research-panel-v2-targets-hg38.bed sampled.bed
fi
TARGET_BP=$(awk '{sum += ($3 - $2)} END {print sum}' sampled.bed)
log "BED: ${TOTAL_INTERVALS} intervals, sampled $(wc -l < sampled.bed) covering ${TARGET_BP} bp"

estimate_cov() {
    # $1 = bam path
    local bases
    bases=$(samtools bedcov sampled.bed "$1" | awk '{sum += $NF} END {print sum}')
    awk -v b=${bases} -v t=${TARGET_BP} 'BEGIN {printf "%.4f", b / t}'
}

# ---- 2. Source BAM stats: read counts + on-target coverage.
MAT_READS=$(samtools view -c 3574260681_r8_L37.bam)
FET_READS=$(samtools view -c 3599996064_r8_L37.bam)
if [ ${MAT_READS} -eq 0 ]; then log "FATAL: maternal BAM has 0 reads"; exit 1; fi
if [ ${FET_READS} -eq 0 ]; then log "FATAL: fetal BAM has 0 reads"; exit 1; fi

MAT_COV=$(estimate_cov 3574260681_r8_L37.bam)
FET_COV=$(estimate_cov 3599996064_r8_L37.bam)
log "maternal: 3574260681_r8_L37.bam reads=${MAT_READS} on_target_cov=${MAT_COV}x"
log "fetal:    3599996064_r8_L37.bam reads=${FET_READS} on_target_cov=${FET_COV}x"

# ---- 3. Compute effective target depth + fractions.
#
# Constraint: each subsample fraction must be <= 1.0:
#     mat_frac = (1-ff) * T / D_mat <= 1.0   =>   T <= D_mat / (1-ff)
#     fet_frac =      ff * T / D_fet <= 1.0  =>   T <= D_fet /     ff
# If the requested T exceeds either bound, we lower T (T_eff) so the
# subsample fractions stay valid. This keeps the FF *correct* at the cost
# of a slightly shallower mixture — preferable to clamping one side to
# 1.0, which would distort the actual FF away from the target label.
#
# Trailing \n is required for `read` + `set -e` (no newline => exit 1).
read T_EFF MAT_FRAC FET_FRAC STATUS_DEPTH < <(awk -v t=${TARGET} -v ff=${FF} -v dm=${MAT_COV} -v df=${FET_COV} '
    BEGIN {
        t_mat = (1.0 - ff) > 0 ? dm / (1.0 - ff) : t
        t_fet =        ff  > 0 ? df /        ff  : t
        t_eff = t
        status = "OK"
        if (t_mat < t_eff) { t_eff = t_mat; status = "WARN_MAT_INSUFFICIENT" }
        if (t_fet < t_eff) { t_eff = t_fet; status = "WARN_FET_INSUFFICIENT" }
        mat_f = (1.0 - ff) * t_eff / dm
        fet_f =        ff  * t_eff / df
        printf "%.4f %.4f %.4f %s\n", t_eff, mat_f, fet_f, status
    }')
log "effective target=${T_EFF}x  fractions: mat=${MAT_FRAC} fet=${FET_FRAC}  status=${STATUS_DEPTH}"

# ---- 4. samtools view -s expects "seed.frac" — strip leading 0 from "0.NNNN".
# T_eff capping above guarantees both fractions are < 1.0 by construction
# (modulo float rounding; use min(0.999999, frac) as a final guard).
# Parenthesize the ternary — busybox awk needs it to disambiguate.
MAT_FRAC_CAP=$(awk -v f=${MAT_FRAC} 'BEGIN { printf "%.6f", (f >= 1.0 ? 0.999999 : f) }')
FET_FRAC_CAP=$(awk -v f=${FET_FRAC} 'BEGIN { printf "%.6f", (f >= 1.0 ? 0.999999 : f) }')
MAT_S=42$(echo ${MAT_FRAC_CAP} | sed 's/^0//')
FET_S=42$(echo ${FET_FRAC_CAP} | sed 's/^0//')

samtools view -b -h -s ${MAT_S} -@ 4 -o mat.sub.bam 3574260681_r8_L37.bam
samtools view -b -h -s ${FET_S} -@ 4 -o fet.sub.bam 3599996064_r8_L37.bam
MAT_SUB_READS=$(samtools view -c mat.sub.bam)
FET_SUB_READS=$(samtools view -c fet.sub.bam)
log "after subsample: mat=${MAT_SUB_READS}/${MAT_READS} fet=${FET_SUB_READS}/${FET_READS}"

# ---- 5. Merge + index.
samtools merge -f -@ 4 HG00136_x_HG00290_FF_10.mix.bam mat.sub.bam fet.sub.bam
samtools index -@ 4 HG00136_x_HG00290_FF_10.mix.bam

# ---- 6. Post-mix verification: read count + actual coverage on sampled BED.
MIX_READS=$(samtools view -c HG00136_x_HG00290_FF_10.mix.bam)
MIX_COV=$(estimate_cov HG00136_x_HG00290_FF_10.mix.bam)
OBS_FF=$(awk -v fr=${FET_SUB_READS} -v mr=${MAT_SUB_READS}         'BEGIN {if (fr+mr>0) printf "%.4f", fr/(fr+mr); else print "0"}')
log "mixture:  HG00136_x_HG00290_FF_10.mix.bam reads=${MIX_READS} on_target_cov=${MIX_COV}x observed_ff=${OBS_FF}"

# ---- 7. Per-mixture report. effective_target_cov differs from target_cov
# only when one source is too shallow to support the requested depth at
# the requested FF — in that case we lower the depth to keep FF correct.
printf "mixture_id\ttarget_ff\ttarget_coverage\teffective_target_cov\tmat_reads_in\tfet_reads_in\tmat_cov_in\tfet_cov_in\tmat_frac\tfet_frac\tmat_reads_kept\tfet_reads_kept\tmix_reads\tmix_cov\tobserved_ff\tstatus\n" > HG00136_x_HG00290_FF_10.mix_report.tsv
printf "HG00136_x_HG00290_FF_10\t${FF}\t${TARGET}\t${T_EFF}\t${MAT_READS}\t${FET_READS}\t${MAT_COV}\t${FET_COV}\t${MAT_FRAC}\t${FET_FRAC}\t${MAT_SUB_READS}\t${FET_SUB_READS}\t${MIX_READS}\t${MIX_COV}\t${OBS_FF}\t${STATUS_DEPTH}\n" >> HG00136_x_HG00290_FF_10.mix_report.tsv