File Info

Filename
.command.sh
Full Path
s3://natera-rnd-pltf-dev-nextflow-scratch-01/work/dc/5cdbc3c88f8ba846f947c05db774bc/.command.sh
Size
1.7 KB
Attempt
#!/bin/bash -Ceuo pipefail
seg_count=$(bcftools view -H -f PASS positive_somatic_control_1.vcf.gz | wc -l)

# PASS amplification calls (DUP + DUP-LOH) → BED with TCN_EM in col 4
bcftools view -f PASS -i 'SVTYPE="DUP" || SVTYPE="DUP-LOH"' positive_somatic_control_1.vcf.gz \
    | bcftools query -f '%CHROM\t%POS0\t%INFO/END\t%INFO/TCN_EM\n' \
    > calls.amps.bed

# Per-truth-row max TCN (left-outer-join keeps unrecovered rows).
bedtools intersect -loj -a hd789_cnv_amps.bed -b calls.amps.bed \
    | awk -F'\t' '
        { gene=$4; tcn=($5=="."?"":$8) }
        { if (!(gene in idx)) { idx[gene]=++n; ord[n]=gene; best[gene]="" }
          if (tcn != "" && (best[gene]=="" || tcn+0 > best[gene]+0)) best[gene]=tcn }
        END { for (i=1;i<=n;i++) print ord[i]"\t"best[ord[i]] }
      ' > per_gene_max_tcn.tsv

truth_total=$(grep -cv '^#' hd789_cnv_amps.bed)
recov=$(awk -F'\t' '$2!=""' per_gene_max_tcn.tsv | wc -l)
recov_cns=$(awk -F'\t' '$2!="" {print $1"="$2}' per_gene_max_tcn.tsv | paste -sd, -)
missed=$(awk -F'\t' '$2==""  {print $1}'        per_gene_max_tcn.tsv | paste -sd';' -)
: ${recov_cns:=NONE}
: ${missed:=NONE}

{
    printf "segment_count\t%s\n"              "$seg_count"
    printf "amp_truth_total\t%s\n"            "$truth_total"
    printf "amp_recovered_count\t%s\n"        "$recov"
    printf "amp_recovered_copy_numbers\t%s\n" "$recov_cns"
    printf "amp_missed\t%s\n"                 "$missed"
} > positive_somatic_control_1.cnv.tsv

cat <<-END_VERSIONS > versions.yml
"DAQ:CONTROL_METRICS:POSITIVE_SOMATIC_METRICS:POSITIVE_SOMATIC_CNV":
    bcftools: $(bcftools --version | head -n1 | awk '{print $2}')
    bedtools: $(bedtools --version | awk '{print $2}' | sed 's/^v//')
END_VERSIONS