#!/bin/bash -Ceuo pipefail
seg_count=$(bcftools view -H -f PASS positive_somatic_control_1.vcf.gz | wc -l)
# PASS amplification calls (DUP + DUP-LOH) → BED with TCN_EM in col 4
bcftools view -f PASS -i 'SVTYPE="DUP" || SVTYPE="DUP-LOH"' positive_somatic_control_1.vcf.gz \
| bcftools query -f '%CHROM\t%POS0\t%INFO/END\t%INFO/TCN_EM\n' \
> calls.amps.bed
# Per-truth-row max TCN (left-outer-join keeps unrecovered rows).
bedtools intersect -loj -a hd789_cnv_amps.bed -b calls.amps.bed \
| awk -F'\t' '
{ gene=$4; tcn=($5=="."?"":$8) }
{ if (!(gene in idx)) { idx[gene]=++n; ord[n]=gene; best[gene]="" }
if (tcn != "" && (best[gene]=="" || tcn+0 > best[gene]+0)) best[gene]=tcn }
END { for (i=1;i<=n;i++) print ord[i]"\t"best[ord[i]] }
' > per_gene_max_tcn.tsv
truth_total=$(grep -cv '^#' hd789_cnv_amps.bed)
recov=$(awk -F'\t' '$2!=""' per_gene_max_tcn.tsv | wc -l)
recov_cns=$(awk -F'\t' '$2!="" {print $1"="$2}' per_gene_max_tcn.tsv | paste -sd, -)
missed=$(awk -F'\t' '$2=="" {print $1}' per_gene_max_tcn.tsv | paste -sd';' -)
: ${recov_cns:=NONE}
: ${missed:=NONE}
{
printf "segment_count\t%s\n" "$seg_count"
printf "amp_truth_total\t%s\n" "$truth_total"
printf "amp_recovered_count\t%s\n" "$recov"
printf "amp_recovered_copy_numbers\t%s\n" "$recov_cns"
printf "amp_missed\t%s\n" "$missed"
} > positive_somatic_control_1.cnv.tsv
cat <<-END_VERSIONS > versions.yml
"DAQ:CONTROL_METRICS:POSITIVE_SOMATIC_METRICS:POSITIVE_SOMATIC_CNV":
bcftools: $(bcftools --version | head -n1 | awk '{print $2}')
bedtools: $(bedtools --version | awk '{print $2}' | sed 's/^v//')
END_VERSIONS