#!/bin/bash -Ceuo pipefail
# (A) header flags
header=$(bcftools view -h germline_control_1.vcf.gz)
if printf '%s\n' "$header" | grep -q '^##CNV_QC_STATUS='; then
qc_failed=true
else
qc_failed=false
fi
abn_chr=$(printf '%s\n' "$header" | sed -n 's/^##ABNORMAL_PLOIDY_CHR=//p' | head -n1)
if [ -n "$abn_chr" ]; then
abnormal_ploidy=true
else
abnormal_ploidy=false
abn_chr=NONE
fi
{
printf "qc_failed\t%s\n" "$qc_failed"
printf "abnormal_ploidy\t%s\n" "$abnormal_ploidy"
printf "abnormal_ploidy_chromosomes\t%s\n" "$abn_chr"
} > germline_control_1.cnv.tsv
# (A) PASS DEL/DUP counts + ratio
bcftools query -f '%FILTER\t%ALT\t%INFO/SVTYPE\n' germline_control_1.vcf.gz \
| awk -F'\t' '
$2=="<DEL>" && $3=="CNV" && $1=="PASS" { d++ }
$2=="<DUP>" && $3=="CNV" && $1=="PASS" { u++ }
END {
ratio=(u>0) ? sprintf("%.4f", d/u) : "NA"
printf "del_pass_count\t%d\n", d+0
printf "dup_pass_count\t%d\n", u+0
printf "del_dup_pass_ratio\t%s\n", ratio
}
' >> germline_control_1.cnv.tsv
# (B) landmark-deletion recovery — match by gene name in INFO/GENES, favor PASS
grep -v '^#' na12878_germline_cnv_dels.bed | awk -F'\t' 'NF>=4 { print $4 }' > truth_genes.txt
bcftools query -f '%FILTER\t%ALT\t%INFO/SVTYPE\t%INFO/GENES[\t%CN]\n' germline_control_1.vcf.gz \
| awk -F'\t' '
BEGIN { rank["PASS"]=0; rank["LOWCONF"]=1; rank["NOT_PRESENT"]=2 }
NR==FNR { g=$1; order[++n]=g; want[g]=1; bestf[g]="NOT_CALLED"; bestcn[g]="NA"; next }
{
filt=$1; alt=$2; svt=$3; genes=$4; cn=$5
if (alt != "<DEL>" || svt != "CNV") next
m=split(genes, ga, ",")
for (i=1; i<=m; i++) {
gene=ga[i]
if (!(gene in want)) continue
cr=(bestf[gene] in rank) ? rank[bestf[gene]] : 99
nr=(filt in rank) ? rank[filt] : 98
if (nr < cr) { bestf[gene]=filt; bestcn[gene]=cn }
}
}
END {
recov=0; missed=""
for (i=1; i<=n; i++) {
gene=order[i]; gl=tolower(gene)
printf "%s_del_filter\t%s\n", gl, bestf[gene]
printf "%s_del_cn\t%s\n", gl, bestcn[gene]
if (bestf[gene]=="PASS") recov++
else missed=(missed==""?gene:missed";"gene)
}
if (missed=="") missed="NONE"
printf "landmark_del_truth_total\t%d\n", n
printf "landmark_del_recovered_count\t%d\n", recov
printf "landmark_del_missed\t%s\n", missed
}
' truth_genes.txt - >> germline_control_1.cnv.tsv
cat <<-END_VERSIONS > versions.yml
"DAQ:CONTROL_METRICS:GERMLINE_METRICS:GERMLINE_CNV_METRICS":
bcftools: $(bcftools --version | head -n1 | awk '{print $2}')
END_VERSIONS