#!/bin/bash -Ceuo pipefail # (A) header flags header=$(bcftools view -h germline_control_1.vcf.gz) if printf '%s\n' "$header" | grep -q '^##CNV_QC_STATUS='; then qc_failed=true else qc_failed=false fi abn_chr=$(printf '%s\n' "$header" | sed -n 's/^##ABNORMAL_PLOIDY_CHR=//p' | head -n1) if [ -n "$abn_chr" ]; then abnormal_ploidy=true else abnormal_ploidy=false abn_chr=NONE fi { printf "qc_failed\t%s\n" "$qc_failed" printf "abnormal_ploidy\t%s\n" "$abnormal_ploidy" printf "abnormal_ploidy_chromosomes\t%s\n" "$abn_chr" } > germline_control_1.cnv.tsv # (A) PASS DEL/DUP counts + ratio bcftools query -f '%FILTER\t%ALT\t%INFO/SVTYPE\n' germline_control_1.vcf.gz \ | awk -F'\t' ' $2=="" && $3=="CNV" && $1=="PASS" { d++ } $2=="" && $3=="CNV" && $1=="PASS" { u++ } END { ratio=(u>0) ? sprintf("%.4f", d/u) : "NA" printf "del_pass_count\t%d\n", d+0 printf "dup_pass_count\t%d\n", u+0 printf "del_dup_pass_ratio\t%s\n", ratio } ' >> germline_control_1.cnv.tsv # (B) landmark-deletion recovery — match by gene name in INFO/GENES, favor PASS grep -v '^#' na12878_germline_cnv_dels.bed | awk -F'\t' 'NF>=4 { print $4 }' > truth_genes.txt bcftools query -f '%FILTER\t%ALT\t%INFO/SVTYPE\t%INFO/GENES[\t%CN]\n' germline_control_1.vcf.gz \ | awk -F'\t' ' BEGIN { rank["PASS"]=0; rank["LOWCONF"]=1; rank["NOT_PRESENT"]=2 } NR==FNR { g=$1; order[++n]=g; want[g]=1; bestf[g]="NOT_CALLED"; bestcn[g]="NA"; next } { filt=$1; alt=$2; svt=$3; genes=$4; cn=$5 if (alt != "" || svt != "CNV") next m=split(genes, ga, ",") for (i=1; i<=m; i++) { gene=ga[i] if (!(gene in want)) continue cr=(bestf[gene] in rank) ? rank[bestf[gene]] : 99 nr=(filt in rank) ? rank[filt] : 98 if (nr < cr) { bestf[gene]=filt; bestcn[gene]=cn } } } END { recov=0; missed="" for (i=1; i<=n; i++) { gene=order[i]; gl=tolower(gene) printf "%s_del_filter\t%s\n", gl, bestf[gene] printf "%s_del_cn\t%s\n", gl, bestcn[gene] if (bestf[gene]=="PASS") recov++ else missed=(missed==""?gene:missed";"gene) } if (missed=="") missed="NONE" printf "landmark_del_truth_total\t%d\n", n printf "landmark_del_recovered_count\t%d\n", recov printf "landmark_del_missed\t%s\n", missed } ' truth_genes.txt - >> germline_control_1.cnv.tsv cat <<-END_VERSIONS > versions.yml "DAQ:CONTROL_METRICS:GERMLINE_METRICS:GERMLINE_CNV_METRICS": bcftools: $(bcftools --version | head -n1 | awk '{print $2}') END_VERSIONS