File Info

Filename
.command.sh
Full Path
s3://natera-rnd-pltf-dev-nextflow-scratch-01/work/cf/005adf9db585dc916a55aa192f29d2/.command.sh
Size
2.8 KB
Attempt
#!/bin/bash -Ceuo pipefail
# (A) header flags
header=$(bcftools view -h germline_control_1.vcf.gz)
if printf '%s\n' "$header" | grep -q '^##CNV_QC_STATUS='; then
    qc_failed=true
else
    qc_failed=false
fi
abn_chr=$(printf '%s\n' "$header" | sed -n 's/^##ABNORMAL_PLOIDY_CHR=//p' | head -n1)
if [ -n "$abn_chr" ]; then
    abnormal_ploidy=true
else
    abnormal_ploidy=false
    abn_chr=NONE
fi

{
    printf "qc_failed\t%s\n"                   "$qc_failed"
    printf "abnormal_ploidy\t%s\n"             "$abnormal_ploidy"
    printf "abnormal_ploidy_chromosomes\t%s\n" "$abn_chr"
} > germline_control_1.cnv.tsv

# (A) PASS DEL/DUP counts + ratio
bcftools query -f '%FILTER\t%ALT\t%INFO/SVTYPE\n' germline_control_1.vcf.gz \
    | awk -F'\t' '
        $2=="<DEL>" && $3=="CNV" && $1=="PASS" { d++ }
        $2=="<DUP>" && $3=="CNV" && $1=="PASS" { u++ }
        END {
            ratio=(u>0) ? sprintf("%.4f", d/u) : "NA"
            printf "del_pass_count\t%d\n",     d+0
            printf "dup_pass_count\t%d\n",     u+0
            printf "del_dup_pass_ratio\t%s\n", ratio
        }
    ' >> germline_control_1.cnv.tsv

# (B) landmark-deletion recovery — match by gene name in INFO/GENES, favor PASS
grep -v '^#' na12878_germline_cnv_dels.bed | awk -F'\t' 'NF>=4 { print $4 }' > truth_genes.txt

bcftools query -f '%FILTER\t%ALT\t%INFO/SVTYPE\t%INFO/GENES[\t%CN]\n' germline_control_1.vcf.gz \
    | awk -F'\t' '
        BEGIN { rank["PASS"]=0; rank["LOWCONF"]=1; rank["NOT_PRESENT"]=2 }
        NR==FNR { g=$1; order[++n]=g; want[g]=1; bestf[g]="NOT_CALLED"; bestcn[g]="NA"; next }
        {
            filt=$1; alt=$2; svt=$3; genes=$4; cn=$5
            if (alt != "<DEL>" || svt != "CNV") next
            m=split(genes, ga, ",")
            for (i=1; i<=m; i++) {
                gene=ga[i]
                if (!(gene in want)) continue
                cr=(bestf[gene] in rank) ? rank[bestf[gene]] : 99
                nr=(filt in rank) ? rank[filt] : 98
                if (nr < cr) { bestf[gene]=filt; bestcn[gene]=cn }
            }
        }
        END {
            recov=0; missed=""
            for (i=1; i<=n; i++) {
                gene=order[i]; gl=tolower(gene)
                printf "%s_del_filter\t%s\n", gl, bestf[gene]
                printf "%s_del_cn\t%s\n",     gl, bestcn[gene]
                if (bestf[gene]=="PASS") recov++
                else missed=(missed==""?gene:missed";"gene)
            }
            if (missed=="") missed="NONE"
            printf "landmark_del_truth_total\t%d\n",     n
            printf "landmark_del_recovered_count\t%d\n", recov
            printf "landmark_del_missed\t%s\n",          missed
        }
    ' truth_genes.txt - >> germline_control_1.cnv.tsv


cat <<-END_VERSIONS > versions.yml
"DAQ:CONTROL_METRICS:GERMLINE_METRICS:GERMLINE_CNV_METRICS":
    bcftools: $(bcftools --version | head -n1 | awk '{print $2}')
END_VERSIONS