File Info

Filename
.command.sh
Full Path
s3://natera-rnd-pltf-dev-nextflow-scratch-01/work/7e/d2f9b6797433ca58b2acb77d8c3098/.command.sh
Size
3.1 KB
Attempt
#!/bin/bash -Ceuo pipefail
bcftools norm \
    -m -both \
    --check-ref w \
    -f Homo_sapiens_assembly38.fasta \
    positive_somatic_control_2.strelka.somatic_indels.vcf.gz \
| awk -F'\t' -v OFS='\t' -v normal="reference-NA12878" -v tumor="Sig_18_tissue" '
    /^##FORMAT=<ID=DP,/ {
        print "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"
        print "##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths (ref, alt) derived from TAR_tier1/TIR_tier1\">"
        print "##FORMAT=<ID=AF,Number=A,Type=Float,Description=\"Allele fraction: TIR_tier1 / (TAR_tier1 + TIR_tier1)\">"
        print "##INFO=<ID=VARIANT_TYPE,Number=1,Type=String,Description=\"Variant type: INDEL\">"
    }
    /^##/ { print; next }
    /^#CHROM/ {
        for (i=1; i<=NF; i++) {
            if ($i == "NORMAL") $i = normal
            else if ($i == "TUMOR") $i = tumor
        }
        print; next
    }
    function sgt_to_gt(s) {
        if (s == "ref") return "0/0"
        if (s == "het") return "0/1"
        if (s == "hom") return "1/1"
        return "./."
    }
    function derive_ad_af(sample,    fmt_n, fmt_keys, sam_n, sam_vals, k, tar_str, tir_str, tar, tir, ad, af, denom) {
        fmt_n = split($9, fmt_keys, ":")
        sam_n = split(sample, sam_vals, ":")
        tar_str = ""; tir_str = ""
        for (k = 1; k <= fmt_n; k++) {
            if (fmt_keys[k] == "TAR") tar_str = sam_vals[k]
            if (fmt_keys[k] == "TIR") tir_str = sam_vals[k]
        }
        split(tar_str, tar, ",")
        split(tir_str, tir, ",")
        ad = (tar[1]+0) "," (tir[1]+0)
        denom = (tar[1]+0) + (tir[1]+0)
        af = (denom > 0) ? sprintf("%.4f", (tir[1]+0) / denom) : "0.0000"
        return ad ":" af
    }
    {
        ngt = "0/0"; tgt = "0/1"
        n = split($8, infos, ";")
        for (j = 1; j <= n; j++) {
            if (infos[j] ~ /^SGT=/) {
                split(infos[j], sgtval, "=")
                split(sgtval[2], gtpair, "->")
                ngt = sgt_to_gt(gtpair[1])
                tgt = sgt_to_gt(gtpair[2])
            }
        }
        n_ad_af = derive_ad_af($10)
        t_ad_af = derive_ad_af($11)
        $9  = "GT:AD:AF:" $9
        $10 = ngt ":" n_ad_af ":" $10
        $11 = tgt ":" t_ad_af ":" $11
        $8 = ($8=="." ? "VARIANT_TYPE=INDEL" : $8 ";VARIANT_TYPE=INDEL")
        print
    }
' \
| bcftools view -Oz -o positive_somatic_control_2.snv_indel.strelka.indel.unsorted.vcf.gz

# Enforce canonical sample order (tumor, normal) so concat never sees mismatched columns
printf '%s\n%s\n' "Sig_18_tissue" "reference-NA12878" > sample_order.txt
bcftools view -S sample_order.txt positive_somatic_control_2.snv_indel.strelka.indel.unsorted.vcf.gz -Oz -o positive_somatic_control_2.snv_indel.strelka.indel.vcf.gz
rm positive_somatic_control_2.snv_indel.strelka.indel.unsorted.vcf.gz

tabix -p vcf positive_somatic_control_2.snv_indel.strelka.indel.vcf.gz

cat <<-END_VERSIONS > versions.yml
"DAQ:CONTROL_VARIANT_CALLING:CONTROL_SOMATIC_VC:VCF_SOMATIC_SNV_INDEL:PREP_STRELKA_INDEL":
    bcftools: $(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*$//')
END_VERSIONS