#!/bin/bash -Ceuo pipefail bcftools norm \ -m -both \ --check-ref w \ -f Homo_sapiens_assembly38.fasta \ negative_somatic_control_1.strelka.somatic_indels.vcf.gz \ | awk -F'\t' -v OFS='\t' -v normal="reference-NA12878" -v tumor="HCC1395_BL" ' /^##FORMAT=" print "##FORMAT=" print "##FORMAT=" print "##INFO=" } /^##/ { print; next } /^#CHROM/ { for (i=1; i<=NF; i++) { if ($i == "NORMAL") $i = normal else if ($i == "TUMOR") $i = tumor } print; next } function sgt_to_gt(s) { if (s == "ref") return "0/0" if (s == "het") return "0/1" if (s == "hom") return "1/1" return "./." } function derive_ad_af(sample, fmt_n, fmt_keys, sam_n, sam_vals, k, tar_str, tir_str, tar, tir, ad, af, denom) { fmt_n = split($9, fmt_keys, ":") sam_n = split(sample, sam_vals, ":") tar_str = ""; tir_str = "" for (k = 1; k <= fmt_n; k++) { if (fmt_keys[k] == "TAR") tar_str = sam_vals[k] if (fmt_keys[k] == "TIR") tir_str = sam_vals[k] } split(tar_str, tar, ",") split(tir_str, tir, ",") ad = (tar[1]+0) "," (tir[1]+0) denom = (tar[1]+0) + (tir[1]+0) af = (denom > 0) ? sprintf("%.4f", (tir[1]+0) / denom) : "0.0000" return ad ":" af } { ngt = "0/0"; tgt = "0/1" n = split($8, infos, ";") for (j = 1; j <= n; j++) { if (infos[j] ~ /^SGT=/) { split(infos[j], sgtval, "=") split(sgtval[2], gtpair, "->") ngt = sgt_to_gt(gtpair[1]) tgt = sgt_to_gt(gtpair[2]) } } n_ad_af = derive_ad_af($10) t_ad_af = derive_ad_af($11) $9 = "GT:AD:AF:" $9 $10 = ngt ":" n_ad_af ":" $10 $11 = tgt ":" t_ad_af ":" $11 $8 = ($8=="." ? "VARIANT_TYPE=INDEL" : $8 ";VARIANT_TYPE=INDEL") print } ' \ | bcftools view -Oz -o negative_somatic_control_1.snv_indel.strelka.indel.unsorted.vcf.gz # Enforce canonical sample order (tumor, normal) so concat never sees mismatched columns printf '%s\n%s\n' "HCC1395_BL" "reference-NA12878" > sample_order.txt bcftools view -S sample_order.txt negative_somatic_control_1.snv_indel.strelka.indel.unsorted.vcf.gz -Oz -o negative_somatic_control_1.snv_indel.strelka.indel.vcf.gz rm negative_somatic_control_1.snv_indel.strelka.indel.unsorted.vcf.gz tabix -p vcf negative_somatic_control_1.snv_indel.strelka.indel.vcf.gz cat <<-END_VERSIONS > versions.yml "DAQ:CONTROL_VARIANT_CALLING:CONTROL_SOMATIC_VC:VCF_SOMATIC_SNV_INDEL:PREP_STRELKA_INDEL": bcftools: $(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*$//') END_VERSIONS