#!/bin/bash -Ceuo pipefail
bcftools norm \
-m -both \
--check-ref w \
-f Homo_sapiens_assembly38.fasta \
negative_somatic_control_1.strelka.somatic_indels.vcf.gz \
| awk -F'\t' -v OFS='\t' -v normal="reference-NA12878" -v tumor="HCC1395_BL" '
/^##FORMAT=<ID=DP,/ {
print "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"
print "##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths (ref, alt) derived from TAR_tier1/TIR_tier1\">"
print "##FORMAT=<ID=AF,Number=A,Type=Float,Description=\"Allele fraction: TIR_tier1 / (TAR_tier1 + TIR_tier1)\">"
print "##INFO=<ID=VARIANT_TYPE,Number=1,Type=String,Description=\"Variant type: INDEL\">"
}
/^##/ { print; next }
/^#CHROM/ {
for (i=1; i<=NF; i++) {
if ($i == "NORMAL") $i = normal
else if ($i == "TUMOR") $i = tumor
}
print; next
}
function sgt_to_gt(s) {
if (s == "ref") return "0/0"
if (s == "het") return "0/1"
if (s == "hom") return "1/1"
return "./."
}
function derive_ad_af(sample, fmt_n, fmt_keys, sam_n, sam_vals, k, tar_str, tir_str, tar, tir, ad, af, denom) {
fmt_n = split($9, fmt_keys, ":")
sam_n = split(sample, sam_vals, ":")
tar_str = ""; tir_str = ""
for (k = 1; k <= fmt_n; k++) {
if (fmt_keys[k] == "TAR") tar_str = sam_vals[k]
if (fmt_keys[k] == "TIR") tir_str = sam_vals[k]
}
split(tar_str, tar, ",")
split(tir_str, tir, ",")
ad = (tar[1]+0) "," (tir[1]+0)
denom = (tar[1]+0) + (tir[1]+0)
af = (denom > 0) ? sprintf("%.4f", (tir[1]+0) / denom) : "0.0000"
return ad ":" af
}
{
ngt = "0/0"; tgt = "0/1"
n = split($8, infos, ";")
for (j = 1; j <= n; j++) {
if (infos[j] ~ /^SGT=/) {
split(infos[j], sgtval, "=")
split(sgtval[2], gtpair, "->")
ngt = sgt_to_gt(gtpair[1])
tgt = sgt_to_gt(gtpair[2])
}
}
n_ad_af = derive_ad_af($10)
t_ad_af = derive_ad_af($11)
$9 = "GT:AD:AF:" $9
$10 = ngt ":" n_ad_af ":" $10
$11 = tgt ":" t_ad_af ":" $11
$8 = ($8=="." ? "VARIANT_TYPE=INDEL" : $8 ";VARIANT_TYPE=INDEL")
print
}
' \
| bcftools view -Oz -o negative_somatic_control_1.snv_indel.strelka.indel.unsorted.vcf.gz
# Enforce canonical sample order (tumor, normal) so concat never sees mismatched columns
printf '%s\n%s\n' "HCC1395_BL" "reference-NA12878" > sample_order.txt
bcftools view -S sample_order.txt negative_somatic_control_1.snv_indel.strelka.indel.unsorted.vcf.gz -Oz -o negative_somatic_control_1.snv_indel.strelka.indel.vcf.gz
rm negative_somatic_control_1.snv_indel.strelka.indel.unsorted.vcf.gz
tabix -p vcf negative_somatic_control_1.snv_indel.strelka.indel.vcf.gz
cat <<-END_VERSIONS > versions.yml
"DAQ:CONTROL_VARIANT_CALLING:CONTROL_SOMATIC_VC:VCF_SOMATIC_SNV_INDEL:PREP_STRELKA_INDEL":
bcftools: $(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*$//')
END_VERSIONS