#!/bin/bash -Ceuo pipefail
bcftools norm \
-m -both \
--old-rec-tag OLD_RECORD \
--check-ref w \
-f Homo_sapiens_assembly38.fasta \
negative_somatic_control_2.tnseq.filtered.vcf.gz \
| bcftools view -i 'STRLEN(REF)==STRLEN(ALT)' \
| awk -F'\t' -v OFS='\t' '
/^##INFO/ && !h {
print "##INFO=<ID=VARIANT_TYPE,Number=1,Type=String,Description=\"Variant type: SNV or MNV\">"
h=1
}
/^#/ { print; next }
function fix_gt(sample, nf, fields, gt, na, sep, m, alleles, k, has_alt, has_ref, result) {
nf = split(sample, fields, ":")
gt = fields[1]
sep = (index(gt, "|") > 0) ? "|" : "/"
m = split(gt, alleles, "[/|]")
if (m <= 2) return sample
has_alt = 0; has_ref = 0
for (k = 1; k <= m; k++) {
if (alleles[k] != "." && alleles[k]+0 > 0) has_alt++
else if (alleles[k] == "0") has_ref++
}
if (has_alt > 0) fields[1] = "0" sep "1"
else if (has_ref > 0) fields[1] = "0" sep "0"
else fields[1] = "." sep "."
result = fields[1]
for (k = 2; k <= nf; k++) result = result ":" fields[k]
return result
}
{
if (length($4)==1 && length($5)==1) vt="SNV"
else vt="MNV"
$8 = ($8=="." ? "VARIANT_TYPE=" vt : $8 ";VARIANT_TYPE=" vt)
for (s = 10; s <= NF; s++) $s = fix_gt($s)
print
}
' \
| bcftools view -Oz -o negative_somatic_control_2.snv_indel.tnseq.snv.unsorted.vcf.gz
# Enforce canonical sample order (tumor, normal) so concat never sees mismatched columns
printf '%s\n%s\n' "Sig_18_Blood" "reference-NA12878" > sample_order.txt
bcftools view -S sample_order.txt negative_somatic_control_2.snv_indel.tnseq.snv.unsorted.vcf.gz -Oz -o negative_somatic_control_2.snv_indel.tnseq.snv.vcf.gz
rm negative_somatic_control_2.snv_indel.tnseq.snv.unsorted.vcf.gz
tabix -p vcf negative_somatic_control_2.snv_indel.tnseq.snv.vcf.gz
cat <<-END_VERSIONS > versions.yml
"DAQ:CONTROL_VARIANT_CALLING:CONTROL_SOMATIC_VC:VCF_SOMATIC_SNV_INDEL:PREP_TNSEQ_SNV":
bcftools: $(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*$//')
END_VERSIONS