#!/bin/bash -Ceuo pipefail bcftools norm \ -m -both \ --old-rec-tag OLD_RECORD \ --check-ref w \ -f Homo_sapiens_assembly38.fasta \ negative_somatic_control_2.tnseq.filtered.vcf.gz \ | bcftools view -i 'STRLEN(REF)==STRLEN(ALT)' \ | awk -F'\t' -v OFS='\t' ' /^##INFO/ && !h { print "##INFO=" h=1 } /^#/ { print; next } function fix_gt(sample, nf, fields, gt, na, sep, m, alleles, k, has_alt, has_ref, result) { nf = split(sample, fields, ":") gt = fields[1] sep = (index(gt, "|") > 0) ? "|" : "/" m = split(gt, alleles, "[/|]") if (m <= 2) return sample has_alt = 0; has_ref = 0 for (k = 1; k <= m; k++) { if (alleles[k] != "." && alleles[k]+0 > 0) has_alt++ else if (alleles[k] == "0") has_ref++ } if (has_alt > 0) fields[1] = "0" sep "1" else if (has_ref > 0) fields[1] = "0" sep "0" else fields[1] = "." sep "." result = fields[1] for (k = 2; k <= nf; k++) result = result ":" fields[k] return result } { if (length($4)==1 && length($5)==1) vt="SNV" else vt="MNV" $8 = ($8=="." ? "VARIANT_TYPE=" vt : $8 ";VARIANT_TYPE=" vt) for (s = 10; s <= NF; s++) $s = fix_gt($s) print } ' \ | bcftools view -Oz -o negative_somatic_control_2.snv_indel.tnseq.snv.unsorted.vcf.gz # Enforce canonical sample order (tumor, normal) so concat never sees mismatched columns printf '%s\n%s\n' "Sig_18_Blood" "reference-NA12878" > sample_order.txt bcftools view -S sample_order.txt negative_somatic_control_2.snv_indel.tnseq.snv.unsorted.vcf.gz -Oz -o negative_somatic_control_2.snv_indel.tnseq.snv.vcf.gz rm negative_somatic_control_2.snv_indel.tnseq.snv.unsorted.vcf.gz tabix -p vcf negative_somatic_control_2.snv_indel.tnseq.snv.vcf.gz cat <<-END_VERSIONS > versions.yml "DAQ:CONTROL_VARIANT_CALLING:CONTROL_SOMATIC_VC:VCF_SOMATIC_SNV_INDEL:PREP_TNSEQ_SNV": bcftools: $(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*$//') END_VERSIONS