.

dpuiu · Apr 18, 2023 · 0c9a8cd · 0c9a8cd
1 parent 3ba4277
commit 0c9a8cd
Show file tree

Hide file tree

Showing 10 changed files with 107 additions and 206 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,9 @@
+# 2023/04/18 #
+
+* updated snpSort.sh, concat2merge.pl : speed up vcf sort and merge
+* updated init.sh ; use HP_DP as minimum depth instead of 100
+* update README.md
+
 # 2023/03/03 #
 
 * updated downsampleSam.sh, downsampleSam.pl, filterSam.pl : alignments are sorted by the reads name before downsampling

diff --git a/README.md b/README.md
@@ -394,3 +394,22 @@
       bwa_mem.sh fastq/sample bams/sample
 
       ls bams/
+
+### How to update the SNV filtering criteria ###
+
+* By default, the SNV with DP<100 or labeled as "strict_strand|strand_bias|base_qual|map_qual|weak_evidence|slippage|position|HP" are filtered out
+
+* To change the criteria, update the following 2 variables in the "init.sh" file
+
+      export HP_FRULE="egrep -v 'strict_strand|strand_bias|base_qual|map_qual|weak_evidence|slippage|position|HP'"
+      export HP_DP=100
+
+* Samples with multiple problems (failing haplocheck, multiple NUMTs, multiple heteroplasmies belonging to a different haplogroup, average coverage < HP_DP ...) are saved in "*.suspicious.{id,tab}" files
+
+* By default, suspicious samples are not removed; The users should check these files and maually remove the samples
+
+      
+
+
+
+
diff --git a/VERSION.md b/VERSION.md
@@ -1 +1 @@
-20230303
+20230418
diff --git a/scripts/concat2merge.pl b/scripts/concat2merge.pl
@@ -44,7 +44,10 @@
 	close(IN);
 
 	#######################################################
-	my ($key,$sample,@keys,%keys,%GT_DP_AF);
+
+	my ($pkey,$sample,@keys,%keys,%GT_DP_AF,$header);
+	my ($AC,$AN)=(0,0);
+
 	while(<>)
 	{
 		if(/^##/)
@@ -60,86 +63,66 @@
 		}
 		else
 		{
+			#0	1	2	3	4	
+			#chrM	42	.	T	TC		
+
+			if(!$header) { print join "\t",("#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT",@samples); print "\n"; $header=1}
+
 			my @F=split;
-			$F[2]=".";
-			$F[5]=".";
-			$F[6]=".";
+			@F[2,5,6]=(".",".",".");
+
+			if($F[-1]=~/^(.+):(.+):1$/)        { $F[-1]="1:$2:1"    }
+			elsif($F[-1]=~/^(.+):(.+):(.+)$/)  { $F[-1]="0/1:$2:$3" }
+
 
-			my $sample;;
-			if($F[7]=~/SM=(.+?);(.+)/)
-			{
-				$sample=$1;
-				$F[7]=$2;
-			}
-			elsif($F[7]=~/SM=(.+)/)
-                        {
-                                $sample=$1;
-				$F[7]=".";
-			}
+			my $sample;
+			if($F[7]=~/SM=(.+?);(.+)/) { ($sample,$F[7])=($1,"$2;");}
+			elsif($F[7]=~/SM=(.+)/)    { ($sample,$F[7])=($1,"");}
 
-			defined($samples{$sample}) or die "ERROR3: $_";
+			defined($samples{$sample}) or die "ERROR: sample not defined in $_";
 			my $key=join "\t",@F[0..7];
+
+			if($pkey and $key ne $pkey)
+			{
+ 				my @P=($pkey."AC=$AC;AN=$AN","GT:DP:AF");
 
-			#april 28 2022
-			$F[-1]="1:$2:1" if($F[-1]=~/^(.+):(.+):(.+)$/ and $3==1);
+                		foreach my  $sample (@samples)
+		                {
+		                        if($GT_DP_AF{$sample}) { push @P,$GT_DP_AF{$sample} ; }
+                	       	 	else                   { push @P,".:.:." }
+	                	}
 
-			$GT_DP_AF{$key}{$sample}=$F[-1];
+	        	        print join "\t",@P;  print "\n";
 
-			if(!$keys{$key})
-			{
-				$keys{$key}=1;
-				push @keys,$key;
+				%GT_DP_AF=();
+				($AC,$AN)=(0,0);
+
 			}
+
+			$pkey=$key;
+
+			$GT_DP_AF{$sample}=$F[-1];			
+			$AC++;
+                        $AN++;
+                        $AN++ if($GT_DP_AF{$sample}=~/\|/ or $GT_DP_AF{$sample}=~/\//)
+
 		}
 	}
 
-	print join "\t",("#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT",@samples); print "\n";
-	foreach my $key (@keys)
-	{
-		my @F=();
-		my @SF;
-		push @F,($key,"GT:DP:AF");
-
-		my $AC=0;
-		my $AN=0;
-		foreach my $i (1..@samples)
-		{
-			my $sample=$samples[$i-1];
+	#end
+	if($pkey)
+        {
+		my @P=($pkey."AC=$AC;AN=$AN","GT:DP:AF");
 
-			if($GT_DP_AF{$key}{$sample})
-			{
-				push @F,$GT_DP_AF{$key}{$sample} ;
-				push @SF,$i;
-
-				my $GT_DP_AF=$GT_DP_AF{$key}{$sample};
-				$GT_DP_AF=~/(.+?):/;
-				my $GT=$1;
-				my @GT=split /[|\/]/,$GT;
-
-				#orig
-				#if($GT[0]==1 or @GT>1 and $GT[1]==1) { $AC++; $AN+=2; }
-				#else                                 { $AN+=2;        }
-
-				#new; April 28 2022
-				$AC++; 
-				$AN++;
-				if($F[-1]=~/\|/ or $F[-1]=~/\//) 
-				{
-					$AN++;
-				}
-				else
-				{
-					$AN+=0;
-				}
-			}
-			else { push @F,".:.:." }
-		}
+		foreach my  $sample (@samples)
+                {
+			if($GT_DP_AF{$sample}) { push @P,$GT_DP_AF{$sample} ; }
+                        else                   { push @P,".:.:." }
+                }
 
-		if($F[0]=~/(.+)\t\.$/)       { $F[0]="$1\tAC=$AC;AN=$AN"    }
-		elsif($F[0]=~/(.+)\t(\S+)$/) { $F[0]="$1\tAC=$AC;AN=$AN;$2" }
-		print join "\t",@F; 
-		print "\n";
+               print join "\t",@P;  print "\n";
 	}
+
 	exit 0;
 }
 
diff --git a/scripts/filter.sh b/scripts/filter.sh
@@ -117,7 +117,7 @@ fi
 
 #if [ ! -s $O.count ]  ; then samtools idxstats $O.bam  -@ $HP_P | idxstats2count.pl -sample $S -chrM $HP_MT > $O.count ; fi
 if [ ! -s $O.cvg ]    ; then cat $O.bam  | bedtools bamtobed -cigar | grep "^$HP_MT" | bedtools genomecov -i - -g $HP_RDIR/$HP_MT.fa.fai  -d | tee $O.cvg  | cut -f3 | st.pl | perl -ane 'if($.==1) { print "Run\t$_" } else { print "$ENV{S}\t$_" }'  > $O.cvg.stat  ; fi
-if [ ! -s $OR.cvg ]   ; then cat $OR.bam | bedtools bamtobed -cigar | grep "^$HP_MT" | bedtools genomecov -i - -g $HP_RDIR/$HP_MTR.fa.fai -d | tee $OR.cvg | cut -f3 | st.pl | perl -ane 'if($.==1) { print "Run\t$_" } else { print "$ENV{S}\t$_" }'  > $OR.cvg.stat ; fi
+#if [ ! -s $OR.cvg ]   ; then cat $OR.bam | bedtools bamtobed -cigar | grep "^$HP_MT" | bedtools genomecov -i - -g $HP_RDIR/$HP_MTR.fa.fai -d | tee $OR.cvg | cut -f3 | st.pl | perl -ane 'if($.==1) { print "Run\t$_" } else { print "$ENV{S}\t$_" }'  > $OR.cvg.stat ; fi
 if [ ! -f $O.sa.bed ] ; then samtools view -h $O.bam  -@ $HP_P | sam2bedSA.pl | uniq.pl -i 3 | sort -k2,2n -k3,3n > $O.sa.bed ; fi
 
 #########################################################################################################################################
@@ -230,7 +230,7 @@ if  [ ! -s $OS.bam ] ; then
   rm -f $OS.*sam $OS.score $O.fq $ON.score
 fi
 
-rm -f $O.bam*
+rm -f $O.bam* $OR.bam*
 rm -f $OS.*idx $OS.*tsv $OS.*stats
 
 # exit if the number of iterations is set to 1
@@ -246,7 +246,7 @@ fi
 
 #if [ ! -s $OS.count ]  ; then samtools idxstats $OS.bam  -@ $HP_P | idxstats2count.pl -sample $S -chrM $S > $OS.count ; fi
 if [ ! -s $OS.cvg ]    ; then cat $OS.bam  | bedtools bamtobed -cigar | grep "^$S" | bedtools genomecov -i - -g $OS.fa.fai -d  | tee $OS.cvg  | cut -f3 | st.pl | perl -ane 'if($.==1) { print "Run\t$_" } else { print "$ENV{S}\t$_" }'  > $OS.cvg.stat  ; fi
-if [ ! -s $OSR.cvg ]   ; then cat $OSR.bam | bedtools bamtobed -cigar | grep "^$S" | bedtools genomecov -i - -g $OSR.fa.fai -d | tee $OSR.cvg | cut -f3 | st.pl | perl -ane 'if($.==1) { print "Run\t$_" } else { print "$ENV{S}\t$_" }'  > $OSR.cvg.stat ; fi
+#if [ ! -s $OSR.cvg ]   ; then cat $OSR.bam | bedtools bamtobed -cigar | grep "^$S" | bedtools genomecov -i - -g $OSR.fa.fai -d | tee $OSR.cvg | cut -f3 | st.pl | perl -ane 'if($.==1) { print "Run\t$_" } else { print "$ENV{S}\t$_" }'  > $OSR.cvg.stat ; fi
 if [ ! -f $OS.sa.bed ] ; then samtools view -h $OS.bam | sam2bedSA.pl | uniq.pl -i 3 | sort -k2,2n -k3,3n > $OS.sa.bed ; fi
 
 #########################################################################################################################################

diff --git a/scripts/getSummary.sh b/scripts/getSummary.sh
@@ -26,13 +26,13 @@ V=$HP_V
 
 #count,cvg
 awk '{print $3}' $HP_IN | sed "s|$|.cvg.stat|"  | xargs cat | uniq.pl -i 0  > $ODIR/cvg.tab
-awk '{print $3}' $HP_IN | sed "s|$|.$S.00.vcf|" | xargs cat | uniq.pl | bedtools sort -header | eval $HP_FRULE   > $ODIR/$S.00.concat.vcf
 
-#Sept 1 ; ARIC CDG only
-#reAnnotateVcf.sh  $ODIR/$S.00.concat.vcf  $ODIR/$S.00.concat.vcf2 ; mv  $ODIR/$S.00.concat.vcf2  $ODIR/$S.00.concat.vcf
+#March 7th 2023
+awk '{print $3}' $HP_IN | sed "s|$|.$S.00.vcf|" | xargs cat | grep -v "^##sample="  |   uniq.pl | bedtools sort -header | eval $HP_FRULE   > $ODIR/$S.00.concat.vcf
 
 snpSort.sh $ODIR/$S.00.concat
 cat $ODIR/$S.00.concat.vcf | grep -v "^#" | sed 's|:|\t|g'  | count.pl -i -1 -round 100| sort -n > $ODIR/$S.00.AF.histo
+
 if [ $V ] ; then awk '{print $3}' $HP_IN | sed "s|$|.$V.00.vcf|" | xargs cat | uniq.pl | bedtools sort -header  | eval $HP_FRULE > $ODIR/$V.00.concat.vcf ; fi
 
 #haplogroups
@@ -69,12 +69,7 @@ if [ $S == "mutserve" ] ; then exit 0 ; fi
 
 SS=$S.$S
 awk '{print $3}' $HP_IN | sed "s|$|.$S.cvg.stat|" | xargs cat | uniq.pl -i 0  > $ODIR/$S.cvg.tab
-awk '{print $3}' $HP_IN | sed "s|$|.$SS.00.vcf|"  | xargs cat | uniq.pl | bedtools sort -header  > $ODIR/$SS.00.concat.vcf  
-
-#Sept 1 2022 ; ARIC CDG only; March 1 2023 edited/commented the following 3 lines (were meant for an older project)
-#awk '{print $3}' $HP_IN | sed "s|$|.mutect2.max.vcf|" | xargs cat >  $ODIR/$S.max.vcf
-#intersectVcf.pl $ODIR/$S.00.concat.vcf $ODIR/$S.max.vcf -sm | cat $ODIR/$SS.00.concat.vcf  - | bedtools sort -header  > $ODIR/$SS.00.concat.vcf2; mv $ODIR/$SS.00.concat.vcf2 $ODIR/$SS.00.concat.vcf
-#reAnnotateVcf.sh  $ODIR/$SS.00.concat.vcf  $ODIR/$SS.00.concat.vcf2 ; mv  $ODIR/$SS.00.concat.vcf2  $ODIR/$SS.00.concat.vcf
+awk '{print $3}' $HP_IN | sed "s|$|.$SS.00.vcf|"  | xargs cat |  grep -v "^##sample=" |   grep -v "^##bcftools_annotateCommand" | uniq.pl | bedtools sort -header  | eval $HP_FRULE  > $ODIR/$SS.00.concat.vcf  
 
 snpSort.sh $ODIR/$SS.00.concat
 cat $ODIR/$SS.00.concat.vcf | grep -v "^#" | sed 's|:|\t|g'  | count.pl -i -1 -round 100| sort -n > $ODIR/$SS.00.AF.histo

diff --git a/scripts/init.sh b/scripts/init.sh
@@ -33,8 +33,8 @@ export PATH=$HP_SDIR:$HP_BDIR:$PATH
 #hs38DH(default)
 export HP_RNAME=hs38DH
 export HP_RMT=chrM
-export HP_RNUMT="chr1:629084-634672 chr17:22521208-22521639"										                          # 150bp reads
-#export HP_RNUMT="chr1:629080-634925 chr2:148881723-148881858 chr5:80651184-80651597 chr11:10508892-10509738 chr13:109424123-109424381 chr17:22521208-22521639"   # 100bp reads
+export HP_RNUMT="chr1:629084-634672 chr17:22521208-22521639"										                          # 150bp reads chr2:32916199-32916630
+#export HP_RNUMT="chr1:629080-634925 chr1:76971223-76971280  chr2:148881723-148881858 chr5:80651184-80651597 chr11:10508892-10509738 chr13:109424123-109424381 chr17:22521208-22521639"   # 100bp reads
 export HP_RCOUNT=3366																		  # 195(hs38DH-no_alt); 194 (hs38DH-no_alt_EBV)
 export HP_RURL=ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa
 
@@ -52,6 +52,9 @@ export HP_RURL=ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/GRCh
 #export HP_RCOUNT=24
 #export HP_RURL=https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/chm13.draft_v1.1.fasta.gz
 
+#CHM13.v2; to compute NUMT regions
+#export HP_RURL=https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/914/755/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_genomic.fna.gz
+
 #Mouse
 #export HP_RNAME=mm39
 #export HP_RMT=chrM
@@ -105,7 +108,7 @@ export HP_T3=10
 export HP_DP=100
 export HP_V=                     # SV caller: gridssexport HP_DP=100                 # minimum coverage: Ex 100
 
-export HP_FRULE="perl -ane 'print unless(/strict_strand|strand_bias|base_qual|map_qual|weak_evidence|slippage|position|Homopolymer/ and /:0\.[01234]\d+$/);' |  bcftools filter -e 'DP<100'"   # filter rule (or just "tee")
+export HP_FRULE="perl -ane 'print unless(/strict_strand|strand_bias|base_qual|map_qual|weak_evidence|slippage|position|Homopolymer/ and /:0\.[01234]\d+$/);' |  bcftools filter -e 'DP<$HP_DP'"   # filter rule (or just "tee")
 
 export HP_P=1				       		            # number of processors
 export HP_MM="3G"                                                   # maximum memory

diff --git a/scripts/sam2bedDA.pl b/scripts/sam2bedDA.pl