Skip to content

Commit

Permalink
.
Browse files Browse the repository at this point in the history
  • Loading branch information
dpuiu committed Apr 18, 2023
1 parent 3ba4277 commit 0c9a8cd
Show file tree
Hide file tree
Showing 10 changed files with 107 additions and 206 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# 2023/04/18 #

* updated snpSort.sh, concat2merge.pl : speed up vcf sort and merge
* updated init.sh ; use HP_DP as minimum depth instead of 100
* update README.md

# 2023/03/03 #

* updated downsampleSam.sh, downsampleSam.pl, filterSam.pl : alignments are sorted by the reads name before downsampling
Expand Down
19 changes: 19 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -394,3 +394,22 @@
bwa_mem.sh fastq/sample bams/sample

ls bams/

### How to update the SNV filtering criteria ###

* By default, the SNV with DP<100 or labeled as "strict_strand|strand_bias|base_qual|map_qual|weak_evidence|slippage|position|HP" are filtered out

* To change the criteria, update the following 2 variables in the "init.sh" file

export HP_FRULE="egrep -v 'strict_strand|strand_bias|base_qual|map_qual|weak_evidence|slippage|position|HP'"
export HP_DP=100

* Samples with multiple problems (failing haplocheck, multiple NUMTs, multiple heteroplasmies belonging to a different haplogroup, average coverage < HP_DP ...) are saved in "*.suspicious.{id,tab}" files

* By default, suspicious samples are not removed; The users should check these files and maually remove the samples





2 changes: 1 addition & 1 deletion VERSION.md
Original file line number Diff line number Diff line change
@@ -1 +1 @@
20230303
20230418
117 changes: 50 additions & 67 deletions scripts/concat2merge.pl
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,10 @@
close(IN);

#######################################################
my ($key,$sample,@keys,%keys,%GT_DP_AF);

my ($pkey,$sample,@keys,%keys,%GT_DP_AF,$header);
my ($AC,$AN)=(0,0);

while(<>)
{
if(/^##/)
Expand All @@ -60,86 +63,66 @@
}
else
{
#0 1 2 3 4
#chrM 42 . T TC

if(!$header) { print join "\t",("#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT",@samples); print "\n"; $header=1}

my @F=split;
$F[2]=".";
$F[5]=".";
$F[6]=".";
@F[2,5,6]=(".",".",".");

if($F[-1]=~/^(.+):(.+):1$/) { $F[-1]="1:$2:1" }
elsif($F[-1]=~/^(.+):(.+):(.+)$/) { $F[-1]="0/1:$2:$3" }


my $sample;;
if($F[7]=~/SM=(.+?);(.+)/)
{
$sample=$1;
$F[7]=$2;
}
elsif($F[7]=~/SM=(.+)/)
{
$sample=$1;
$F[7]=".";
}
my $sample;
if($F[7]=~/SM=(.+?);(.+)/) { ($sample,$F[7])=($1,"$2;");}
elsif($F[7]=~/SM=(.+)/) { ($sample,$F[7])=($1,"");}

defined($samples{$sample}) or die "ERROR3: $_";
defined($samples{$sample}) or die "ERROR: sample not defined in $_";
my $key=join "\t",@F[0..7];

if($pkey and $key ne $pkey)
{
my @P=($pkey."AC=$AC;AN=$AN","GT:DP:AF");

#april 28 2022
$F[-1]="1:$2:1" if($F[-1]=~/^(.+):(.+):(.+)$/ and $3==1);
foreach my $sample (@samples)
{
if($GT_DP_AF{$sample}) { push @P,$GT_DP_AF{$sample} ; }
else { push @P,".:.:." }
}

$GT_DP_AF{$key}{$sample}=$F[-1];
print join "\t",@P; print "\n";

if(!$keys{$key})
{
$keys{$key}=1;
push @keys,$key;
%GT_DP_AF=();
($AC,$AN)=(0,0);

}

$pkey=$key;

$GT_DP_AF{$sample}=$F[-1];
$AC++;
$AN++;
$AN++ if($GT_DP_AF{$sample}=~/\|/ or $GT_DP_AF{$sample}=~/\//)

}
}

print join "\t",("#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT",@samples); print "\n";
foreach my $key (@keys)
{
my @F=();
my @SF;
push @F,($key,"GT:DP:AF");

my $AC=0;
my $AN=0;
foreach my $i (1..@samples)
{
my $sample=$samples[$i-1];
#end
if($pkey)
{
my @P=($pkey."AC=$AC;AN=$AN","GT:DP:AF");

if($GT_DP_AF{$key}{$sample})
{
push @F,$GT_DP_AF{$key}{$sample} ;
push @SF,$i;

my $GT_DP_AF=$GT_DP_AF{$key}{$sample};
$GT_DP_AF=~/(.+?):/;
my $GT=$1;
my @GT=split /[|\/]/,$GT;

#orig
#if($GT[0]==1 or @GT>1 and $GT[1]==1) { $AC++; $AN+=2; }
#else { $AN+=2; }

#new; April 28 2022
$AC++;
$AN++;
if($F[-1]=~/\|/ or $F[-1]=~/\//)
{
$AN++;
}
else
{
$AN+=0;
}
}
else { push @F,".:.:." }
}
foreach my $sample (@samples)
{
if($GT_DP_AF{$sample}) { push @P,$GT_DP_AF{$sample} ; }
else { push @P,".:.:." }
}

if($F[0]=~/(.+)\t\.$/) { $F[0]="$1\tAC=$AC;AN=$AN" }
elsif($F[0]=~/(.+)\t(\S+)$/) { $F[0]="$1\tAC=$AC;AN=$AN;$2" }
print join "\t",@F;
print "\n";
print join "\t",@P; print "\n";
}

exit 0;
}

6 changes: 3 additions & 3 deletions scripts/filter.sh
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ fi

#if [ ! -s $O.count ] ; then samtools idxstats $O.bam -@ $HP_P | idxstats2count.pl -sample $S -chrM $HP_MT > $O.count ; fi
if [ ! -s $O.cvg ] ; then cat $O.bam | bedtools bamtobed -cigar | grep "^$HP_MT" | bedtools genomecov -i - -g $HP_RDIR/$HP_MT.fa.fai -d | tee $O.cvg | cut -f3 | st.pl | perl -ane 'if($.==1) { print "Run\t$_" } else { print "$ENV{S}\t$_" }' > $O.cvg.stat ; fi
if [ ! -s $OR.cvg ] ; then cat $OR.bam | bedtools bamtobed -cigar | grep "^$HP_MT" | bedtools genomecov -i - -g $HP_RDIR/$HP_MTR.fa.fai -d | tee $OR.cvg | cut -f3 | st.pl | perl -ane 'if($.==1) { print "Run\t$_" } else { print "$ENV{S}\t$_" }' > $OR.cvg.stat ; fi
#if [ ! -s $OR.cvg ] ; then cat $OR.bam | bedtools bamtobed -cigar | grep "^$HP_MT" | bedtools genomecov -i - -g $HP_RDIR/$HP_MTR.fa.fai -d | tee $OR.cvg | cut -f3 | st.pl | perl -ane 'if($.==1) { print "Run\t$_" } else { print "$ENV{S}\t$_" }' > $OR.cvg.stat ; fi
if [ ! -f $O.sa.bed ] ; then samtools view -h $O.bam -@ $HP_P | sam2bedSA.pl | uniq.pl -i 3 | sort -k2,2n -k3,3n > $O.sa.bed ; fi

#########################################################################################################################################
Expand Down Expand Up @@ -230,7 +230,7 @@ if [ ! -s $OS.bam ] ; then
rm -f $OS.*sam $OS.score $O.fq $ON.score
fi

rm -f $O.bam*
rm -f $O.bam* $OR.bam*
rm -f $OS.*idx $OS.*tsv $OS.*stats

# exit if the number of iterations is set to 1
Expand All @@ -246,7 +246,7 @@ fi

#if [ ! -s $OS.count ] ; then samtools idxstats $OS.bam -@ $HP_P | idxstats2count.pl -sample $S -chrM $S > $OS.count ; fi
if [ ! -s $OS.cvg ] ; then cat $OS.bam | bedtools bamtobed -cigar | grep "^$S" | bedtools genomecov -i - -g $OS.fa.fai -d | tee $OS.cvg | cut -f3 | st.pl | perl -ane 'if($.==1) { print "Run\t$_" } else { print "$ENV{S}\t$_" }' > $OS.cvg.stat ; fi
if [ ! -s $OSR.cvg ] ; then cat $OSR.bam | bedtools bamtobed -cigar | grep "^$S" | bedtools genomecov -i - -g $OSR.fa.fai -d | tee $OSR.cvg | cut -f3 | st.pl | perl -ane 'if($.==1) { print "Run\t$_" } else { print "$ENV{S}\t$_" }' > $OSR.cvg.stat ; fi
#if [ ! -s $OSR.cvg ] ; then cat $OSR.bam | bedtools bamtobed -cigar | grep "^$S" | bedtools genomecov -i - -g $OSR.fa.fai -d | tee $OSR.cvg | cut -f3 | st.pl | perl -ane 'if($.==1) { print "Run\t$_" } else { print "$ENV{S}\t$_" }' > $OSR.cvg.stat ; fi
if [ ! -f $OS.sa.bed ] ; then samtools view -h $OS.bam | sam2bedSA.pl | uniq.pl -i 3 | sort -k2,2n -k3,3n > $OS.sa.bed ; fi

#########################################################################################################################################
Expand Down
13 changes: 4 additions & 9 deletions scripts/getSummary.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@ V=$HP_V

#count,cvg
awk '{print $3}' $HP_IN | sed "s|$|.cvg.stat|" | xargs cat | uniq.pl -i 0 > $ODIR/cvg.tab
awk '{print $3}' $HP_IN | sed "s|$|.$S.00.vcf|" | xargs cat | uniq.pl | bedtools sort -header | eval $HP_FRULE > $ODIR/$S.00.concat.vcf

#Sept 1 ; ARIC CDG only
#reAnnotateVcf.sh $ODIR/$S.00.concat.vcf $ODIR/$S.00.concat.vcf2 ; mv $ODIR/$S.00.concat.vcf2 $ODIR/$S.00.concat.vcf
#March 7th 2023
awk '{print $3}' $HP_IN | sed "s|$|.$S.00.vcf|" | xargs cat | grep -v "^##sample=" | uniq.pl | bedtools sort -header | eval $HP_FRULE > $ODIR/$S.00.concat.vcf

snpSort.sh $ODIR/$S.00.concat
cat $ODIR/$S.00.concat.vcf | grep -v "^#" | sed 's|:|\t|g' | count.pl -i -1 -round 100| sort -n > $ODIR/$S.00.AF.histo

if [ $V ] ; then awk '{print $3}' $HP_IN | sed "s|$|.$V.00.vcf|" | xargs cat | uniq.pl | bedtools sort -header | eval $HP_FRULE > $ODIR/$V.00.concat.vcf ; fi

#haplogroups
Expand Down Expand Up @@ -69,12 +69,7 @@ if [ $S == "mutserve" ] ; then exit 0 ; fi

SS=$S.$S
awk '{print $3}' $HP_IN | sed "s|$|.$S.cvg.stat|" | xargs cat | uniq.pl -i 0 > $ODIR/$S.cvg.tab
awk '{print $3}' $HP_IN | sed "s|$|.$SS.00.vcf|" | xargs cat | uniq.pl | bedtools sort -header > $ODIR/$SS.00.concat.vcf

#Sept 1 2022 ; ARIC CDG only; March 1 2023 edited/commented the following 3 lines (were meant for an older project)
#awk '{print $3}' $HP_IN | sed "s|$|.mutect2.max.vcf|" | xargs cat > $ODIR/$S.max.vcf
#intersectVcf.pl $ODIR/$S.00.concat.vcf $ODIR/$S.max.vcf -sm | cat $ODIR/$SS.00.concat.vcf - | bedtools sort -header > $ODIR/$SS.00.concat.vcf2; mv $ODIR/$SS.00.concat.vcf2 $ODIR/$SS.00.concat.vcf
#reAnnotateVcf.sh $ODIR/$SS.00.concat.vcf $ODIR/$SS.00.concat.vcf2 ; mv $ODIR/$SS.00.concat.vcf2 $ODIR/$SS.00.concat.vcf
awk '{print $3}' $HP_IN | sed "s|$|.$SS.00.vcf|" | xargs cat | grep -v "^##sample=" | grep -v "^##bcftools_annotateCommand" | uniq.pl | bedtools sort -header | eval $HP_FRULE > $ODIR/$SS.00.concat.vcf

snpSort.sh $ODIR/$SS.00.concat
cat $ODIR/$SS.00.concat.vcf | grep -v "^#" | sed 's|:|\t|g' | count.pl -i -1 -round 100| sort -n > $ODIR/$SS.00.AF.histo
Expand Down
9 changes: 6 additions & 3 deletions scripts/init.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ export PATH=$HP_SDIR:$HP_BDIR:$PATH
#hs38DH(default)
export HP_RNAME=hs38DH
export HP_RMT=chrM
export HP_RNUMT="chr1:629084-634672 chr17:22521208-22521639" # 150bp reads
#export HP_RNUMT="chr1:629080-634925 chr2:148881723-148881858 chr5:80651184-80651597 chr11:10508892-10509738 chr13:109424123-109424381 chr17:22521208-22521639" # 100bp reads
export HP_RNUMT="chr1:629084-634672 chr17:22521208-22521639" # 150bp reads chr2:32916199-32916630
#export HP_RNUMT="chr1:629080-634925 chr1:76971223-76971280 chr2:148881723-148881858 chr5:80651184-80651597 chr11:10508892-10509738 chr13:109424123-109424381 chr17:22521208-22521639" # 100bp reads
export HP_RCOUNT=3366 # 195(hs38DH-no_alt); 194 (hs38DH-no_alt_EBV)
export HP_RURL=ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/GRCh38_reference_genome/GRCh38_full_analysis_set_plus_decoy_hla.fa

Expand All @@ -52,6 +52,9 @@ export HP_RURL=ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/GRCh
#export HP_RCOUNT=24
#export HP_RURL=https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/CHM13/assemblies/chm13.draft_v1.1.fasta.gz

#CHM13.v2; to compute NUMT regions
#export HP_RURL=https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/914/755/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_genomic.fna.gz

#Mouse
#export HP_RNAME=mm39
#export HP_RMT=chrM
Expand Down Expand Up @@ -105,7 +108,7 @@ export HP_T3=10
export HP_DP=100
export HP_V= # SV caller: gridssexport HP_DP=100 # minimum coverage: Ex 100

export HP_FRULE="perl -ane 'print unless(/strict_strand|strand_bias|base_qual|map_qual|weak_evidence|slippage|position|Homopolymer/ and /:0\.[01234]\d+$/);' | bcftools filter -e 'DP<100'" # filter rule (or just "tee")
export HP_FRULE="perl -ane 'print unless(/strict_strand|strand_bias|base_qual|map_qual|weak_evidence|slippage|position|Homopolymer/ and /:0\.[01234]\d+$/);' | bcftools filter -e 'DP<$HP_DP'" # filter rule (or just "tee")

export HP_P=1 # number of processors
export HP_MM="3G" # maximum memory
Expand Down
114 changes: 0 additions & 114 deletions scripts/sam2bedDA.pl

This file was deleted.

Loading

0 comments on commit 0c9a8cd

Please sign in to comment.