diff --git a/src/algorithms/pad_band.cpp b/src/algorithms/pad_band.cpp index 87af3fb14f8..8353d7572f2 100644 --- a/src/algorithms/pad_band.cpp +++ b/src/algorithms/pad_band.cpp @@ -11,27 +11,43 @@ namespace vg { namespace algorithms { -std::function pad_band_random_walk(double band_padding_multiplier, size_t band_padding_memo_size) { +std::function pad_band_random_walk_internal(double band_padding_multiplier, size_t band_padding_memo_size, size_t max_padding, + const std::function size_function) { // We're goign to capture this vector by value into the closure std::vector band_padding_memo; // Fill it in to initialize band_padding_memo.resize(band_padding_memo_size); for (size_t i = 0; i < band_padding_memo.size(); i++) { - band_padding_memo[i] = size_t(band_padding_multiplier * sqrt(i)) + 1; + band_padding_memo[i] = std::min(max_padding, size_t(band_padding_multiplier * sqrt(i)) + 1); } - - function choose_band_padding = [band_padding_multiplier, band_padding_memo](const Alignment& seq, const HandleGraph& graph) { - size_t read_length = seq.sequence().size(); - return read_length < band_padding_memo.size() ? band_padding_memo.at(read_length) - : size_t(band_padding_multiplier * sqrt(read_length)) + 1; + + function choose_band_padding = + [band_padding_multiplier, band_padding_memo, size_function, max_padding](const Alignment& seq, const HandleGraph& graph) { + size_t size = size_function(seq, graph); + return size < band_padding_memo.size() ? band_padding_memo.at(size) + : std::min(max_padding, size_t(band_padding_multiplier * sqrt(size)) + 1); }; // And return the closure which now owns the memo table. return choose_band_padding; } +std::function pad_band_random_walk(double band_padding_multiplier, size_t band_padding_memo_size, size_t max_padding) { + return pad_band_random_walk_internal(band_padding_multiplier, band_padding_memo_size, max_padding, + [](const Alignment& seq, const HandleGraph&) { + return seq.sequence().size(); + }); +} + +std::function pad_band_min_random_walk(double band_padding_multiplier, size_t band_padding_memo_size, size_t max_padding) { + return pad_band_random_walk_internal(band_padding_multiplier, band_padding_memo_size, max_padding, + [](const Alignment& seq, const HandleGraph& graph) { + return std::min(seq.sequence().size(), graph.get_total_length()); + }); +} + std::function pad_band_constant(size_t band_padding) { // don't dynamically choose band padding, shim constant value into a function type function constant_padding = [band_padding](const Alignment& seq, const HandleGraph& graph) { diff --git a/src/algorithms/pad_band.hpp b/src/algorithms/pad_band.hpp index 6ddf1ef3837..0573cabec31 100644 --- a/src/algorithms/pad_band.hpp +++ b/src/algorithms/pad_band.hpp @@ -9,15 +9,25 @@ #include "../handle.hpp" #include +#include namespace vg { namespace algorithms { using namespace std; -/// Get a band padding function that uses the expected distance of a random +/// Get a band padding function that scales with the expected distance of a random /// walk, memoized out to the given length. -std::function pad_band_random_walk(double band_padding_multiplier = 1.0, size_t band_padding_memo_size = 2000); +std::function pad_band_random_walk(double band_padding_multiplier = 1.0, + size_t band_padding_memo_size = 2000, + size_t max_padding = std::numeric_limits::max()); + +/// Get a band padding function that scales the expected distance of a random +/// walk, memoized out to the given length, using the minimum of graph size and +/// read size as the length. +std::function pad_band_min_random_walk(double band_padding_multiplier = 1.0, + size_t band_padding_memo_size = 2000, + size_t max_padding = std::numeric_limits::max()); /// Get a band padding function that uses a constant value. std::function pad_band_constant(size_t band_padding); diff --git a/src/aligner.cpp b/src/aligner.cpp index 808a4d189e5..f10c8512a27 100644 --- a/src/aligner.cpp +++ b/src/aligner.cpp @@ -1421,8 +1421,7 @@ void Aligner::align_pinned_multi(Alignment& alignment, vector& alt_al } void Aligner::align_global_banded(Alignment& alignment, const HandleGraph& g, - int32_t band_padding, bool permissive_banding, - const unordered_map* left_align_strand) const { + int32_t band_padding, bool permissive_banding) const { if (alignment.sequence().empty()) { // we can save time by using a specialized deletion aligner for empty strings @@ -1447,8 +1446,7 @@ void Aligner::align_global_banded(Alignment& alignment, const HandleGraph& g, g, band_padding, permissive_banding, - false, - left_align_strand); + false); band_graph.align(score_matrix, nt_table, gap_open, gap_extension); } else if (best_score <= numeric_limits::max() && worst_score >= numeric_limits::min()) { @@ -1457,8 +1455,7 @@ void Aligner::align_global_banded(Alignment& alignment, const HandleGraph& g, g, band_padding, permissive_banding, - false, - left_align_strand); + false); band_graph.align(score_matrix, nt_table, gap_open, gap_extension); } else if (best_score <= numeric_limits::max() && worst_score >= numeric_limits::min()) { @@ -1467,8 +1464,7 @@ void Aligner::align_global_banded(Alignment& alignment, const HandleGraph& g, g, band_padding, permissive_banding, - false, - left_align_strand); + false); band_graph.align(score_matrix, nt_table, gap_open, gap_extension); } else { @@ -1477,16 +1473,14 @@ void Aligner::align_global_banded(Alignment& alignment, const HandleGraph& g, g, band_padding, permissive_banding, - false, - left_align_strand); + false); band_graph.align(score_matrix, nt_table, gap_open, gap_extension); } } void Aligner::align_global_banded_multi(Alignment& alignment, vector& alt_alignments, const HandleGraph& g, - int32_t max_alt_alns, int32_t band_padding, bool permissive_banding, - const unordered_map* left_align_strand) const { + int32_t max_alt_alns, int32_t band_padding, bool permissive_banding) const { if (alignment.sequence().empty()) { // we can save time by using a specialized deletion aligner for empty strings @@ -1511,8 +1505,7 @@ void Aligner::align_global_banded_multi(Alignment& alignment, vector& max_alt_alns, band_padding, permissive_banding, - false, - left_align_strand); + false); band_graph.align(score_matrix, nt_table, gap_open, gap_extension); } else if (best_score <= numeric_limits::max() && worst_score >= numeric_limits::min()) { @@ -1523,8 +1516,7 @@ void Aligner::align_global_banded_multi(Alignment& alignment, vector& max_alt_alns, band_padding, permissive_banding, - false, - left_align_strand); + false); band_graph.align(score_matrix, nt_table, gap_open, gap_extension); } else if (best_score <= numeric_limits::max() && worst_score >= numeric_limits::min()) { @@ -1535,8 +1527,7 @@ void Aligner::align_global_banded_multi(Alignment& alignment, vector& max_alt_alns, band_padding, permissive_banding, - false, - left_align_strand); + false); band_graph.align(score_matrix, nt_table, gap_open, gap_extension); } else { @@ -1547,8 +1538,7 @@ void Aligner::align_global_banded_multi(Alignment& alignment, vector& max_alt_alns, band_padding, permissive_banding, - false, - left_align_strand); + false); band_graph.align(score_matrix, nt_table, gap_open, gap_extension); } @@ -2105,8 +2095,7 @@ void QualAdjAligner::align_pinned_multi(Alignment& alignment, vector& } void QualAdjAligner::align_global_banded(Alignment& alignment, const HandleGraph& g, - int32_t band_padding, bool permissive_banding, - const unordered_map* left_align_strand) const { + int32_t band_padding, bool permissive_banding) const { if (alignment.sequence().empty()) { // we can save time by using a specialized deletion aligner for empty strings @@ -2129,8 +2118,7 @@ void QualAdjAligner::align_global_banded(Alignment& alignment, const HandleGraph g, band_padding, permissive_banding, - true, - left_align_strand); + true); band_graph.align(score_matrix, nt_table, gap_open, gap_extension); } else if (best_score <= numeric_limits::max() && worst_score >= numeric_limits::min()) { @@ -2139,8 +2127,7 @@ void QualAdjAligner::align_global_banded(Alignment& alignment, const HandleGraph g, band_padding, permissive_banding, - true, - left_align_strand); + true); band_graph.align(score_matrix, nt_table, gap_open, gap_extension); } else if (best_score <= numeric_limits::max() && worst_score >= numeric_limits::min()) { @@ -2149,8 +2136,7 @@ void QualAdjAligner::align_global_banded(Alignment& alignment, const HandleGraph g, band_padding, permissive_banding, - true, - left_align_strand); + true); band_graph.align(score_matrix, nt_table, gap_open, gap_extension); } else { @@ -2159,16 +2145,14 @@ void QualAdjAligner::align_global_banded(Alignment& alignment, const HandleGraph g, band_padding, permissive_banding, - true, - left_align_strand); + true); band_graph.align(score_matrix, nt_table, gap_open, gap_extension); } } void QualAdjAligner::align_global_banded_multi(Alignment& alignment, vector& alt_alignments, const HandleGraph& g, - int32_t max_alt_alns, int32_t band_padding, bool permissive_banding, - const unordered_map* left_align_strand) const { + int32_t max_alt_alns, int32_t band_padding, bool permissive_banding) const { if (alignment.sequence().empty()) { // we can save time by using a specialized deletion aligner for empty strings @@ -2193,8 +2177,7 @@ void QualAdjAligner::align_global_banded_multi(Alignment& alignment, vector::max() && worst_score >= numeric_limits::min()) { @@ -2205,8 +2188,7 @@ void QualAdjAligner::align_global_banded_multi(Alignment& alignment, vector::max() && worst_score >= numeric_limits::min()) { @@ -2217,8 +2199,7 @@ void QualAdjAligner::align_global_banded_multi(Alignment& alignment, vector* left_align_strand = nullptr) const = 0; + int32_t band_padding = 0, bool permissive_banding = true) const = 0; /// store top scoring global alignments in the vector in descending score order up to a maximum number /// of alternate alignments (including the optimal alignment). if there are fewer than the maximum @@ -162,8 +161,7 @@ namespace vg { /// optimal alignment will be stored in both the vector and the original alignment object virtual void align_global_banded_multi(Alignment& alignment, vector& alt_alignments, const HandleGraph& g, int32_t max_alt_alns, int32_t band_padding = 0, - bool permissive_banding = true, - const unordered_map* left_align_strand = nullptr) const = 0; + bool permissive_banding = true) const = 0; /// xdrop aligner virtual void align_xdrop(Alignment& alignment, const HandleGraph& g, const vector& mems, bool reverse_complemented, uint16_t max_gap_length = default_xdrop_max_gap_length) const = 0; @@ -365,16 +363,14 @@ namespace vg { /// permissive banding auto detects the width of band needed so that paths can travel /// through every node in the graph void align_global_banded(Alignment& alignment, const HandleGraph& g, - int32_t band_padding = 0, bool permissive_banding = true, - const unordered_map* left_align_strand = nullptr) const; + int32_t band_padding = 0, bool permissive_banding = true) const; /// store top scoring global alignments in the vector in descending score order up to a maximum number /// of alternate alignments (including the optimal alignment). if there are fewer than the maximum /// number of alignments in the return value, then the vector contains all possible alignments. the /// optimal alignment will be stored in both the vector and the original alignment object void align_global_banded_multi(Alignment& alignment, vector& alt_alignments, const HandleGraph& g, - int32_t max_alt_alns, int32_t band_padding = 0, bool permissive_banding = true, - const unordered_map* left_align_strand = nullptr) const; + int32_t max_alt_alns, int32_t band_padding = 0, bool permissive_banding = true) const; /// xdrop aligner void align_xdrop(Alignment& alignment, const HandleGraph& g, const vector& mems, @@ -437,13 +433,11 @@ namespace vg { void align(Alignment& alignment, const HandleGraph& g, bool traceback_aln) const; void align_global_banded(Alignment& alignment, const HandleGraph& g, - int32_t band_padding = 0, bool permissive_banding = true, - const unordered_map* left_align_strand = nullptr) const; + int32_t band_padding = 0, bool permissive_banding = true) const; void align_pinned(Alignment& alignment, const HandleGraph& g, bool pin_left, bool xdrop = false, uint16_t xdrop_max_gap_length = default_xdrop_max_gap_length) const; void align_global_banded_multi(Alignment& alignment, vector& alt_alignments, const HandleGraph& g, - int32_t max_alt_alns, int32_t band_padding = 0, bool permissive_banding = true, - const unordered_map* left_align_strand = nullptr) const; + int32_t max_alt_alns, int32_t band_padding = 0, bool permissive_banding = true) const; void align_pinned_multi(Alignment& alignment, vector& alt_alignments, const HandleGraph& g, bool pin_left, int32_t max_alt_alns) const; diff --git a/src/banded_global_aligner.cpp b/src/banded_global_aligner.cpp index 0cc667ad07f..825ee1a1bdf 100644 --- a/src/banded_global_aligner.cpp +++ b/src/banded_global_aligner.cpp @@ -211,21 +211,20 @@ void BandedGlobalAligner::BABuilder::finalize_alignment(const list BandedGlobalAligner::BAMatrix::BAMatrix(Alignment& alignment, handle_t node, int64_t top_diag, int64_t bottom_diag, - const vector& seeds, int64_t cumulative_seq_len, bool left_alignment_strand) : + const vector& seeds, int64_t cumulative_seq_len) : node(node), top_diag(top_diag), bottom_diag(bottom_diag), seeds(seeds), alignment(alignment), cumulative_seq_len(cumulative_seq_len), - left_alignment_strand(left_alignment_strand), match(nullptr), insert_col(nullptr), insert_row(nullptr) { // nothing to do #ifdef debug_banded_aligner_objects - cerr << "[BAMatrix]: constructor for node " << as_integer(node) << " and band from " << top_diag << " to " << bottom_diag << " with left alignment strand " << left_alignment_strand << endl;; + cerr << "[BAMatrix]: constructor for node " << as_integer(node) << " and band from " << top_diag << " to " << bottom_diag << endl;; #endif } @@ -826,9 +825,6 @@ void BandedGlobalAligner::BAMatrix::traceback(const HandleGraph& graph, } array prev_mats{Match, InsertCol, InsertRow}; - if (left_alignment_strand) { - std::swap(prev_mats[0], prev_mats[2]); - } // find optimal traceback idx = i * ncols + j; @@ -1432,9 +1428,6 @@ void BandedGlobalAligner::BAMatrix::traceback_over_edge(const HandleGra cerr << "[BAMatrix::traceback_over_edge] checking seed rectangular coordinates (" << seed_row << ", " << seed_col << "), with indices calculated from current diagonal " << curr_diag << " (top diag " << top_diag << " + offset " << i << "), seed top diagonal " << seed->top_diag << ", seed seq length " << seed_ncols << " with insert column offset " << (mat == InsertCol) << endl; #endif array prev_mats{Match, InsertCol, InsertRow}; - if (left_alignment_strand) { - std::swap(prev_mats[0], prev_mats[2]); - } switch (mat) { case Match: @@ -1930,14 +1923,12 @@ void BandedGlobalAligner::BAMatrix::print_band(const HandleGraph& graph template BandedGlobalAligner::BandedGlobalAligner(Alignment& alignment, const HandleGraph& g, int64_t band_padding, bool permissive_banding, - bool adjust_for_base_quality, - const unordered_map* left_align_strand) : + bool adjust_for_base_quality) : BandedGlobalAligner(alignment, g, nullptr, 1, band_padding, permissive_banding, - adjust_for_base_quality, - left_align_strand) + adjust_for_base_quality) { // nothing to do, just funnel into internal constructor } @@ -1947,15 +1938,13 @@ BandedGlobalAligner::BandedGlobalAligner(Alignment& alignment, const Ha vector& alt_alignments, int64_t max_multi_alns, int64_t band_padding, bool permissive_banding, - bool adjust_for_base_quality, - const unordered_map* left_align_strand) : + bool adjust_for_base_quality) : BandedGlobalAligner(alignment, g, &alt_alignments, max_multi_alns, band_padding, permissive_banding, - adjust_for_base_quality, - left_align_strand) + adjust_for_base_quality) { // check data integrity and funnel into internal constructor if (!alt_alignments.empty()) { @@ -1970,8 +1959,7 @@ BandedGlobalAligner::BandedGlobalAligner(Alignment& alignment, const Ha int64_t max_multi_alns, int64_t band_padding, bool permissive_banding, - bool adjust_for_base_quality, - const unordered_map* left_align_strand) : + bool adjust_for_base_quality) : graph(g), alignment(alignment), alt_alignments(alt_alignments), @@ -2055,20 +2043,12 @@ BandedGlobalAligner::BandedGlobalAligner(Alignment& alignment, const Ha seeds.push_back(banded_matrices[node_id_to_idx[graph.get_id(prev)]]); }); - bool strand = false; - if (left_align_strand) { - auto it = left_align_strand->find(node); - if (it != left_align_strand->end()) { - strand = it->second; - } - } banded_matrices[i] = new BAMatrix(alignment, node, band_ends[i].first, band_ends[i].second, std::move(seeds), - shortest_seqs[i], - strand); + shortest_seqs[i]); } } @@ -2502,9 +2482,6 @@ BandedGlobalAligner::AltTracebackStack::AltTracebackStack(const HandleG else { // let the insert routine figure out which one is the best and which ones to keep in the stack array mats{Match, InsertCol, InsertRow}; - if (band_matrix->left_alignment_strand) { - std::swap(mats[0], mats[2]); - } for (auto mat : mats) { switch (mat) { diff --git a/src/banded_global_aligner.hpp b/src/banded_global_aligner.hpp index 74ff2628ce7..7f7e9bda0c3 100644 --- a/src/banded_global_aligner.hpp +++ b/src/banded_global_aligner.hpp @@ -60,8 +60,7 @@ namespace vg { /// BandedGlobalAligner(Alignment& alignment, const HandleGraph& g, int64_t band_padding, bool permissive_banding = false, - bool adjust_for_base_quality = false, - const unordered_map* left_align_strand = nullptr); + bool adjust_for_base_quality = false); /// Initializes banded multi-alignment, which computes the top scoring alternate alignments in addition @@ -79,8 +78,7 @@ namespace vg { BandedGlobalAligner(Alignment& alignment, const HandleGraph& g, vector& alt_alignments, int64_t max_multi_alns, int64_t band_padding, bool permissive_banding = false, - bool adjust_for_base_quality = false, - const unordered_map* left_align_strand = nullptr); + bool adjust_for_base_quality = false); ~BandedGlobalAligner(); @@ -135,8 +133,7 @@ namespace vg { BandedGlobalAligner(Alignment& alignment, const HandleGraph& g, vector* alt_alignments, int64_t max_multi_alns, int64_t band_padding, bool permissive_banding = false, - bool adjust_for_base_quality = false, - const unordered_map* left_align_strand = nullptr); + bool adjust_for_base_quality = false); /// Traceback through dynamic programming matrices to compute alignment void traceback(int8_t* score_mat, int8_t* nt_table, int8_t gap_open, int8_t gap_extend, IntType min_inf); @@ -159,7 +156,7 @@ namespace vg { public: BAMatrix(Alignment& alignment, handle_t node, int64_t top_diag, int64_t bottom_diag, - const vector& seeds, int64_t cumulative_seq_len, bool left_alignment_strand); + const vector& seeds, int64_t cumulative_seq_len); ~BAMatrix(); /// Use DP to fill the band with alignment scores @@ -190,9 +187,7 @@ namespace vg { int64_t bottom_diag; handle_t node; - - bool left_alignment_strand; - + Alignment& alignment; /// Length of shortest sequence leading to matrix from a source node diff --git a/src/multipath_alignment_graph.cpp b/src/multipath_alignment_graph.cpp index 7189c08f53a..0d0a249c532 100644 --- a/src/multipath_alignment_graph.cpp +++ b/src/multipath_alignment_graph.cpp @@ -4,6 +4,7 @@ #include "multipath_alignment_graph.hpp" #include "sequence_complexity.hpp" +#include "reverse_graph.hpp" #include "structures/rank_pairing_heap.hpp" @@ -4230,8 +4231,7 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap size_t unmergeable_len, size_t band_padding, multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls, SnarlDistanceIndex* dist_index, const function(id_t)>* project, - - bool allow_negative_scores, unordered_map* left_align_strand) { + bool allow_negative_scores, bool align_in_reverse) { align(alignment, align_graph, @@ -4250,7 +4250,7 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap dist_index, project, allow_negative_scores, - left_align_strand); + align_in_reverse); } void MultipathAlignmentGraph::deduplicate_alt_alns(vector>& alt_alns, @@ -5187,7 +5187,7 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap const function& band_padding_function, multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls, SnarlDistanceIndex* dist_index, const function(id_t)>* project, - bool allow_negative_scores, unordered_map* left_align_strand) { + bool allow_negative_scores, bool align_in_reverse) { // TODO: magic number // how many tails we need to have before we try the more complicated but @@ -5352,10 +5352,21 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap }); #endif + // possibly the reverse the sequence + HandleGraph* aln_connecting_graph = &connecting_graph; + ReverseGraph reverse_graph(&connecting_graph, false); + if (align_in_reverse) { + reverse_alignment(intervening_sequence); + aln_connecting_graph = &reverse_graph; + } vector alt_alignments; - aligner->align_global_banded_multi(intervening_sequence, alt_alignments, connecting_graph, num_alns_iter, - band_padding_function(intervening_sequence, connecting_graph), true, - left_align_strand); + aligner->align_global_banded_multi(intervening_sequence, alt_alignments, *aln_connecting_graph, num_alns_iter, + band_padding_function(intervening_sequence, connecting_graph), true); + if (align_in_reverse) { + for (auto& aln : alt_alignments) { + reverse_alignment(aln); + } + } // remove alignments with the same path deduplicated = convert_and_deduplicate(alt_alignments, false, false); @@ -6612,6 +6623,31 @@ void MultipathAlignmentGraph::align(const Alignment& alignment, const HandleGrap return to_return; } + + void MultipathAlignmentGraph::reverse_alignment(Alignment& aln) const { + + std::reverse(aln.mutable_sequence()->begin(), aln.mutable_sequence()->end()); + if (!aln.quality().empty()) { + std::reverse(aln.mutable_quality()->begin(), aln.mutable_quality()->end()); + } + + if (aln.path().mapping_size() != 0) { + auto path = aln.mutable_path(); + for (size_t i = 0, j = path->mapping_size() - 1; i < j; ++i, --j) { + path->mutable_mapping()->SwapElements(i, j); + } + for (size_t k = 0; k < path->mapping_size(); ++k) { + auto mapping = path->mutable_mapping(k); + for (size_t i = 0, j = mapping->edit_size() - 1; i < j; ++i, --j) { + mapping->mutable_edit()->SwapElements(i, j); + } + for (size_t i = 0; i < mapping->edit_size(); ++i) { + auto edit = mapping->mutable_edit(i); + std::reverse(edit->mutable_sequence()->begin(), edit->mutable_sequence()->end()); + } + } + } + } } diff --git a/src/multipath_alignment_graph.hpp b/src/multipath_alignment_graph.hpp index afe73c2cdf6..5b518fa9a69 100644 --- a/src/multipath_alignment_graph.hpp +++ b/src/multipath_alignment_graph.hpp @@ -197,8 +197,7 @@ namespace vg { size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier, size_t max_tail_length, bool simplify_topologies, size_t unmergeable_len, size_t band_padding, multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls = nullptr, SnarlDistanceIndex* dist_index = nullptr, - const function(id_t)>* project = nullptr, bool allow_negative_scores = false, - unordered_map* left_align_strand = nullptr); + const function(id_t)>* project = nullptr, bool allow_negative_scores = false, bool align_in_reverse = false); /// Do intervening and tail alignments between the anchoring paths and /// store the result in a multipath_alignment_t. Reachability edges must @@ -216,8 +215,7 @@ namespace vg { size_t max_alt_alns, bool dynamic_alt_alns, size_t max_gap, double pessimistic_tail_gap_multiplier, size_t max_tail_length, bool simplify_topologies, size_t unmergeable_len, const function& band_padding_function, multipath_alignment_t& multipath_aln_out, SnarlManager* cutting_snarls = nullptr, SnarlDistanceIndex* dist_index = nullptr, - const function(id_t)>* project = nullptr, bool allow_negative_scores = false, - unordered_map* left_align_strand = nullptr); + const function(id_t)>* project = nullptr, bool allow_negative_scores = false, bool align_in_reverse = false); /// Converts a MultipathAlignmentGraph to a GraphViz Dot representation, output to the given ostream. /// If given the Alignment query we are working on, can produce information about subpath iterators. @@ -337,6 +335,8 @@ namespace vg { const Alignment& alignment, const HandleGraph& align_graph, string::const_iterator begin, const GSSWAligner* aligner); + void reverse_alignment(Alignment& aln) const; + /// Identifies regions that are shared across all of the alternative alignments, and then /// splits those regions out into separate alignments, dividing the set of alternative /// alignments accordingly. Return value consists of a vector of the shared segments and diff --git a/src/subcommand/surject_main.cpp b/src/subcommand/surject_main.cpp index 9aa76352f36..e4bcd79f0cf 100644 --- a/src/subcommand/surject_main.cpp +++ b/src/subcommand/surject_main.cpp @@ -330,6 +330,7 @@ int main_surject(int argc, char** argv) { // We have an override surjector.max_subgraph_bases_per_read_base = *max_graph_scale; } + surjector.choose_band_padding = algorithms::pad_band_min_random_walk(1.0, 2000, 16); // Count our threads int thread_count = vg::get_thread_count(); diff --git a/src/surjector.cpp b/src/surjector.cpp index 8a78280b310..707fdb83580 100644 --- a/src/surjector.cpp +++ b/src/surjector.cpp @@ -34,7 +34,7 @@ namespace vg { using namespace std; - Surjector::Surjector(const PathPositionHandleGraph* graph) : graph(graph) { + Surjector::Surjector(const PathPositionHandleGraph* graph) : graph(graph), choose_band_padding(algorithms::pad_band_constant(1)) { if (!graph) { cerr << "error:[Surjector] Failed to provide an graph to the Surjector" << endl; } @@ -2982,13 +2982,6 @@ using namespace std; #endif } - // left align on forward strands and right align on reverse strands - unordered_map left_align_strand; - left_align_strand.reserve(aln_graph->get_node_count()); - aln_graph->for_each_handle([&](const handle_t& handle) { - left_align_strand[handle] = projection_trans(aln_graph->get_id(handle)).second; - }); - // align the intervening segments and store the result in a multipath alignment multipath_alignment_t mp_aln; mp_aln_graph.align(source, *aln_graph, get_aligner(), @@ -3000,13 +2993,13 @@ using namespace std; max_tail_length, // max length of tail to align false, // simplify topologies 0, // unmergeable len - 1, // band padding + choose_band_padding, // band padding mp_aln, // output nullptr, // snarl manager nullptr, // distance index nullptr, // projector allow_negative_scores, // subpath local - &left_align_strand); // strand to left align against + rev_strand); // left/right align topologically_order_subpaths(mp_aln); @@ -4057,17 +4050,17 @@ using namespace std; if ((anchor_lengths[i] <= max_low_complexity_anchor_prune || chunk.first.second - chunk.first.first <= max_low_complexity_anchor_prune)) { SeqComplexity<6> chunk_complexity(chunk.first.first, chunk.first.second); if (chunk.first.second - chunk.first.first < pad_suspicious_anchors_to_length) { - auto context_begin = max(sequence.begin(), chunk.first.first - (pad_suspicious_anchors_to_length - (chunk.first.second - chunk.first.first)) / 2); - auto context_end = min(sequence.end(), context_begin + pad_suspicious_anchors_to_length); - if (context_end == sequence.end()) { + auto read_context_begin = max(sequence.begin(), chunk.first.first - (pad_suspicious_anchors_to_length - (chunk.first.second - chunk.first.first)) / 2); + auto read_context_end = min(sequence.end(), read_context_begin + pad_suspicious_anchors_to_length); + if (read_context_end == sequence.end()) { // try to ensure enough bases if we're near the end of the read - context_begin = max(sequence.begin(), context_end - pad_suspicious_anchors_to_length); + read_context_begin = max(sequence.begin(), read_context_end - pad_suspicious_anchors_to_length); } - SeqComplexity<6> context_complexity(context_begin, context_end); - // TODO: repetetive + SeqComplexity<6> context_complexity(read_context_begin, read_context_end); + // TODO: repetitive for (int order = 1, max_order = 6; order <= max_order; ++order) { - if (context_complexity.p_value(order) < low_complexity_p_value && - (chunk_complexity.repetitiveness(order) > 0.5 || (chunk.first.second - chunk.first.first) <= order)) { + + if (context_complexity.p_value(order) < low_complexity_p_value) { #ifdef debug_anchored_surject cerr << "anchor " << i << " (read[" << (chunk.first.first - sequence.begin()) << ":" << (chunk.first.second - sequence.begin()) << "]) pruned being for having context with low complexity at order " << order << " with p-value " << context_complexity.p_value(order) << " and anchor repetitive fraction " << chunk_complexity.repetitiveness(order) << endl; #endif @@ -4216,11 +4209,35 @@ using namespace std; // pull the ref sequence std::string ref_seq; - auto step = ref_chunk.first; + bool path_rev = (graph->get_is_reverse(graph->get_handle_of_step(ref_chunk.first)) + != path_chunk.second.mapping().front().position().is_reverse()); + + // get the left-most step to iterate along + step_handle_t step; + if (left_end) { + step = ref_chunk.first; + } + else { + step = ref_chunk.second; + size_t to_walk = path_chunk.second.mapping_size() - (mapping_idx + 1); + if (path_rev) { + for (size_t j = 0; j < to_walk; ++j) { + step = graph->get_next_step(step); + } + } + else { + for (size_t j = 0; j < to_walk; ++j) { + step = graph->get_previous_step(step); + } + } + } + +#ifdef debug_anchored_surject + cerr << "extracting reference sequence starting on node " << graph->get_id(graph->get_handle_of_step(step)) << (graph->get_is_reverse(graph->get_handle_of_step(step)) ? "-" : "+") << " at position " << graph->get_position_of_step(step) << endl; +#endif + for (size_t m = left_end ? 0 : mapping_idx, n = left_end ? mapping_idx + 1 : path_chunk.second.mapping_size(); m < n; ++m) { const auto& mapping = path_chunk.second.mapping(m); - // TODO: a bit silly to do this on every mapping - bool path_rev = graph->get_is_reverse(graph->get_handle_of_step(step)) != mapping.position().is_reverse(); size_t walked_from_length = 0; for (size_t e = (!left_end && m == mapping_idx) ? edit_idx + 1 : 0, k = (left_end && m == mapping_idx) ? edit_idx : mapping.edit_size(); e < k; ++e) { @@ -4239,6 +4256,9 @@ using namespace std; ref_seq.append(graph->get_subsequence(handle, offset, walked_from_length)); step = path_rev ? graph->get_previous_step(step) : graph->get_next_step(step); } +#ifdef debug_anchored_surject + cerr << "got reference seqeunce " << ref_seq << endl; +#endif // is the ref sequence of this tail low complexity? SeqComplexity<6> trim_candidate_ref_complexity(ref_seq.begin(), ref_seq.end()); diff --git a/src/surjector.hpp b/src/surjector.hpp index 993b00b5180..40d5edbcfae 100644 --- a/src/surjector.hpp +++ b/src/surjector.hpp @@ -18,6 +18,7 @@ #include "path.hpp" #include #include "multipath_alignment.hpp" +#include "algorithms/pad_band.hpp" namespace vg { @@ -134,8 +135,11 @@ using namespace std; bool prune_suspicious_anchors = false; int64_t max_tail_anchor_prune = 4; double low_complexity_p_value = .0075; - int64_t max_low_complexity_anchor_prune = 32; - int64_t pad_suspicious_anchors_to_length = 10; + int64_t max_low_complexity_anchor_prune = 40; + int64_t pad_suspicious_anchors_to_length = 12; + + // A function for computing band padding + std::function choose_band_padding; /// How many anchors (per path) will we use when surjecting using /// anchors?