Mwaniki, Njagi Moses; Pisanti, Nadia
Optimal Sequence Alignment to ED-Strings Proceedings Article
In: Bansal, Mukul S.; Cai, Zhipeng; Mangul, Serghei (Ed.): Bioinformatics Research and Applications, pp. 204–216, Springer Nature, Germany, 2023, ISSN: 0302-9743.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Mwaniki2022b,
title = {Optimal Sequence Alignment to ED-Strings},
author = {Njagi Moses Mwaniki and Nadia Pisanti},
editor = {Mukul S. Bansal and Zhipeng Cai and Serghei Mangul},
doi = {10.1007/978-3-031-23198-8_19},
issn = {0302-9743},
year = {2023},
date = {2023-01-01},
urldate = {2022-01-01},
booktitle = {Bioinformatics Research and Applications},
volume = {13760},
pages = {204--216},
publisher = {Springer Nature},
address = {Germany},
abstract = {Partial Order Alignment (POA) was introduced by Lee et al. in 2002 to allow the alignment of a string to a graph-like structure representing a set of aligned strings (a Multiple Sequence Alignment, MSA). However, the POA edit transcript (the sequence of edit operations that describe the alignment) does not reflect the possible elasticity of the MSA (different gaps sizes in the aligned string), leaving room for possible misalignment and its propagation in progressive MSA. Elastic-Degenerate Strings (ED-strings) are strings that can represent the outcome of an MSA by highlighting gaps and variants as a list of strings that can differ in size and that can possibly include the empty string. In this paper, we define a method that optimally aligns a string to an ED-string, the latter compactly representing an MSA, overcoming the ambiguity in the POA edit transcript while maintaining its time and space complexity.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Rivals, Eric; Sweering, Michelle; Wang, Pengfei
Convergence of the number of period sets in strings Proceedings Article
In: Etessami, Kousha; Feige, Uriel; Puppis, Gabriele (Ed.): 50th International Colloquium on Automata, Languages, and Programming (ICALP 2023), pp. 100:1–100:14, Schloss Dagstuhl -- Leibniz-Zentrum f{"u}r Informatik, Dagstuhl, Germany, 2023, ISBN: 978-3-95977-278-5.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Rivals2022,
title = {Convergence of the number of period sets in strings},
author = {Eric Rivals and Michelle Sweering and Pengfei Wang},
editor = {Kousha Etessami and Uriel Feige and Gabriele Puppis},
doi = {10.4230/LIPIcs.ICALP.2023.100},
isbn = {978-3-95977-278-5},
year = {2023},
date = {2023-01-01},
urldate = {2023-01-01},
booktitle = {50th International Colloquium on Automata, Languages, and Programming (ICALP 2023)},
volume = {261},
pages = {100:1--100:14},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum f{"u}r Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {Consider words of length $n$. The set of all periods of a word of length $n$ is a subset of ${0, 1, 2, ldots , n-1}$. However, any subset of ${0, 1, 2, ldots , n-1}$ is not necessarily a valid set of periods. In a seminal paper in 1981, Guibas and Odlyzko have proposed to encode the set of periods of a word into an $n$ long binary string, called an autocorrelation, where a one at position $i$ denotes a period of $i$. They considered the question of recognizing a valid period set, and also studied the number of valid period sets for length $n$, denoted $kappa n$. They conjectured that $ln(kappa n)$ asymptotically converges to a constant times $ln^2(n)$. If improved lower bounds for $ln(kappa n)/ln^2(n)$ were proposed in 2001, the question of a tight upper bound has remained opened since Guibas and Odlyzko’s paper. Here, we exhibit an upper bound for this fraction, which implies its convergence and closes this long standing conjecture. Moreover, we extend our result to find similar bounds for the number of correlations: a generalization of autocorrelations which encodes the overlaps between two strings.},
howpublished = {arXiv},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Rizzo, Nicola; Cáceres, Manuel; Mäkinen, Veli
Chaining of maximal exact matches in graphs Proceedings Article
In: Nardini, Franco Maria; Pisanti, Nadia; Venturini, Rossano (Ed.): String Processing and Information Retrieval, pp. 353–366, Springer Nature Switzerland, Cham, 2023, ISBN: 978-3-031-43980-3.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Rizzo2023-ym,
title = {Chaining of maximal exact matches in graphs},
author = {Nicola Rizzo and Manuel Cáceres and Veli Mäkinen},
editor = {Nardini, Franco Maria and Pisanti, Nadia and Venturini, Rossano},
doi = {10.1007/978-3-031-43980-3_29},
isbn = {978-3-031-43980-3},
year = {2023},
date = {2023-01-01},
urldate = {2023-01-01},
booktitle = {String Processing and Information Retrieval},
volume = {14240},
pages = {353–366},
publisher = {Springer Nature Switzerland},
address = {Cham},
series = {Lecture notes in computer science},
abstract = {We show how to chain maximal exact matches (MEMs) between a query string Q and a labeled directed acyclic graph (DAG) $G=(V,E)$ to solve the longest common subsequence (LCS) problem between Q and G. We obtain our result via a new symmetric formulation of chaining in DAGs that we solve in $O(m+n+k^2|V| + |E| + kNlog N)$ time, where $m=|Q|$, n is the total length of node labels, k is the minimum number of paths covering the nodes of G and N is the number of MEMs between Q and node labels, which we show encode full MEMs.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Luo, Xiao; Kang, Xiongbin; Schönhuth, Alexander
VeChat: correcting errors in long reads using variation graphs Journal Article
In: Nat Commun, vol. 13, no. 1, pp. 6657, 2022, ISSN: 2041-1723.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Luo2022b,
title = {VeChat: correcting errors in long reads using variation graphs},
author = {Xiao Luo and Xiongbin Kang and Alexander Schönhuth},
doi = {10.1038/s41467-022-34381-8},
issn = {2041-1723},
year = {2022},
date = {2022-11-01},
urldate = {2022-11-01},
journal = {Nat Commun},
volume = {13},
number = {1},
pages = {6657},
abstract = {Error correction is the canonical first step in long-read sequencing data analysis. Current self-correction methods, however, are affected by consensus sequence induced biases that mask true variants in haplotypes of lower frequency showing in mixed samples. Unlike consensus sequence templates, graph-based reference systems are not affected by such biases, so do not mistakenly mask true variants as errors. We present VeChat, as an approach to implement this idea: VeChat is based on variation graphs, as a popular type of data structure for pangenome reference systems. Extensive benchmarking experiments demonstrate that long reads corrected by VeChat contain 4 to 15 (Pacific Biosciences) and 1 to 10 times (Oxford Nanopore Technologies) less errors than when being corrected by state of the art approaches. Further, using VeChat prior to long-read assembly significantly improves the haplotype awareness of the assemblies. VeChat is an easy-to-use open-source tool and publicly available at https://github.com/HaploKit/vechat .},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
Blassel, Luc; Medvedev, Paul; Chikhi, Rayan
Mapping-friendly sequence reductions: Going beyond homopolymer compression Journal Article
In: iScience, vol. 25, no. 11, pp. 105305, 2022, ISSN: 2589-0042.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Blassel2022,
title = {Mapping-friendly sequence reductions: Going beyond homopolymer compression},
author = {Luc Blassel and Paul Medvedev and Rayan Chikhi},
doi = {10.1016/j.isci.2022.105305},
issn = {2589-0042},
year = {2022},
date = {2022-11-01},
urldate = {2022-11-01},
journal = {iScience},
volume = {25},
number = {11},
pages = {105305},
abstract = {Sequencing errors continue to pose algorithmic challenges to methods working with sequencing data. One of the simplest and most prevalent techniques for ameliorating the detrimental effects of homopolymer expansion/contraction errors present in long reads is homopolymer compression. It collapses runs of repeated nucleotides, to remove some sequencing errors and improve mapping sensitivity. Though our intuitive understanding justifies why homopolymer compression works, it in no way implies that it is the best transformation that can be done. In this paper, we explore if there are transformations that can be applied in the same pre-processing manner as homopolymer compression that would achieve better alignment sensitivity. We introduce a more general framework than homopolymer compression, called mapping-friendly sequence reductions. We transform the reference and the reads using these reductions and then apply an alignment algorithm. We demonstrate that some mapping-friendly sequence reductions lead to improved mapping accuracy, outperforming homopolymer compression.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
Lemane, Téo; Chikhi, Rayan; Peterlongo, Pierre
kmdiff, large-scale and user-friendly differential k-mer analyses Journal Article
In: Bioinformatics, vol. 38, no. 24, pp. 5443-5445, 2022, ISSN: 1367-4803.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Lemane2022,
title = {kmdiff, large-scale and user-friendly differential k-mer analyses},
author = {Téo Lemane and Rayan Chikhi and Pierre Peterlongo},
url = {https://github.com/tlemane/kmdiff},
doi = {10.1093/bioinformatics/btac689},
issn = {1367-4803},
year = {2022},
date = {2022-10-01},
urldate = {2022-10-01},
journal = {Bioinformatics},
volume = {38},
number = {24},
pages = {5443-5445},
abstract = {Genome Wide Association Studies (GWAS) elucidate links between genotypes and phenotypes. Recent studies point out the interest of conducting such experiments using k-mers as the base signal instead of single-nucleotide polymorphisms. We propose a tool, kmdiff, that performs differential k-mer analyses on large sequencing cohorts in an order of magnitude less time and memory than previously possible.
AVAILABILITY: https://github.com/tlemane/kmdiff.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
AVAILABILITY: https://github.com/tlemane/kmdiff.
Kang, Xiongbin; Luo, Xiao; Schönhuth, Alexander
StrainXpress: strain aware metagenome assembly from short reads Journal Article
In: Nucleic Acids Res, vol. 50, no. 17, pp. e101, 2022, ISSN: 1362-4962.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Kang2022,
title = {StrainXpress: strain aware metagenome assembly from short reads},
author = {Xiongbin Kang and Xiao Luo and Alexander Schönhuth},
doi = {10.1093/nar/gkac543},
issn = {1362-4962},
year = {2022},
date = {2022-09-01},
urldate = {2022-09-01},
journal = {Nucleic Acids Res},
volume = {50},
number = {17},
pages = {e101},
abstract = {Next-generation sequencing-based metagenomics has enabled to identify microorganisms in characteristic habitats without the need for lengthy cultivation. Importantly, clinically relevant phenomena such as resistance to medication, virulence or interactions with the environment can vary already within species. Therefore, a major current challenge is to reconstruct individual genomes from the sequencing reads at the level of strains, and not just the level of species. However, strains of one species can differ only by minor amounts of variants, which makes it difficult to distinguish them. Despite considerable recent progress, related approaches have remained fragmentary so far. Here, we present StrainXpress, as a comprehensive solution to the problem of strain aware metagenome assembly from next-generation sequencing reads. In experiments, StrainXpress reconstructs strain-specific genomes from metagenomes that involve up to >1000 strains and proves to successfully deal with poorly covered strains. The amount of reconstructed strain-specific sequence exceeds that of the current state-of-the-art approaches by on average 26.75% across all data sets (first quartile: 18.51%, median: 26.60%, third quartile: 35.05%).},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
Dufresne, Yoann; Lemane, Teo; Marijon, Pierre; Peterlongo, Pierre; Rahman, Amatur; Kokot, Marek; Medvedev, Paul; Deorowicz, Sebastian; Chikhi, Rayan
The K-mer File Format: a standardized and compact disk representation of sets of k-mers Journal Article
In: Bioinformatics, vol. 38, no. 18, pp. 4423–4425, 2022, ISSN: 1367-4811.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Dufresne2022,
title = {The K-mer File Format: a standardized and compact disk representation of sets of k-mers},
author = {Yoann Dufresne and Teo Lemane and Pierre Marijon and Pierre Peterlongo and Amatur Rahman and Marek Kokot and Paul Medvedev and Sebastian Deorowicz and Rayan Chikhi},
url = {https://github.com/Kmer-File-Format/},
doi = {10.1093/bioinformatics/btac528},
issn = {1367-4811},
year = {2022},
date = {2022-09-01},
urldate = {2022-09-01},
journal = {Bioinformatics},
volume = {38},
number = {18},
pages = {4423--4425},
abstract = {\textbf{Summary:} Bioinformatics applications increasingly rely on ad hoc disk storage of k-mer sets, e.g. for de Bruijn graphs
or alignment indexes. Here, we introduce the K-mer File Format as a general lossless framework for storing and
manipulating k-mer sets, realizing space savings of 3–5 compared to other formats, and bringing interoperability
across tools.
\textbf{Availability and implementation:} Format specification, Cþþ/Rust API, tools: <a href="https://github.com/Kmer-File-Format/">https://github.com/Kmer-File-Format/</a>.
\textbf{Supplementary Information:} Supplementary data are available at Bioinformatics online.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
or alignment indexes. Here, we introduce the K-mer File Format as a general lossless framework for storing and
manipulating k-mer sets, realizing space savings of 3–5 compared to other formats, and bringing interoperability
across tools.
Availability and implementation: Format specification, Cþþ/Rust API, tools: <a href="https://github.com/Kmer-File-Format/">https://github.com/Kmer-File-Format/</a>.
Supplementary Information: Supplementary data are available at Bioinformatics online.
Goga, Adrián; Baláž, Andrej
Prefix-free parsing for building large tunnelled Wheeler graphs Proceedings Article
In: Boucher, Christina; Rahmann, Sven (Ed.): 22nd International Workshop on Algorithms in Bioinformatics (WABI 2022), pp. 18:1–18:12, Schloss Dagstuhl - Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2022, ISBN: 978-3-95977-243-3.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Goga2022-eb,
title = {Prefix-free parsing for building large tunnelled Wheeler graphs},
author = {Adrián Goga and Andrej Baláž},
editor = {Boucher, Christina and Rahmann, Sven},
doi = {10.4230/LIPIcs.WABI.2022.18},
isbn = {978-3-95977-243-3},
year = {2022},
date = {2022-08-01},
urldate = {2022-08-01},
booktitle = {22nd International Workshop on Algorithms in Bioinformatics (WABI 2022)},
volume = {242},
pages = {18:1--18:12},
publisher = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {We propose a new technique for creating a space-efficient index for large repetitive text collections, such as pangenomic databases containing sequences of many individuals from the same species. We combine two recent techniques from this area: Wheeler graphs (Gagie et al., 2017) and prefix-free parsing (PFP, Boucher et al., 2019). Wheeler graphs are a general framework encompassing several indexes based on the Burrows-Wheeler transform (BWT), such as the FM-index. Wheeler graphs admit a succinct representation which can be further compacted by employing the idea of tunnelling, which exploits redundancies in the form of parallel, equally-labelled paths called blocks that can be merged into a single path. The problem of finding the optimal set of blocks for tunnelling, i.e. the one that minimizes the size of the resulting Wheeler graph, is known to be NP-complete and remains the most computationally challenging part of the tunnelling process. To find an adequate set of blocks in less time, we propose a new method based on the prefix-free parsing (PFP). The idea of PFP is to divide the input text into phrases of roughly equal sizes that overlap by a fixed number of characters. The phrases are then sorted lexicographically. The original text is represented by a sequence of phrase ranks (the parse) and a list of all used phrases (the dictionary). In repetitive texts, the PFP representation of the text is generally much shorter than the original since individual phrases are used many times in the parse, thus reducing the size of the dictionary. To speed up the block selection for tunnelling, we apply the PFP to obtain the parse and the dictionary of the original text, tunnel the Wheeler graph of the parse using existing heuristics and subsequently use this tunnelled parse to construct a compact Wheeler graph of the original text. Compared with constructing a Wheeler graph from the original text without PFP, our method is much faster and uses less memory on collections of pangenomic sequences. Therefore, our method enables the use of Wheeler graphs as a pangenomic reference for real-world pangenomic datasets.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Charalampopoulos, Panagiotis; Pissis, Solon P.; Radoszewski, Jakub
Longest Palindromic Substring in Sublinear Time Proceedings Article
In: Bannai, Hideo; Holub, Jan (Ed.): 33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022), pp. 20:1–20:9, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2022, ISSN: 1868-8969.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG, WP2: Evolutionary/Comparative CPG
@inproceedings{Charalampopoulos2022b,
title = {Longest Palindromic Substring in Sublinear Time},
author = {Charalampopoulos, Panagiotis and Pissis, Solon P. and Radoszewski, Jakub},
editor = {Bannai, Hideo and Holub, Jan},
doi = {10.4230/LIPIcs.CPM.2022.20},
issn = {1868-8969},
year = {2022},
date = {2022-06-22},
urldate = {2022-06-22},
booktitle = {33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022)},
volume = {223},
pages = {20:1--20:9},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {We revisit the classic algorithmic problem of computing a longest palidromic substring. This problem is solvable by a celebrated 𝒪(n)-time algorithm [Manacher, J. ACM 1975], where n is the length of the input string. For small alphabets, 𝒪(n) is not necessarily optimal in the word RAM model of computation: a string of length n over alphabet [0,σ) can be stored in 𝒪(n log σ/log n) space and read in 𝒪(n log σ/log n) time. We devise a simple 𝒪(n log σ/log n)-time algorithm for computing a longest palindromic substring. In particular, our algorithm works in sublinear time if σ = $2^{o(log n)}$. Our technique relies on periodicity and on the 𝒪(n log σ/log n)-time constructible data structure of Kempa and Kociumaka [STOC 2019] that answers longest common extension queries in 𝒪(1) time.},
keywords = {WP1: Primary CPG, WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Bernardini, Giulia; Conte, Alessio; Gabory, Esteban; Grossi, Roberto; Loukides, Grigorios; Pissis, Solon P.; Punzi, Giulia; Sweering, Michelle
On Strings Having the Same Length- k Substrings Proceedings Article
In: Bannai, Hideo; Holub, Jan (Ed.): 33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022), pp. 16:1–16:17, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2022, ISSN: 1868-8969.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG, WP2: Evolutionary/Comparative CPG
@inproceedings{Bernardini2022b,
title = {On Strings Having the Same Length- k Substrings},
author = {Bernardini, Giulia and Conte, Alessio and Gabory, Esteban and Grossi, Roberto and Loukides, Grigorios and Pissis, Solon P. and Punzi, Giulia and Sweering, Michelle},
editor = {Bannai, Hideo and Holub, Jan},
doi = {10.4230/LIPIcs.CPM.2022.16},
issn = {1868-8969},
year = {2022},
date = {2022-06-22},
urldate = {2022-06-22},
booktitle = {33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022)},
volume = {223},
pages = {16:1--16:17},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {Let Substr_k(X) denote the set of length-k substrings of a given string X for a given integer k > 0. We study the following basic string problem, called z-Shortest 𝒮_k-Equivalent Strings: Given a set 𝒮_k of n length-k strings and an integer z > 0, list z shortest distinct strings T₁,…,T_z such that Substr_k(T_i) = 𝒮_k, for all i ∈ [1,z]. The z-Shortest 𝒮_k-Equivalent Strings problem arises naturally as an encoding problem in many real-world applications; e.g., in data privacy, in data compression, and in bioinformatics. The 1-Shortest 𝒮_k-Equivalent Strings, referred to as Shortest 𝒮_k-Equivalent String, asks for a shortest string X such that Substr_k(X) = 𝒮_k.
Our main contributions are summarized below:
- Given a directed graph G(V,E), the Directed Chinese Postman (DCP) problem asks for a shortest closed walk that visits every edge of G at least once. DCP can be solved in 𝒪̃(|E||V|) time using an algorithm for min-cost flow. We show, via a non-trivial reduction, that if Shortest 𝒮_k-Equivalent String over a binary alphabet has a near-linear-time solution then so does DCP.
- We show that the length of a shortest string output by Shortest 𝒮_k-Equivalent String is in 𝒪(k+n²). We generalize this bound by showing that the total length of z shortest strings is in 𝒪(zk+zn²+z²n). We derive these upper bounds by showing (asymptotically tight) bounds on the total length of z shortest Eulerian walks in general directed graphs.
- We present an algorithm for solving z-Shortest 𝒮_k-Equivalent Strings in 𝒪(nk+n²log²n+zn²log n+|output|) time. If z = 1, the time becomes 𝒪(nk+n²log²n) by the fact that the size of the input is Θ(nk) and the size of the output is 𝒪(k+n²).},
keywords = {WP1: Primary CPG, WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Our main contributions are summarized below:
- Given a directed graph G(V,E), the Directed Chinese Postman (DCP) problem asks for a shortest closed walk that visits every edge of G at least once. DCP can be solved in 𝒪̃(|E||V|) time using an algorithm for min-cost flow. We show, via a non-trivial reduction, that if Shortest 𝒮_k-Equivalent String over a binary alphabet has a near-linear-time solution then so does DCP.
- We show that the length of a shortest string output by Shortest 𝒮_k-Equivalent String is in 𝒪(k+n²). We generalize this bound by showing that the total length of z shortest strings is in 𝒪(zk+zn²+z²n). We derive these upper bounds by showing (asymptotically tight) bounds on the total length of z shortest Eulerian walks in general directed graphs.
- We present an algorithm for solving z-Shortest 𝒮_k-Equivalent Strings in 𝒪(nk+n²log²n+zn²log n+|output|) time. If z = 1, the time becomes 𝒪(nk+n²log²n) by the fact that the size of the input is Θ(nk) and the size of the output is 𝒪(k+n²).
Bernardini, Giulia; Chen, Huiping; Loukides, Grigorios; Pissis, Solon P.; Stougie, Leen; Sweering, Michelle
Making de Bruijn Graphs Eulerian Proceedings Article
In: Bannai, Hideo; Holub, Jan (Ed.): 33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022), pp. 12:1–12:18, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2022, ISSN: 1868-8969.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG, WP2: Evolutionary/Comparative CPG
@inproceedings{Bernardini2022c,
title = {Making de Bruijn Graphs Eulerian},
author = {Bernardini, Giulia and Chen, Huiping and Loukides, Grigorios and Pissis, Solon P. and Stougie, Leen and Sweering, Michelle},
editor = {Bannai, Hideo and Holub, Jan},
doi = {10.4230/LIPIcs.CPM.2022.12},
issn = {1868-8969},
year = {2022},
date = {2022-06-06},
urldate = {2022-06-06},
booktitle = {33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022)},
volume = {223},
pages = {12:1--12:18},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {A directed multigraph is called Eulerian if it has a circuit which uses each edge exactly once. Euler’s theorem tells us that a weakly connected directed multigraph is Eulerian if and only if every node is balanced. Given a collection S of strings over an alphabet Σ, the de Bruijn graph (dBG) of order k of S is a directed multigraph $G_{S,k}(V,E)$, where V is the set of length-(k-1) substrings of the strings in S, and $G_{S,k}$ contains an edge (u,v) with multiplicity $m_{u,v}$, if and only if the string u[0]⋅ v is equal to the string u⋅ v[k-2] and this string occurs exactly $m_{u,v}$ times in total in strings in S. Let $G_{Σ,k}(V_{Σ,k},E_{Σ,k})$ be the complete dBG of $Σ^k$. The Eulerian Extension (EE) problem on $G_{S,k}$ asks to extend $G_{S,k}$ with a set $cal{B}$ of nodes from $V_{Σ,k}$ and a smallest multiset $cal{A}$ of edges from $E_{Σ,k}$ to make it Eulerian. Note that extending dBGs is algorithmically much more challenging than extending general directed multigraphs because some edges in dBGs are by definition forbidden. Extending dBGs lies at the heart of sequence assembly [Medvedev et al., WABI 2007], one of the most important tasks in bioinformatics. The novelty of our work with respect to existing works is that we allow not only to duplicate existing edges of $G_{S,k}$ but to also add novel edges and nodes, in an effort to (i) connect multiple components and (ii) reduce the total EE cost. It is easy to show that EE on $G_{S,k}$ is NP-hard via a reduction from shortest common superstring. We further show that EE remains NP-hard, even when we are not allowed to add new nodes, via a highly non-trivial reduction from 3-SAT. We thus investigate the following two problems underlying EE in dBGs:
1) When $G_{S,k}$ is not weakly connected, we are asked to connect its d > 1 components using a minimum-weight spanning tree, whose edges are paths on the underlying $G_{Σ,k}$ and weights are the corresponding path lengths. This way of connecting guarantees that no new unbalanced node is added. We show that this problem can be solved in $O(|V|klog d+|E|)$ time, which is nearly optimal, since the size of $G_{S,k}$ is $Θ(|V|k+|E|)$.
2) When $G_{S,k}$ is not balanced, we are asked to extend $G_{S,k}$ to $H_{S,k}(V cup cal{B},E cup cal{A})$ such that every node of $H_{S,k}$ is balanced and the total number $|cal{A}|$ of added edges is minimized. We show that this problem can be solved in the optimal $O(k|V| + |E|+ |cal{A}|)$ time. Let us stress that, although our main contributions are theoretical, the algorithms we design for the above two problems are practical. We combine the two algorithms in one method that makes any dBG Eulerian; and show experimentally that the cost of the obtained feasible solutions on real-world dBGs is substantially smaller than the corresponding cost obtained by existing greedy approaches.},
keywords = {WP1: Primary CPG, WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
1) When $G_{S,k}$ is not weakly connected, we are asked to connect its d > 1 components using a minimum-weight spanning tree, whose edges are paths on the underlying $G_{Σ,k}$ and weights are the corresponding path lengths. This way of connecting guarantees that no new unbalanced node is added. We show that this problem can be solved in $O(|V|klog d+|E|)$ time, which is nearly optimal, since the size of $G_{S,k}$ is $Θ(|V|k+|E|)$.
2) When $G_{S,k}$ is not balanced, we are asked to extend $G_{S,k}$ to $H_{S,k}(V cup cal{B},E cup cal{A})$ such that every node of $H_{S,k}$ is balanced and the total number $|cal{A}|$ of added edges is minimized. We show that this problem can be solved in the optimal $O(k|V| + |E|+ |cal{A}|)$ time. Let us stress that, although our main contributions are theoretical, the algorithms we design for the above two problems are practical. We combine the two algorithms in one method that makes any dBG Eulerian; and show experimentally that the cost of the obtained feasible solutions on real-world dBGs is substantially smaller than the corresponding cost obtained by existing greedy approaches.
Bernardini, Giulia; Gawrychowski, Paweł; Pisanti, Nadia; Pissis, Solon P.; Rosone, Giovanna
Elastic-Degenerate String Matching via Fast Matrix Multiplication Journal Article
In: SIAM Journal on Computing, vol. 51, no. 3, pp. 549–576, 2022.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Bernardini2022,
title = {Elastic-Degenerate String Matching via Fast Matrix Multiplication},
author = {Giulia Bernardini and Paweł Gawrychowski and Nadia Pisanti and Solon P. Pissis and Giovanna Rosone},
doi = {10.1137/20m1368033},
year = {2022},
date = {2022-05-01},
urldate = {2022-05-01},
journal = {SIAM Journal on Computing},
volume = {51},
number = {3},
pages = {549--576},
publisher = {Society for Industrial & Applied Mathematics (SIAM)},
abstract = {An elastic-degenerate (ED) string is a sequence of $n$ sets of strings of total length $N$ which was recently proposed to model a set of similar sequences. The ED string matching (EDSM) problem is to find all occurrences of a pattern of length $m$ in an ED text. The EDSM problem has recently received some attention in the combinatorial pattern matching community, and an $mathcal{O}(nm^{1.5}sqrt{log m} + N)$-time algorithm is known [Aoyama et al., CPM 2018]. The standard assumption in the prior work on this question is that $N$ is substantially larger than both $n$ and $m$, and thus we would like to have a linear dependency on the former. Under this assumption, the natural open problem is whether we can decrease the 1.5 exponent in the time complexity, similarly as in the related (but, to the best of our knowledge, not equivalent) word break problem [Backurs and Indyk, FOCS 2016]. Our starting point is a conditional lower bound for the EDSM problem. We use the popular combinatorial Boolean matrix multiplication (BMM) conjecture stating that there is no truly subcubic combinatorial algorithm for BMM [Abboud and Williams, FOCS 2014]. By designing an appropriate reduction, we show that a combinatorial algorithm solving the EDSM problem in $mathcal{O}(nm^{1.5-epsilon} + N)$ time, for any $epsilon>0$, refutes this conjecture. Our reduction should be understood as an indication that decreasing the exponent requires fast matrix multiplication. String periodicity and fast Fourier transform are two standard tools in string algorithms. Our main technical contribution is that we successfully combine these tools with fast matrix multiplication to design a noncombinatorial $tilde{mathcal{O}}(nm^{omega-1}+N)$-time algorithm for EDSM, where $omega$ denotes the matrix multiplication exponent and the $tilde{mathcal{O}}(cdot)$ notation suppresses polylog factors. To the best of our knowledge, we are the first to combine these tools. In particular, using the fact that $omega<2.373$ [Alman and Williams, SODA 2021; Le Gall, ISSAC 2014; Williams, STOC 2012], we obtain an $mathcal{O}(nm^{1.373} + N)$-time algorithm for EDSM. An important building block in our solution that might find applications in other problems is a method of selecting a small set of length-$ell$ substrings of the pattern, called anchors, so that any occurrence of a string from an ED text set contains at least one but not too many (on average) such anchors inside.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
Meyer, Fernando; Fritz, Adrian; Deng, Zhi-Luo; Koslicki, David; Lesker, Till Robin; Gurevich, Alexey; Robertson, Gary; Alser, Mohammed; Antipov, Dmitry; Beghini, Francesco; Bertrand, Denis; Brito, Jaqueline J.; Brown, C. Titus; Buchmann, Jan; Buluç, Aydin; Chen, Bo; Chikhi, Rayan; Clausen, Philip T. L. C.; Cristian, Alexandru; Dabrowski, Piotr Wojciech; Darling, Aaron E.; Egan, Rob; Eskin, Eleazar; Georganas, Evangelos; Goltsman, Eugene; Gray, Melissa A.; Hansen, Lars Hestbjerg; Hofmeyr, Steven; Huang, Pingqin; Irber, Luiz; Jia, Huijue; Jørgensen, Tue Sparholt; Kieser, Silas D.; Klemetsen, Terje; Kola, Axel; Kolmogorov, Mikhail; Korobeynikov, Anton; Kwan, Jason; LaPierre, Nathan; Lemaitre, Claire; Li, Chenhao; Limasset, Antoine; Malcher-Miranda, Fabio; Mangul, Serghei; Marcelino, Vanessa R.; Marchet, Camille; Marijon, Pierre; Meleshko, Dmitry; Mende, Daniel R.; Milanese, Alessio; Nagarajan, Niranjan; Nissen, Jakob; Nurk, Sergey; Oliker, Leonid; Paoli, Lucas; Peterlongo, Pierre; Piro, Vitor C.; Porter, Jacob S.; Rasmussen, Simon; Rees, Evan R.; Reinert, Knut; Renard, Bernhard; Robertsen, Espen Mikal; Rosen, Gail L.; Ruscheweyh, Hans-Joachim; Sarwal, Varuni; Segata, Nicola; Seiler, Enrico; Shi, Lizhen; Sun, Fengzhu; Sunagawa, Shinichi; Sørensen, Søren Johannes; Thomas, Ashleigh; Tong, Chengxuan; Trajkovski, Mirko; Tremblay, Julien; Uritskiy, Gherman; Vicedomini, Riccardo; Wang, Zhengyang; Wang, Ziye; Wang, Zhong; Warren, Andrew; Willassen, Nils Peder; Yelick, Katherine; You, Ronghui; Zeller, Georg; Zhao, Zhengqiao; Zhu, Shanfeng; Zhu, Jie; Garrido-Oter, Ruben; Gastmeier, Petra; Hacquard, Stephane; Häußler, Susanne; Khaledi, Ariane; Maechler, Friederike; Mesny, Fantin; Radutoiu, Simona; Schulze-Lefert, Paul; Smit, Nathiana; Strowig, Till; Bremges, Andreas; Sczyrba, Alexander; McHardy, Alice Carolyn
Critical Assessment of Metagenome Interpretation: the second round of challenges Journal Article
In: Nature Methods, vol. 19, no. 4, pp. 429–440, 2022, ISSN: 1548-7105.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Meyer2022,
title = {Critical Assessment of Metagenome Interpretation: the second round of challenges},
author = {Fernando Meyer and Adrian Fritz and Zhi-Luo Deng and David Koslicki and Till Robin Lesker and Alexey Gurevich and Gary Robertson and Mohammed Alser and Dmitry Antipov and Francesco Beghini and Denis Bertrand and Jaqueline J. Brito and C. Titus Brown and Jan Buchmann and Aydin Buluç and Bo Chen and Rayan Chikhi and Philip T. L. C. Clausen and Alexandru Cristian and Piotr Wojciech Dabrowski and Aaron E. Darling and Rob Egan and Eleazar Eskin and Evangelos Georganas and Eugene Goltsman and Melissa A. Gray and Lars Hestbjerg Hansen and Steven Hofmeyr and Pingqin Huang and Luiz Irber and Huijue Jia and Tue Sparholt Jørgensen and Silas D. Kieser and Terje Klemetsen and Axel Kola and Mikhail Kolmogorov and Anton Korobeynikov and Jason Kwan and Nathan LaPierre and Claire Lemaitre and Chenhao Li and Antoine Limasset and Fabio Malcher-Miranda and Serghei Mangul and Vanessa R. Marcelino and Camille Marchet and Pierre Marijon and Dmitry Meleshko and Daniel R. Mende and Alessio Milanese and Niranjan Nagarajan and Jakob Nissen and Sergey Nurk and Leonid Oliker and Lucas Paoli and Pierre Peterlongo and Vitor C. Piro and Jacob S. Porter and Simon Rasmussen and Evan R. Rees and Knut Reinert and Bernhard Renard and Espen Mikal Robertsen and Gail L. Rosen and Hans-Joachim Ruscheweyh and Varuni Sarwal and Nicola Segata and Enrico Seiler and Lizhen Shi and Fengzhu Sun and Shinichi Sunagawa and Søren Johannes Sørensen and Ashleigh Thomas and Chengxuan Tong and Mirko Trajkovski and Julien Tremblay and Gherman Uritskiy and Riccardo Vicedomini and Zhengyang Wang and Ziye Wang and Zhong Wang and Andrew Warren and Nils Peder Willassen and Katherine Yelick and Ronghui You and Georg Zeller and Zhengqiao Zhao and Shanfeng Zhu and Jie Zhu and Ruben Garrido-Oter and Petra Gastmeier and Stephane Hacquard and Susanne Häußler and Ariane Khaledi and Friederike Maechler and Fantin Mesny and Simona Radutoiu and Paul Schulze-Lefert and Nathiana Smit and Till Strowig and Andreas Bremges and Alexander Sczyrba and Alice Carolyn McHardy},
doi = {10.1038/s41592-022-01431-4},
issn = {1548-7105},
year = {2022},
date = {2022-04-01},
urldate = {2022-04-01},
journal = {Nature Methods},
volume = {19},
number = {4},
pages = {429–440},
publisher = {Springer Science and Business Media LLC},
abstract = {Evaluating metagenomic software is key for optimizing metagenome interpretation and focus of the Initiative for the Critical Assessment of Metagenome Interpretation (CAMI). The CAMI II challenge engaged the community to assess methods on realistic and complex datasets with long- and short-read sequences, created computationally from around 1,700 new and known genomes, as well as 600 new plasmids and viruses. Here we analyze 5,002 results by 76 program versions. Substantial improvements were seen in assembly, some due to long-read data. Related strains still were challenging for assembly and genome recovery through binning, as was assembly quality for the latter. Profilers markedly matured, with taxon profilers and binners excelling at higher bacterial ranks, but underperforming for viruses and Archaea. Clinical pathogen detection results revealed a need to improve reproducibility. Runtime and memory usage analyses identified efficient programs, including top performers with other metrics. The results identify challenges and guide researchers in selecting methods for analyses.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
Sarkar, Aditya; Giros, Augustin; Mockly, Louis; Moore, Jaden; Moore, Andrew; Nagareddy, Anish; Fu, Boyang; Fiscutean, Andrada; Chhugani, Karishma; Darci-Maher, Nicholas; Patel, Yesha M.; Sarwal, Varuni; Chang, Yutong; Ginjala, Srishti; Garmire, Lana X.; Bao, Riyue; Sankararaman, Sriram; Chikhi, Rayan; Mangul, Serghei
Unlocking the microblogging potential for science and medicine Unpublished
bioRxiv, 2022.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@unpublished{Sarkar2022,
title = {Unlocking the microblogging potential for science and medicine},
author = {Aditya Sarkar and Augustin Giros and Louis Mockly and Jaden Moore and Andrew Moore and Anish Nagareddy and Boyang Fu and Andrada Fiscutean and Karishma Chhugani and Nicholas Darci-Maher and Yesha M. Patel and Varuni Sarwal and Yutong Chang and Srishti Ginjala and Lana X. Garmire and Riyue Bao and Sriram Sankararaman and Rayan Chikhi and Serghei Mangul},
doi = {10.1101/2022.04.22.488804},
year = {2022},
date = {2022-04-01},
urldate = {2022-04-01},
publisher = {Cold Spring Harbor Laboratory},
abstract = {Microblogging platform Twitter allows researchers to showcase their work, receive constructive feedback, find jobs, and build scientific collaborations. While existing literature has analyzed the benefits of Twitter in the development and distribution of scientific knowledge, most of the studies only took into account a limited number of researchers, which affected the generalizability of derived results. Our study analyzed the activity of 6,000 biomedical scientists on Twitter using data-driven approaches, a third of whom were female. Furthermore, we estimated that up to a quarter of the members of the scientific community are engaged on Twitter. While the number of male scientists joining the microblogging platform every year has decreased, the number of female scientists has remained roughly the same. Scientists are very selective in who they are following as compared to the general public. We also found that the type of tweets and retweets one posts may affect the number of followers, specifically, that a moderate to high level of professionalism and a high level of positivity is correlated with followers count. Moreover, female scientists send fewer negative tweets as compared to male scientists (31.1% for females and 34.7% for males). Our analysis could provide insights and launch a conversation on the advantages and limitations of using Twitter for disseminating scientific information and engaging in constructive discussion and collaborations within the scientific community.},
howpublished = {bioRxiv},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {unpublished}
}
Baaijens, Jasmijn A.; Bonizzoni, Paola; Boucher, Christina; Vedova, Gianluca Della; Pirola, Yuri; Rizzi, Raffaella; Sirén, Jouni
Computational graph pangenomics: a tutorial on data structures and their applications Journal Article
In: Natural Computing, vol. 21, no. 1, pp. 81–108, 2022, ISBN: 1572-9796.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Baaijens2022,
title = {Computational graph pangenomics: a tutorial on data structures and their applications},
author = {Jasmijn A. Baaijens and Paola Bonizzoni and Christina Boucher and Gianluca Della Vedova and Yuri Pirola and Raffaella Rizzi and Jouni Sirén},
doi = {10.1007/s11047-022-09882-6},
isbn = {1572-9796},
year = {2022},
date = {2022-03-01},
urldate = {2022-03-01},
journal = {Natural Computing},
volume = {21},
number = {1},
pages = {81--108},
abstract = {Computational pangenomics is an emerging research field that is changing the way computer scientists are facing challenges in biological sequence analysis. In past decades, contributions from combinatorics, stringology, graph theory and data structures were essential in the development of a plethora of software tools for the analysis of the human genome. These tools allowed computational biologists to approach ambitious projects at population scale, such as the 1000 Genomes Project. A major contribution of the 1000 Genomes Project is the characterization of a broad spectrum of genetic variations in the human genome, including the discovery of novel variations in the South Asian, African and European populations---thus enhancing the catalogue of variability within the reference genome. Currently, the need to take into account the high variability in population genomes as well as the specificity of an individual genome in a personalized approach to medicine is rapidly pushing the abandonment of the traditional paradigm of using a single reference genome. A graph-based representation of multiple genomes, or a graph pangenome, is replacing the linear reference genome. This means completely rethinking well-established procedures to analyze, store, and access information from genome representations. Properly addressing these challenges is crucial to face the computational tasks of ambitious healthcare projects aiming to characterize human diversity by sequencing 1M individuals (Stark et al. 2019). This tutorial aims to introduce readers to the most recent advances in the theory of data structures for the representation of graph pangenomes. We discuss efficient representations of haplotypes and the variability of genotypes in graph pangenomes, and highlight applications in solving computational problems in human and microbial (viral) pangenomes.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
Luo, Xiao; Kang, Xiongbin; Schönhuth, Alexander
Strainline: full-length de novo viral haplotype reconstruction from noisy long reads Journal Article
In: Genome Biology, vol. 23, iss. 1, no. 29, pp. 1–27, 2022, ISSN: 1474-760X.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG, WP3: Translational CPG
@article{Luo2022,
title = {Strainline: full-length de novo viral haplotype reconstruction from noisy long reads},
author = {Xiao Luo and Xiongbin Kang and Alexander Schönhuth},
doi = {10.1186/s13059-021-02587-6},
issn = {1474-760X},
year = {2022},
date = {2022-01-20},
urldate = {2022-01-20},
journal = {Genome Biology},
volume = {23},
number = {29},
issue = {1},
pages = {1--27},
publisher = {Springer Science and Business Media LLC},
abstract = {Haplotype-resolved de novo assembly of highly diverse virus genomes is critical in prevention, control and treatment of viral diseases. Current methods either can handle only relatively accurate short read data, or collapse haplotype-specific variations into consensus sequence. Here, we present Strainline, a novel approach to assemble viral haplotypes from noisy long reads without a reference genome. Strainline is the first approach to provide strain-resolved, full-length de novo assemblies of viral quasispecies from noisy third-generation sequencing data. Benchmarking on simulated and real datasets of varying complexity and diversity confirm this novelty and demonstrate the superiority of Strainline.},
keywords = {WP1: Primary CPG, WP3: Translational CPG},
pubstate = {published},
tppubtype = {article}
}
Rizzo, Nicola; Mäkinen, Veli
Linear Time Construction of Indexable Elastic Founder Graphs Proceedings Article
In: Bazgan, Cristina; Fernau, Henning (Ed.): Combinatorial Algorithms, pp. 480–493, Springer International Publishing, Cham, 2022, ISBN: 978-3-031-06678-8.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Rizzo2022,
title = {Linear Time Construction of Indexable Elastic Founder Graphs},
author = {Nicola Rizzo and Veli Mäkinen},
editor = {Cristina Bazgan and Henning Fernau},
url = {https://arxiv.org/abs/2201.06492},
doi = {10.1007/978-3-031-06678-8_35},
isbn = {978-3-031-06678-8},
year = {2022},
date = {2022-01-01},
urldate = {2022-01-01},
booktitle = {Combinatorial Algorithms},
pages = {480--493},
publisher = {Springer International Publishing},
address = {Cham},
abstract = {Pattern matching on graphs has been widely studied lately due to its importance in genomics applications. Unfortunately, even the simplest problem of deciding if a string appears as a subpath of a graph admits a quadratic lower bound under the Orthogonal Vectors Hypothesis (Equi et al. ICALP 2019, SOFSEM 2021). To avoid this bottleneck, the research has shifted towards more specific graph classes, e.g. those induced from multiple sequence alignments (MSAs). Consider segmenting $mathsf{MSA}[1..m,1..n]$ into $b$ blocks $mathsf{MSA}[1..m,1..j_1]$, $mathsf{MSA}[1..m,j_1+1..j_2]$, $ldots$, $mathsf{MSA}[1..m,j_b-1+1..n]$. The distinct strings in the rows of the blocks, after the removal of gap symbols, form the nodes of an elastic founder graph (EFG) where the edges represent the original connections observed in the MSA. An EFG is called indexable if a node label occurs as a prefix of only those paths that start from a node of the same block. Equi et al. (ISAAC 2021) showed that such EFGs support fast pattern matching and gave an $O(mn log m)$-time algorithm for preprocessing the MSA in a way that allows the construction of indexable EFGs maximizing the number of blocks and, alternatively, minimizing the maximum length of a block, in $O(n)$ and $O(n loglog n)$ time respectively. Using the suffix tree and solving a novel ancestor problem on trees, we improve the preprocessing to $O(mn)$ time and the $O(n loglog n)$-time EFG construction to $O(n)$ time, thus showing that both types of indexable EFGs can be constructed in time linear in the input size.},
howpublished = {arXiv},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Rizzo, Nicola; Mäkinen, Veli
Indexable Elastic Founder Graphs of Minimum Height Proceedings Article
In: Bannai, Hideo; Holub, Jan (Ed.): 33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022), pp. 19:1–19:19, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2022, ISSN: 1868-8969.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Rizzo2022b,
title = {Indexable Elastic Founder Graphs of Minimum Height},
author = {Nicola Rizzo and Veli Mäkinen},
editor = {Hideo Bannai and Jan Holub},
doi = {10.4230/LIPIcs.CPM.2022.19},
issn = {1868-8969},
year = {2022},
date = {2022-01-01},
urldate = {2022-01-01},
booktitle = {33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022)},
volume = {223},
pages = {19:1--19:19},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {Indexable elastic founder graphs have been recently proposed as a data structure for genomics applications supporting fast pattern matching queries. Consider segmenting a multiple sequence alignment MSA[1..m,1..n] into b blocks MSA[1..m,1..j₁], MSA[1..m,j₁+1..j₂], …, MSA[1..m,j_{b-1}+1..n]. The resulting elastic founder graph (EFG) is obtained by merging in each block the strings that are equivalent after the removal of gap symbols, taking the strings as the nodes of the block and the original MSA connections as edges. We call an elastic founder graph indexable if a node label occurs as a prefix of only those paths that start from a node of the same block. Equi et al. (ISAAC 2021) showed that such EFGs support fast pattern matching and studied their construction maximizing the number of blocks and minimizing the maximum length of a block, but left open the case of minimizing the maximum number of distinct strings in a block that we call graph height. For the simplified gapless setting, we give an O(mn) time algorithm to find a segmentation of an MSA minimizing the height of the resulting indexable founder graph, by combining previous results in segmentation algorithms and founder graphs. For the general setting, the known techniques yield a linear-time parameterized solution on constant alphabet Σ, taking time O(m n² log|Σ|) in the worst case, so we study the refined measure of prefix-aware height, that omits counting strings that are prefixes of another considered string. The indexable EFG minimizing the maximum prefix-aware height provides a lower bound for the original height: by exploiting exploiting suffix trees built from the MSA rows and the data structure answering weighted ancestor queries in constant time of Belazzougui et al. (CPM 2021), we give an O(mn)-time algorithm for the optimal EFG under this alternative height. },
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Badkobeh, Golnaz; Charalampopoulos, Panagiotis; Kosolobov, Dmitry; Pissis, Solon P.
Internal shortest absent word queries in constant time and linear space Journal Article
In: Theoretical Computer Science, vol. 922, pp. 271-282, 2022, ISSN: 0304-3975.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG, WP2: Evolutionary/Comparative CPG
@article{Badkobeh2022,
title = {Internal shortest absent word queries in constant time and linear space},
author = {Golnaz Badkobeh and Panagiotis Charalampopoulos and Dmitry Kosolobov and Solon P. Pissis},
url = {https://arxiv.org/abs/2106.01763},
doi = {https://doi.org/10.1016/j.tcs.2022.04.029},
issn = {0304-3975},
year = {2022},
date = {2022-01-01},
urldate = {2022-01-01},
journal = {Theoretical Computer Science},
volume = {922},
pages = {271-282},
abstract = {Given a string T of length n over an alphabet Σ⊂1,2,…,nO(1) of size σ, we are to preprocess T so that given a range [i,j], we can return a representation of a shortest string over Σ that is absent in the fragment T[i]⋯T[j] of T. We present an O(n)-space data structure that answers such queries in constant time and can be constructed in O(nlogσn) time.},
keywords = {WP1: Primary CPG, WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {article}
}