Goga, Adrián; Depuydt, Lore; Brown, Nathaniel K.; Fostier, Jan; Gagie, Travis; Navarro, Gonzalo
Faster Maximal Exact Matches with Lazy LCP Evaluation Proceedings Article
In: 2024 Data Compression Conference (DCC), pp. 123–132, IEEE, 2024.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Goga2024,
title = {Faster Maximal Exact Matches with Lazy LCP Evaluation},
author = {Adrián Goga and Lore Depuydt and Nathaniel K. Brown and Jan Fostier and Travis Gagie and Gonzalo Navarro},
doi = {10.1109/dcc58796.2024.00020},
year = {2024},
date = {2024-03-01},
booktitle = {2024 Data Compression Conference (DCC)},
pages = {123–132},
publisher = {IEEE},
abstract = {MONI (Rossi et al., JCB 2022) is a BWT-based compressed index for computing the matching statistics and maximal exact matches (MEMs) of a pattern (usually a DNA read) with respect to a highly repetitive text (usually a database of genomes) using two operations: LF-steps and longest common extension (LCE) queries on a grammar-compressed representation of the text. In practice, most of the operations are constant-time LF-steps but most of the time is spent evaluating LCE queries. In this paper we show how (a variant of) the latter can be evaluated lazily, so as to bound the total time MONI needs to process the pattern in terms of the number of MEMs between the pattern and the text, while maintaining logarithmic latency.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Lemane, Téo; Lezzoche, Nolan; Lecubin, Julien; Pelletier, Eric; Lescot, Magali; Chikhi, Rayan; Peterlongo, Pierre
Indexing and real-time user-friendly queries in terabyte-sized complex genomic datasets with kmindex and ORA Journal Article
In: Nature Computational Science, vol. 4, no. 2, pp. 104–109, 2024, ISSN: 2662-8457.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG, WP2: Evolutionary/Comparative CPG, WP3: Translational CPG
@article{Lemane2024,
title = {Indexing and real-time user-friendly queries in terabyte-sized complex genomic datasets with kmindex and ORA},
author = {Téo Lemane and Nolan Lezzoche and Julien Lecubin and Eric Pelletier and Magali Lescot and Rayan Chikhi and Pierre Peterlongo},
doi = {10.1038/s43588-024-00596-6},
issn = {2662-8457},
year = {2024},
date = {2024-02-01},
journal = {Nature Computational Science},
volume = {4},
number = {2},
pages = {104–109},
publisher = {Springer Science and Business Media LLC},
abstract = {Public sequencing databases contain vast amounts of biological information, yet they are largely underutilized as it is challenging to efficiently search them for any sequence(s) of interest. We present kmindex, an approach that can index thousands of metagenomes and perform sequence searches in a fraction of a second. The index construction is an order of magnitude faster than previous methods, while search times are two orders of magnitude faster. With negligible false positive rates below 0.01%, kmindex outperforms the precision of existing approaches by four orders of magnitude. Here we demonstrate the scalability of kmindex by successfully indexing 1,393 marine seawater metagenome samples from the Tara Oceans project. Additionally, we introduce the publicly accessible web server Ocean Read Atlas, which enables real-time queries on the Tara Oceans dataset.},
keywords = {WP1: Primary CPG, WP2: Evolutionary/Comparative CPG, WP3: Translational CPG},
pubstate = {published},
tppubtype = {article}
}
Rizzo, Nicola; Equi, Massimo; Norri, Tuukka; Mäkinen, Veli
Elastic founder graphs improved and enhanced Journal Article
In: Theoretical Computer Science, vol. 982, no. 114269, pp. 114269, 2024, ISSN: 0304-3975.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Rizzo2024-jy,
title = {Elastic founder graphs improved and enhanced},
author = {Nicola Rizzo and Massimo Equi and Tuukka Norri and Veli Mäkinen},
doi = {10.1016/j.tcs.2023.114269},
issn = {0304-3975},
year = {2024},
date = {2024-01-01},
urldate = {2024-01-01},
journal = {Theoretical Computer Science},
volume = {982},
number = {114269},
pages = {114269},
publisher = {Elsevier BV},
abstract = {Indexing labeled graphs for pattern matching is a central challenge of pangenomics. Equi et al. (2022) [14] developed the Elastic Founder Graph (EFG) representing an alignment of m sequences of length n, drawn from alphabet Σ plus the special gap character: the paths spell the original sequences or their recombination. By enforcing the semi-repeat-free property, the EFG admits a polynomial-space index for linear-time pattern matching, breaking through the conditional lower bounds on indexing labeled graphs (Equi et al. [13]). In this work, we improve the space of the EFG index answering pattern matching queries in linear time, from linear in the length of all strings spelled by three consecutive node labels, to linear in the size of the edge labels. Then, we develop linear-time construction algorithms optimizing for different metrics: we improve the existing linearithmic construction algorithms to O(mn), by solving the novel exclusive ancestor set problem on trees; we propose, for the simplified gapless setting, an O(mn)-time solution minimizing the maximum block height, that we generalize by substituting block height with prefix-aware height. Finally, to show the versatility of the framework, we develop a BWT-based EFG index and study how to encode and perform document listing queries on a set of paths of the graphs, reporting which paths present a given pattern as a substring. We propose the EFG framework as an improved and enhanced version of the framework for the gapless setting, along with construction methods that are valid in any setting concerned with the segmentation of aligned sequences.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
Benoit, Gaëtan; Raguideau, Sébastien; James, Robert; Phillippy, Adam M.; Chikhi, Rayan; Quince, Christopher
High-quality metagenome assembly from long accurate reads with metaMDBG Journal Article
In: Nature Biotechnology, 2024, ISSN: 1546-1696.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Benoit2024,
title = {High-quality metagenome assembly from long accurate reads with metaMDBG},
author = {Gaëtan Benoit and Sébastien Raguideau and Robert James and Adam M. Phillippy and Rayan Chikhi and Christopher Quince},
doi = {10.1038/s41587-023-01983-6},
issn = {1546-1696},
year = {2024},
date = {2024-01-01},
urldate = {2024-01-01},
journal = {Nature Biotechnology},
publisher = {Springer Science and Business Media LLC},
abstract = {We introduce metaMDBG, a metagenomics assembler for PacBio HiFi reads. MetaMDBG combines a de Bruijn graph assembly in a minimizer space with an iterative assembly over sequences of minimizers to address variations in genome coverage depth and an abundance-based filtering strategy to simplify strain complexity. For complex communities, we obtained up to twice as many high-quality circularized prokaryotic metagenome-assembled genomes as existing methods and had better recovery of viruses and plasmids.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
Cartes, Jorge Avila; Bonizzoni, Paola; Ciccolella, Simone; Vedova, Gianluca Della; Denti, Luca; Didelot, Xavier; Monti, Davide Cesare; Pirola, Yuri
RecGraph: recombination-aware alignment of sequences to variation graphs Journal Article
In: Bioinformatics, vol. 40, no. 5, pp. btae292, 2024, ISSN: 1367-4811.
Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG
@article{Avila2024,
title = {RecGraph: recombination-aware alignment of sequences to variation graphs},
author = {Jorge Avila Cartes and Paola Bonizzoni and Simone Ciccolella and Gianluca Della Vedova and Luca Denti and Xavier Didelot and Davide Cesare Monti and Yuri Pirola},
url = {https://github.com/AlgoLab/RecGraph},
doi = {10.1093/bioinformatics/btae292},
issn = {1367-4811},
year = {2024},
date = {2024-01-01},
urldate = {2024-01-01},
journal = {Bioinformatics},
volume = {40},
number = {5},
pages = {btae292},
abstract = {Bacterial genomes present more variability than human genomes, which requires important adjustments in computational tools that are developed for human data. In particular, bacteria exhibit a mosaic structure due to homologous recombinations, but this fact is not sufficiently captured by standard read mappers that align against linear reference genomes. The recent introduction of pangenomics provides some insights in that context, as a pangenome graph can represent the variability within a species. However, the concept of sequence-to-graph alignment that captures the presence of recombinations has not been previously investigated.In this paper, we present the extension of the notion of sequence-to-graph alignment to a variation graph that incorporates a recombination, so that the latter are explicitly represented and evaluated in an alignment. Moreover, we present a dynamic programming approach for the special case where there is at most a recombination—we implement this case as RecGraph. From a modelling point of view, a recombination corresponds to identifying a new path of the variation graph, where the new arc is composed of two halves, each extracted from an original path, possibly joined by a new arc. Our experiments show that RecGraph accurately aligns simulated recombinant bacterial sequences that have at most a recombination, providing evidence for the presence of recombination events.
Our implementation is open source and available at https://github.com/AlgoLab/RecGraph.},
keywords = {WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {article}
}
Our implementation is open source and available at https://github.com/AlgoLab/RecGraph.
Cillari, Nico; Neri, Giuseppe; Pisanti, Nadia; Milazzo, Paolo; Borello, Ugo
RettDb: the Rett syndrome omics database to navigate the Rett syndrome genomic landscape Journal Article
In: Database, vol. 2024, 2024, ISSN: 1758-0463.
Abstract | Links | BibTeX | Tags: WP3: Translational CPG
@article{Cillari2024,
title = {RettDb: the Rett syndrome omics database to navigate the Rett syndrome genomic landscape},
author = {Nico Cillari and Giuseppe Neri and Nadia Pisanti and Paolo Milazzo and Ugo Borello},
url = {https://biomedinfo.di.unipi.it/rett-database/},
doi = {10.1093/database/baae109},
issn = {1758-0463},
year = {2024},
date = {2024-01-01},
urldate = {2024-01-01},
journal = {Database},
volume = {2024},
publisher = {Oxford University Press (OUP)},
abstract = {Rett syndrome (RTT) is a neurodevelopmental disorder occurring almost exclusively in females and leading to a variety of impairments and disabilities from mild to severe. In >95% cases, RTT is due to mutations in the X-linked gene MECP2, but the molecular mechanisms determining RTT are unknown at present, and the complexity of the system is challenging. To facilitate and provide guidance to the unraveling of those mechanisms, we developed a database resource for the visualization and analysis of the genomic landscape in the context of wild-type or mutated Mecp2 gene in the mouse model. Our resource allows for the exploration of differential dynamics of gene expression and the prediction of new potential MECP2 target genes to decipher the RTT disorder molecular mechanisms.
Database URL: https://biomedinfo.di.unipi.it/rett-database/},
keywords = {WP3: Translational CPG},
pubstate = {published},
tppubtype = {article}
}
Database URL: https://biomedinfo.di.unipi.it/rett-database/
Bernardini, Giulia; Chen, Huiping; Conte, Alessio; Grossi, Roberto; Guerrini, Veronica; Loukides, Grigorios; Pisanti, Nadia; Pissis, Solon P.
Utility-Oriented String Mining Book Chapter
In: Proceedings of the 2024 SIAM International Conference on Data Mining (SDM), pp. 190–198, Society for Industrial and Applied Mathematics, 2024, ISBN: 9781611978032.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inbook{Bernardini2024,
title = {Utility-Oriented String Mining},
author = {Giulia Bernardini and Huiping Chen and Alessio Conte and Roberto Grossi and Veronica Guerrini and Grigorios Loukides and Nadia Pisanti and Solon P. Pissis},
doi = {10.1137/1.9781611978032.22},
isbn = {9781611978032},
year = {2024},
date = {2024-01-01},
booktitle = {Proceedings of the 2024 SIAM International Conference on Data Mining (SDM)},
pages = {190–198},
publisher = {Society for Industrial and Applied Mathematics},
abstract = {A string is often provided with numerical scores (utilities) which quantify the importance, interest, profit, or risk of the letters occurring at every position of the string. For example, every DNA fragment produced by modern sequencing machines comes with a confidence score per position. Motivated by the abundance of strings with utilities, we introduce Utility-oriented String Mining (USM), a natural generalization of the classic frequent substring mining problem. Given a string S of length n and a threshold 𝒱, USM asks for every string R whose utility U (R) is at least 𝒱, where U is a function that maps R to a utility score based on the utilities of all letters of every occurrence of R in S. In addition, our work makes the following contributions: (1) We identify a class 𝕌 of utility functions for which USM admits an 𝒪 (n2)-time algorithm. (2) We prove that no listing algorithm solves the USM problem in subquadratic time for every utility function, or even for every function in 𝕌. (3) We propose an 𝒪 (n log n)-time algorithm that solves USM for a class of monotone functions from 𝕌. (4) We design another 𝒪 (n log n)-time algorithm for the same problem that is comparable in runtime but offers drastic space savings in practice when, in addition, a lower bound on the length of the output strings is provided as input. (5) We demonstrate experimentally using publicly available, billion-letter datasets that our algorithms are many times more efficient, in terms of runtime and/or space, compared to an Apriori-like baseline which employs advanced string processing tools.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inbook}
}
Ascone, Rocco; Bernardini, Giulia; Conte, Alessio; Equi, Massimo; Gabory, Esteban; Grossi, Roberto; Pisanti, Nadia
A Unifying Taxonomy of Pattern Matching in Degenerate Strings and Founder Graphs Proceedings Article
In: Pissis, Solon P.; Sung, Wing-Kin (Ed.): pp. 14:1–14:21, Schloss Dagstuhl – Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2024, ISSN: 1868-8969.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Ascone2024,
title = {A Unifying Taxonomy of Pattern Matching in Degenerate Strings and Founder Graphs},
author = {Rocco Ascone and Giulia Bernardini and Alessio Conte and Massimo Equi and Esteban Gabory and Roberto Grossi and Nadia Pisanti},
editor = {Solon P. Pissis and Wing-Kin Sung},
doi = {10.4230/LIPICS.WABI.2024.14},
issn = {1868-8969},
year = {2024},
date = {2024-01-01},
volume = {312},
pages = {14:1–14:21},
publisher = {Schloss Dagstuhl – Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
abstract = {Elastic Degenerate (ED) strings and Elastic Founder (EF) graphs are two versions of acyclic components of pangenomes. Both ED strings and EF graphs (which we collectively name variable strings) extend the well-known notion of indeterminate string. Recent work has extensively investigated algorithmic tasks over these structures, and over several other variable strings notions that they generalise. Among such tasks, the basic operation of matching a pattern into a text, which can serve as a toolkit for many pangenomic data analyses using these data structures, deserves special attention. In this paper we: (1) highlight a clear taxonomy within both ED strings and EF graphs ranging through variable strings of all types, from the linear string up to the most general one; (2) investigate the problem PvarT(X,Y) of matching a solid or variable pattern of type X into a variable text of type Y; (3) using as a reference the quadratic conditional lower bounds that are known for PvarT(solid,ED) and PvarT(solid,EF), for all possible types of variable strings X and Y we either prove the quadratic conditional lower bound for PvarT(X,Y), or provide non-trivial, often sub-quadratic, upper bounds, also exploiting the above-mentioned taxonomy.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Bille, Philip; Nekrich, Yakov; Pissis, Solon P.
Size-Constrained Weighted Ancestors with Applications Proceedings Article
In: 19th Scandinavian Symposium and Workshops on Algorithm Theory (SWAT 2024), pp. 14:1–14:12, Schloss Dagstuhl – Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2024, ISSN: 1868-8969.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Bille2024,
title = {Size-Constrained Weighted Ancestors with Applications},
author = {Philip Bille and Yakov Nekrich and Solon P. Pissis},
doi = {10.4230/LIPICS.SWAT.2024.14},
issn = {1868-8969},
year = {2024},
date = {2024-01-01},
booktitle = {19th Scandinavian Symposium and Workshops on Algorithm Theory (SWAT 2024)},
volume = {294},
pages = {14:1–14:12},
publisher = {Schloss Dagstuhl – Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {The weighted ancestor problem on a rooted node-weighted tree T is a generalization of the classic predecessor problem: construct a data structure for a set of integers that supports fast predecessor queries. Both problems are known to require Ω(log log n) time for queries provided 𝒪(n poly log n) space is available, where n is the input size. The weighted ancestor problem has attracted a lot of attention by the combinatorial pattern matching community due to its direct application to suffix trees. In this formulation of the problem, the nodes are weighted by string depth. This research has culminated in a data structure for weighted ancestors in suffix trees with 𝒪(1) query time and an 𝒪(n)-time construction algorithm [Belazzougui et al., CPM 2021].
In this paper, we consider a different version of the weighted ancestor problem, where the nodes are weighted by any function weight that maps each node of T to a positive integer, such that weight(u) ≤ size(u) for any node u and weight(u₁) ≤ weight(u₂) if node u₁ is a descendant of node u₂, where size(u) is the number of nodes in the subtree rooted at u. In the size-constrained weighted ancestor (SWA) problem, for any node u of T and any integer k, we are asked to return the lowest ancestor w of u with weight at least k. We show that for any rooted tree with n nodes, we can locate node w in 𝒪(1) time after 𝒪(n)-time preprocessing. In particular, this implies a data structure for the SWA problem in suffix trees with 𝒪(1) query time and 𝒪(n)-time preprocessing, when the nodes are weighted by weight. We also show several string-processing applications of this result.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
In this paper, we consider a different version of the weighted ancestor problem, where the nodes are weighted by any function weight that maps each node of T to a positive integer, such that weight(u) ≤ size(u) for any node u and weight(u₁) ≤ weight(u₂) if node u₁ is a descendant of node u₂, where size(u) is the number of nodes in the subtree rooted at u. In the size-constrained weighted ancestor (SWA) problem, for any node u of T and any integer k, we are asked to return the lowest ancestor w of u with weight at least k. We show that for any rooted tree with n nodes, we can locate node w in 𝒪(1) time after 𝒪(n)-time preprocessing. In particular, this implies a data structure for the SWA problem in suffix trees with 𝒪(1) query time and 𝒪(n)-time preprocessing, when the nodes are weighted by weight. We also show several string-processing applications of this result.
Charalampopoulos, Panagiotis; Pissis, Solon P.; Radoszewski, Jakub; Rytter, Wojciech; Waleń, Tomasz; Zuba, Wiktor
Approximate Circular Pattern Matching Under Edit Distance Proceedings Article
In: Beyersdorff, Olaf; Kanté, Mamadou Moustapha; Kupferman, Orna; Lokshtanov, Daniel (Ed.): 41st International Symposium on Theoretical Aspects of Computer Science (STACS 2024), pp. 24:1–24:22, Schloss Dagstuhl – Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2024, ISSN: 1868-8969.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Charalampopoulos2024a,
title = {Approximate Circular Pattern Matching Under Edit Distance},
author = {Panagiotis Charalampopoulos and Solon P. Pissis and Jakub Radoszewski and Wojciech Rytter and Tomasz Waleń and Wiktor Zuba},
editor = {Olaf Beyersdorff and Mamadou Moustapha Kanté and Orna Kupferman and Daniel Lokshtanov},
doi = {10.4230/LIPICS.STACS.2024.24},
issn = {1868-8969},
year = {2024},
date = {2024-01-01},
booktitle = {41st International Symposium on Theoretical Aspects of Computer Science (STACS 2024)},
volume = {289},
pages = {24:1–24:22},
publisher = {Schloss Dagstuhl – Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {In the k-Edit Circular Pattern Matching (k-Edit CPM) problem, we are given a length-n text T, a length-m pattern P, and a positive integer threshold k, and we are to report all starting positions of the substrings of T that are at edit distance at most k from some cyclic rotation of P. In the decision version of the problem, we are to check if any such substring exists. Very recently, Charalampopoulos et al. [ESA 2022] presented 𝒪(nk²)-time and 𝒪(nk log³ k)-time solutions for the reporting and decision versions of k-Edit CPM, respectively. Here, we show that the reporting and decision versions of k-Edit CPM can be solved in 𝒪(n+(n/m) k⁶) time and 𝒪(n+(n/m) k⁵ log³ k) time, respectively, thus obtaining the first algorithms with a complexity of the type 𝒪(n+(n/m) poly(k)) for this problem. Notably, our algorithms run in 𝒪(n) time when m = Ω(k⁶) and are superior to the previous respective solutions when m = ω(k⁴). We provide a meta-algorithm that yields efficient algorithms in several other interesting settings, such as when the strings are given in a compressed form (as straight-line programs), when the strings are dynamic, or when we have a quantum computer.
We obtain our solutions by exploiting the structure of approximate circular occurrences of P in T, when T is relatively short w.r.t. P. Roughly speaking, either the starting positions of approximate occurrences of rotations of P form 𝒪(k⁴) intervals that can be computed efficiently, or some rotation of P is almost periodic (is at a small edit distance from a string with small period). Dealing with the almost periodic case is the most technically demanding part of this work; we tackle it using properties of locked fragments (originating from [Cole and Hariharan, SICOMP 2002]).},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
We obtain our solutions by exploiting the structure of approximate circular occurrences of P in T, when T is relatively short w.r.t. P. Roughly speaking, either the starting positions of approximate occurrences of rotations of P form 𝒪(k⁴) intervals that can be computed efficiently, or some rotation of P is almost periodic (is at a small edit distance from a string with small period). Dealing with the almost periodic case is the most technically demanding part of this work; we tackle it using properties of locked fragments (originating from [Cole and Hariharan, SICOMP 2002]).
Bille, Philip; Gørtz, Inge Li; Lewenstein, Moshe; Pissis, Solon P.; Rotenberg, Eva; Steiner, Teresa Anna
Gapped String Indexing in Subquadratic Space and Sublinear Query Time Proceedings Article
In: Beyersdorff, Olaf; Kanté, Mamadou Moustapha; Kupferman, Orna; Lokshtanov, Daniel (Ed.): 41st International Symposium on Theoretical Aspects of Computer Science (STACS 2024), pp. 16:1–16:21, Schloss Dagstuhl – Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2024, ISSN: 1868-8969.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Bille2024a,
title = {Gapped String Indexing in Subquadratic Space and Sublinear Query Time},
author = {Philip Bille and Inge Li Gørtz and Moshe Lewenstein and Solon P. Pissis and Eva Rotenberg and Teresa Anna Steiner},
editor = {Olaf Beyersdorff and Mamadou Moustapha Kanté and Orna Kupferman and Daniel Lokshtanov},
doi = {10.4230/LIPICS.STACS.2024.16},
issn = {1868-8969},
year = {2024},
date = {2024-01-01},
booktitle = {41st International Symposium on Theoretical Aspects of Computer Science (STACS 2024)},
volume = {289},
pages = {16:1–16:21},
publisher = {Schloss Dagstuhl – Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {In Gapped String Indexing, the goal is to compactly represent a string S of length n such that for any query consisting of two strings P₁ and P₂, called patterns, and an integer interval [α, β], called gap range, we can quickly find occurrences of P₁ and P₂ in S with distance in [α, β]. Gapped String Indexing is a central problem in computational biology and text mining and has thus received significant research interest, including parameterized and heuristic approaches. Despite this interest, the best-known time-space trade-offs for Gapped String Indexing are the straightforward 𝒪(n) space and 𝒪(n+ occ) query time or Ω(n²) space and Õ(|P₁| + |P₂| + occ) query time.
We break through this barrier obtaining the first interesting trade-offs with polynomially subquadratic space and polynomially sublinear query time. In particular, we show that, for every 0 ≤ δ ≤ 1, there is a data structure for Gapped String Indexing with either Õ(n^2-δ/3) or Õ(n^3-2δ) space and Õ(|P₁| + |P₂| + n^δ⋅ (occ+1)) query time, where occ is the number of reported occurrences. As a new fundamental tool towards obtaining our main result, we introduce the Shifted Set Intersection problem: preprocess a collection of sets S₁, …, S_k of integers such that for any query consisting of three integers i,j,s, we can quickly output YES if and only if there exist a ∈ S_i and b ∈ S_j with a+s = b. We start by showing that the Shifted Set Intersection problem is equivalent to the indexing variant of 3SUM (3SUM Indexing) [Golovnev et al., STOC 2020]. We then give a data structure for Shifted Set Intersection with gaps, which entails a solution to the Gapped String Indexing problem. Furthermore, we enhance our data structure for deciding Shifted Set Intersection, so that we can support the reporting variant of the problem, i.e., outputting all certificates in the affirmative case. Via the obtained equivalence to 3SUM Indexing, we thus give new improved data structures for the reporting variant of 3SUM Indexing, and we show how this improves upon the state-of-the-art solution for Jumbled Indexing [Chan and Lewenstein, STOC 2015] for any alphabet of constant size σ > 5.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
We break through this barrier obtaining the first interesting trade-offs with polynomially subquadratic space and polynomially sublinear query time. In particular, we show that, for every 0 ≤ δ ≤ 1, there is a data structure for Gapped String Indexing with either Õ(n^2-δ/3) or Õ(n^3-2δ) space and Õ(|P₁| + |P₂| + n^δ⋅ (occ+1)) query time, where occ is the number of reported occurrences. As a new fundamental tool towards obtaining our main result, we introduce the Shifted Set Intersection problem: preprocess a collection of sets S₁, …, S_k of integers such that for any query consisting of three integers i,j,s, we can quickly output YES if and only if there exist a ∈ S_i and b ∈ S_j with a+s = b. We start by showing that the Shifted Set Intersection problem is equivalent to the indexing variant of 3SUM (3SUM Indexing) [Golovnev et al., STOC 2020]. We then give a data structure for Shifted Set Intersection with gaps, which entails a solution to the Gapped String Indexing problem. Furthermore, we enhance our data structure for deciding Shifted Set Intersection, so that we can support the reporting variant of the problem, i.e., outputting all certificates in the affirmative case. Via the obtained equivalence to 3SUM Indexing, we thus give new improved data structures for the reporting variant of 3SUM Indexing, and we show how this improves upon the state-of-the-art solution for Jumbled Indexing [Chan and Lewenstein, STOC 2015] for any alphabet of constant size σ > 5.
Zuba, Wiktor; Loukides, Grigorios; Pissis, Solon P.; Thankachan, Sharma V.
Approximate Suffix-Prefix Dictionary Queries Proceedings Article
In: Královič, Rastislav; Kučera, Antonín (Ed.): 49th International Symposium on Mathematical Foundations of Computer Science (MFCS 2024), pp. 85:1–85:18, Schloss Dagstuhl – Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2024, ISSN: 1868-8969.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Zuba2024,
title = {Approximate Suffix-Prefix Dictionary Queries},
author = {Wiktor Zuba and Grigorios Loukides and Solon P. Pissis and Sharma V. Thankachan},
editor = {Rastislav Královič and Antonín Kučera},
doi = {10.4230/LIPICS.MFCS.2024.85},
issn = {1868-8969},
year = {2024},
date = {2024-01-01},
urldate = {2024-01-01},
booktitle = {49th International Symposium on Mathematical Foundations of Computer Science (MFCS 2024)},
volume = {306},
pages = {85:1–85:18},
publisher = {Schloss Dagstuhl – Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {In the all-pairs suffix-prefix (APSP) problem [Gusfield et al., Inf. Process. Lett. 1992], we are given a dictionary R of r strings, S₁,…,S_r, of total length n, and we are asked to find the length SPL_i,j of the longest string that is both a suffix of S_i and a prefix of S_j, for all i,j ∈ [1..r]. APSP is a classic problem in string algorithms with applications in bioinformatics, especially in sequence assembly. Since r = |R| is typically very large in real-world applications, considering all r² pairs of strings explicitly is prohibitive. This is when the data structure variant of APSP makes sense; in the same spirit as distance oracles computing shortest paths between any two vertices given online. We show how to quickly locate k-approximate matches (under the Hamming or the edit distance) in R using a version of the k-errata tree [Cole et al., STOC 2004] that we introduce. Let SPL^k_i,j be the length of the longest suffix of S_i that is at distance at most k from a prefix of S_j. In particular, for any k = 𝒪(1), we show an 𝒪(nlog^k n)-sized data structure to support the following queries:
- One-to-One^k(i,j): output SPL^k_i,j in 𝒪(log^k nlog log n) time.
- Report^k(i,d): output all j ∈ [1..r], such that SPL^k_i,j ≥ d, in 𝒪(log^kn(log n/log log n+output)) time, where output denotes the size of the output. In fact, our algorithms work for any value of k not just for k = 𝒪(1), but the formulas bounding the complexities get much more complicated for larger values of k.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
- One-to-One^k(i,j): output SPL^k_i,j in 𝒪(log^k nlog log n) time.
- Report^k(i,d): output all j ∈ [1..r], such that SPL^k_i,j ≥ d, in 𝒪(log^kn(log n/log log n+output)) time, where output denotes the size of the output. In fact, our algorithms work for any value of k not just for k = 𝒪(1), but the formulas bounding the complexities get much more complicated for larger values of k.
Verbeek, Hilde; Ayad, Lorraine A. K.; Loukides, Grigorios; Pissis, Solon P.
Minimizing the Minimizers via Alphabet Reordering Proceedings Article
In: Inenaga, Shunsuke; Puglisi, Simon J. (Ed.): 35th Annual Symposium on Combinatorial Pattern Matching (CPM 2024), pp. 28:1–28:13, Schloss Dagstuhl – Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2024, ISSN: 1868-8969.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Verbeek2024,
title = {Minimizing the Minimizers via Alphabet Reordering},
author = {Hilde Verbeek and Lorraine A. K. Ayad and Grigorios Loukides and Solon P. Pissis},
editor = {Shunsuke Inenaga and Simon J. Puglisi},
doi = {10.4230/LIPICS.CPM.2024.28},
issn = {1868-8969},
year = {2024},
date = {2024-01-01},
urldate = {2024-01-01},
booktitle = {35th Annual Symposium on Combinatorial Pattern Matching (CPM 2024)},
volume = {296},
pages = {28:1–28:13},
publisher = {Schloss Dagstuhl – Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {Minimizers sampling is one of the most widely-used mechanisms for sampling strings [Roberts et al., Bioinformatics 2004]. Let S = S[1]… S[n] be a string over a totally ordered alphabet Σ. Further let w ≥ 2 and k ≥ 1 be two integers. The minimizer of S[i..i+w+k-2] is the smallest position in [i,i+w-1] where the lexicographically smallest length-k substring of S[i..i+w+k-2] starts. The set of minimizers over all i ∈ [1,n-w-k+2] is the set ℳ_w,k(S) of the minimizers of S.
We consider the following basic problem:
Given S, w, and k, can we efficiently compute a total order on Σ that minimizes |ℳ_w,k(S)|?
We show that this is unlikely by proving that the problem is NP-hard for any w ≥ 3 and k ≥ 1. Our result provides theoretical justification as to why there exist no exact algorithms for minimizing the minimizers samples, while there exists a plethora of heuristics for the same purpose.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
We consider the following basic problem:
Given S, w, and k, can we efficiently compute a total order on Σ that minimizes |ℳ_w,k(S)|?
We show that this is unlikely by proving that the problem is NP-hard for any w ≥ 3 and k ≥ 1. Our result provides theoretical justification as to why there exist no exact algorithms for minimizing the minimizers samples, while there exists a plethora of heuristics for the same purpose.
Bernardini, Giulia; Chen, Huiping; Gørtz, Inge Li; Krogh, Christoffer; Loukides, Grigorios; Pissis, Solon P.; Stougie, Leen; Sweering, Michelle
Connecting de Bruijn Graphs Proceedings Article
In: Inenaga, Shunsuke; Puglisi, Simon J. (Ed.): 35th Annual Symposium on Combinatorial Pattern Matching (CPM 2024), pp. 6:1–6:16, Schloss Dagstuhl – Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2024, ISSN: 1868-8969.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Bernardini2024a,
title = {Connecting de Bruijn Graphs},
author = {Giulia Bernardini and Huiping Chen and Inge Li Gørtz and Christoffer Krogh and Grigorios Loukides and Solon P. Pissis and Leen Stougie and Michelle Sweering},
editor = {Shunsuke Inenaga and Simon J. Puglisi},
doi = {10.4230/LIPICS.CPM.2024.6},
issn = {1868-8969},
year = {2024},
date = {2024-01-01},
urldate = {2024-01-01},
booktitle = {35th Annual Symposium on Combinatorial Pattern Matching (CPM 2024)},
volume = {296},
pages = {6:1–6:16},
publisher = {Schloss Dagstuhl – Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {We study the problem of making a de Bruijn graph (dBG), constructed from a collection of strings, weakly connected while minimizing the total cost of edge additions. The input graph is a dBG that can be made weakly connected by adding edges (along with extra nodes if needed) from the underlying complete dBG. The problem arises from genome reconstruction, where the dBG is constructed from a set of sequences generated from a genome sample by a sequencing experiment. Due to sequencing errors, the dBG is never Eulerian in practice and is often not even weakly connected. We show the following results for a dBG G(V,E) of order k consisting of d weakly connected components:
1) Making G weakly connected by adding a set of edges of minimal total cost is NP-hard.
2) No PTAS exists for making G weakly connected by adding a set of edges of minimal total cost (unless the unique games conjecture fails). We complement this result by showing that there does exist a polynomial-time (2-2/d)-approximation algorithm for the problem.
3) We consider a restricted version of the above problem, where we are asked to make G weakly connected by only adding directed paths between pairs of components. We show that making G weakly connected by adding d-1 such paths of minimal total cost can be done in 𝒪(k|V|α(|V|)+|E|) time, where α(⋅) is the inverse Ackermann function. This improves on the 𝒪(k|V|log(|V|)+|E|)-time algorithm proposed by Bernardini et al. [CPM 2022] for the same restricted problem.
4) An ILP formulation of polynomial size for making G Eulerian with minimal total cost.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
1) Making G weakly connected by adding a set of edges of minimal total cost is NP-hard.
2) No PTAS exists for making G weakly connected by adding a set of edges of minimal total cost (unless the unique games conjecture fails). We complement this result by showing that there does exist a polynomial-time (2-2/d)-approximation algorithm for the problem.
3) We consider a restricted version of the above problem, where we are asked to make G weakly connected by only adding directed paths between pairs of components. We show that making G weakly connected by adding d-1 such paths of minimal total cost can be done in 𝒪(k|V|α(|V|)+|E|) time, where α(⋅) is the inverse Ackermann function. This improves on the 𝒪(k|V|log(|V|)+|E|)-time algorithm proposed by Bernardini et al. [CPM 2022] for the same restricted problem.
4) An ILP formulation of polynomial size for making G Eulerian with minimal total cost.
Ayad, Lorraine A. K.; Loukides, Grigorios; Pissis, Solon P.; Verbeek, Hilde
Sparse Suffix and LCP Array: Simple, Direct, Small, and Fast Proceedings Article
In: LATIN 2024: Theoretical Informatics, pp. 162–177, Springer Nature Switzerland, Cham, 2024, ISSN: 1611-3349.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Ayad2024,
title = {Sparse Suffix and LCP Array: Simple, Direct, Small, and Fast},
author = {Lorraine A. K. Ayad and Grigorios Loukides and Solon P. Pissis and Hilde Verbeek},
url = {https://arxiv.org/abs/2310.09023},
doi = {10.1007/978-3-031-55598-5_11},
issn = {1611-3349},
year = {2024},
date = {2024-01-01},
urldate = {2024-01-01},
booktitle = {LATIN 2024: Theoretical Informatics},
pages = {162–177},
publisher = {Springer Nature Switzerland},
address = {Cham},
abstract = {Sparse suffix sorting is the problem of sorting $b=o(n)$ suffixes of a string of length n. Efficient sparse suffix sorting algorithms have existed for more than a decade. Despite the multitude of works and their justified claims for applications in text indexing, the existing algorithms have not been employed by practitioners. Arguably this is because there are no simple, direct, and efficient algorithms for sparse suffix array construction. We provide two new algorithms for constructing the sparse suffix and LCP arrays that are simultaneously simple, direct, small, and fast. In particular, our algorithms are: simple in the sense that they can be implemented using only basic data structures; direct in the sense that the output arrays are not a byproduct of constructing the sparse suffix tree or an LCE data structure; fast in the sense that they run in $mathcal{O}(nlog b)$ time, in the worst case, or in $mathcal{O}(n)$ time, when the total number of suffixes with an LCP value greater than $2^{lfloor log frac{n}{b} rfloor + 1}-1$ is in $mathcal{O}(b/log b)$, matching the time of optimal yet much more complicated algorithms [Gawrychowski and Kociumaka, SODA 2017; Birenzwige et al., SODA 2020]; and small in the sense that they can be implemented using only $8b+o(b)$ machine words. We also show that our second algorithm can be trivially amended to work in $mathcal{O}(n)$ time for any uniformly random string. Our algorithms are non-trivial space-efficient adaptations of the Monte Carlo algorithm by I et al. for constructing the sparse suffix tree in $mathcal{O}(nlog b)$ time [STACS 2014].},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Schulz, Tizian; Parmigiani, Luca; Rempel, Andreas; Stoye, Jens
Methods for Pangenomic Core Detection Book Chapter
In: Setubal, João Carlos; Stadler, Peter F.; Stoye, Jens (Ed.): Comparative Genomics: Methods and Protocols, pp. 73–106, Springer US, New York, NY, 2024, ISSN: 1940-6029.
Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG
@inbook{Schulz2024,
title = {Methods for Pangenomic Core Detection},
author = {Tizian Schulz and Luca Parmigiani and Andreas Rempel and Jens Stoye},
editor = {João Carlos Setubal and Peter F. Stadler and Jens Stoye},
doi = {10.1007/978-1-0716-3838-5_4},
issn = {1940-6029},
year = {2024},
date = {2024-01-01},
booktitle = {Comparative Genomics: Methods and Protocols},
pages = {73–106},
publisher = {Springer US},
address = {New York, NY},
abstract = {Computational pangenomics deals with the joint analysis of all genomic sequences of a species. It has already been successfully applied to various tasks in many research areas. Further advances in DNA sequencing technologies constantly let more and more genomic sequences become available for many species, leading to an increasing attractiveness of pangenomic studies. At the same time, larger datasets also pose new challenges for data structures and algorithms that are needed to handle the data. Efficient methods oftentimes make use of the concept of k-mers.},
keywords = {WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {inbook}
}
Lipták, Zsuzsanna; Parmigiani, Luca
A BWT-Based Algorithm for Random de Bruijn Sequence Construction Proceedings Article
In: LATIN 2024: Theoretical Informatics, pp. 130–145, Springer Nature Switzerland, Cham, 2024, ISSN: 1611-3349.
Abstract | Links | BibTeX | Tags: Stringology
@inproceedings{Liptak2024,
title = {A BWT-Based Algorithm for Random de Bruijn Sequence Construction},
author = {Zsuzsanna Lipták and Luca Parmigiani},
url = {https://github.com/lucaparmigiani/rnd_dbseq},
doi = {10.1007/978-3-031-55598-5_9},
issn = {1611-3349},
year = {2024},
date = {2024-01-01},
urldate = {2024-01-01},
booktitle = {LATIN 2024: Theoretical Informatics},
pages = {130–145},
publisher = {Springer Nature Switzerland},
address = {Cham},
abstract = {A binary de Bruijn sequence (dB sequence) of order k is a circular binary string that contains each k-length word exactly once as a substring. Most existing algorithms construct a specific dB sequence, or members of a specific class of dB sequences, representing only a tiny fraction of the complete set. The only algorithms capable of generating all dB sequences are based on finding Euler cycles in de Bruijn graphs. Here, we present an algorithm for constructing random binary dB sequences which uses the extended Burrows-Wheeler Transform. Our method is simple to implement (less than 120 lines of C++ code) and can produce random dB sequences of any order. Even though it does not output dB sequences uniformly at random, it provably outputs each dB sequence with positive probability. The algorithm runs in linear space and near-linear time in the length of the dB sequence and needs less than one second on a laptop computer for orders up to 23, including outputting the sequence. It can be straightforwardly extended to any constant-size alphabet. To the best of our knowledge, this is the first practical algorithm for generating random dB sequences which is capable of producing all dB sequences. Apart from its immediate usefulness in contexts where it is desirable to use a dB sequence that cannot be guessed easily, we also demonstrate our algorithm's potential in theoretical studies, giving hitherto unknown estimates of the average discrepancy of binary dB sequences. The code is available (in C++ and python) at https://github.com/lucaparmigiani/rnd_dbseq.},
keywords = {Stringology},
pubstate = {published},
tppubtype = {inproceedings}
}
Baláž, Andrej; Gagie, Travis; Goga, Adrián; Heumos, Simon; Navarro, Gonzalo; Petescia, Alessia; Sirén, Jouni
Wheeler Maps Proceedings Article
In: Soto, José A.; Wiese, Andreas (Ed.): LATIN 2024: Theoretical Informatics, pp. 178–192, Springer Nature Switzerland, Cham, 2024, ISSN: 1611-3349.
Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG
@inproceedings{Balaz2024,
title = {Wheeler Maps},
author = {Andrej Baláž and Travis Gagie and Adrián Goga and Simon Heumos and Gonzalo Navarro and Alessia Petescia and Jouni Sirén},
editor = {José A. Soto and Andreas Wiese},
doi = {10.1007/978-3-031-55598-5_12},
issn = {1611-3349},
year = {2024},
date = {2024-01-01},
booktitle = {LATIN 2024: Theoretical Informatics},
pages = {178–192},
publisher = {Springer Nature Switzerland},
address = {Cham},
abstract = {Motivated by challenges in pangenomic read alignment, we propose a generalization of Wheeler graphs that we call Wheeler maps. A Wheeler map stores a text T[1..n] and an assignment of tags to the characters of T such that we can preprocess a pattern P[1..m] and then, given i and j, quickly return all the distinct tags labeling the first characters of the occurrences of P[i..j] in T. For the applications that most interest us, characters with long common contexts are likely to have the same tag, so we consider the number t of runs in the list of tags sorted by their characters’ positions in the Burrows-Wheeler Transform (BWT) of T. We show how, given a straight-line program with g rules for T, we can build an O(g+r+t)-space Wheeler map, where r is the number of runs in the BWT of T, with which we can preprocess a pattern P[1..m] in $O(m łog n)$ time and then return the k distinct tags for P[i..j] in optimal O(k) time for any given i and j.},
keywords = {WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Sauer, Lena M.; Cánovas, Rodrigo; Roche, Daniel Barry; Shams-Eldin, Hosam; Ravel, Patrice; Colinge, Jacques; Schwarz, Ralph T.; Mamoun, Choukri Ben; Rivals, Eric; Cornillot, Emmanuel
In: Malaria Journal, vol. 22, no. 1, pp. 27-46, 2023, ISBN: 1475-2875.
Abstract | Links | BibTeX | Tags: Misc
@article{Sauer2023,
title = {FT-GPI, a highly sensitive and accurate predictor of GPI-anchored proteins, reveals the composition and evolution of the GPI proteome in Plasmodium species},
author = {Lena M. Sauer and Rodrigo Cánovas and Daniel Barry Roche and Hosam Shams-Eldin and Patrice Ravel and Jacques Colinge and Ralph T. Schwarz and Choukri Ben Mamoun and Eric Rivals and Emmanuel Cornillot},
doi = {10.1186/s12936-022-04430-0},
isbn = {1475-2875},
year = {2023},
date = {2023-12-01},
urldate = {2023-12-01},
journal = {Malaria Journal},
volume = {22},
number = {1},
pages = {27-46},
publisher = {BioMed Central},
abstract = {Protozoan parasites are known to attach specific and diverse group of proteins to their plasma membrane via a GPI anchor. In malaria parasites, GPI-anchored proteins (GPI-APs) have been shown to play an important role in host–pathogen interactions and a key function in host cell invasion and immune evasion. Because of their immunogenic properties, some of these proteins have been considered as malaria vaccine candidates. However, identification of all possible GPI-APs encoded by these parasites remains challenging due to their sequence diversity and limitations of the tools used for their characterization.},
keywords = {Misc},
pubstate = {published},
tppubtype = {article}
}
Kang, Xiongbin; Xu, Jialu; Luo, Xiao; Schönhuth, Alexander
Hybrid-hybrid correction of errors in long reads with HERO Journal Article
In: Genome Biol., vol. 24, no. 1, pp. 275, 2023, ISBN: 1474-760X.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Kang2023-jk,
title = {Hybrid-hybrid correction of errors in long reads with HERO},
author = {Xiongbin Kang and Jialu Xu and Xiao Luo and Alexander Schönhuth},
doi = {10.1186/s13059-023-03112-7},
isbn = {1474-760X},
year = {2023},
date = {2023-12-01},
urldate = {2023-12-01},
journal = {Genome Biol.},
volume = {24},
number = {1},
pages = {275},
abstract = {Although generally superior, hybrid approaches for correcting errors in third-generation sequencing (TGS) reads, using next-generation sequencing (NGS) reads, mistake haplotype-specific variants for errors in polyploid and mixed samples. We suggest HERO, as the first “hybrid-hybrid”approach, to make use of both de Bruijn graphs and overlap graphs for optimal catering to the particular strengths of NGS and TGS reads. Extensive benchmarking experiments demonstrate that HERO improves indel and mismatch error rates by on average 65% (27~95%) and 20% (4~61%). Using HERO prior to genome assembly significantly improves the assemblies in the majority of the relevant categories.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}