Rizzo, Nicola; Mäkinen, Veli
Linear Time Construction of Indexable Elastic Founder Graphs Proceedings Article
In: Bazgan, Cristina; Fernau, Henning (Ed.): Combinatorial Algorithms, pp. 480–493, Springer International Publishing, Cham, 2022, ISBN: 978-3-031-06678-8.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Rizzo2022,
title = {Linear Time Construction of Indexable Elastic Founder Graphs},
author = {Nicola Rizzo and Veli Mäkinen},
editor = {Cristina Bazgan and Henning Fernau},
url = {https://arxiv.org/abs/2201.06492},
doi = {10.1007/978-3-031-06678-8_35},
isbn = {978-3-031-06678-8},
year = {2022},
date = {2022-01-01},
urldate = {2022-01-01},
booktitle = {Combinatorial Algorithms},
pages = {480--493},
publisher = {Springer International Publishing},
address = {Cham},
abstract = {Pattern matching on graphs has been widely studied lately due to its importance in genomics applications. Unfortunately, even the simplest problem of deciding if a string appears as a subpath of a graph admits a quadratic lower bound under the Orthogonal Vectors Hypothesis (Equi et al. ICALP 2019, SOFSEM 2021). To avoid this bottleneck, the research has shifted towards more specific graph classes, e.g. those induced from multiple sequence alignments (MSAs). Consider segmenting $mathsf{MSA}[1..m,1..n]$ into $b$ blocks $mathsf{MSA}[1..m,1..j_1]$, $mathsf{MSA}[1..m,j_1+1..j_2]$, $ldots$, $mathsf{MSA}[1..m,j_b-1+1..n]$. The distinct strings in the rows of the blocks, after the removal of gap symbols, form the nodes of an elastic founder graph (EFG) where the edges represent the original connections observed in the MSA. An EFG is called indexable if a node label occurs as a prefix of only those paths that start from a node of the same block. Equi et al. (ISAAC 2021) showed that such EFGs support fast pattern matching and gave an $O(mn log m)$-time algorithm for preprocessing the MSA in a way that allows the construction of indexable EFGs maximizing the number of blocks and, alternatively, minimizing the maximum length of a block, in $O(n)$ and $O(n loglog n)$ time respectively. Using the suffix tree and solving a novel ancestor problem on trees, we improve the preprocessing to $O(mn)$ time and the $O(n loglog n)$-time EFG construction to $O(n)$ time, thus showing that both types of indexable EFGs can be constructed in time linear in the input size.},
howpublished = {arXiv},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Mwaniki, Njagi Moses; Garrison, Erik; Pisanti, Nadia
Fast Exact String to D-Texts Alignments Unpublished
arXiv, 2022.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@unpublished{Mwaniki2022,
title = {Fast Exact String to D-Texts Alignments},
author = {Njagi Moses Mwaniki and Erik Garrison and Nadia Pisanti},
doi = {10.48550/ARXIV.2206.03242},
year = {2022},
date = {2022-01-01},
urldate = {2022-01-01},
publisher = {arXiv},
abstract = {In recent years, aligning a sequence to a pangenome has become a central problem in genomics and pangenomics. A fast and accurate solution to this problem can serve as a toolkit to many crucial tasks such as read-correction, Multiple Sequences Alignment (MSA), genome assemblies, variant calling, just to name a few. In this paper we propose a new, fast and exact method to align a string to a D-string, the latter possibly representing an MSA, a pan-genome or a partial assembly. An implementation of our tool dsa is publicly available at https://github.com/urbanslug/dsa.},
howpublished = {arXiv},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {unpublished}
}
Zhong, Haodi; Loukides, Grigorios; Pissis, Solon P.
Clustering sequence graphs Journal Article
In: Data & Knowledge Engineering, vol. 138, pp. 101981, 2022, ISSN: 0169-023X.
Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG
@article{Zhong2022,
title = {Clustering sequence graphs},
author = {Haodi Zhong and Grigorios Loukides and Solon P. Pissis},
doi = {https://doi.org/10.1016/j.datak.2022.101981},
issn = {0169-023X},
year = {2022},
date = {2022-01-01},
urldate = {2022-01-01},
journal = {Data & Knowledge Engineering},
volume = {138},
pages = {101981},
abstract = {In application domains ranging from social networks to e-commerce, it is important to cluster users with respect to both their relationships (e.g., friendship or trust) and their actions (e.g., visited locations or rated products). Motivated by these applications, we introduce here the task of clustering the nodes of a sequence graph, i.e., a graph whose nodes are labeled with strings (e.g., sequences of users' visited locations or rated products). Both string clustering algorithms and graph clustering algorithms are inappropriate to deal with this task, as they do not consider the structure of strings and graph simultaneously. Moreover, attributed graph clustering algorithms generally construct poor solutions because they need to represent a string as a vector of attributes, which inevitably loses information and may harm clustering quality. We thus introduce the problem of clustering a sequence graph. We first propose two pairwise distance measures for sequence graphs, one based on edit distance and shortest path distance and another one based on SimRank. We then formalize the problem under each measure, showing also that it is NP-hard. In addition, we design a polynomial-time 2-approximation algorithm, as well as a heuristic for the problem. Experiments using real datasets and a case study demonstrate the effectiveness and efficiency of our methods.},
keywords = {WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {article}
}
Cartes, Jorge Avila; Anand, Santosh; Ciccolella, Simone; Bonizzoni, Paola; Vedova, Gianluca Della
Accurate and fast clade assignment via deep learning and frequency chaos game representation Journal Article
In: GigaScience, vol. 12, 2022, ISSN: 2047-217X, (giac119).
Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG
@article{Cartes2022,
title = {Accurate and fast clade assignment via deep learning and frequency chaos game representation},
author = {Jorge Avila Cartes and Santosh Anand and Simone Ciccolella and Paola Bonizzoni and Gianluca Della Vedova},
url = {https://github.com/AlgoLab/CouGaR-g},
doi = {10.1093/gigascience/giac119},
issn = {2047-217X},
year = {2022},
date = {2022-01-01},
urldate = {2022-01-01},
journal = {GigaScience},
volume = {12},
publisher = {Cold Spring Harbor Laboratory},
abstract = {Since the beginning of the coronavirus disease 2019 pandemic, there has been an explosion of sequencing of the severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) virus, making it the most widely sequenced virus in the history. Several databases and tools have been created to keep track of genome sequences and variants of the virus; most notably, the GISAID platform hosts millions of complete genome sequences, and it is continuously expanding every day. A challenging task is the development of fast and accurate tools that are able to distinguish between the different SARS-CoV-2 variants and assign them to a clade.In this article, we leverage the frequency chaos game representation (FCGR) and convolutional neural networks (CNNs) to develop an original method that learns how to classify genome sequences that we implement into CouGaR-g, a tool for the clade assignment problem on SARS-CoV-2 sequences. On a testing subset of the GISAID, CouGaR-g achieved an $96.29%$ overall accuracy, while a similar tool, Covidex, obtained a $77,12%$ overall accuracy. As far as we know, our method is the first using deep learning and FCGR for intraspecies classification. Furthermore, by using some feature importance methods, CouGaR-g allows to identify k-mers that match SARS-CoV-2 marker variants.By combining FCGR and CNNs, we develop a method that achieves a better accuracy than Covidex (which is based on random forest) for clade assignment of SARS-CoV-2 genome sequences, also thanks to our training on a much larger dataset, with comparable running times. Our method implemented in CouGaR-g is able to detect k-mers that capture relevant biological information that distinguishes the clades, known as marker variants.The trained models can be tested online providing a FASTA file (with 1 or multiple sequences) at https://huggingface.co/spaces/BIASLab/sars-cov-2-classification-fcgr. CouGaR-g is also available at https://github.com/AlgoLab/CouGaR-g under the GPL.},
note = {giac119},
keywords = {WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {article}
}
Rizzo, Nicola; Mäkinen, Veli
Indexable Elastic Founder Graphs of Minimum Height Proceedings Article
In: Bannai, Hideo; Holub, Jan (Ed.): 33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022), pp. 19:1–19:19, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2022, ISSN: 1868-8969.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Rizzo2022b,
title = {Indexable Elastic Founder Graphs of Minimum Height},
author = {Nicola Rizzo and Veli Mäkinen},
editor = {Hideo Bannai and Jan Holub},
doi = {10.4230/LIPIcs.CPM.2022.19},
issn = {1868-8969},
year = {2022},
date = {2022-01-01},
urldate = {2022-01-01},
booktitle = {33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022)},
volume = {223},
pages = {19:1--19:19},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {Indexable elastic founder graphs have been recently proposed as a data structure for genomics applications supporting fast pattern matching queries. Consider segmenting a multiple sequence alignment MSA[1..m,1..n] into b blocks MSA[1..m,1..j₁], MSA[1..m,j₁+1..j₂], …, MSA[1..m,j_{b-1}+1..n]. The resulting elastic founder graph (EFG) is obtained by merging in each block the strings that are equivalent after the removal of gap symbols, taking the strings as the nodes of the block and the original MSA connections as edges. We call an elastic founder graph indexable if a node label occurs as a prefix of only those paths that start from a node of the same block. Equi et al. (ISAAC 2021) showed that such EFGs support fast pattern matching and studied their construction maximizing the number of blocks and minimizing the maximum length of a block, but left open the case of minimizing the maximum number of distinct strings in a block that we call graph height. For the simplified gapless setting, we give an O(mn) time algorithm to find a segmentation of an MSA minimizing the height of the resulting indexable founder graph, by combining previous results in segmentation algorithms and founder graphs. For the general setting, the known techniques yield a linear-time parameterized solution on constant alphabet Σ, taking time O(m n² log|Σ|) in the worst case, so we study the refined measure of prefix-aware height, that omits counting strings that are prefixes of another considered string. The indexable EFG minimizing the maximum prefix-aware height provides a lower bound for the original height: by exploiting exploiting suffix trees built from the MSA rows and the data structure answering weighted ancestor queries in constant time of Belazzougui et al. (CPM 2021), we give an O(mn)-time algorithm for the optimal EFG under this alternative height. },
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Badkobeh, Golnaz; Charalampopoulos, Panagiotis; Kosolobov, Dmitry; Pissis, Solon P.
Internal shortest absent word queries in constant time and linear space Journal Article
In: Theoretical Computer Science, vol. 922, pp. 271-282, 2022, ISSN: 0304-3975.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG, WP2: Evolutionary/Comparative CPG
@article{Badkobeh2022,
title = {Internal shortest absent word queries in constant time and linear space},
author = {Golnaz Badkobeh and Panagiotis Charalampopoulos and Dmitry Kosolobov and Solon P. Pissis},
url = {https://arxiv.org/abs/2106.01763},
doi = {https://doi.org/10.1016/j.tcs.2022.04.029},
issn = {0304-3975},
year = {2022},
date = {2022-01-01},
urldate = {2022-01-01},
journal = {Theoretical Computer Science},
volume = {922},
pages = {271-282},
abstract = {Given a string T of length n over an alphabet Σ⊂1,2,…,nO(1) of size σ, we are to preprocess T so that given a range [i,j], we can return a representation of a shortest string over Σ that is absent in the fragment T[i]⋯T[j] of T. We present an O(n)-space data structure that answers such queries in constant time and can be constructed in O(nlogσn) time.},
keywords = {WP1: Primary CPG, WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {article}
}
Bonnet, Konstantinn; Marschall, Tobias; Doerr, Daniel
Constructing Founder Sets Under Allelic and Non-Allelic Homologous Recombination Proceedings Article
In: Boucher, Christina; Rahmann, Sven (Ed.): 22nd International Workshop on Algorithms in Bioinformatics (WABI 2022), pp. 6:1–6:23, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2022, ISSN: 1868-8969.
Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG
@inproceedings{Bonnet2022,
title = {Constructing Founder Sets Under Allelic and Non-Allelic Homologous Recombination},
author = {Konstantinn Bonnet and Tobias Marschall and Daniel Doerr},
editor = {Christina Boucher and Sven Rahmann},
doi = {10.4230/LIPIcs.WABI.2022.6},
issn = {1868-8969},
year = {2022},
date = {2022-01-01},
urldate = {2022-01-01},
booktitle = {22nd International Workshop on Algorithms in Bioinformatics (WABI 2022)},
volume = {242},
pages = {6:1--6:23},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {Homologous recombination between the maternal and paternal copies of a chromosome is a key mechanism for human inheritance and shapes population genetic properties of our species. However, a similar mechanism can also act between different copies of the same sequence, then called non-allelic homologous recombination (NAHR). This process can result in genomic rearrangements - including deletion, duplication, and inversion - and is underlying many genomic disorders. Despite its importance for genome evolution and disease, there is a lack of computational models to study genomic loci prone to NAHR.
In this work, we propose such a computational model, providing a unified framework for both (allelic) homologous recombination and NAHR. Our model represents a set of genomes as a graph, where human haplotypes correspond to walks through this graph. We formulate two founder set problems under our recombination model, provide flow-based algorithms for their solution, and demonstrate scalability to problem instances arising in practice.},
keywords = {WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
In this work, we propose such a computational model, providing a unified framework for both (allelic) homologous recombination and NAHR. Our model represents a set of genomes as a graph, where human haplotypes correspond to walks through this graph. We formulate two founder set problems under our recombination model, provide flow-based algorithms for their solution, and demonstrate scalability to problem instances arising in practice.
Loukides, Grigorios; Pissis, Solon P.
All-pairs suffix/prefix in optimal time using Aho-Corasick space Journal Article
In: Information Processing Letters, vol. 178, pp. 106275, 2022, ISSN: 0020-0190.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Loukides2022,
title = {All-pairs suffix/prefix in optimal time using Aho-Corasick space},
author = {Grigorios Loukides and Solon P. Pissis},
doi = {https://doi.org/10.1016/j.ipl.2022.106275},
issn = {0020-0190},
year = {2022},
date = {2022-01-01},
urldate = {2022-01-01},
journal = {Information Processing Letters},
volume = {178},
pages = {106275},
abstract = {The all-pairs suffix/prefix (APSP) problem is a classic problem in computer science with many applications in bioinformatics. Given a set ${S_1,ldots,S_k}$ of $k$ strings of total length $n$, we are asked to find, for each string $S_i, iin[1,k]$, its longest suffix that is a prefix of string $S_j$, for all j≠i, j∈[1,k]. Several algorithms running in the optimal $O(n+k^2)$ time for solving APSP are known. All of these algorithms are based on suffix sorting and thus require space Ω(n) in any case. We consider the parameterized version of the APSP problem, denoted by $ell$-APSP, in which we are asked to output only the pairs whose suffix/prefix overlap is of length at least $ell$. We give an algorithm for solving $ell$-APSP that runs in the optimal $O(n+|text{OUTPUT}_ℓ|)$ time using O(n) space, where $text{OUTPUT}_ℓ$ is the set of output pairs. Our algorithm is thus optimal for the APSP problem as well by setting $ell=0$. Notably, our algorithm is fundamentally different from all optimal algorithms solving the APSP problem: it does not rely on sorting the suffixes of all input strings but on a novel traversal of the Aho-Corasick machine, and it thus requires space linear in the size of the machine.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
Charalampopoulos, Panagiotis; Kociumaka, Tomasz; Radoszewski, Jakub; Pissis, Solon P.; Rytter, Wojciech; Waleń, Tomasz; Zuba, Wiktor
Approximate Circular Pattern Matching Proceedings Article
In: Chechik, Shiri; Navarro, Gonzalo; Rotenberg, Eva; Herman, Grzegorz (Ed.): 30th Annual European Symposium on Algorithms (ESA 2022), pp. 35:1–35:19, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2022, ISSN: 1868-8969.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Charalampopoulos2022c,
title = {Approximate Circular Pattern Matching},
author = {Panagiotis Charalampopoulos and Tomasz Kociumaka and Jakub Radoszewski and Solon P. Pissis and Wojciech Rytter and Tomasz Waleń and Wiktor Zuba},
editor = {Shiri Chechik and Gonzalo Navarro and Eva Rotenberg and Grzegorz Herman},
doi = {10.4230/LIPIcs.ESA.2022.35},
issn = {1868-8969},
year = {2022},
date = {2022-01-01},
urldate = {2022-01-01},
booktitle = {30th Annual European Symposium on Algorithms (ESA 2022)},
volume = {244},
pages = {35:1--35:19},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {We investigate the complexity of approximate circular pattern matching (CPM, in short) under the Hamming and edit distance. Under each of these two basic metrics, we are given a length-n text T, a length-m pattern P, and a positive integer threshold k, and we are to report all starting positions (called occurrences) of fragments of T that are at distance at most k from some cyclic rotation of P. In the decision version of the problem, we are to check if there is any such occurrence. All previous results for approximate CPM were either average-case upper bounds or heuristics, with the exception of the work of Charalampopoulos et al. [$text{CKP}^+$, JCSS'21], who considered only the Hamming distance. For the reporting version of the approximate CPM problem, under the Hamming distance we improve upon the main algorithm of [$text{CKP}^+$, JCSS'21] from 𝒪(n+(n/m) ⋅ k⁴) to 𝒪(n+(n/m) ⋅ k³ log log k) time; for the edit distance, we give an 𝒪(nk²)-time algorithm. Notably, for the decision versions and wide parameter-ranges, we give algorithms whose complexities are almost identical to the state-of-the-art for standard (i.e., non-circular) approximate pattern matching:
- For the decision version of the approximate CPM problem under the Hamming distance, we obtain an 𝒪(n+(n/m) ⋅ k² log k / log log k)-time algorithm, which works in 𝒪(n) time whenever k = 𝒪(√{m log log m / log m}). In comparison, the fastest algorithm for the standard counterpart of the problem, by Chan et al. [CGKKP, STOC’20], runs in 𝒪(n) time only for k = 𝒪(√m). We achieve this result via a reduction to a geometric problem by building on ideas from [$text{CKP}^+$, JCSS'21] and Charalampopoulos et al. [CKW, FOCS'20].
- For the decision version of the approximate CPM problem under the edit distance, the 𝒪(nklog³ k) runtime of our algorithm near matches the 𝒪(nk) runtime of the Landau-Vishkin algorithm [LV, J. Algorithms'89] for approximate pattern matching under edit distance; the latter algorithm remains the fastest known for $k = Ω(m^{2/5})$. As a stepping stone, we propose an 𝒪(nklog³ k)-time algorithm for solving the Longest Prefix k'-Approximate Match problem, proposed by Landau et al. [LMS, SICOMP'98], for all k' ∈ {1,…,k}. Our algorithm is based on Tiskin’s theory of seaweeds [Tiskin, Math. Comput. Sci.'08], with recent advancements (see Charalampopoulos et al. [CKW, FOCS'22]), and on exploiting the seaweeds' relation to Monge matrices.
In contrast, we obtain a conditional lower bound that suggests a polynomial separation between approximate CPM under the Hamming distance over the binary alphabet and its non-circular counterpart. We also show that a strongly subquadratic-time algorithm for the decision version of approximate CPM under edit distance would refute the Strong Exponential Time Hypothesis.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
- For the decision version of the approximate CPM problem under the Hamming distance, we obtain an 𝒪(n+(n/m) ⋅ k² log k / log log k)-time algorithm, which works in 𝒪(n) time whenever k = 𝒪(√{m log log m / log m}). In comparison, the fastest algorithm for the standard counterpart of the problem, by Chan et al. [CGKKP, STOC’20], runs in 𝒪(n) time only for k = 𝒪(√m). We achieve this result via a reduction to a geometric problem by building on ideas from [$text{CKP}^+$, JCSS'21] and Charalampopoulos et al. [CKW, FOCS'20].
- For the decision version of the approximate CPM problem under the edit distance, the 𝒪(nklog³ k) runtime of our algorithm near matches the 𝒪(nk) runtime of the Landau-Vishkin algorithm [LV, J. Algorithms'89] for approximate pattern matching under edit distance; the latter algorithm remains the fastest known for $k = Ω(m^{2/5})$. As a stepping stone, we propose an 𝒪(nklog³ k)-time algorithm for solving the Longest Prefix k'-Approximate Match problem, proposed by Landau et al. [LMS, SICOMP'98], for all k' ∈ {1,…,k}. Our algorithm is based on Tiskin’s theory of seaweeds [Tiskin, Math. Comput. Sci.'08], with recent advancements (see Charalampopoulos et al. [CKW, FOCS'22]), and on exploiting the seaweeds' relation to Monge matrices.
In contrast, we obtain a conditional lower bound that suggests a polynomial separation between approximate CPM under the Hamming distance over the binary alphabet and its non-circular counterpart. We also show that a strongly subquadratic-time algorithm for the decision version of approximate CPM under edit distance would refute the Strong Exponential Time Hypothesis.
Bernardini, Giulia; Gabory, Esteban; Pissis, Solon P.; Stougie, Leen; Sweering, Michelle; Zuba, Wiktor
Elastic-Degenerate String Matching with 1 Error Proceedings Article
In: Castañeda, Armando; Rodr'iguez-Henr'iquez, Francisco (Ed.): LATIN 2022: Theoretical Informatics, pp. 20–37, Springer International Publishing, Cham, 2022, ISBN: 978-3-031-20624-5.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Bernardini2022d,
title = {Elastic-Degenerate String Matching with 1 Error},
author = {Giulia Bernardini and Esteban Gabory and Solon P. Pissis and Leen Stougie and Michelle Sweering and Wiktor Zuba},
editor = {Armando Castañeda and Francisco Rodr'iguez-Henr'iquez},
url = {https://arxiv.org/abs/2209.01095},
doi = {10.1007/978-3-031-20624-5_2},
isbn = {978-3-031-20624-5},
year = {2022},
date = {2022-01-01},
urldate = {2022-01-01},
booktitle = {LATIN 2022: Theoretical Informatics},
volume = {13568},
pages = {20--37},
publisher = {Springer International Publishing},
address = {Cham},
abstract = {An elastic-degenerate (ED) string is a sequence of n finite sets of strings of total length N, introduced to represent a set of related DNA sequences, also known as a pangenome. The ED string matching (EDSM) problem consists in reporting all occurrences of a pattern of length m in an ED text. The EDSM problem has recently received some attention by the combinatorial pattern matching community, culminating in an $mathcal{tilde O}(nm^{omega -1})+mathcal{O}(N)$-time algorithm [Bernardini et al., SIAM J. Comput. 2022], where $omega$ denotes the matrix multiplication exponent and the $mathcal{tilde O}(cdot)$ notation suppresses polylog factors. In the k-EDSM problem, the approximate version of EDSM, we are asked to report all pattern occurrences with at most k errors. k-EDSM can be solved in $mathcal{O}(k^2mG+kN)$ time under edit distance, where G denotes the total number of strings in the ED text [Bernardini et al., Theor. Comput. Sci. 2020]. Unfortunately, G is only bounded by N, and so even for $k=1$, the existing algorithm runs in $varOmega(mN)$ time in the worst case. Here we make progress in this direction. We show that 1-EDSM can be solved in $mathcal{O}((nm^2 + N)log m)$ or $mathcal{O}(nm^3 + N)$ time under edit distance. For the decision version of the problem, we present a faster $mathcal{O}(nm^2sqrt{log m} + Nloglog m)-time algorithm. Our algorithms rely on non-trivial reductions from 1-EDSM to special instances of classic computational geometry problems (2d rectangle stabbing or range emptiness), which we show how to solve efficiently.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Lemane, Téo; Medvedev, Paul; Chikhi, Rayan; Peterlongo, Pierre
kmtricks: efficient and flexible construction of Bloom filters for large sequencing data collections Journal Article
In: Bioinformatics Advances, vol. 2, no. 1, 2022, ISSN: 2635-0041.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Lemane2022b,
title = {kmtricks: efficient and flexible construction of Bloom filters for large sequencing data collections},
author = {Téo Lemane and Paul Medvedev and Rayan Chikhi and Pierre Peterlongo},
editor = {Thomas Lengauer},
url = {https://github.com/tlemane/kmtricks},
doi = {10.1093/bioadv/vbac029},
issn = {2635-0041},
year = {2022},
date = {2022-01-01},
urldate = {2022-01-01},
journal = {Bioinformatics Advances},
volume = {2},
number = {1},
publisher = {Oxford University Press (OUP)},
abstract = {When indexing large collections of short-read sequencing data, a common operation that has now been implemented in several tools (Sequence Bloom Trees and variants, BIGSI) is to construct a collection of Bloom filters, one per sample. Each Bloom filter is used to represent a set of k-mers which approximates the desired set of all the non-erroneous k-mers present in the sample. However, this approximation is imperfect, especially in the case of metagenomics data. Erroneous but abundant k-mers are wrongly included, and non-erroneous but low-abundant ones are wrongly discarded. We propose kmtricks, a novel approach for generating Bloom filters from terabase-sized collections of sequencing data. Our main contributions are (i) an efficient method for jointly counting k-mers across multiple samples, including a streamlined Bloom filter construction by directly counting, partitioning and sorting hashes instead of k-mers, which is approximately four times faster than state-of-the-art tools; (ii) a novel technique that takes advantage of joint counting to preserve low-abundant k-mers present in several samples, improving the recovery of non-erroneous k-mers. Our experiments highlight that this technique preserves around 8× more k-mers than the usual yet crude filtering of low-abundance k-mers in a large metagenomics dataset.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
Bernardini, Giulia; Conte, Alessio; Gourdel, Garance; Grossi, Roberto; Loukides, Grigorios; Pisanti, Nadia; Pissis, Solon; Punzi, Giulia; Stougie, Leen; Sweering, Michelle
Hide and Mine in Strings: Hardness, Algorithms, and Experiments Journal Article
In: IEEE Transactions on Knowledge and Data Engineering, pp. 1–1, 2022, ISSN: 2326-3865.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Bernardini2022a,
title = {Hide and Mine in Strings: Hardness, Algorithms, and Experiments},
author = {Giulia Bernardini and Alessio Conte and Garance Gourdel and Roberto Grossi and Grigorios Loukides and Nadia Pisanti and Solon Pissis and Giulia Punzi and Leen Stougie and Michelle Sweering},
doi = {10.1109/tkde.2022.3158063},
issn = {2326-3865},
year = {2022},
date = {2022-01-01},
journal = {IEEE Transactions on Knowledge and Data Engineering},
pages = {1–1},
publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
abstract = {Data sanitization and frequent pattern mining are two well-studied topics in data mining. Data sanitization is the process of disguising (hiding) confidential information in a given dataset. Typically, this process incurs some utility loss that should be minimized. Frequent pattern mining is the process of obtaining all patterns occurring frequently enough in a given dataset. Our work initiates a study on the fundamental relation between data sanitization and frequent pattern mining in the context of sequential (string) data. Current methods for string sanitization hide confidential patterns. This, however, may lead to spurious patterns that harm the utility of frequent pattern mining. The main computational problem is to minimize this harm. Our contribution here is as follows. First, we present several hardness results, for different variants of this problem, essentially showing that these variants cannot be solved or even be approximated in polynomial time. Second, we propose integer linear programming formulations for these variants and algorithms to solve them, which work in polynomial time under realistic assumptions on the input parameters. We also complement the integer linear programming algorithms with a greedy heuristic. Third, we present an extensive experimental study, using both synthetic and real-world datasets, that demonstrates the effectiveness and efficiency of our methods. Beyond sanitization, the process of missing value replacement may also lead to spurious patterns. Interestingly, our results apply in this context as well. We show that, unlike popular approaches, our methods can fill missing values in genomic sequences, while preserving the accuracy of frequent pattern mining.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
Luo, Xiao; Kang, Xiongbin; Schönhuth, Alexander
phasebook: haplotype-aware de novo assembly of diploid genomes from long reads Journal Article
In: Genome biology, vol. 22, no. 299, pp. 1–26, 2021, ISSN: 1474-760X.
Abstract | Links | BibTeX | Tags: WP3: Translational CPG
@article{luo2021phasebook,
title = {phasebook: haplotype-aware de novo assembly of diploid genomes from long reads},
author = {Xiao Luo and Xiongbin Kang and Alexander Schönhuth},
doi = {https://doi.org/10.1186/s13059-021-02512-x},
issn = {1474-760X},
year = {2021},
date = {2021-10-27},
urldate = {2021-10-27},
journal = {Genome biology},
volume = {22},
number = {299},
pages = {1--26},
publisher = {BioMed Central},
abstract = {Haplotype-aware diploid genome assembly is crucial in genomics, precision medicine, and many other disciplines. Long-read sequencing technologies have greatly improved genome assembly. However, current long-read assemblers are either reference based, so introduce biases, or fail to capture the haplotype diversity of diploid genomes. We present phasebook, a de novo approach for reconstructing the haplotypes of diploid genomes from long reads. phasebook outperforms other approaches in terms of haplotype coverage by large margins, in addition to achieving competitive performance in terms of assembly errors and assembly contiguity.},
keywords = {WP3: Translational CPG},
pubstate = {published},
tppubtype = {article}
}
Gafurov, Askar; Vinař, Tomáš; Brejová, Broňa
Probabilistic Models of k-mer Frequencies Proceedings Article
In: Mol, Liesbeth De; Weiermann, Andreas; Manea, Florin; Fernández-Duque, David (Ed.): Connecting with Computability, pp. 227–236, Springer International Publishing, Cham, 2021, ISSN: 1611-3349.
Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG
@inproceedings{Gafurov2021,
title = {Probabilistic Models of k-mer Frequencies},
author = {Askar Gafurov and Tomáš Vinař and Broňa Brejová},
editor = {Liesbeth De Mol and Andreas Weiermann and Florin Manea and David Fernández-Duque},
doi = {10.1007/978-3-030-80049-9_21},
issn = {1611-3349},
year = {2021},
date = {2021-07-02},
urldate = {2021-01-01},
booktitle = {Connecting with Computability},
pages = {227–236},
publisher = {Springer International Publishing},
address = {Cham},
abstract = {In this article, we review existing probabilistic models for modeling abundance of fixed-length strings (k-mers) in DNA sequencing data. These models capture dependence of the abundance on various phenomena, such as the size and repeat content of the genome, heterozygosity levels, and sequencing error rate. This in turn allows to estimate these properties from k-mer abundance histograms observed in real data. We also briefly discuss the issue of comparing k-mer abundance between related sequencing samples and meaningfully summarizing the results.},
keywords = {WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Khorsand, Parsoa; Denti, Luca; Consortium, Human Genome Structural Variant; Bonizzoni, Paola; Chikhi, Rayan; Hormozdiari, Fereydoun
Comparative genome analysis using sample-specific string detection in accurate long reads Journal Article
In: Bioinform. Adv., vol. 1, no. 1, 2021, ISSN: 2635-0041.
Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG
@article{Khorsand2021-uq,
title = {Comparative genome analysis using sample-specific string detection in accurate long reads},
author = {Parsoa Khorsand and Luca Denti and Human Genome Structural Variant Consortium and Paola Bonizzoni and Rayan Chikhi and Fereydoun Hormozdiari},
url = {https://github.com/Parsoa/PingPong},
doi = {10.1093/bioadv/vbab005},
issn = {2635-0041},
year = {2021},
date = {2021-05-01},
urldate = {2021-05-01},
journal = {Bioinform. Adv.},
volume = {1},
number = {1},
publisher = {Oxford University Press (OUP)},
abstract = {Comparative genome analysis of two or more whole-genome sequenced (WGS) samples is at the core of most applications in genomics. These include the discovery of genomic differences segregating in populations, case-control analysis in common diseases and diagnosing rare disorders. With the current progress of accurate long-read sequencing technologies (e.g. circular consensus sequencing from PacBio sequencers), we can dive into studying repeat regions of the genome (e.g. segmental duplications) and hard-to-detect variants (e.g. complex structural variants).We propose a novel framework for comparative genome analysis through the discovery of strings that are specific to one genome (‘samples-specific’ strings). We have developed a novel, accurate and efficient computational method for the discovery of sample-specific strings between two groups of WGS samples. The proposed approach will give us the ability to perform comparative genome analysis without the need to map the reads and is not hindered by shortcomings of the reference genome and mapping algorithms. We show that the proposed approach is capable of accurately finding sample-specific strings representing nearly all variation (>98%) reported across pairs or trios of WGS samples using accurate long reads (e.g. PacBio HiFi data).Data, code and instructions for reproducing the results presented in this manuscript are publicly available at https://github.com/Parsoa/PingPong.Supplementary data are available at Bioinformatics Advances online.},
keywords = {WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {article}
}
Charalampopoulos, Panagiotis; Chen, Huiping; Christen, Peter; Loukides, Grigorios; Pisanti, Nadia; Pissis, Solon P.; Radoszewski, Jakub
Pattern Masking for Dictionary Matching Proceedings Article
In: Ahn, Hee-Kap; Sadakane, Kunihiko (Ed.): 32nd International Symposium on Algorithms and Computation (ISAAC 2021), pp. 65:1–65:19, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2021, ISSN: 1868-8969.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Charalampopoulos2021,
title = {Pattern Masking for Dictionary Matching},
author = {Panagiotis Charalampopoulos and Huiping Chen and Peter Christen and Grigorios Loukides and Nadia Pisanti and Solon P. Pissis and Jakub Radoszewski},
editor = {Hee-Kap Ahn and Kunihiko Sadakane},
doi = {10.4230/LIPIcs.ISAAC.2021.65},
issn = {1868-8969},
year = {2021},
date = {2021-01-01},
urldate = {2021-01-01},
booktitle = {32nd International Symposium on Algorithms and Computation (ISAAC 2021)},
volume = {212},
pages = {65:1--65:19},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {Data masking is a common technique for sanitizing sensitive data maintained in database systems, and it is also becoming increasingly important in various application areas, such as in record linkage of personal data. This work formalizes the Pattern Masking for Dictionary Matching (PMDM) problem. In PMDM, we are given a dictionary 𝒟 of d strings, each of length 𝓁, a query string q of length 𝓁, and a positive integer z, and we are asked to compute a smallest set K ⊆ {1,…,𝓁}, so that if q[i] is replaced by a wildcard for all i ∈ K, then q matches at least z strings from 𝒟. Solving PMDM allows providing data utility guarantees as opposed to existing approaches.
We first show, through a reduction from the well-known k-Clique problem, that a decision version of the PMDM problem is NP-complete, even for strings over a binary alphabet. We thus approach the problem from a more practical perspective. We show a combinatorial 𝒪((d𝓁)^{|K|/3}+d𝓁)-time and 𝒪(d𝓁)-space algorithm for PMDM for |K| = 𝒪(1). In fact, we show that we cannot hope for a faster combinatorial algorithm, unless the combinatorial k-Clique hypothesis fails [Abboud et al., SIAM J. Comput. 2018; Lincoln et al., SODA 2018]. We also generalize this algorithm for the problem of masking multiple query strings simultaneously so that every string has at least z matches in 𝒟.
Note that PMDM can be viewed as a generalization of the decision version of the dictionary matching with mismatches problem: by querying a PMDM data structure with string q and z = 1, one obtains the minimal number of mismatches of q with any string from 𝒟. The query time or space of all known data structures for the more restricted problem of dictionary matching with at most k mismatches incurs some exponential factor with respect to k. A simple exact algorithm for PMDM runs in time 𝒪(2^𝓁 d). We present a data structure for PMDM that answers queries over 𝒟 in time 𝒪(2^{𝓁/2}(2^{𝓁/2}+τ)𝓁) and requires space 𝒪(2^𝓁 d²/τ²+2^{𝓁/2}d), for any parameter τ ∈ [1,d].
We complement our results by showing a two-way polynomial-time reduction between PMDM and the Minimum Union problem [Chlamtáč et al., SODA 2017]. This gives a polynomial-time 𝒪(d^{1/4+ε})-approximation algorithm for PMDM, which is tight under a plausible complexity conjecture. },
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
We first show, through a reduction from the well-known k-Clique problem, that a decision version of the PMDM problem is NP-complete, even for strings over a binary alphabet. We thus approach the problem from a more practical perspective. We show a combinatorial 𝒪((d𝓁)^{|K|/3}+d𝓁)-time and 𝒪(d𝓁)-space algorithm for PMDM for |K| = 𝒪(1). In fact, we show that we cannot hope for a faster combinatorial algorithm, unless the combinatorial k-Clique hypothesis fails [Abboud et al., SIAM J. Comput. 2018; Lincoln et al., SODA 2018]. We also generalize this algorithm for the problem of masking multiple query strings simultaneously so that every string has at least z matches in 𝒟.
Note that PMDM can be viewed as a generalization of the decision version of the dictionary matching with mismatches problem: by querying a PMDM data structure with string q and z = 1, one obtains the minimal number of mismatches of q with any string from 𝒟. The query time or space of all known data structures for the more restricted problem of dictionary matching with at most k mismatches incurs some exponential factor with respect to k. A simple exact algorithm for PMDM runs in time 𝒪(2^𝓁 d). We present a data structure for PMDM that answers queries over 𝒟 in time 𝒪(2^{𝓁/2}(2^{𝓁/2}+τ)𝓁) and requires space 𝒪(2^𝓁 d²/τ²+2^{𝓁/2}d), for any parameter τ ∈ [1,d].
We complement our results by showing a two-way polynomial-time reduction between PMDM and the Minimum Union problem [Chlamtáč et al., SODA 2017]. This gives a polynomial-time 𝒪(d^{1/4+ε})-approximation algorithm for PMDM, which is tight under a plausible complexity conjecture.
Pisanti, Nadia
On-Line Pattern Matching on D-Texts (Invited Talk) Proceedings Article
In: Gawrychowski, Paweł; Starikovskaya, Tatiana (Ed.): 32nd Annual Symposium on Combinatorial Pattern Matching (CPM 2021), pp. 3:1–3:2, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2021, ISSN: 1868-8969.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG, WP2: Evolutionary/Comparative CPG
@inproceedings{Pisanti2021,
title = {On-Line Pattern Matching on D-Texts (Invited Talk)},
author = {Nadia Pisanti},
editor = {Paweł Gawrychowski and Tatiana Starikovskaya},
doi = {10.4230/LIPIcs.CPM.2021.3},
issn = {1868-8969},
year = {2021},
date = {2021-01-01},
urldate = {2021-01-01},
booktitle = {32nd Annual Symposium on Combinatorial Pattern Matching (CPM 2021)},
volume = {191},
pages = {3:1--3:2},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {The Elastic Degenerate String Matching (EDSM) problem is defined as that of finding an occurrence of a pattern P of length m in an ED-text T. A D-text (Degenerate text) is a string that actually represents a set of similar and aligned strings (e.g. a pan-genome [The Computational Pan-Genomics Consortium, 2018]) by collapsing common fragments into a standard string, and representing variants with sets of alternative substrings. When such substrings are not bound to have the same size, then we talk about elastic D-strings (ED-strings). In [R.Grossi et al., 2017] we gave an O(nm²+N) time on-line algorithm for EDSM, where n is the length of T and N is its size, defined as the total number of letters. A fundamental toolkit of our algorithm is the O(m²+N) time solution of the later called Active Prefixes problem (AP). In [K.Aoyama et al., 2018], a O(m^{1.5} √{log m}+N) solution for AP was shown, leading to a O(nm^{1.5} √{log m}+N) time solution for EDSM. The natural open problem was thus whether the 1.5 exponent could furtherly be decreased. In [G.Bernardini et al., 2019], we prove several properties that answer this and other questions: we give a conditional O(nm^{1.5}+N) lower bound for EDSM, proving that a combinatorial algorithm solving EDSM in O(nm^{1.5-ε} +N) time would break the Boolean Matrix Multiplication (BMM) conjecture; we use this result as a hint to devise a non-combinatorial algorithm that solves EDSM in O(nm^{1.381}+N) time; we do so by successfully combining Fast Fourier Transform and properties of string periodicity. In my talk I will overview the results above, as well as some interesting side results: the extension to a dictionary rather than a single pattern [S.P.Pissis and A.Retha, 2018], the introduction of errors [G.Bernardini et al., 2020], and a notion of matching among D-strings with its linear time solution [M.Alzamel et al., 2020].},
keywords = {WP1: Primary CPG, WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Conte, Alessio; Grossi, Roberto; Loukides, Grigorios; Pisanti, Nadia; Pissis, Solon P.; Punzi, Giulia
Beyond the BEST Theorem: Fast Assessment of Eulerian Trails Proceedings Article
In: Bampis, Evripidis; Pagourtzis, Aris (Ed.): Fundamentals of Computation Theory, pp. 162–175, Springer International Publishing, Cham, 2021, ISBN: 978-3-030-86593-1.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Conte2021,
title = {Beyond the BEST Theorem: Fast Assessment of Eulerian Trails},
author = {Alessio Conte and Roberto Grossi and Grigorios Loukides and Nadia Pisanti and Solon P. Pissis and Giulia Punzi},
editor = {Evripidis Bampis and Aris Pagourtzis},
url = {https://kclpure.kcl.ac.uk/portal/files/159578921/Estimating_Eulerian_Trails_FCT.pdf, Full PDF on King’s Research Portal},
doi = {10.1007/978-3-030-86593-1_11},
isbn = {978-3-030-86593-1},
year = {2021},
date = {2021-01-01},
urldate = {2021-01-01},
booktitle = {Fundamentals of Computation Theory},
pages = {162--175},
publisher = {Springer International Publishing},
address = {Cham},
abstract = {Given a directed multigraph $G=(V,E)$, with $|V|=n$ nodes and $|E|=m$ edges, and an integer $z$, we are asked to assess whether the number $#ET(G)$ of node-distinct Eulerian trails of $G$ is at least $z$; two trails are called node-distinct if their node sequences are different. This problem has been formalized by Bernardini et al. [ALENEX 2020] as it is the core computational problem in several string processing applications. It can be solved in $mathcal O(n^omega)$ arithmetic operations by applying the well-known BEST theorem, where $omega <2.373$ denotes the matrix multiplication exponent. The algorithmic challenge is: Can we solve this problem faster for certain values of $m$ and $z$? Namely, we want to design a combinatorial algorithm for assessing whether $#ET(G) ge z$, which does not resort to the BEST theorem and has a predictably bounded cost as a function of $m$ and $z$. We address this challenge here by providing a combinatorial algorithm requiring $mathcal O(mcdot min {z, #ET(G)})$ time.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Loukides, Grigorios; Pissis, Solon P.
Bidirectional String Anchors: A New String Sampling Mechanism Proceedings Article
In: Mutzel, Petra; Pagh, Rasmus; Herman, Grzegorz (Ed.): 29th Annual European Symposium on Algorithms (ESA 2021), pp. 64:1–64:21, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2021, ISSN: 1868-8969.
Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG
@inproceedings{Loukides2021,
title = {Bidirectional String Anchors: A New String Sampling Mechanism},
author = {Grigorios Loukides and Solon P. Pissis},
editor = {Petra Mutzel and Rasmus Pagh and Grzegorz Herman},
doi = {10.4230/LIPIcs.ESA.2021.64},
issn = {1868-8969},
year = {2021},
date = {2021-01-01},
urldate = {2021-01-01},
booktitle = {29th Annual European Symposium on Algorithms (ESA 2021)},
volume = {204},
pages = {64:1--64:21},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = { The minimizers sampling mechanism is a popular mechanism for string sampling introduced independently by Schleimer et al. [SIGMOD 2003] and by Roberts et al. [Bioinf. 2004]. Given two positive integers w and k, it selects the lexicographically smallest length-k substring in every fragment of w consecutive length-k substrings (in every sliding window of length w+k-1). Minimizers samples are approximately uniform, locally consistent, and computable in linear time. Although they do not have good worst-case guarantees on their size, they are often small in practice. They thus have been successfully employed in several string processing applications. Two main disadvantages of minimizers sampling mechanisms are: first, they also do not have good guarantees on the expected size of their samples for every combination of w and k; and, second, indexes that are constructed over their samples do not have good worst-case guarantees for on-line pattern searches.
To alleviate these disadvantages, we introduce bidirectional string anchors (bd-anchors), a new string sampling mechanism. Given a positive integer 𝓁, our mechanism selects the lexicographically smallest rotation in every length-𝓁 fragment (in every sliding window of length 𝓁). We show that bd-anchors samples are also approximately uniform, locally consistent, and computable in linear time. In addition, our experiments using several datasets demonstrate that the bd-anchors sample sizes decrease proportionally to 𝓁; and that these sizes are competitive to or smaller than the minimizers sample sizes using the analogous sampling parameters. We provide theoretical justification for these results by analyzing the expected size of bd-anchors samples.
We also show that by using any bd-anchors sample, we can construct, in near-linear time, an index which requires linear (extra) space in the size of the sample and answers on-line pattern searches in near-optimal time. We further show, using several datasets, that a simple implementation of our index is consistently faster for on-line pattern searches than an analogous implementation of a minimizers-based index [Grabowski and Raniszewski, Softw. Pract. Exp. 2017].
Finally, we highlight the applicability of bd-anchors by developing an efficient and effective heuristic for top-K similarity search under edit distance. We show, using synthetic datasets, that our heuristic is more accurate and more than one order of magnitude faster in top-K similarity searches than the state-of-the-art tool for the same purpose [Zhang and Zhang, KDD 2020]. },
keywords = {WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
To alleviate these disadvantages, we introduce bidirectional string anchors (bd-anchors), a new string sampling mechanism. Given a positive integer 𝓁, our mechanism selects the lexicographically smallest rotation in every length-𝓁 fragment (in every sliding window of length 𝓁). We show that bd-anchors samples are also approximately uniform, locally consistent, and computable in linear time. In addition, our experiments using several datasets demonstrate that the bd-anchors sample sizes decrease proportionally to 𝓁; and that these sizes are competitive to or smaller than the minimizers sample sizes using the analogous sampling parameters. We provide theoretical justification for these results by analyzing the expected size of bd-anchors samples.
We also show that by using any bd-anchors sample, we can construct, in near-linear time, an index which requires linear (extra) space in the size of the sample and answers on-line pattern searches in near-optimal time. We further show, using several datasets, that a simple implementation of our index is consistently faster for on-line pattern searches than an analogous implementation of a minimizers-based index [Grabowski and Raniszewski, Softw. Pract. Exp. 2017].
Finally, we highlight the applicability of bd-anchors by developing an efficient and effective heuristic for top-K similarity search under edit distance. We show, using synthetic datasets, that our heuristic is more accurate and more than one order of magnitude faster in top-K similarity searches than the state-of-the-art tool for the same purpose [Zhang and Zhang, KDD 2020].
Charalampopoulos, Panagiotis; Kociumaka, Tomasz; Pissis, Solon P.; Radoszewski, Jakub
Faster Algorithms for Longest Common Substring Proceedings Article
In: Mutzel, Petra; Pagh, Rasmus; Herman, Grzegorz (Ed.): 29th Annual European Symposium on Algorithms (ESA 2021), pp. 30:1–30:17, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2021, ISSN: 1868-8969.
Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG
@inproceedings{Charalampopoulos2021b,
title = {Faster Algorithms for Longest Common Substring},
author = {Panagiotis Charalampopoulos and Tomasz Kociumaka and Solon P. Pissis and Jakub Radoszewski},
editor = {Petra Mutzel and Rasmus Pagh and Grzegorz Herman},
doi = {10.4230/LIPIcs.ESA.2021.30},
issn = {1868-8969},
year = {2021},
date = {2021-01-01},
urldate = {2021-01-01},
booktitle = {29th Annual European Symposium on Algorithms (ESA 2021)},
volume = {204},
pages = {30:1--30:17},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = { In the classic longest common substring (LCS) problem, we are given two strings S and T, each of length at most n, over an alphabet of size σ, and we are asked to find a longest string occurring as a fragment of both S and T. Weiner, in his seminal paper that introduced the suffix tree, presented an 𝒪(n log σ)-time algorithm for this problem [SWAT 1973]. For polynomially-bounded integer alphabets, the linear-time construction of suffix trees by Farach yielded an 𝒪(n)-time algorithm for the LCS problem [FOCS 1997]. However, for small alphabets, this is not necessarily optimal for the LCS problem in the word RAM model of computation, in which the strings can be stored in 𝒪(n log σ/log n) space and read in 𝒪(n log σ/log n) time. We show that, in this model, we can compute an LCS in time 𝒪(n log σ / √{log n}), which is sublinear in n if σ = 2^{o(√{log n})} (in particular, if σ = 𝒪(1)), using optimal space 𝒪(n log σ/log n).
We then lift our ideas to the problem of computing a k-mismatch LCS, which has received considerable attention in recent years. In this problem, the aim is to compute a longest substring of S that occurs in T with at most k mismatches. Flouri et al. showed how to compute a 1-mismatch LCS in 𝒪(n log n) time [IPL 2015]. Thankachan et al. extended this result to computing a k-mismatch LCS in 𝒪(n log^k n) time for k = 𝒪(1) [J. Comput. Biol. 2016]. We show an 𝒪(n log^{k-1/2} n)-time algorithm, for any constant integer k > 0 and irrespective of the alphabet size, using 𝒪(n) space as the previous approaches. We thus notably break through the well-known n log^k n barrier, which stems from a recursive heavy-path decomposition technique that was first introduced in the seminal paper of Cole et al. [STOC 2004] for string indexing with k errors. },
keywords = {WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
We then lift our ideas to the problem of computing a k-mismatch LCS, which has received considerable attention in recent years. In this problem, the aim is to compute a longest substring of S that occurs in T with at most k mismatches. Flouri et al. showed how to compute a 1-mismatch LCS in 𝒪(n log n) time [IPL 2015]. Thankachan et al. extended this result to computing a k-mismatch LCS in 𝒪(n log^k n) time for k = 𝒪(1) [J. Comput. Biol. 2016]. We show an 𝒪(n log^{k-1/2} n)-time algorithm, for any constant integer k > 0 and irrespective of the alphabet size, using 𝒪(n) space as the previous approaches. We thus notably break through the well-known n log^k n barrier, which stems from a recursive heavy-path decomposition technique that was first introduced in the seminal paper of Cole et al. [STOC 2004] for string indexing with k errors.
