Cartes, Jorge Avila; Anand, Santosh; Ciccolella, Simone; Bonizzoni, Paola; Vedova, Gianluca Della

Accurate and fast clade assignment via deep learning and frequency chaos game representation Journal Article

In: GigaScience, vol. 12, 2022, ISSN: 2047-217X, (giac119).

Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG

@article{Cartes2022,

title = {Accurate and fast clade assignment via deep learning and frequency chaos game representation},

author = {Jorge Avila Cartes and Santosh Anand and Simone Ciccolella and Paola Bonizzoni and Gianluca Della Vedova},

url = {https://github.com/AlgoLab/CouGaR-g},

doi = {10.1093/gigascience/giac119},

issn = {2047-217X},

year = {2022},

date = {2022-01-01},

urldate = {2022-01-01},

journal = {GigaScience},

volume = {12},

publisher = {Cold Spring Harbor Laboratory},

abstract = {Since the beginning of the coronavirus disease 2019 pandemic, there has been an explosion of sequencing of the severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) virus, making it the most widely sequenced virus in the history. Several databases and tools have been created to keep track of genome sequences and variants of the virus; most notably, the GISAID platform hosts millions of complete genome sequences, and it is continuously expanding every day. A challenging task is the development of fast and accurate tools that are able to distinguish between the different SARS-CoV-2 variants and assign them to a clade.In this article, we leverage the frequency chaos game representation (FCGR) and convolutional neural networks (CNNs) to develop an original method that learns how to classify genome sequences that we implement into CouGaR-g, a tool for the clade assignment problem on SARS-CoV-2 sequences. On a testing subset of the GISAID, CouGaR-g achieved an $96.29%$ overall accuracy, while a similar tool, Covidex, obtained a $77,12%$ overall accuracy. As far as we know, our method is the first using deep learning and FCGR for intraspecies classification. Furthermore, by using some feature importance methods, CouGaR-g allows to identify k-mers that match SARS-CoV-2 marker variants.By combining FCGR and CNNs, we develop a method that achieves a better accuracy than Covidex (which is based on random forest) for clade assignment of SARS-CoV-2 genome sequences, also thanks to our training on a much larger dataset, with comparable running times. Our method implemented in CouGaR-g is able to detect k-mers that capture relevant biological information that distinguishes the clades, known as marker variants.The trained models can be tested online providing a FASTA file (with 1 or multiple sequences) at https://huggingface.co/spaces/BIASLab/sars-cov-2-classification-fcgr. CouGaR-g is also available at https://github.com/AlgoLab/CouGaR-g under the GPL.},

note = {giac119},

keywords = {WP2: Evolutionary/Comparative CPG},

pubstate = {published},

tppubtype = {article}

}

Rizzo, Nicola; Mäkinen, Veli

Indexable Elastic Founder Graphs of Minimum Height Proceedings Article

In: Bannai, Hideo; Holub, Jan (Ed.): 33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022), pp. 19:1–19:19, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2022, ISSN: 1868-8969.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@inproceedings{Rizzo2022b,

title = {Indexable Elastic Founder Graphs of Minimum Height},

author = {Nicola Rizzo and Veli Mäkinen},

editor = {Hideo Bannai and Jan Holub},

doi = {10.4230/LIPIcs.CPM.2022.19},

issn = {1868-8969},

year = {2022},

date = {2022-01-01},

urldate = {2022-01-01},

booktitle = {33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022)},

volume = {223},

pages = {19:1--19:19},

publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},

address = {Dagstuhl, Germany},

series = {Leibniz International Proceedings in Informatics (LIPIcs)},

abstract = {Indexable elastic founder graphs have been recently proposed as a data structure for genomics applications supporting fast pattern matching queries. Consider segmenting a multiple sequence alignment MSA[1..m,1..n] into b blocks MSA[1..m,1..j₁], MSA[1..m,j₁+1..j₂], …, MSA[1..m,j_{b-1}+1..n]. The resulting elastic founder graph (EFG) is obtained by merging in each block the strings that are equivalent after the removal of gap symbols, taking the strings as the nodes of the block and the original MSA connections as edges. We call an elastic founder graph indexable if a node label occurs as a prefix of only those paths that start from a node of the same block. Equi et al. (ISAAC 2021) showed that such EFGs support fast pattern matching and studied their construction maximizing the number of blocks and minimizing the maximum length of a block, but left open the case of minimizing the maximum number of distinct strings in a block that we call graph height. For the simplified gapless setting, we give an O(mn) time algorithm to find a segmentation of an MSA minimizing the height of the resulting indexable founder graph, by combining previous results in segmentation algorithms and founder graphs. For the general setting, the known techniques yield a linear-time parameterized solution on constant alphabet Σ, taking time O(m n² log|Σ|) in the worst case, so we study the refined measure of prefix-aware height, that omits counting strings that are prefixes of another considered string. The indexable EFG minimizing the maximum prefix-aware height provides a lower bound for the original height: by exploiting exploiting suffix trees built from the MSA rows and the data structure answering weighted ancestor queries in constant time of Belazzougui et al. (CPM 2021), we give an O(mn)-time algorithm for the optimal EFG under this alternative height. },

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {inproceedings}

}

Badkobeh, Golnaz; Charalampopoulos, Panagiotis; Kosolobov, Dmitry; Pissis, Solon P.

Internal shortest absent word queries in constant time and linear space Journal Article

In: Theoretical Computer Science, vol. 922, pp. 271-282, 2022, ISSN: 0304-3975.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG, WP2: Evolutionary/Comparative CPG

@article{Badkobeh2022,

title = {Internal shortest absent word queries in constant time and linear space},

author = {Golnaz Badkobeh and Panagiotis Charalampopoulos and Dmitry Kosolobov and Solon P. Pissis},

url = {https://arxiv.org/abs/2106.01763},

doi = {https://doi.org/10.1016/j.tcs.2022.04.029},

issn = {0304-3975},

year = {2022},

date = {2022-01-01},

urldate = {2022-01-01},

journal = {Theoretical Computer Science},

volume = {922},

pages = {271-282},

abstract = {Given a string T of length n over an alphabet Σ⊂1,2,…,nO(1) of size σ, we are to preprocess T so that given a range [i,j], we can return a representation of a shortest string over Σ that is absent in the fragment T[i]⋯T[j] of T. We present an O(n)-space data structure that answers such queries in constant time and can be constructed in O(nlogσn) time.},

keywords = {WP1: Primary CPG, WP2: Evolutionary/Comparative CPG},

pubstate = {published},

tppubtype = {article}

}

Bonnet, Konstantinn; Marschall, Tobias; Doerr, Daniel

Constructing Founder Sets Under Allelic and Non-Allelic Homologous Recombination Proceedings Article

In: Boucher, Christina; Rahmann, Sven (Ed.): 22nd International Workshop on Algorithms in Bioinformatics (WABI 2022), pp. 6:1–6:23, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2022, ISSN: 1868-8969.

Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG

@inproceedings{Bonnet2022,

title = {Constructing Founder Sets Under Allelic and Non-Allelic Homologous Recombination},

author = {Konstantinn Bonnet and Tobias Marschall and Daniel Doerr},

editor = {Christina Boucher and Sven Rahmann},

doi = {10.4230/LIPIcs.WABI.2022.6},

issn = {1868-8969},

year = {2022},

date = {2022-01-01},

urldate = {2022-01-01},

booktitle = {22nd International Workshop on Algorithms in Bioinformatics (WABI 2022)},

volume = {242},

pages = {6:1--6:23},

publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},

address = {Dagstuhl, Germany},

series = {Leibniz International Proceedings in Informatics (LIPIcs)},

abstract = {Homologous recombination between the maternal and paternal copies of a chromosome is a key mechanism for human inheritance and shapes population genetic properties of our species. However, a similar mechanism can also act between different copies of the same sequence, then called non-allelic homologous recombination (NAHR). This process can result in genomic rearrangements - including deletion, duplication, and inversion - and is underlying many genomic disorders. Despite its importance for genome evolution and disease, there is a lack of computational models to study genomic loci prone to NAHR.

In this work, we propose such a computational model, providing a unified framework for both (allelic) homologous recombination and NAHR. Our model represents a set of genomes as a graph, where human haplotypes correspond to walks through this graph. We formulate two founder set problems under our recombination model, provide flow-based algorithms for their solution, and demonstrate scalability to problem instances arising in practice.},

keywords = {WP2: Evolutionary/Comparative CPG},

pubstate = {published},

tppubtype = {inproceedings}

}

In this work, we propose such a computational model, providing a unified framework for both (allelic) homologous recombination and NAHR. Our model represents a set of genomes as a graph, where human haplotypes correspond to walks through this graph. We formulate two founder set problems under our recombination model, provide flow-based algorithms for their solution, and demonstrate scalability to problem instances arising in practice.

Loukides, Grigorios; Pissis, Solon P.

All-pairs suffix/prefix in optimal time using Aho-Corasick space Journal Article

In: Information Processing Letters, vol. 178, pp. 106275, 2022, ISSN: 0020-0190.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@article{Loukides2022,

title = {All-pairs suffix/prefix in optimal time using Aho-Corasick space},

author = {Grigorios Loukides and Solon P. Pissis},

doi = {https://doi.org/10.1016/j.ipl.2022.106275},

issn = {0020-0190},

year = {2022},

date = {2022-01-01},

urldate = {2022-01-01},

journal = {Information Processing Letters},

volume = {178},

pages = {106275},

abstract = {The all-pairs suffix/prefix (APSP) problem is a classic problem in computer science with many applications in bioinformatics. Given a set ${S_1,ldots,S_k}$ of $k$ strings of total length $n$, we are asked to find, for each string $S_i, iin[1,k]$, its longest suffix that is a prefix of string $S_j$, for all j≠i, j∈[1,k]. Several algorithms running in the optimal $O(n+k^2)$ time for solving APSP are known. All of these algorithms are based on suffix sorting and thus require space Ω(n) in any case. We consider the parameterized version of the APSP problem, denoted by $ell$-APSP, in which we are asked to output only the pairs whose suffix/prefix overlap is of length at least $ell$. We give an algorithm for solving $ell$-APSP that runs in the optimal $O(n+|text{OUTPUT}_ℓ|)$ time using O(n) space, where $text{OUTPUT}_ℓ$ is the set of output pairs. Our algorithm is thus optimal for the APSP problem as well by setting $ell=0$. Notably, our algorithm is fundamentally different from all optimal algorithms solving the APSP problem: it does not rely on sorting the suffixes of all input strings but on a novel traversal of the Aho-Corasick machine, and it thus requires space linear in the size of the machine.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {article}

}

Charalampopoulos, Panagiotis; Kociumaka, Tomasz; Radoszewski, Jakub; Pissis, Solon P.; Rytter, Wojciech; Waleń, Tomasz; Zuba, Wiktor

Approximate Circular Pattern Matching Proceedings Article

In: Chechik, Shiri; Navarro, Gonzalo; Rotenberg, Eva; Herman, Grzegorz (Ed.): 30th Annual European Symposium on Algorithms (ESA 2022), pp. 35:1–35:19, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2022, ISSN: 1868-8969.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@inproceedings{Charalampopoulos2022c,

title = {Approximate Circular Pattern Matching},

author = {Panagiotis Charalampopoulos and Tomasz Kociumaka and Jakub Radoszewski and Solon P. Pissis and Wojciech Rytter and Tomasz Waleń and Wiktor Zuba},

editor = {Shiri Chechik and Gonzalo Navarro and Eva Rotenberg and Grzegorz Herman},

doi = {10.4230/LIPIcs.ESA.2022.35},

issn = {1868-8969},

year = {2022},

date = {2022-01-01},

urldate = {2022-01-01},

booktitle = {30th Annual European Symposium on Algorithms (ESA 2022)},

volume = {244},

pages = {35:1--35:19},

publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},

address = {Dagstuhl, Germany},

series = {Leibniz International Proceedings in Informatics (LIPIcs)},

abstract = {We investigate the complexity of approximate circular pattern matching (CPM, in short) under the Hamming and edit distance. Under each of these two basic metrics, we are given a length-n text T, a length-m pattern P, and a positive integer threshold k, and we are to report all starting positions (called occurrences) of fragments of T that are at distance at most k from some cyclic rotation of P. In the decision version of the problem, we are to check if there is any such occurrence. All previous results for approximate CPM were either average-case upper bounds or heuristics, with the exception of the work of Charalampopoulos et al. [$text{CKP}^+$, JCSS'21], who considered only the Hamming distance. For the reporting version of the approximate CPM problem, under the Hamming distance we improve upon the main algorithm of [$text{CKP}^+$, JCSS'21] from 𝒪(n+(n/m) ⋅ k⁴) to 𝒪(n+(n/m) ⋅ k³ log log k) time; for the edit distance, we give an 𝒪(nk²)-time algorithm. Notably, for the decision versions and wide parameter-ranges, we give algorithms whose complexities are almost identical to the state-of-the-art for standard (i.e., non-circular) approximate pattern matching:

- For the decision version of the approximate CPM problem under the Hamming distance, we obtain an 𝒪(n+(n/m) ⋅ k² log k / log log k)-time algorithm, which works in 𝒪(n) time whenever k = 𝒪(√{m log log m / log m}). In comparison, the fastest algorithm for the standard counterpart of the problem, by Chan et al. [CGKKP, STOC’20], runs in 𝒪(n) time only for k = 𝒪(√m). We achieve this result via a reduction to a geometric problem by building on ideas from [$text{CKP}^+$, JCSS'21] and Charalampopoulos et al. [CKW, FOCS'20].

- For the decision version of the approximate CPM problem under the edit distance, the 𝒪(nklog³ k) runtime of our algorithm near matches the 𝒪(nk) runtime of the Landau-Vishkin algorithm [LV, J. Algorithms'89] for approximate pattern matching under edit distance; the latter algorithm remains the fastest known for $k = Ω(m^{2/5})$. As a stepping stone, we propose an 𝒪(nklog³ k)-time algorithm for solving the Longest Prefix k'-Approximate Match problem, proposed by Landau et al. [LMS, SICOMP'98], for all k' ∈ {1,…,k}. Our algorithm is based on Tiskin’s theory of seaweeds [Tiskin, Math. Comput. Sci.'08], with recent advancements (see Charalampopoulos et al. [CKW, FOCS'22]), and on exploiting the seaweeds' relation to Monge matrices.

In contrast, we obtain a conditional lower bound that suggests a polynomial separation between approximate CPM under the Hamming distance over the binary alphabet and its non-circular counterpart. We also show that a strongly subquadratic-time algorithm for the decision version of approximate CPM under edit distance would refute the Strong Exponential Time Hypothesis.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {inproceedings}

}

- For the decision version of the approximate CPM problem under the Hamming distance, we obtain an 𝒪(n+(n/m) ⋅ k² log k / log log k)-time algorithm, which works in 𝒪(n) time whenever k = 𝒪(√{m log log m / log m}). In comparison, the fastest algorithm for the standard counterpart of the problem, by Chan et al. [CGKKP, STOC’20], runs in 𝒪(n) time only for k = 𝒪(√m). We achieve this result via a reduction to a geometric problem by building on ideas from [$text{CKP}^+$, JCSS'21] and Charalampopoulos et al. [CKW, FOCS'20].

- For the decision version of the approximate CPM problem under the edit distance, the 𝒪(nklog³ k) runtime of our algorithm near matches the 𝒪(nk) runtime of the Landau-Vishkin algorithm [LV, J. Algorithms'89] for approximate pattern matching under edit distance; the latter algorithm remains the fastest known for $k = Ω(m^{2/5})$. As a stepping stone, we propose an 𝒪(nklog³ k)-time algorithm for solving the Longest Prefix k'-Approximate Match problem, proposed by Landau et al. [LMS, SICOMP'98], for all k' ∈ {1,…,k}. Our algorithm is based on Tiskin’s theory of seaweeds [Tiskin, Math. Comput. Sci.'08], with recent advancements (see Charalampopoulos et al. [CKW, FOCS'22]), and on exploiting the seaweeds' relation to Monge matrices.

In contrast, we obtain a conditional lower bound that suggests a polynomial separation between approximate CPM under the Hamming distance over the binary alphabet and its non-circular counterpart. We also show that a strongly subquadratic-time algorithm for the decision version of approximate CPM under edit distance would refute the Strong Exponential Time Hypothesis.

Bernardini, Giulia; Gabory, Esteban; Pissis, Solon P.; Stougie, Leen; Sweering, Michelle; Zuba, Wiktor

Elastic-Degenerate String Matching with 1 Error Proceedings Article

In: Castañeda, Armando; Rodr'iguez-Henr'iquez, Francisco (Ed.): LATIN 2022: Theoretical Informatics, pp. 20–37, Springer International Publishing, Cham, 2022, ISBN: 978-3-031-20624-5.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@inproceedings{Bernardini2022d,

title = {Elastic-Degenerate String Matching with 1 Error},

author = {Giulia Bernardini and Esteban Gabory and Solon P. Pissis and Leen Stougie and Michelle Sweering and Wiktor Zuba},

editor = {Armando Castañeda and Francisco Rodr'iguez-Henr'iquez},

url = {https://arxiv.org/abs/2209.01095},

doi = {10.1007/978-3-031-20624-5_2},

isbn = {978-3-031-20624-5},

year = {2022},

date = {2022-01-01},

urldate = {2022-01-01},

booktitle = {LATIN 2022: Theoretical Informatics},

volume = {13568},

pages = {20--37},

publisher = {Springer International Publishing},

address = {Cham},

abstract = {An elastic-degenerate (ED) string is a sequence of n finite sets of strings of total length N, introduced to represent a set of related DNA sequences, also known as a pangenome. The ED string matching (EDSM) problem consists in reporting all occurrences of a pattern of length m in an ED text. The EDSM problem has recently received some attention by the combinatorial pattern matching community, culminating in an $mathcal{tilde O}(nm^{omega -1})+mathcal{O}(N)$-time algorithm [Bernardini et al., SIAM J. Comput. 2022], where $omega$ denotes the matrix multiplication exponent and the $mathcal{tilde O}(cdot)$ notation suppresses polylog factors. In the k-EDSM problem, the approximate version of EDSM, we are asked to report all pattern occurrences with at most k errors. k-EDSM can be solved in $mathcal{O}(k^2mG+kN)$ time under edit distance, where G denotes the total number of strings in the ED text [Bernardini et al., Theor. Comput. Sci. 2020]. Unfortunately, G is only bounded by N, and so even for $k=1$, the existing algorithm runs in $varOmega(mN)$ time in the worst case. Here we make progress in this direction. We show that 1-EDSM can be solved in $mathcal{O}((nm^2 + N)log m)$ or $mathcal{O}(nm^3 + N)$ time under edit distance. For the decision version of the problem, we present a faster $mathcal{O}(nm^2sqrt{log m} + Nloglog m)-time algorithm. Our algorithms rely on non-trivial reductions from 1-EDSM to special instances of classic computational geometry problems (2d rectangle stabbing or range emptiness), which we show how to solve efficiently.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {inproceedings}

}

Parmigiani, Luca; Wittler, Roland; Stoye, Jens

Revisiting pangenome openness with k-mers Unpublished

bioRxiv, 2022.

Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG

@unpublished{Parmigiani2022,

title = {Revisiting pangenome openness with k-mers},

author = {Luca Parmigiani and Roland Wittler and Jens Stoye},

doi = {10.1101/2022.11.15.516472},

year = {2022},

date = {2022-01-01},

urldate = {2022-01-01},

journal = {bioRxiv},

publisher = {Cold Spring Harbor Laboratory},

abstract = {Pangenomics is the study of related genomes collectively, usually from the same species or closely related taxa. Originally, pangenomes were defined for bacterial species. After the concept was extended to eukaryotic genomes, two definitions of pangenome evolved in parallel: the gene-based approach, which defines the pangenome as the union of all genes, and the sequence-based approach, which defines the pangenome as the set of all nonredundant genomic sequences.Estimating the total size of the pangenome for a given species has been subject of study since the very first mention of pangenomes. Traditionally, this is performed predicting the ratio at which new genes are discovered, referred to as the openness of the species.Here, we abstract each genome as a set of items, which is entirely agnostic of the two approaches (gene-based, sequence-based). Genes are a viable option for items, but also other possibilities are feasible, e.g., genome sequence substrings of fixed length k (k-mers).In the present study, we investigate the use of k-mers to estimate the openness as an alternative to genes, and compare the results. An efficient implementation is also provided.Competing Interest StatementThe authors have declared no competing interest.},

howpublished = {bioRxiv},

keywords = {WP2: Evolutionary/Comparative CPG},

pubstate = {published},

tppubtype = {unpublished}

}

Rivals, Eric; Sweering, Michelle; Wang, Pengfei

Convergence of the number of period sets in strings Unpublished

arXiv, 2022.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@unpublished{Rivals2022,

title = {Convergence of the number of period sets in strings},

author = {Eric Rivals and Michelle Sweering and Pengfei Wang},

doi = {10.48550/ARXIV.2209.08926},

year = {2022},

date = {2022-01-01},

urldate = {2022-01-01},

publisher = {arXiv},

abstract = {Consider words of length $n$. The set of all periods of a word of length $n$ is a subset of ${0, 1, 2, ldots , n-1}$. However, any subset of ${0, 1, 2, ldots , n-1}$ is not necessarily a valid set of periods. In a seminal paper in 1981, Guibas and Odlyzko have proposed to encode the set of periods of a word into an $n$ long binary string, called an autocorrelation, where a one at position $i$ denotes a period of $i$. They considered the question of recognizing a valid period set, and also studied the number of valid period sets for length $n$, denoted $kappa n$. They conjectured that $ln(kappa n)$ asymptotically converges to a constant times $ln^2(n)$. If improved lower bounds for $ln(kappa n)/ln^2(n)$ were proposed in 2001, the question of a tight upper bound has remained opened since Guibas and Odlyzko’s paper. Here, we exhibit an upper bound for this fraction, which implies its convergence and closes this long standing conjecture. Moreover, we extend our result to find similar bounds for the number of correlations: a generalization of autocorrelations which encodes the overlaps between two strings.},

howpublished = {arXiv},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {unpublished}

}

Luo, Xiao; Kang, Xiongbin; Schönhuth, Alexander

phasebook: haplotype-aware de novo assembly of diploid genomes from long reads Journal Article

In: Genome biology, vol. 22, no. 299, pp. 1–26, 2021, ISSN: 1474-760X.

Abstract | Links | BibTeX | Tags: WP3: Translational CPG

@article{luo2021phasebook,

title = {phasebook: haplotype-aware de novo assembly of diploid genomes from long reads},

author = {Xiao Luo and Xiongbin Kang and Alexander Schönhuth},

doi = {https://doi.org/10.1186/s13059-021-02512-x},

issn = {1474-760X},

year = {2021},

date = {2021-10-27},

urldate = {2021-10-27},

journal = {Genome biology},

volume = {22},

number = {299},

pages = {1--26},

publisher = {BioMed Central},

abstract = {Haplotype-aware diploid genome assembly is crucial in genomics, precision medicine, and many other disciplines. Long-read sequencing technologies have greatly improved genome assembly. However, current long-read assemblers are either reference based, so introduce biases, or fail to capture the haplotype diversity of diploid genomes. We present phasebook, a de novo approach for reconstructing the haplotypes of diploid genomes from long reads. phasebook outperforms other approaches in terms of haplotype coverage by large margins, in addition to achieving competitive performance in terms of assembly errors and assembly contiguity.},

keywords = {WP3: Translational CPG},

pubstate = {published},

tppubtype = {article}

}

Charalampopoulos, Panagiotis; Chen, Huiping; Christen, Peter; Loukides, Grigorios; Pisanti, Nadia; Pissis, Solon P.; Radoszewski, Jakub

Pattern Masking for Dictionary Matching Proceedings Article

In: Ahn, Hee-Kap; Sadakane, Kunihiko (Ed.): 32nd International Symposium on Algorithms and Computation (ISAAC 2021), pp. 65:1–65:19, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2021, ISSN: 1868-8969.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@inproceedings{Charalampopoulos2021,

title = {Pattern Masking for Dictionary Matching},

author = {Panagiotis Charalampopoulos and Huiping Chen and Peter Christen and Grigorios Loukides and Nadia Pisanti and Solon P. Pissis and Jakub Radoszewski},

editor = {Hee-Kap Ahn and Kunihiko Sadakane},

doi = {10.4230/LIPIcs.ISAAC.2021.65},

issn = {1868-8969},

year = {2021},

date = {2021-01-01},

urldate = {2021-01-01},

booktitle = {32nd International Symposium on Algorithms and Computation (ISAAC 2021)},

volume = {212},

pages = {65:1--65:19},

publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},

address = {Dagstuhl, Germany},

series = {Leibniz International Proceedings in Informatics (LIPIcs)},

abstract = {Data masking is a common technique for sanitizing sensitive data maintained in database systems, and it is also becoming increasingly important in various application areas, such as in record linkage of personal data. This work formalizes the Pattern Masking for Dictionary Matching (PMDM) problem. In PMDM, we are given a dictionary 𝒟 of d strings, each of length 𝓁, a query string q of length 𝓁, and a positive integer z, and we are asked to compute a smallest set K ⊆ {1,…,𝓁}, so that if q[i] is replaced by a wildcard for all i ∈ K, then q matches at least z strings from 𝒟. Solving PMDM allows providing data utility guarantees as opposed to existing approaches.

We first show, through a reduction from the well-known k-Clique problem, that a decision version of the PMDM problem is NP-complete, even for strings over a binary alphabet. We thus approach the problem from a more practical perspective. We show a combinatorial 𝒪((d𝓁)^{|K|/3}+d𝓁)-time and 𝒪(d𝓁)-space algorithm for PMDM for |K| = 𝒪(1). In fact, we show that we cannot hope for a faster combinatorial algorithm, unless the combinatorial k-Clique hypothesis fails [Abboud et al., SIAM J. Comput. 2018; Lincoln et al., SODA 2018]. We also generalize this algorithm for the problem of masking multiple query strings simultaneously so that every string has at least z matches in 𝒟.

Note that PMDM can be viewed as a generalization of the decision version of the dictionary matching with mismatches problem: by querying a PMDM data structure with string q and z = 1, one obtains the minimal number of mismatches of q with any string from 𝒟. The query time or space of all known data structures for the more restricted problem of dictionary matching with at most k mismatches incurs some exponential factor with respect to k. A simple exact algorithm for PMDM runs in time 𝒪(2^𝓁 d). We present a data structure for PMDM that answers queries over 𝒟 in time 𝒪(2^{𝓁/2}(2^{𝓁/2}+τ)𝓁) and requires space 𝒪(2^𝓁 d²/τ²+2^{𝓁/2}d), for any parameter τ ∈ [1,d].

We complement our results by showing a two-way polynomial-time reduction between PMDM and the Minimum Union problem [Chlamtáč et al., SODA 2017]. This gives a polynomial-time 𝒪(d^{1/4+ε})-approximation algorithm for PMDM, which is tight under a plausible complexity conjecture. },

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {inproceedings}

}

We first show, through a reduction from the well-known k-Clique problem, that a decision version of the PMDM problem is NP-complete, even for strings over a binary alphabet. We thus approach the problem from a more practical perspective. We show a combinatorial 𝒪((d𝓁)^{|K|/3}+d𝓁)-time and 𝒪(d𝓁)-space algorithm for PMDM for |K| = 𝒪(1). In fact, we show that we cannot hope for a faster combinatorial algorithm, unless the combinatorial k-Clique hypothesis fails [Abboud et al., SIAM J. Comput. 2018; Lincoln et al., SODA 2018]. We also generalize this algorithm for the problem of masking multiple query strings simultaneously so that every string has at least z matches in 𝒟.

Note that PMDM can be viewed as a generalization of the decision version of the dictionary matching with mismatches problem: by querying a PMDM data structure with string q and z = 1, one obtains the minimal number of mismatches of q with any string from 𝒟. The query time or space of all known data structures for the more restricted problem of dictionary matching with at most k mismatches incurs some exponential factor with respect to k. A simple exact algorithm for PMDM runs in time 𝒪(2^𝓁 d). We present a data structure for PMDM that answers queries over 𝒟 in time 𝒪(2^{𝓁/2}(2^{𝓁/2}+τ)𝓁) and requires space 𝒪(2^𝓁 d²/τ²+2^{𝓁/2}d), for any parameter τ ∈ [1,d].

We complement our results by showing a two-way polynomial-time reduction between PMDM and the Minimum Union problem [Chlamtáč et al., SODA 2017]. This gives a polynomial-time 𝒪(d^{1/4+ε})-approximation algorithm for PMDM, which is tight under a plausible complexity conjecture.

Pisanti, Nadia

On-Line Pattern Matching on D-Texts (Invited Talk) Proceedings Article

In: Gawrychowski, Paweł; Starikovskaya, Tatiana (Ed.): 32nd Annual Symposium on Combinatorial Pattern Matching (CPM 2021), pp. 3:1–3:2, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2021, ISSN: 1868-8969.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG, WP2: Evolutionary/Comparative CPG

@inproceedings{Pisanti2021,

title = {On-Line Pattern Matching on D-Texts (Invited Talk)},

author = {Nadia Pisanti},

editor = {Paweł Gawrychowski and Tatiana Starikovskaya},

doi = {10.4230/LIPIcs.CPM.2021.3},

issn = {1868-8969},

year = {2021},

date = {2021-01-01},

urldate = {2021-01-01},

booktitle = {32nd Annual Symposium on Combinatorial Pattern Matching (CPM 2021)},

volume = {191},

pages = {3:1--3:2},

publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},

address = {Dagstuhl, Germany},

series = {Leibniz International Proceedings in Informatics (LIPIcs)},

abstract = {The Elastic Degenerate String Matching (EDSM) problem is defined as that of finding an occurrence of a pattern P of length m in an ED-text T. A D-text (Degenerate text) is a string that actually represents a set of similar and aligned strings (e.g. a pan-genome [The Computational Pan-Genomics Consortium, 2018]) by collapsing common fragments into a standard string, and representing variants with sets of alternative substrings. When such substrings are not bound to have the same size, then we talk about elastic D-strings (ED-strings). In [R.Grossi et al., 2017] we gave an O(nm²+N) time on-line algorithm for EDSM, where n is the length of T and N is its size, defined as the total number of letters. A fundamental toolkit of our algorithm is the O(m²+N) time solution of the later called Active Prefixes problem (AP). In [K.Aoyama et al., 2018], a O(m^{1.5} √{log m}+N) solution for AP was shown, leading to a O(nm^{1.5} √{log m}+N) time solution for EDSM. The natural open problem was thus whether the 1.5 exponent could furtherly be decreased. In [G.Bernardini et al., 2019], we prove several properties that answer this and other questions: we give a conditional O(nm^{1.5}+N) lower bound for EDSM, proving that a combinatorial algorithm solving EDSM in O(nm^{1.5-ε} +N) time would break the Boolean Matrix Multiplication (BMM) conjecture; we use this result as a hint to devise a non-combinatorial algorithm that solves EDSM in O(nm^{1.381}+N) time; we do so by successfully combining Fast Fourier Transform and properties of string periodicity. In my talk I will overview the results above, as well as some interesting side results: the extension to a dictionary rather than a single pattern [S.P.Pissis and A.Retha, 2018], the introduction of errors [G.Bernardini et al., 2020], and a notion of matching among D-strings with its linear time solution [M.Alzamel et al., 2020].},

keywords = {WP1: Primary CPG, WP2: Evolutionary/Comparative CPG},

pubstate = {published},

tppubtype = {inproceedings}

}

Conte, Alessio; Grossi, Roberto; Loukides, Grigorios; Pisanti, Nadia; Pissis, Solon P.; Punzi, Giulia

Beyond the BEST Theorem: Fast Assessment of Eulerian Trails Proceedings Article

In: Bampis, Evripidis; Pagourtzis, Aris (Ed.): Fundamentals of Computation Theory, pp. 162–175, Springer International Publishing, Cham, 2021, ISBN: 978-3-030-86593-1.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@inproceedings{Conte2021,

title = {Beyond the BEST Theorem: Fast Assessment of Eulerian Trails},

author = {Alessio Conte and Roberto Grossi and Grigorios Loukides and Nadia Pisanti and Solon P. Pissis and Giulia Punzi},

editor = {Evripidis Bampis and Aris Pagourtzis},

url = {https://kclpure.kcl.ac.uk/portal/files/159578921/Estimating_Eulerian_Trails_FCT.pdf, Full PDF on King’s Research Portal},

doi = {10.1007/978-3-030-86593-1_11},

isbn = {978-3-030-86593-1},

year = {2021},

date = {2021-01-01},

urldate = {2021-01-01},

booktitle = {Fundamentals of Computation Theory},

pages = {162--175},

publisher = {Springer International Publishing},

address = {Cham},

abstract = {Given a directed multigraph $G=(V,E)$, with $|V|=n$ nodes and $|E|=m$ edges, and an integer $z$, we are asked to assess whether the number $#ET(G)$ of node-distinct Eulerian trails of $G$ is at least $z$; two trails are called node-distinct if their node sequences are different. This problem has been formalized by Bernardini et al. [ALENEX 2020] as it is the core computational problem in several string processing applications. It can be solved in $mathcal O(n^omega)$ arithmetic operations by applying the well-known BEST theorem, where $omega <2.373$ denotes the matrix multiplication exponent. The algorithmic challenge is: Can we solve this problem faster for certain values of $m$ and $z$? Namely, we want to design a combinatorial algorithm for assessing whether $#ET(G) ge z$, which does not resort to the BEST theorem and has a predictably bounded cost as a function of $m$ and $z$. We address this challenge here by providing a combinatorial algorithm requiring $mathcal O(mcdot min {z, #ET(G)})$ time.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {inproceedings}

}

Loukides, Grigorios; Pissis, Solon P.

Bidirectional String Anchors: A New String Sampling Mechanism Proceedings Article

In: Mutzel, Petra; Pagh, Rasmus; Herman, Grzegorz (Ed.): 29th Annual European Symposium on Algorithms (ESA 2021), pp. 64:1–64:21, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2021, ISSN: 1868-8969.

Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG

@inproceedings{Loukides2021,

title = {Bidirectional String Anchors: A New String Sampling Mechanism},

author = {Grigorios Loukides and Solon P. Pissis},

editor = {Petra Mutzel and Rasmus Pagh and Grzegorz Herman},

doi = {10.4230/LIPIcs.ESA.2021.64},

issn = {1868-8969},

year = {2021},

date = {2021-01-01},

urldate = {2021-01-01},

booktitle = {29th Annual European Symposium on Algorithms (ESA 2021)},

volume = {204},

pages = {64:1--64:21},

publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},

address = {Dagstuhl, Germany},

series = {Leibniz International Proceedings in Informatics (LIPIcs)},

abstract = { The minimizers sampling mechanism is a popular mechanism for string sampling introduced independently by Schleimer et al. [SIGMOD 2003] and by Roberts et al. [Bioinf. 2004]. Given two positive integers w and k, it selects the lexicographically smallest length-k substring in every fragment of w consecutive length-k substrings (in every sliding window of length w+k-1). Minimizers samples are approximately uniform, locally consistent, and computable in linear time. Although they do not have good worst-case guarantees on their size, they are often small in practice. They thus have been successfully employed in several string processing applications. Two main disadvantages of minimizers sampling mechanisms are: first, they also do not have good guarantees on the expected size of their samples for every combination of w and k; and, second, indexes that are constructed over their samples do not have good worst-case guarantees for on-line pattern searches.

To alleviate these disadvantages, we introduce bidirectional string anchors (bd-anchors), a new string sampling mechanism. Given a positive integer 𝓁, our mechanism selects the lexicographically smallest rotation in every length-𝓁 fragment (in every sliding window of length 𝓁). We show that bd-anchors samples are also approximately uniform, locally consistent, and computable in linear time. In addition, our experiments using several datasets demonstrate that the bd-anchors sample sizes decrease proportionally to 𝓁; and that these sizes are competitive to or smaller than the minimizers sample sizes using the analogous sampling parameters. We provide theoretical justification for these results by analyzing the expected size of bd-anchors samples.

We also show that by using any bd-anchors sample, we can construct, in near-linear time, an index which requires linear (extra) space in the size of the sample and answers on-line pattern searches in near-optimal time. We further show, using several datasets, that a simple implementation of our index is consistently faster for on-line pattern searches than an analogous implementation of a minimizers-based index [Grabowski and Raniszewski, Softw. Pract. Exp. 2017].

Finally, we highlight the applicability of bd-anchors by developing an efficient and effective heuristic for top-K similarity search under edit distance. We show, using synthetic datasets, that our heuristic is more accurate and more than one order of magnitude faster in top-K similarity searches than the state-of-the-art tool for the same purpose [Zhang and Zhang, KDD 2020]. },

keywords = {WP2: Evolutionary/Comparative CPG},

pubstate = {published},

tppubtype = {inproceedings}

}

To alleviate these disadvantages, we introduce bidirectional string anchors (bd-anchors), a new string sampling mechanism. Given a positive integer 𝓁, our mechanism selects the lexicographically smallest rotation in every length-𝓁 fragment (in every sliding window of length 𝓁). We show that bd-anchors samples are also approximately uniform, locally consistent, and computable in linear time. In addition, our experiments using several datasets demonstrate that the bd-anchors sample sizes decrease proportionally to 𝓁; and that these sizes are competitive to or smaller than the minimizers sample sizes using the analogous sampling parameters. We provide theoretical justification for these results by analyzing the expected size of bd-anchors samples.

We also show that by using any bd-anchors sample, we can construct, in near-linear time, an index which requires linear (extra) space in the size of the sample and answers on-line pattern searches in near-optimal time. We further show, using several datasets, that a simple implementation of our index is consistently faster for on-line pattern searches than an analogous implementation of a minimizers-based index [Grabowski and Raniszewski, Softw. Pract. Exp. 2017].

Finally, we highlight the applicability of bd-anchors by developing an efficient and effective heuristic for top-K similarity search under edit distance. We show, using synthetic datasets, that our heuristic is more accurate and more than one order of magnitude faster in top-K similarity searches than the state-of-the-art tool for the same purpose [Zhang and Zhang, KDD 2020].

Charalampopoulos, Panagiotis; Kociumaka, Tomasz; Pissis, Solon P.; Radoszewski, Jakub

Faster Algorithms for Longest Common Substring Proceedings Article

In: Mutzel, Petra; Pagh, Rasmus; Herman, Grzegorz (Ed.): 29th Annual European Symposium on Algorithms (ESA 2021), pp. 30:1–30:17, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2021, ISSN: 1868-8969.

Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG

@inproceedings{Charalampopoulos2021b,

title = {Faster Algorithms for Longest Common Substring},

author = {Panagiotis Charalampopoulos and Tomasz Kociumaka and Solon P. Pissis and Jakub Radoszewski},

editor = {Petra Mutzel and Rasmus Pagh and Grzegorz Herman},

doi = {10.4230/LIPIcs.ESA.2021.30},

issn = {1868-8969},

year = {2021},

date = {2021-01-01},

urldate = {2021-01-01},

booktitle = {29th Annual European Symposium on Algorithms (ESA 2021)},

volume = {204},

pages = {30:1--30:17},

publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},

address = {Dagstuhl, Germany},

series = {Leibniz International Proceedings in Informatics (LIPIcs)},

abstract = { In the classic longest common substring (LCS) problem, we are given two strings S and T, each of length at most n, over an alphabet of size σ, and we are asked to find a longest string occurring as a fragment of both S and T. Weiner, in his seminal paper that introduced the suffix tree, presented an 𝒪(n log σ)-time algorithm for this problem [SWAT 1973]. For polynomially-bounded integer alphabets, the linear-time construction of suffix trees by Farach yielded an 𝒪(n)-time algorithm for the LCS problem [FOCS 1997]. However, for small alphabets, this is not necessarily optimal for the LCS problem in the word RAM model of computation, in which the strings can be stored in 𝒪(n log σ/log n) space and read in 𝒪(n log σ/log n) time. We show that, in this model, we can compute an LCS in time 𝒪(n log σ / √{log n}), which is sublinear in n if σ = 2^{o(√{log n})} (in particular, if σ = 𝒪(1)), using optimal space 𝒪(n log σ/log n).

We then lift our ideas to the problem of computing a k-mismatch LCS, which has received considerable attention in recent years. In this problem, the aim is to compute a longest substring of S that occurs in T with at most k mismatches. Flouri et al. showed how to compute a 1-mismatch LCS in 𝒪(n log n) time [IPL 2015]. Thankachan et al. extended this result to computing a k-mismatch LCS in 𝒪(n log^k n) time for k = 𝒪(1) [J. Comput. Biol. 2016]. We show an 𝒪(n log^{k-1/2} n)-time algorithm, for any constant integer k > 0 and irrespective of the alphabet size, using 𝒪(n) space as the previous approaches. We thus notably break through the well-known n log^k n barrier, which stems from a recursive heavy-path decomposition technique that was first introduced in the seminal paper of Cole et al. [STOC 2004] for string indexing with k errors. },

keywords = {WP2: Evolutionary/Comparative CPG},

pubstate = {published},

tppubtype = {inproceedings}

}

We then lift our ideas to the problem of computing a k-mismatch LCS, which has received considerable attention in recent years. In this problem, the aim is to compute a longest substring of S that occurs in T with at most k mismatches. Flouri et al. showed how to compute a 1-mismatch LCS in 𝒪(n log n) time [IPL 2015]. Thankachan et al. extended this result to computing a k-mismatch LCS in 𝒪(n log^k n) time for k = 𝒪(1) [J. Comput. Biol. 2016]. We show an 𝒪(n log^{k-1/2} n)-time algorithm, for any constant integer k > 0 and irrespective of the alphabet size, using 𝒪(n) space as the previous approaches. We thus notably break through the well-known n log^k n barrier, which stems from a recursive heavy-path decomposition technique that was first introduced in the seminal paper of Cole et al. [STOC 2004] for string indexing with k errors.

Park, Sangsoo; Park, Sung Gwan; Cazaux, Bastien; Park, Kunsoo; Rivals, Eric

A Linear Time Algorithm for Constructing Hierarchical Overlap Graphs Proceedings Article

In: Gawrychowski, Paweł; Starikovskaya, Tatiana (Ed.): 32nd Annual Symposium on Combinatorial Pattern Matching (CPM 2021), pp. 22:1–22:9, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2021, ISSN: 1868-8969.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@inproceedings{Park2021,

title = {A Linear Time Algorithm for Constructing Hierarchical Overlap Graphs},

author = {Sangsoo Park and Sung Gwan Park and Bastien Cazaux and Kunsoo Park and Eric Rivals},

editor = {Paweł Gawrychowski and Tatiana Starikovskaya},

doi = {10.4230/LIPIcs.CPM.2021.22},

issn = {1868-8969},

year = {2021},

date = {2021-01-01},

urldate = {2021-01-01},

booktitle = {32nd Annual Symposium on Combinatorial Pattern Matching (CPM 2021)},

volume = {191},

pages = {22:1--22:9},

publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},

address = {Dagstuhl, Germany},

series = {Leibniz International Proceedings in Informatics (LIPIcs)},

abstract = {The hierarchical overlap graph (HOG) is a graph that encodes overlaps from a given set P of n strings, as the overlap graph does. A best known algorithm constructs HOG in O(||P|| log n) time and O(||P||) space, where ||P|| is the sum of lengths of the strings in P. In this paper we present a new algorithm to construct HOG in O(||P||) time and space. Hence, the construction time and space of HOG are better than those of the overlap graph, which are O(||P|| + n²). },

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {inproceedings}

}

Cazaux, Bastien; Rivals, Eric

Greedy-reduction from Shortest Linear Superstring to Shortest Circular Superstring Unpublished

arXiv, 2020.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@unpublished{Cazaux2020,

title = {Greedy-reduction from Shortest Linear Superstring to Shortest Circular Superstring},

author = {Bastien Cazaux and Eric Rivals},

doi = {10.48550/ARXIV.2012.08878},

year = {2020},

date = {2020-01-01},

urldate = {2020-01-01},

publisher = {arXiv},

abstract = {A superstring of a set of strings correspond to a string which contains all the other strings as substrings. The problem of finding the Shortest Linear Superstring is a well-know and well-studied problem in stringology. We present here a variant of this problem, the Shortest Circular Superstring problem where the sought superstring is a circular string. We show a strong link between these two problems and prove that the Shortest Circular Superstring problem is NP-complete. Moreover, we propose a new conjecture on the approximation ratio of the Shortest Circular Superstring problem. },

howpublished = {arXiv},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {unpublished}

}