Loukides, Grigorios; Pissis, Solon P.
All-pairs suffix/prefix in optimal time using Aho-Corasick space Journal Article
In: Information Processing Letters, vol. 178, pp. 106275, 2022, ISSN: 0020-0190.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Loukides2022,
title = {All-pairs suffix/prefix in optimal time using Aho-Corasick space},
author = {Grigorios Loukides and Solon P. Pissis},
doi = {https://doi.org/10.1016/j.ipl.2022.106275},
issn = {0020-0190},
year = {2022},
date = {2022-01-01},
urldate = {2022-01-01},
journal = {Information Processing Letters},
volume = {178},
pages = {106275},
abstract = {The all-pairs suffix/prefix (APSP) problem is a classic problem in computer science with many applications in bioinformatics. Given a set ${S_1,ldots,S_k}$ of $k$ strings of total length $n$, we are asked to find, for each string $S_i, iin[1,k]$, its longest suffix that is a prefix of string $S_j$, for all j≠i, j∈[1,k]. Several algorithms running in the optimal $O(n+k^2)$ time for solving APSP are known. All of these algorithms are based on suffix sorting and thus require space Ω(n) in any case. We consider the parameterized version of the APSP problem, denoted by $ell$-APSP, in which we are asked to output only the pairs whose suffix/prefix overlap is of length at least $ell$. We give an algorithm for solving $ell$-APSP that runs in the optimal $O(n+|text{OUTPUT}_ℓ|)$ time using O(n) space, where $text{OUTPUT}_ℓ$ is the set of output pairs. Our algorithm is thus optimal for the APSP problem as well by setting $ell=0$. Notably, our algorithm is fundamentally different from all optimal algorithms solving the APSP problem: it does not rely on sorting the suffixes of all input strings but on a novel traversal of the Aho-Corasick machine, and it thus requires space linear in the size of the machine.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
Charalampopoulos, Panagiotis; Kociumaka, Tomasz; Radoszewski, Jakub; Pissis, Solon P.; Rytter, Wojciech; Waleń, Tomasz; Zuba, Wiktor
Approximate Circular Pattern Matching Proceedings Article
In: Chechik, Shiri; Navarro, Gonzalo; Rotenberg, Eva; Herman, Grzegorz (Ed.): 30th Annual European Symposium on Algorithms (ESA 2022), pp. 35:1–35:19, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2022, ISSN: 1868-8969.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Charalampopoulos2022c,
title = {Approximate Circular Pattern Matching},
author = {Panagiotis Charalampopoulos and Tomasz Kociumaka and Jakub Radoszewski and Solon P. Pissis and Wojciech Rytter and Tomasz Waleń and Wiktor Zuba},
editor = {Shiri Chechik and Gonzalo Navarro and Eva Rotenberg and Grzegorz Herman},
doi = {10.4230/LIPIcs.ESA.2022.35},
issn = {1868-8969},
year = {2022},
date = {2022-01-01},
urldate = {2022-01-01},
booktitle = {30th Annual European Symposium on Algorithms (ESA 2022)},
volume = {244},
pages = {35:1--35:19},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {We investigate the complexity of approximate circular pattern matching (CPM, in short) under the Hamming and edit distance. Under each of these two basic metrics, we are given a length-n text T, a length-m pattern P, and a positive integer threshold k, and we are to report all starting positions (called occurrences) of fragments of T that are at distance at most k from some cyclic rotation of P. In the decision version of the problem, we are to check if there is any such occurrence. All previous results for approximate CPM were either average-case upper bounds or heuristics, with the exception of the work of Charalampopoulos et al. [$text{CKP}^+$, JCSS'21], who considered only the Hamming distance. For the reporting version of the approximate CPM problem, under the Hamming distance we improve upon the main algorithm of [$text{CKP}^+$, JCSS'21] from 𝒪(n+(n/m) ⋅ k⁴) to 𝒪(n+(n/m) ⋅ k³ log log k) time; for the edit distance, we give an 𝒪(nk²)-time algorithm. Notably, for the decision versions and wide parameter-ranges, we give algorithms whose complexities are almost identical to the state-of-the-art for standard (i.e., non-circular) approximate pattern matching:
- For the decision version of the approximate CPM problem under the Hamming distance, we obtain an 𝒪(n+(n/m) ⋅ k² log k / log log k)-time algorithm, which works in 𝒪(n) time whenever k = 𝒪(√{m log log m / log m}). In comparison, the fastest algorithm for the standard counterpart of the problem, by Chan et al. [CGKKP, STOC’20], runs in 𝒪(n) time only for k = 𝒪(√m). We achieve this result via a reduction to a geometric problem by building on ideas from [$text{CKP}^+$, JCSS'21] and Charalampopoulos et al. [CKW, FOCS'20].
- For the decision version of the approximate CPM problem under the edit distance, the 𝒪(nklog³ k) runtime of our algorithm near matches the 𝒪(nk) runtime of the Landau-Vishkin algorithm [LV, J. Algorithms'89] for approximate pattern matching under edit distance; the latter algorithm remains the fastest known for $k = Ω(m^{2/5})$. As a stepping stone, we propose an 𝒪(nklog³ k)-time algorithm for solving the Longest Prefix k'-Approximate Match problem, proposed by Landau et al. [LMS, SICOMP'98], for all k' ∈ {1,…,k}. Our algorithm is based on Tiskin’s theory of seaweeds [Tiskin, Math. Comput. Sci.'08], with recent advancements (see Charalampopoulos et al. [CKW, FOCS'22]), and on exploiting the seaweeds' relation to Monge matrices.
In contrast, we obtain a conditional lower bound that suggests a polynomial separation between approximate CPM under the Hamming distance over the binary alphabet and its non-circular counterpart. We also show that a strongly subquadratic-time algorithm for the decision version of approximate CPM under edit distance would refute the Strong Exponential Time Hypothesis.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
- For the decision version of the approximate CPM problem under the Hamming distance, we obtain an 𝒪(n+(n/m) ⋅ k² log k / log log k)-time algorithm, which works in 𝒪(n) time whenever k = 𝒪(√{m log log m / log m}). In comparison, the fastest algorithm for the standard counterpart of the problem, by Chan et al. [CGKKP, STOC’20], runs in 𝒪(n) time only for k = 𝒪(√m). We achieve this result via a reduction to a geometric problem by building on ideas from [$text{CKP}^+$, JCSS'21] and Charalampopoulos et al. [CKW, FOCS'20].
- For the decision version of the approximate CPM problem under the edit distance, the 𝒪(nklog³ k) runtime of our algorithm near matches the 𝒪(nk) runtime of the Landau-Vishkin algorithm [LV, J. Algorithms'89] for approximate pattern matching under edit distance; the latter algorithm remains the fastest known for $k = Ω(m^{2/5})$. As a stepping stone, we propose an 𝒪(nklog³ k)-time algorithm for solving the Longest Prefix k'-Approximate Match problem, proposed by Landau et al. [LMS, SICOMP'98], for all k' ∈ {1,…,k}. Our algorithm is based on Tiskin’s theory of seaweeds [Tiskin, Math. Comput. Sci.'08], with recent advancements (see Charalampopoulos et al. [CKW, FOCS'22]), and on exploiting the seaweeds' relation to Monge matrices.
In contrast, we obtain a conditional lower bound that suggests a polynomial separation between approximate CPM under the Hamming distance over the binary alphabet and its non-circular counterpart. We also show that a strongly subquadratic-time algorithm for the decision version of approximate CPM under edit distance would refute the Strong Exponential Time Hypothesis.
Bernardini, Giulia; Gabory, Esteban; Pissis, Solon P.; Stougie, Leen; Sweering, Michelle; Zuba, Wiktor
Elastic-Degenerate String Matching with 1 Error Proceedings Article
In: Castañeda, Armando; Rodr'iguez-Henr'iquez, Francisco (Ed.): LATIN 2022: Theoretical Informatics, pp. 20–37, Springer International Publishing, Cham, 2022, ISBN: 978-3-031-20624-5.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Bernardini2022d,
title = {Elastic-Degenerate String Matching with 1 Error},
author = {Giulia Bernardini and Esteban Gabory and Solon P. Pissis and Leen Stougie and Michelle Sweering and Wiktor Zuba},
editor = {Armando Castañeda and Francisco Rodr'iguez-Henr'iquez},
url = {https://arxiv.org/abs/2209.01095},
doi = {10.1007/978-3-031-20624-5_2},
isbn = {978-3-031-20624-5},
year = {2022},
date = {2022-01-01},
urldate = {2022-01-01},
booktitle = {LATIN 2022: Theoretical Informatics},
volume = {13568},
pages = {20--37},
publisher = {Springer International Publishing},
address = {Cham},
abstract = {An elastic-degenerate (ED) string is a sequence of n finite sets of strings of total length N, introduced to represent a set of related DNA sequences, also known as a pangenome. The ED string matching (EDSM) problem consists in reporting all occurrences of a pattern of length m in an ED text. The EDSM problem has recently received some attention by the combinatorial pattern matching community, culminating in an $mathcal{tilde O}(nm^{omega -1})+mathcal{O}(N)$-time algorithm [Bernardini et al., SIAM J. Comput. 2022], where $omega$ denotes the matrix multiplication exponent and the $mathcal{tilde O}(cdot)$ notation suppresses polylog factors. In the k-EDSM problem, the approximate version of EDSM, we are asked to report all pattern occurrences with at most k errors. k-EDSM can be solved in $mathcal{O}(k^2mG+kN)$ time under edit distance, where G denotes the total number of strings in the ED text [Bernardini et al., Theor. Comput. Sci. 2020]. Unfortunately, G is only bounded by N, and so even for $k=1$, the existing algorithm runs in $varOmega(mN)$ time in the worst case. Here we make progress in this direction. We show that 1-EDSM can be solved in $mathcal{O}((nm^2 + N)log m)$ or $mathcal{O}(nm^3 + N)$ time under edit distance. For the decision version of the problem, we present a faster $mathcal{O}(nm^2sqrt{log m} + Nloglog m)-time algorithm. Our algorithms rely on non-trivial reductions from 1-EDSM to special instances of classic computational geometry problems (2d rectangle stabbing or range emptiness), which we show how to solve efficiently.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Mwaniki, Njagi Moses; Garrison, Erik; Pisanti, Nadia
Fast Exact String to D-Texts Alignments Unpublished
arXiv, 2022.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@unpublished{Mwaniki2022,
title = {Fast Exact String to D-Texts Alignments},
author = {Njagi Moses Mwaniki and Erik Garrison and Nadia Pisanti},
doi = {10.48550/ARXIV.2206.03242},
year = {2022},
date = {2022-01-01},
urldate = {2022-01-01},
publisher = {arXiv},
abstract = {In recent years, aligning a sequence to a pangenome has become a central problem in genomics and pangenomics. A fast and accurate solution to this problem can serve as a toolkit to many crucial tasks such as read-correction, Multiple Sequences Alignment (MSA), genome assemblies, variant calling, just to name a few. In this paper we propose a new, fast and exact method to align a string to a D-string, the latter possibly representing an MSA, a pan-genome or a partial assembly. An implementation of our tool dsa is publicly available at https://github.com/urbanslug/dsa.},
howpublished = {arXiv},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {unpublished}
}
Lemane, Téo; Medvedev, Paul; Chikhi, Rayan; Peterlongo, Pierre
kmtricks: efficient and flexible construction of Bloom filters for large sequencing data collections Journal Article
In: Bioinformatics Advances, vol. 2, no. 1, 2022, ISSN: 2635-0041.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Lemane2022b,
title = {kmtricks: efficient and flexible construction of Bloom filters for large sequencing data collections},
author = {Téo Lemane and Paul Medvedev and Rayan Chikhi and Pierre Peterlongo},
editor = {Thomas Lengauer},
url = {https://github.com/tlemane/kmtricks},
doi = {10.1093/bioadv/vbac029},
issn = {2635-0041},
year = {2022},
date = {2022-01-01},
urldate = {2022-01-01},
journal = {Bioinformatics Advances},
volume = {2},
number = {1},
publisher = {Oxford University Press (OUP)},
abstract = {When indexing large collections of short-read sequencing data, a common operation that has now been implemented in several tools (Sequence Bloom Trees and variants, BIGSI) is to construct a collection of Bloom filters, one per sample. Each Bloom filter is used to represent a set of k-mers which approximates the desired set of all the non-erroneous k-mers present in the sample. However, this approximation is imperfect, especially in the case of metagenomics data. Erroneous but abundant k-mers are wrongly included, and non-erroneous but low-abundant ones are wrongly discarded. We propose kmtricks, a novel approach for generating Bloom filters from terabase-sized collections of sequencing data. Our main contributions are (i) an efficient method for jointly counting k-mers across multiple samples, including a streamlined Bloom filter construction by directly counting, partitioning and sorting hashes instead of k-mers, which is approximately four times faster than state-of-the-art tools; (ii) a novel technique that takes advantage of joint counting to preserve low-abundant k-mers present in several samples, improving the recovery of non-erroneous k-mers. Our experiments highlight that this technique preserves around 8× more k-mers than the usual yet crude filtering of low-abundance k-mers in a large metagenomics dataset.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
Pisanti, Nadia
On-Line Pattern Matching on D-Texts (Invited Talk) Proceedings Article
In: Gawrychowski, Paweł; Starikovskaya, Tatiana (Ed.): 32nd Annual Symposium on Combinatorial Pattern Matching (CPM 2021), pp. 3:1–3:2, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2021, ISSN: 1868-8969.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG, WP2: Evolutionary/Comparative CPG
@inproceedings{Pisanti2021,
title = {On-Line Pattern Matching on D-Texts (Invited Talk)},
author = {Nadia Pisanti},
editor = {Paweł Gawrychowski and Tatiana Starikovskaya},
doi = {10.4230/LIPIcs.CPM.2021.3},
issn = {1868-8969},
year = {2021},
date = {2021-01-01},
urldate = {2021-01-01},
booktitle = {32nd Annual Symposium on Combinatorial Pattern Matching (CPM 2021)},
volume = {191},
pages = {3:1--3:2},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {The Elastic Degenerate String Matching (EDSM) problem is defined as that of finding an occurrence of a pattern P of length m in an ED-text T. A D-text (Degenerate text) is a string that actually represents a set of similar and aligned strings (e.g. a pan-genome [The Computational Pan-Genomics Consortium, 2018]) by collapsing common fragments into a standard string, and representing variants with sets of alternative substrings. When such substrings are not bound to have the same size, then we talk about elastic D-strings (ED-strings). In [R.Grossi et al., 2017] we gave an O(nm²+N) time on-line algorithm for EDSM, where n is the length of T and N is its size, defined as the total number of letters. A fundamental toolkit of our algorithm is the O(m²+N) time solution of the later called Active Prefixes problem (AP). In [K.Aoyama et al., 2018], a O(m^{1.5} √{log m}+N) solution for AP was shown, leading to a O(nm^{1.5} √{log m}+N) time solution for EDSM. The natural open problem was thus whether the 1.5 exponent could furtherly be decreased. In [G.Bernardini et al., 2019], we prove several properties that answer this and other questions: we give a conditional O(nm^{1.5}+N) lower bound for EDSM, proving that a combinatorial algorithm solving EDSM in O(nm^{1.5-ε} +N) time would break the Boolean Matrix Multiplication (BMM) conjecture; we use this result as a hint to devise a non-combinatorial algorithm that solves EDSM in O(nm^{1.381}+N) time; we do so by successfully combining Fast Fourier Transform and properties of string periodicity. In my talk I will overview the results above, as well as some interesting side results: the extension to a dictionary rather than a single pattern [S.P.Pissis and A.Retha, 2018], the introduction of errors [G.Bernardini et al., 2020], and a notion of matching among D-strings with its linear time solution [M.Alzamel et al., 2020].},
keywords = {WP1: Primary CPG, WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Conte, Alessio; Grossi, Roberto; Loukides, Grigorios; Pisanti, Nadia; Pissis, Solon P.; Punzi, Giulia
Beyond the BEST Theorem: Fast Assessment of Eulerian Trails Proceedings Article
In: Bampis, Evripidis; Pagourtzis, Aris (Ed.): Fundamentals of Computation Theory, pp. 162–175, Springer International Publishing, Cham, 2021, ISBN: 978-3-030-86593-1.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Conte2021,
title = {Beyond the BEST Theorem: Fast Assessment of Eulerian Trails},
author = {Alessio Conte and Roberto Grossi and Grigorios Loukides and Nadia Pisanti and Solon P. Pissis and Giulia Punzi},
editor = {Evripidis Bampis and Aris Pagourtzis},
url = {https://kclpure.kcl.ac.uk/portal/files/159578921/Estimating_Eulerian_Trails_FCT.pdf, Full PDF on King’s Research Portal},
doi = {10.1007/978-3-030-86593-1_11},
isbn = {978-3-030-86593-1},
year = {2021},
date = {2021-01-01},
urldate = {2021-01-01},
booktitle = {Fundamentals of Computation Theory},
pages = {162--175},
publisher = {Springer International Publishing},
address = {Cham},
abstract = {Given a directed multigraph $G=(V,E)$, with $|V|=n$ nodes and $|E|=m$ edges, and an integer $z$, we are asked to assess whether the number $#ET(G)$ of node-distinct Eulerian trails of $G$ is at least $z$; two trails are called node-distinct if their node sequences are different. This problem has been formalized by Bernardini et al. [ALENEX 2020] as it is the core computational problem in several string processing applications. It can be solved in $mathcal O(n^omega)$ arithmetic operations by applying the well-known BEST theorem, where $omega <2.373$ denotes the matrix multiplication exponent. The algorithmic challenge is: Can we solve this problem faster for certain values of $m$ and $z$? Namely, we want to design a combinatorial algorithm for assessing whether $#ET(G) ge z$, which does not resort to the BEST theorem and has a predictably bounded cost as a function of $m$ and $z$. We address this challenge here by providing a combinatorial algorithm requiring $mathcal O(mcdot min {z, #ET(G)})$ time.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Charalampopoulos, Panagiotis; Chen, Huiping; Christen, Peter; Loukides, Grigorios; Pisanti, Nadia; Pissis, Solon P.; Radoszewski, Jakub
Pattern Masking for Dictionary Matching Proceedings Article
In: Ahn, Hee-Kap; Sadakane, Kunihiko (Ed.): 32nd International Symposium on Algorithms and Computation (ISAAC 2021), pp. 65:1–65:19, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2021, ISSN: 1868-8969.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Charalampopoulos2021,
title = {Pattern Masking for Dictionary Matching},
author = {Panagiotis Charalampopoulos and Huiping Chen and Peter Christen and Grigorios Loukides and Nadia Pisanti and Solon P. Pissis and Jakub Radoszewski},
editor = {Hee-Kap Ahn and Kunihiko Sadakane},
doi = {10.4230/LIPIcs.ISAAC.2021.65},
issn = {1868-8969},
year = {2021},
date = {2021-01-01},
urldate = {2021-01-01},
booktitle = {32nd International Symposium on Algorithms and Computation (ISAAC 2021)},
volume = {212},
pages = {65:1--65:19},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {Data masking is a common technique for sanitizing sensitive data maintained in database systems, and it is also becoming increasingly important in various application areas, such as in record linkage of personal data. This work formalizes the Pattern Masking for Dictionary Matching (PMDM) problem. In PMDM, we are given a dictionary 𝒟 of d strings, each of length 𝓁, a query string q of length 𝓁, and a positive integer z, and we are asked to compute a smallest set K ⊆ {1,…,𝓁}, so that if q[i] is replaced by a wildcard for all i ∈ K, then q matches at least z strings from 𝒟. Solving PMDM allows providing data utility guarantees as opposed to existing approaches.
We first show, through a reduction from the well-known k-Clique problem, that a decision version of the PMDM problem is NP-complete, even for strings over a binary alphabet. We thus approach the problem from a more practical perspective. We show a combinatorial 𝒪((d𝓁)^{|K|/3}+d𝓁)-time and 𝒪(d𝓁)-space algorithm for PMDM for |K| = 𝒪(1). In fact, we show that we cannot hope for a faster combinatorial algorithm, unless the combinatorial k-Clique hypothesis fails [Abboud et al., SIAM J. Comput. 2018; Lincoln et al., SODA 2018]. We also generalize this algorithm for the problem of masking multiple query strings simultaneously so that every string has at least z matches in 𝒟.
Note that PMDM can be viewed as a generalization of the decision version of the dictionary matching with mismatches problem: by querying a PMDM data structure with string q and z = 1, one obtains the minimal number of mismatches of q with any string from 𝒟. The query time or space of all known data structures for the more restricted problem of dictionary matching with at most k mismatches incurs some exponential factor with respect to k. A simple exact algorithm for PMDM runs in time 𝒪(2^𝓁 d). We present a data structure for PMDM that answers queries over 𝒟 in time 𝒪(2^{𝓁/2}(2^{𝓁/2}+τ)𝓁) and requires space 𝒪(2^𝓁 d²/τ²+2^{𝓁/2}d), for any parameter τ ∈ [1,d].
We complement our results by showing a two-way polynomial-time reduction between PMDM and the Minimum Union problem [Chlamtáč et al., SODA 2017]. This gives a polynomial-time 𝒪(d^{1/4+ε})-approximation algorithm for PMDM, which is tight under a plausible complexity conjecture. },
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
We first show, through a reduction from the well-known k-Clique problem, that a decision version of the PMDM problem is NP-complete, even for strings over a binary alphabet. We thus approach the problem from a more practical perspective. We show a combinatorial 𝒪((d𝓁)^{|K|/3}+d𝓁)-time and 𝒪(d𝓁)-space algorithm for PMDM for |K| = 𝒪(1). In fact, we show that we cannot hope for a faster combinatorial algorithm, unless the combinatorial k-Clique hypothesis fails [Abboud et al., SIAM J. Comput. 2018; Lincoln et al., SODA 2018]. We also generalize this algorithm for the problem of masking multiple query strings simultaneously so that every string has at least z matches in 𝒟.
Note that PMDM can be viewed as a generalization of the decision version of the dictionary matching with mismatches problem: by querying a PMDM data structure with string q and z = 1, one obtains the minimal number of mismatches of q with any string from 𝒟. The query time or space of all known data structures for the more restricted problem of dictionary matching with at most k mismatches incurs some exponential factor with respect to k. A simple exact algorithm for PMDM runs in time 𝒪(2^𝓁 d). We present a data structure for PMDM that answers queries over 𝒟 in time 𝒪(2^{𝓁/2}(2^{𝓁/2}+τ)𝓁) and requires space 𝒪(2^𝓁 d²/τ²+2^{𝓁/2}d), for any parameter τ ∈ [1,d].
We complement our results by showing a two-way polynomial-time reduction between PMDM and the Minimum Union problem [Chlamtáč et al., SODA 2017]. This gives a polynomial-time 𝒪(d^{1/4+ε})-approximation algorithm for PMDM, which is tight under a plausible complexity conjecture.
Park, Sangsoo; Park, Sung Gwan; Cazaux, Bastien; Park, Kunsoo; Rivals, Eric
A Linear Time Algorithm for Constructing Hierarchical Overlap Graphs Proceedings Article
In: Gawrychowski, Paweł; Starikovskaya, Tatiana (Ed.): 32nd Annual Symposium on Combinatorial Pattern Matching (CPM 2021), pp. 22:1–22:9, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2021, ISSN: 1868-8969.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Park2021,
title = {A Linear Time Algorithm for Constructing Hierarchical Overlap Graphs},
author = {Sangsoo Park and Sung Gwan Park and Bastien Cazaux and Kunsoo Park and Eric Rivals},
editor = {Paweł Gawrychowski and Tatiana Starikovskaya},
doi = {10.4230/LIPIcs.CPM.2021.22},
issn = {1868-8969},
year = {2021},
date = {2021-01-01},
urldate = {2021-01-01},
booktitle = {32nd Annual Symposium on Combinatorial Pattern Matching (CPM 2021)},
volume = {191},
pages = {22:1--22:9},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {The hierarchical overlap graph (HOG) is a graph that encodes overlaps from a given set P of n strings, as the overlap graph does. A best known algorithm constructs HOG in O(||P|| log n) time and O(||P||) space, where ||P|| is the sum of lengths of the strings in P. In this paper we present a new algorithm to construct HOG in O(||P||) time and space. Hence, the construction time and space of HOG are better than those of the overlap graph, which are O(||P|| + n²). },
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Cazaux, Bastien; Rivals, Eric
Greedy-reduction from Shortest Linear Superstring to Shortest Circular Superstring Unpublished
arXiv, 2020.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@unpublished{Cazaux2020,
title = {Greedy-reduction from Shortest Linear Superstring to Shortest Circular Superstring},
author = {Bastien Cazaux and Eric Rivals},
doi = {10.48550/ARXIV.2012.08878},
year = {2020},
date = {2020-01-01},
urldate = {2020-01-01},
publisher = {arXiv},
abstract = {A superstring of a set of strings correspond to a string which contains all the other strings as substrings. The problem of finding the Shortest Linear Superstring is a well-know and well-studied problem in stringology. We present here a variant of this problem, the Shortest Circular Superstring problem where the sought superstring is a circular string. We show a strong link between these two problems and prove that the Shortest Circular Superstring problem is NP-complete. Moreover, we propose a new conjecture on the approximation ratio of the Shortest Circular Superstring problem. },
howpublished = {arXiv},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {unpublished}
}