Loukides, Grigorios; Pissis, Solon P.
Bidirectional String Anchors: A New String Sampling Mechanism Proceedings Article
In: Mutzel, Petra; Pagh, Rasmus; Herman, Grzegorz (Ed.): 29th Annual European Symposium on Algorithms (ESA 2021), pp. 64:1โ64:21, Schloss Dagstuhl -- Leibniz-Zentrum fรผr Informatik, Dagstuhl, Germany, 2021, ISSN: 1868-8969.
Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG
@inproceedings{Loukides2021,
title = {Bidirectional String Anchors: A New String Sampling Mechanism},
author = {Grigorios Loukides and Solon P. Pissis},
editor = {Petra Mutzel and Rasmus Pagh and Grzegorz Herman},
doi = {10.4230/LIPIcs.ESA.2021.64},
issn = {1868-8969},
year = {2021},
date = {2021-01-01},
urldate = {2021-01-01},
booktitle = {29th Annual European Symposium on Algorithms (ESA 2021)},
volume = {204},
pages = {64:1--64:21},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum fรผr Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = { The minimizers sampling mechanism is a popular mechanism for string sampling introduced independently by Schleimer et al. [SIGMOD 2003] and by Roberts et al. [Bioinf. 2004]. Given two positive integers w and k, it selects the lexicographically smallest length-k substring in every fragment of w consecutive length-k substrings (in every sliding window of length w+k-1). Minimizers samples are approximately uniform, locally consistent, and computable in linear time. Although they do not have good worst-case guarantees on their size, they are often small in practice. They thus have been successfully employed in several string processing applications. Two main disadvantages of minimizers sampling mechanisms are: first, they also do not have good guarantees on the expected size of their samples for every combination of w and k; and, second, indexes that are constructed over their samples do not have good worst-case guarantees for on-line pattern searches.
To alleviate these disadvantages, we introduce bidirectional string anchors (bd-anchors), a new string sampling mechanism. Given a positive integer ๐, our mechanism selects the lexicographically smallest rotation in every length-๐ fragment (in every sliding window of length ๐). We show that bd-anchors samples are also approximately uniform, locally consistent, and computable in linear time. In addition, our experiments using several datasets demonstrate that the bd-anchors sample sizes decrease proportionally to ๐; and that these sizes are competitive to or smaller than the minimizers sample sizes using the analogous sampling parameters. We provide theoretical justification for these results by analyzing the expected size of bd-anchors samples.
We also show that by using any bd-anchors sample, we can construct, in near-linear time, an index which requires linear (extra) space in the size of the sample and answers on-line pattern searches in near-optimal time. We further show, using several datasets, that a simple implementation of our index is consistently faster for on-line pattern searches than an analogous implementation of a minimizers-based index [Grabowski and Raniszewski, Softw. Pract. Exp. 2017].
Finally, we highlight the applicability of bd-anchors by developing an efficient and effective heuristic for top-K similarity search under edit distance. We show, using synthetic datasets, that our heuristic is more accurate and more than one order of magnitude faster in top-K similarity searches than the state-of-the-art tool for the same purpose [Zhang and Zhang, KDD 2020]. },
keywords = {WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
To alleviate these disadvantages, we introduce bidirectional string anchors (bd-anchors), a new string sampling mechanism. Given a positive integer ๐, our mechanism selects the lexicographically smallest rotation in every length-๐ fragment (in every sliding window of length ๐). We show that bd-anchors samples are also approximately uniform, locally consistent, and computable in linear time. In addition, our experiments using several datasets demonstrate that the bd-anchors sample sizes decrease proportionally to ๐; and that these sizes are competitive to or smaller than the minimizers sample sizes using the analogous sampling parameters. We provide theoretical justification for these results by analyzing the expected size of bd-anchors samples.
We also show that by using any bd-anchors sample, we can construct, in near-linear time, an index which requires linear (extra) space in the size of the sample and answers on-line pattern searches in near-optimal time. We further show, using several datasets, that a simple implementation of our index is consistently faster for on-line pattern searches than an analogous implementation of a minimizers-based index [Grabowski and Raniszewski, Softw. Pract. Exp. 2017].
Finally, we highlight the applicability of bd-anchors by developing an efficient and effective heuristic for top-K similarity search under edit distance. We show, using synthetic datasets, that our heuristic is more accurate and more than one order of magnitude faster in top-K similarity searches than the state-of-the-art tool for the same purpose [Zhang and Zhang, KDD 2020].
Charalampopoulos, Panagiotis; Kociumaka, Tomasz; Pissis, Solon P.; Radoszewski, Jakub
Faster Algorithms for Longest Common Substring Proceedings Article
In: Mutzel, Petra; Pagh, Rasmus; Herman, Grzegorz (Ed.): 29th Annual European Symposium on Algorithms (ESA 2021), pp. 30:1โ30:17, Schloss Dagstuhl -- Leibniz-Zentrum fรผr Informatik, Dagstuhl, Germany, 2021, ISSN: 1868-8969.
Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG
@inproceedings{Charalampopoulos2021b,
title = {Faster Algorithms for Longest Common Substring},
author = {Panagiotis Charalampopoulos and Tomasz Kociumaka and Solon P. Pissis and Jakub Radoszewski},
editor = {Petra Mutzel and Rasmus Pagh and Grzegorz Herman},
doi = {10.4230/LIPIcs.ESA.2021.30},
issn = {1868-8969},
year = {2021},
date = {2021-01-01},
urldate = {2021-01-01},
booktitle = {29th Annual European Symposium on Algorithms (ESA 2021)},
volume = {204},
pages = {30:1--30:17},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum fรผr Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = { In the classic longest common substring (LCS) problem, we are given two strings S and T, each of length at most n, over an alphabet of size ฯ, and we are asked to find a longest string occurring as a fragment of both S and T. Weiner, in his seminal paper that introduced the suffix tree, presented an ๐ช(n log ฯ)-time algorithm for this problem [SWAT 1973]. For polynomially-bounded integer alphabets, the linear-time construction of suffix trees by Farach yielded an ๐ช(n)-time algorithm for the LCS problem [FOCS 1997]. However, for small alphabets, this is not necessarily optimal for the LCS problem in the word RAM model of computation, in which the strings can be stored in ๐ช(n log ฯ/log n) space and read in ๐ช(n log ฯ/log n) time. We show that, in this model, we can compute an LCS in time ๐ช(n log ฯ / โ{log n}), which is sublinear in n if ฯ = 2^{o(โ{log n})} (in particular, if ฯ = ๐ช(1)), using optimal space ๐ช(n log ฯ/log n).
We then lift our ideas to the problem of computing a k-mismatch LCS, which has received considerable attention in recent years. In this problem, the aim is to compute a longest substring of S that occurs in T with at most k mismatches. Flouri et al. showed how to compute a 1-mismatch LCS in ๐ช(n log n) time [IPL 2015]. Thankachan et al. extended this result to computing a k-mismatch LCS in ๐ช(n log^k n) time for k = ๐ช(1) [J. Comput. Biol. 2016]. We show an ๐ช(n log^{k-1/2} n)-time algorithm, for any constant integer k > 0 and irrespective of the alphabet size, using ๐ช(n) space as the previous approaches. We thus notably break through the well-known n log^k n barrier, which stems from a recursive heavy-path decomposition technique that was first introduced in the seminal paper of Cole et al. [STOC 2004] for string indexing with k errors. },
keywords = {WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
We then lift our ideas to the problem of computing a k-mismatch LCS, which has received considerable attention in recent years. In this problem, the aim is to compute a longest substring of S that occurs in T with at most k mismatches. Flouri et al. showed how to compute a 1-mismatch LCS in ๐ช(n log n) time [IPL 2015]. Thankachan et al. extended this result to computing a k-mismatch LCS in ๐ช(n log^k n) time for k = ๐ช(1) [J. Comput. Biol. 2016]. We show an ๐ช(n log^{k-1/2} n)-time algorithm, for any constant integer k > 0 and irrespective of the alphabet size, using ๐ช(n) space as the previous approaches. We thus notably break through the well-known n log^k n barrier, which stems from a recursive heavy-path decomposition technique that was first introduced in the seminal paper of Cole et al. [STOC 2004] for string indexing with k errors.
Park, Sangsoo; Park, Sung Gwan; Cazaux, Bastien; Park, Kunsoo; Rivals, Eric
A Linear Time Algorithm for Constructing Hierarchical Overlap Graphs Proceedings Article
In: Gawrychowski, Paweล; Starikovskaya, Tatiana (Ed.): 32nd Annual Symposium on Combinatorial Pattern Matching (CPM 2021), pp. 22:1โ22:9, Schloss Dagstuhl -- Leibniz-Zentrum fรผr Informatik, Dagstuhl, Germany, 2021, ISSN: 1868-8969.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Park2021,
title = {A Linear Time Algorithm for Constructing Hierarchical Overlap Graphs},
author = {Sangsoo Park and Sung Gwan Park and Bastien Cazaux and Kunsoo Park and Eric Rivals},
editor = {Paweล Gawrychowski and Tatiana Starikovskaya},
doi = {10.4230/LIPIcs.CPM.2021.22},
issn = {1868-8969},
year = {2021},
date = {2021-01-01},
urldate = {2021-01-01},
booktitle = {32nd Annual Symposium on Combinatorial Pattern Matching (CPM 2021)},
volume = {191},
pages = {22:1--22:9},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum fรผr Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {The hierarchical overlap graph (HOG) is a graph that encodes overlaps from a given set P of n strings, as the overlap graph does. A best known algorithm constructs HOG in O(||P|| log n) time and O(||P||) space, where ||P|| is the sum of lengths of the strings in P. In this paper we present a new algorithm to construct HOG in O(||P||) time and space. Hence, the construction time and space of HOG are better than those of the overlap graph, which are O(||P|| + nยฒ). },
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Cazaux, Bastien; Rivals, Eric
Greedy-reduction from Shortest Linear Superstring to Shortest Circular Superstring Unpublished
arXiv, 2020.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@unpublished{Cazaux2020,
title = {Greedy-reduction from Shortest Linear Superstring to Shortest Circular Superstring},
author = {Bastien Cazaux and Eric Rivals},
doi = {10.48550/ARXIV.2012.08878},
year = {2020},
date = {2020-01-01},
urldate = {2020-01-01},
publisher = {arXiv},
abstract = {A superstring of a set of strings correspond to a string which contains all the other strings as substrings. The problem of finding the Shortest Linear Superstring is a well-know and well-studied problem in stringology. We present here a variant of this problem, the Shortest Circular Superstring problem where the sought superstring is a circular string. We show a strong link between these two problems and prove that the Shortest Circular Superstring problem is NP-complete. Moreover, we propose a new conjecture on the approximation ratio of the Shortest Circular Superstring problem. },
howpublished = {arXiv},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {unpublished}
}