Teramo, Antonella; Binatti, Andrea; Ciabatti, Elena; Schiavoni, Gianluca; Tarrini, Giulia; Baril`a, Gregorio; Calabretto, Giulia; Vicenzetto, Cristina; Gasparini, Vanessa Rebecca; Facco, Monica; Petrini, Iacopo; Grossi, Roberto; Pisanti, Nadia; Bortoluzzi, Stefania; Falini, Brunangelo; Tiacci, Enrico; Galimberti, Sara; Semenzato, Gianpietro; Zambello, Renato

Defining TCRγδlymphoproliferative disorders by combined immunophenotypic and molecular evaluation Journal Article

In: Nature Communications, vol. 13, no. 1, pp. 3298, 2022, ISBN: 2041-1723.

@article{Teramo2022,

title = {Defining TCRγδlymphoproliferative disorders by combined immunophenotypic and molecular evaluation},

author = {Antonella Teramo and Andrea Binatti and Elena Ciabatti and Gianluca Schiavoni and Giulia Tarrini and Gregorio Baril`a and Giulia Calabretto and Cristina Vicenzetto and Vanessa Rebecca Gasparini and Monica Facco and Iacopo Petrini and Roberto Grossi and Nadia Pisanti and Stefania Bortoluzzi and Brunangelo Falini and Enrico Tiacci and Sara Galimberti and Gianpietro Semenzato and Renato Zambello},

doi = {10.1038/s41467-022-31015-x},

isbn = {2041-1723},

year = {2022},

date = {2022-06-08},

urldate = {2022-06-08},

journal = {Nature Communications},

volume = {13},

number = {1},

pages = {3298},

abstract = {Tγδlarge granular lymphocyte leukemia (TγδLGLL) is a rare lymphoproliferative disease, scantily described in literature. A deep-analysis, in an initial cohort of 9 TγδLGLL compared to 23 healthy controls, shows that TγδLGLL dominant clonotypes are mainly public and exhibit different V-(D)-J γ/δusage between patients with symptomatic and indolent Tγδneoplasm. Moreover, some clonotypes share the same rearranged sequence. Data obtained in an enlarged cohort (n = 36) indicate the importance of a combined evaluation of immunophenotype and STAT mutational profile for the correct management of patients with Tγδcell expansions. In fact, we observe an association between Vδ2/Vγ9 clonality and indolent course, while Vδ2/Vγ9 negativity correlates with symptomatic disease. Moreover, the 7 patients with STAT3 mutations have neutropenia and a CD56-/Vδ2- phenotype, and the 3 cases with STAT5B mutations display an asymptomatic clinical course and CD56/Vδ2 expression. All these data indicate that biological characterization is needed for Tγδ-cell neoplasm definition.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Bonnet, Konstantinn; Marschall, Tobias; Doerr, Daniel

Constructing founder sets under allelic and non-allelic homologous recombination Unpublished

bioRxiv, 2022.

@unpublished{Bonnet2022,

title = {Constructing founder sets under allelic and non-allelic homologous recombination},

author = {Konstantinn Bonnet and Tobias Marschall and Daniel Doerr},

doi = {10.1101/2022.05.27.493721},

year = {2022},

date = {2022-05-29},

urldate = {2022-05-01},

publisher = {Cold Spring Harbor Laboratory},

abstract = {Homologous recombination between the maternal and paternal copies of a chromosome is a key mechanism for human inheritance and shapes population genetic properties of our species. However, a similar mechanism can also act between different copies of the same sequence, then called non-allelic homologous recombination (NAHR). This process can result in genomic rearrangements—including deletion, duplication, and inversion—and is underlying many genomic disorders. Despite its importance for genome evolution and disease, there is a lack of computational models to study genomic loci prone to NAHR. In this work, we propose such a computational model, providing a unified framework for both (allelic) homologous recombination and NAHR. Our model represents a set of genomes as a graph, where human haplotypes correspond to walks through this graph. We formulate two founder set problems under our recombination model, provide flow-based algorithms for their solution, and demonstrate scalability to problem instances arising in practice.},

howpublished = {bioRxiv},

keywords = {},

pubstate = {published},

tppubtype = {unpublished}

}

Bernardini, Giulia; Gawrychowski, Paweł; Pisanti, Nadia; Pissis, Solon P.; Rosone, Giovanna

Elastic-Degenerate String Matching via Fast Matrix Multiplication Journal Article

In: SIAM Journal on Computing, vol. 51, no. 3, pp. 549–576, 2022.

@article{Bernardini2022,

title = {Elastic-Degenerate String Matching via Fast Matrix Multiplication},

author = {Giulia Bernardini and Paweł Gawrychowski and Nadia Pisanti and Solon P. Pissis and Giovanna Rosone},

doi = {10.1137/20m1368033},

year = {2022},

date = {2022-05-01},

urldate = {2022-05-01},

journal = {SIAM Journal on Computing},

volume = {51},

number = {3},

pages = {549--576},

publisher = {Society for Industrial & Applied Mathematics (SIAM)},

abstract = {An elastic-degenerate (ED) string is a sequence of $n$ sets of strings of total length $N$ which was recently proposed to model a set of similar sequences. The ED string matching (EDSM) problem is to find all occurrences of a pattern of length $m$ in an ED text. The EDSM problem has recently received some attention in the combinatorial pattern matching community, and an $\mathcal{O}(nm^{1.5}\sqrt{\log m} + N)$-time algorithm is known [Aoyama et al., CPM 2018]. The standard assumption in the prior work on this question is that $N$ is substantially larger than both $n$ and $m$, and thus we would like to have a linear dependency on the former. Under this assumption, the natural open problem is whether we can decrease the 1.5 exponent in the time complexity, similarly as in the related (but, to the best of our knowledge, not equivalent) word break problem [Backurs and Indyk, FOCS 2016]. Our starting point is a conditional lower bound for the EDSM problem. We use the popular combinatorial Boolean matrix multiplication (BMM) conjecture stating that there is no truly subcubic combinatorial algorithm for BMM [Abboud and Williams, FOCS 2014]. By designing an appropriate reduction, we show that a combinatorial algorithm solving the EDSM problem in $\mathcal{O}(nm^{1.5-\epsilon} + N)$ time, for any $\epsilon>0$, refutes this conjecture. Our reduction should be understood as an indication that decreasing the exponent requires fast matrix multiplication. String periodicity and fast Fourier transform are two standard tools in string algorithms. Our main technical contribution is that we successfully combine these tools with fast matrix multiplication to design a noncombinatorial $\tilde{\mathcal{O}}(nm^{\omega-1}+N)$-time algorithm for EDSM, where $\omega$ denotes the matrix multiplication exponent and the $\tilde{\mathcal{O}}(\cdot)$ notation suppresses polylog factors. To the best of our knowledge, we are the first to combine these tools. In particular, using the fact that $\omega<2.373$ [Alman and Williams, SODA 2021; Le Gall, ISSAC 2014; Williams, STOC 2012], we obtain an $\mathcal{O}(nm^{1.373} + N)$-time algorithm for EDSM. An important building block in our solution that might find applications in other problems is a method of selecting a small set of length-$\ell$ substrings of the pattern, called anchors, so that any occurrence of a string from an ED text set contains at least one but not too many (on average) such anchors inside.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Charalampopoulos, Panagiotis; Iliopoulos, Costas S.; Kociumaka, Tomasz; Pissis, Solon P.; Radoszewski, Jakub; Straszy'nski, Juliusz

Efficient Computation of Sequence Mappability Journal Article

In: Algorithmica, vol. 84, no. 5, pp. 1418–1440, 2022, ISBN: 1432-0541.

@article{Charalampopoulos2022,

title = {Efficient Computation of Sequence Mappability},

author = {Panagiotis Charalampopoulos and Costas S. Iliopoulos and Tomasz Kociumaka and Solon P. Pissis and Jakub Radoszewski and Juliusz Straszy'nski},

doi = {10.1007/s00453-022-00934-y},

isbn = {1432-0541},

year = {2022},

date = {2022-05-01},

urldate = {2022-05-01},

journal = {Algorithmica},

volume = {84},

number = {5},

pages = {1418--1440},

abstract = {Sequence mappability is an important task in genome resequencing. In the (k, m)-mappability problem, for a given sequence T of length n, the goal is to compute a table whose ith entry is the number of indices $j ne i$ such that the length-m substrings of T starting at positions i and j have at most k mismatches. Previous works on this problem focused on heuristics computing a rough approximation of the result or on the case of $k=1$. We present several efficient algorithms for the general case of the problem. Our main result is an algorithm that, for $k=O(1)$, works in $O(n)$space and, with high probability, in $O(n cdot min {m^k,log ^k n})$ time. Our algorithm requires a careful adaptation of the k-errata trees of Cole et al. [STOC 2004] to avoid multiple counting of pairs of substrings. Our technique can also be applied to solve the all-pairs Hamming distance problem introduced by Crochemore et al. [WABI 2017]. We further develop $O(n^2)$-time algorithms to compute all (k, m)-mappability tables for a fixed m and all $k in {0,ldots ,m}$or a fixed k and all $m in {k,ldots ,n}$. Finally, we show that, for $k,m = \Theta (\log n)$, the (k, m)-mappability problem cannot be solved in strongly subquadratic time unless the Strong Exponential Time Hypothesis fails. This is an improved and extended version of a paper presented at SPIRE 2018.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Luo, Xiao; Kang, Xiongbin; Schönhuth, Alexander

Strainline: full-length de novo viral haplotype reconstruction from noisy long reads Journal Article

In: Genome Biology, vol. 23, no. 1, pp. 29, 2022, ISSN: 1474-760X.

@article{Luo2022,

title = {Strainline: full-length de novo viral haplotype reconstruction from noisy long reads},

author = {Xiao Luo and Xiongbin Kang and Alexander Schönhuth},

doi = {10.1186/s13059-021-02587-6},

issn = {1474-760X},

year = {2022},

date = {2022-01-20},

urldate = {2022-01-01},

journal = {Genome Biology},

volume = {23},

number = {1},

pages = {29},

publisher = {Springer Science and Business Media LLC},

abstract = {Haplotype-resolved de novo assembly of highly diverse virus genomes is critical in prevention, control and treatment of viral diseases. Current methods either can handle only relatively accurate short read data, or collapse haplotype-specific variations into consensus sequence. Here, we present Strainline, a novel approach to assemble viral haplotypes from noisy long reads without a reference genome. Strainline is the first approach to provide strain-resolved, full-length de novo assemblies of viral quasispecies from noisy third-generation sequencing data. Benchmarking on simulated and real datasets of varying complexity and diversity confirm this novelty and demonstrate the superiority of Strainline.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Gažiová, M.; Sládeček, T.; Pös, O.; Števko, M.; Krampl, W.; Pös, Z.; Hekel, R.; Hlavačka, M.; Kucharík, M.; Radvánszky, J.; Budiš, J.; Szemes, T.

Automated prediction of the clinical impact of structural copy number variations Journal Article

In: Scientific Reports, vol. 12, no. 1, pp. 555, 2022, ISSN: 2045-2322.

@article{Gaziova2022,

title = {Automated prediction of the clinical impact of structural copy number variations},

author = {M. Gažiová and T. Sládeček and O. Pös and M. Števko and W. Krampl and Z. Pös and R. Hekel and M. Hlavačka and M. Kucharík and J. Radvánszky and J. Budiš and T. Szemes},

doi = {10.1038/s41598-021-04505-z},

issn = {2045-2322},

year = {2022},

date = {2022-01-11},

urldate = {2022-01-01},

journal = {Scientific Reports},

volume = {12},

number = {1},

pages = {555},

publisher = {Springer Science and Business Media LLC},

abstract = {Copy number variants (CNVs) play an important role in many biological processes, including the development of genetic diseases, making them attractive targets for genetic analyses. The interpretation of the effect of these structural variants is a challenging problem due to highly variable numbers of gene, regulatory, or other genomic elements affected by the CNV. This led to the demand for the interpretation tools that would relieve researchers, laboratory diagnosticians, genetic counselors, and clinical geneticists from the laborious process of annotation and classification of CNVs. We designed and validated a prediction method (ISV; Interpretation of Structural Variants) that is based on boosted trees which takes into account annotations of CNVs from several publicly available databases. The presented approach achieved more than 98{%} prediction accuracy on both copy number loss and copy number gain variants while also allowing CNVs being assigned “uncertain”significance in predictions. We believe that ISV’s prediction capability and explainability have a great potential to guide users to more precise interpretations and classifications of CNVs.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Rizzo, Nicola; Mäkinen, Veli

Linear Time Construction of Indexable Elastic Founder Graphs Inproceedings

In: Bazgan, Cristina; Fernau, Henning (Ed.): Combinatorial Algorithms, pp. 480–493, Springer International Publishing, Cham, 2022, ISBN: 978-3-031-06678-8.

@inproceedings{Rizzo2022,

title = {Linear Time Construction of Indexable Elastic Founder Graphs},

author = {Nicola Rizzo and Veli Mäkinen},

editor = {Cristina Bazgan and Henning Fernau},

url = {https://arxiv.org/abs/2201.06492},

doi = {10.1007/978-3-031-06678-8_35},

isbn = {978-3-031-06678-8},

year = {2022},

date = {2022-01-01},

urldate = {2022-01-01},

booktitle = {Combinatorial Algorithms},

pages = {480--493},

publisher = {Springer International Publishing},

address = {Cham},

abstract = {Pattern matching on graphs has been widely studied lately due to its importance in genomics applications. Unfortunately, even the simplest problem of deciding if a string appears as a subpath of a graph admits a quadratic lower bound under the Orthogonal Vectors Hypothesis (Equi et al. ICALP 2019, SOFSEM 2021). To avoid this bottleneck, the research has shifted towards more specific graph classes, e.g. those induced from multiple sequence alignments (MSAs). Consider segmenting $mathsf{MSA}[1..m,1..n]$ into $b$ blocks $mathsf{MSA}[1..m,1..j_1]$, $mathsf{MSA}[1..m,j_1+1..j_2]$, $ldots$, $mathsf{MSA}[1..m,j_b-1+1..n]$. The distinct strings in the rows of the blocks, after the removal of gap symbols, form the nodes of an elastic founder graph (EFG) where the edges represent the original connections observed in the MSA. An EFG is called indexable if a node label occurs as a prefix of only those paths that start from a node of the same block. Equi et al. (ISAAC 2021) showed that such EFGs support fast pattern matching and gave an $O(mn log m)$-time algorithm for preprocessing the MSA in a way that allows the construction of indexable EFGs maximizing the number of blocks and, alternatively, minimizing the maximum length of a block, in $O(n)$ and $O(n loglog n)$ time respectively. Using the suffix tree and solving a novel ancestor problem on trees, we improve the preprocessing to $O(mn)$ time and the $O(n loglog n)$-time EFG construction to $O(n)$ time, thus showing that both types of indexable EFGs can be constructed in time linear in the input size.},

howpublished = {arXiv},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

Mwaniki, Njagi Moses; Garrison, Erik; Pisanti, Nadia

Fast Exact String to D-Texts Alignments Unpublished

arXiv, 2022.

@unpublished{Mwaniki2022,

title = {Fast Exact String to D-Texts Alignments},

author = {Njagi Moses Mwaniki and Erik Garrison and Nadia Pisanti},

doi = {10.48550/ARXIV.2206.03242},

year = {2022},

date = {2022-01-01},

urldate = {2022-01-01},

publisher = {arXiv},

abstract = {In recent years, aligning a sequence to a pangenome has become a central problem in genomics and pangenomics. A fast and accurate solution to this problem can serve as a toolkit to many crucial tasks such as read-correction, Multiple Sequences Alignment (MSA), genome assemblies, variant calling, just to name a few. In this paper we propose a new, fast and exact method to align a string to a D-string, the latter possibly representing an MSA, a pan-genome or a partial assembly. An implementation of our tool dsa is publicly available at https://github.com/urbanslug/dsa.},

howpublished = {arXiv},

keywords = {},

pubstate = {published},

tppubtype = {unpublished}

}

Zhong, Haodi; Loukides, Grigorios; Pissis, Solon P.

Clustering sequence graphs Journal Article

In: Data & Knowledge Engineering, vol. 138, pp. 101981, 2022, ISSN: 0169-023X.

@article{Zhong2022,

title = {Clustering sequence graphs},

author = {Haodi Zhong and Grigorios Loukides and Solon P. Pissis},

doi = {https://doi.org/10.1016/j.datak.2022.101981},

issn = {0169-023X},

year = {2022},

date = {2022-01-01},

urldate = {2022-01-01},

journal = {Data & Knowledge Engineering},

volume = {138},

pages = {101981},

abstract = {In application domains ranging from social networks to e-commerce, it is important to cluster users with respect to both their relationships (e.g., friendship or trust) and their actions (e.g., visited locations or rated products). Motivated by these applications, we introduce here the task of clustering the nodes of a sequence graph, i.e., a graph whose nodes are labeled with strings (e.g., sequences of users' visited locations or rated products). Both string clustering algorithms and graph clustering algorithms are inappropriate to deal with this task, as they do not consider the structure of strings and graph simultaneously. Moreover, attributed graph clustering algorithms generally construct poor solutions because they need to represent a string as a vector of attributes, which inevitably loses information and may harm clustering quality. We thus introduce the problem of clustering a sequence graph. We first propose two pairwise distance measures for sequence graphs, one based on edit distance and shortest path distance and another one based on SimRank. We then formalize the problem under each measure, showing also that it is NP-hard. In addition, we design a polynomial-time 2-approximation algorithm, as well as a heuristic for the problem. Experiments using real datasets and a case study demonstrate the effectiveness and efficiency of our methods.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Cartes, Jorge Avila; Anand, Santosh; Ciccolella, Simone; Bonizzoni, Paola; Vedova, Gianluca Della

Accurate and Fast Clade Assignment via Deep Learning and Frequency Chaos Game Representation Journal Article

In: bioRxiv, 2022.

@article{Cartes2022,

title = {Accurate and Fast Clade Assignment via Deep Learning and Frequency Chaos Game Representation},

author = {Jorge Avila Cartes and Santosh Anand and Simone Ciccolella and Paola Bonizzoni and Gianluca Della Vedova},

doi = {10.1101/2022.06.13.495912},

year = {2022},

date = {2022-01-01},

urldate = {2022-01-01},

journal = {bioRxiv},

publisher = {Cold Spring Harbor Laboratory},

abstract = {Background Since the beginning of the COVID-19 pandemic there has been an explosion of sequencing of the SARS-CoV-2 virus, making it the most widely sequenced virus in the history. Several databases and tools have been created to keep track of genome sequences and variants of the virus, most notably the GISAID platform hosts millions of complete genome sequences, and it is continuously expanding every day. A challenging task is the development of fast and accurate tools that are able to distinguish between the different SARS-CoV-2 variants and assign them to a clade.Results In this paper, we leverage the Frequency Chaos Game Representation (FCGR) and Convolutional Neural Networks (CNNs) to develop an original method that learns how to classify genome sequences that we implement into CouGaR-g, a tool for the clade assignment problem on SARS-CoV-2 sequences. On a testing subset of the GISAID, CouGaR-g achieves an 96.29% overall accuracy, while a similar tool, Covidex, obtained a 77, 12% overall accuracy. As far as we know, our method is the first using Deep Learning and FCGR for intra-species classification. Furthermore, by using some feature importance methods CouGaR-g allows to identify k-mers that matches SARS-CoV-2 marker variants.Conclusions By combining FCGR and CNNs, we develop a method that achieves a better accuracy than Covidex (which is based on Random Forest) for clade assignment of SARS-CoV-2 genome sequences, also thanks to our training on a much larger dataset, with comparable running times. Our method implemented in CouGaR-g is able to detect k-mers that capture relevant biological information that distinguishes the clades, known as marker variants.Availability The trained models can be tested online providing a FASTA file (with one or multiple sequences) at https://huggingface.co/spaces/BIASLab/sars-cov-2-classification-fcgr. CouGaR-g is also available at https://github.com/AlgoLab/CouGaR-g under the GPL.Competing Interest StatementThe authors have declared no competing interest.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Rizzo, Nicola; Mäkinen, Veli

Indexable Elastic Founder Graphs of Minimum Height Inproceedings

In: Bannai, Hideo; Holub, Jan (Ed.): 33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022), pp. 19:1–19:19, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2022, ISSN: 1868-8969.

@inproceedings{Rizzo2022b,

title = {Indexable Elastic Founder Graphs of Minimum Height},

author = {Nicola Rizzo and Veli Mäkinen},

editor = {Hideo Bannai and Jan Holub},

doi = {10.4230/LIPIcs.CPM.2022.19},

issn = {1868-8969},

year = {2022},

date = {2022-01-01},

urldate = {2022-01-01},

booktitle = {33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022)},

volume = {223},

pages = {19:1--19:19},

publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},

address = {Dagstuhl, Germany},

series = {Leibniz International Proceedings in Informatics (LIPIcs)},

abstract = {Indexable elastic founder graphs have been recently proposed as a data structure for genomics applications supporting fast pattern matching queries. Consider segmenting a multiple sequence alignment MSA[1..m,1..n] into b blocks MSA[1..m,1..j₁], MSA[1..m,j₁+1..j₂], …, MSA[1..m,j_{b-1}+1..n]. The resulting elastic founder graph (EFG) is obtained by merging in each block the strings that are equivalent after the removal of gap symbols, taking the strings as the nodes of the block and the original MSA connections as edges. We call an elastic founder graph indexable if a node label occurs as a prefix of only those paths that start from a node of the same block. Equi et al. (ISAAC 2021) showed that such EFGs support fast pattern matching and studied their construction maximizing the number of blocks and minimizing the maximum length of a block, but left open the case of minimizing the maximum number of distinct strings in a block that we call graph height. For the simplified gapless setting, we give an O(mn) time algorithm to find a segmentation of an MSA minimizing the height of the resulting indexable founder graph, by combining previous results in segmentation algorithms and founder graphs. For the general setting, the known techniques yield a linear-time parameterized solution on constant alphabet Σ, taking time O(m n² log|Σ|) in the worst case, so we study the refined measure of prefix-aware height, that omits counting strings that are prefixes of another considered string. The indexable EFG minimizing the maximum prefix-aware height provides a lower bound for the original height: by exploiting exploiting suffix trees built from the MSA rows and the data structure answering weighted ancestor queries in constant time of Belazzougui et al. (CPM 2021), we give an O(mn)-time algorithm for the optimal EFG under this alternative height. },

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

Charalampopoulos, Panagiotis; Chen, Huiping; Christen, Peter; Loukides, Grigorios; Pisanti, Nadia; Pissis, Solon P.; Radoszewski, Jakub

Pattern Masking for Dictionary Matching Inproceedings

In: Ahn, Hee-Kap; Sadakane, Kunihiko (Ed.): 32nd International Symposium on Algorithms and Computation (ISAAC 2021), pp. 65:1–65:19, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2021, ISSN: 1868-8969.

@inproceedings{Charalampopoulos2021,

title = {Pattern Masking for Dictionary Matching},

author = {Panagiotis Charalampopoulos and Huiping Chen and Peter Christen and Grigorios Loukides and Nadia Pisanti and Solon P. Pissis and Jakub Radoszewski},

editor = {Hee-Kap Ahn and Kunihiko Sadakane},

doi = {10.4230/LIPIcs.ISAAC.2021.65},

issn = {1868-8969},

year = {2021},

date = {2021-01-01},

urldate = {2021-01-01},

booktitle = {32nd International Symposium on Algorithms and Computation (ISAAC 2021)},

volume = {212},

pages = {65:1--65:19},

publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},

address = {Dagstuhl, Germany},

series = {Leibniz International Proceedings in Informatics (LIPIcs)},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

Pisanti, Nadia

On-Line Pattern Matching on D-Texts (Invited Talk) Inproceedings

In: Gawrychowski, Paweł; Starikovskaya, Tatiana (Ed.): 32nd Annual Symposium on Combinatorial Pattern Matching (CPM 2021), pp. 3:1–3:2, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2021, ISSN: 1868-8969.

@inproceedings{Pisanti2021,

title = {On-Line Pattern Matching on D-Texts (Invited Talk)},

author = {Nadia Pisanti},

editor = {Paweł Gawrychowski and Tatiana Starikovskaya},

doi = {10.4230/LIPIcs.CPM.2021.3},

issn = {1868-8969},

year = {2021},

date = {2021-01-01},

urldate = {2021-01-01},

booktitle = {32nd Annual Symposium on Combinatorial Pattern Matching (CPM 2021)},

volume = {191},

pages = {3:1--3:2},

publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},

address = {Dagstuhl, Germany},

series = {Leibniz International Proceedings in Informatics (LIPIcs)},

abstract = {The Elastic Degenerate String Matching (EDSM) problem is defined as that of finding an occurrence of a pattern P of length m in an ED-text T. A D-text (Degenerate text) is a string that actually represents a set of similar and aligned strings (e.g. a pan-genome [The Computational Pan-Genomics Consortium, 2018]) by collapsing common fragments into a standard string, and representing variants with sets of alternative substrings. When such substrings are not bound to have the same size, then we talk about elastic D-strings (ED-strings). In [R.Grossi et al., 2017] we gave an O(nm²+N) time on-line algorithm for EDSM, where n is the length of T and N is its size, defined as the total number of letters. A fundamental toolkit of our algorithm is the O(m²+N) time solution of the later called Active Prefixes problem (AP). In [K.Aoyama et al., 2018], a O(m^{1.5} √{log m}+N) solution for AP was shown, leading to a O(nm^{1.5} √{log m}+N) time solution for EDSM. The natural open problem was thus whether the 1.5 exponent could furtherly be decreased. In [G.Bernardini et al., 2019], we prove several properties that answer this and other questions: we give a conditional O(nm^{1.5}+N) lower bound for EDSM, proving that a combinatorial algorithm solving EDSM in O(nm^{1.5-ε} +N) time would break the Boolean Matrix Multiplication (BMM) conjecture; we use this result as a hint to devise a non-combinatorial algorithm that solves EDSM in O(nm^{1.381}+N) time; we do so by successfully combining Fast Fourier Transform and properties of string periodicity. In my talk I will overview the results above, as well as some interesting side results: the extension to a dictionary rather than a single pattern [S.P.Pissis and A.Retha, 2018], the introduction of errors [G.Bernardini et al., 2020], and a notion of matching among D-strings with its linear time solution [M.Alzamel et al., 2020].},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

Conte, Alessio; Grossi, Roberto; Loukides, Grigorios; Pisanti, Nadia; Pissis, Solon P.; Punzi, Giulia

Beyond the BEST Theorem: Fast Assessment of Eulerian Trails Inproceedings

In: Bampis, Evripidis; Pagourtzis, Aris (Ed.): Fundamentals of Computation Theory, pp. 162–175, Springer International Publishing, Cham, 2021, ISBN: 978-3-030-86593-1.

@inproceedings{Conte2021,

title = {Beyond the BEST Theorem: Fast Assessment of Eulerian Trails},

author = {Alessio Conte and Roberto Grossi and Grigorios Loukides and Nadia Pisanti and Solon P. Pissis and Giulia Punzi},

editor = {Evripidis Bampis and Aris Pagourtzis},

url = {https://kclpure.kcl.ac.uk/portal/files/159578921/Estimating_Eulerian_Trails_FCT.pdf, Full PDF on King’s Research Portal},

doi = {10.1007/978-3-030-86593-1_11},

isbn = {978-3-030-86593-1},

year = {2021},

date = {2021-01-01},

urldate = {2021-01-01},

booktitle = {Fundamentals of Computation Theory},

pages = {162--175},

publisher = {Springer International Publishing},

address = {Cham},

abstract = {Given a directed multigraph $G=(V,E)$, with $|V|=n$ nodes and $|E|=m$ edges, and an integer $z$, we are asked to assess whether the number $#ET(G)$ of node-distinct Eulerian trails of $G$ is at least $z$; two trails are called node-distinct if their node sequences are different. This problem has been formalized by Bernardini et al. [ALENEX 2020] as it is the core computational problem in several string processing applications. It can be solved in $mathcal O(n^omega)$ arithmetic operations by applying the well-known BEST theorem, where $omega <2.373$ denotes the matrix multiplication exponent. The algorithmic challenge is: Can we solve this problem faster for certain values of $m$ and $z$? Namely, we want to design a combinatorial algorithm for assessing whether $#ET(G) ge z$, which does not resort to the BEST theorem and has a predictably bounded cost as a function of $m$ and $z$. We address this challenge here by providing a combinatorial algorithm requiring $mathcal O(mcdot min {z, #ET(G)})$ time.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

Loukides, Grigorios; Pissis, Solon P.

Bidirectional String Anchors: A New String Sampling Mechanism Inproceedings

In: Mutzel, Petra; Pagh, Rasmus; Herman, Grzegorz (Ed.): 29th Annual European Symposium on Algorithms (ESA 2021), pp. 64:1–64:21, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2021, ISSN: 1868-8969.

@inproceedings{Loukides2021,

title = {Bidirectional String Anchors: A New String Sampling Mechanism},

author = {Grigorios Loukides and Solon P. Pissis},

editor = {Petra Mutzel and Rasmus Pagh and Grzegorz Herman},

doi = {10.4230/LIPIcs.ESA.2021.64},

issn = {1868-8969},

year = {2021},

date = {2021-01-01},

urldate = {2021-01-01},

booktitle = {29th Annual European Symposium on Algorithms (ESA 2021)},

volume = {204},

pages = {64:1--64:21},

publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},

address = {Dagstuhl, Germany},

series = {Leibniz International Proceedings in Informatics (LIPIcs)},

abstract = { The minimizers sampling mechanism is a popular mechanism for string sampling introduced independently by Schleimer et al. [SIGMOD 2003] and by Roberts et al. [Bioinf. 2004]. Given two positive integers w and k, it selects the lexicographically smallest length-k substring in every fragment of w consecutive length-k substrings (in every sliding window of length w+k-1). Minimizers samples are approximately uniform, locally consistent, and computable in linear time. Although they do not have good worst-case guarantees on their size, they are often small in practice. They thus have been successfully employed in several string processing applications. Two main disadvantages of minimizers sampling mechanisms are: first, they also do not have good guarantees on the expected size of their samples for every combination of w and k; and, second, indexes that are constructed over their samples do not have good worst-case guarantees for on-line pattern searches.

To alleviate these disadvantages, we introduce bidirectional string anchors (bd-anchors), a new string sampling mechanism. Given a positive integer 𝓁, our mechanism selects the lexicographically smallest rotation in every length-𝓁 fragment (in every sliding window of length 𝓁). We show that bd-anchors samples are also approximately uniform, locally consistent, and computable in linear time. In addition, our experiments using several datasets demonstrate that the bd-anchors sample sizes decrease proportionally to 𝓁; and that these sizes are competitive to or smaller than the minimizers sample sizes using the analogous sampling parameters. We provide theoretical justification for these results by analyzing the expected size of bd-anchors samples.

We also show that by using any bd-anchors sample, we can construct, in near-linear time, an index which requires linear (extra) space in the size of the sample and answers on-line pattern searches in near-optimal time. We further show, using several datasets, that a simple implementation of our index is consistently faster for on-line pattern searches than an analogous implementation of a minimizers-based index [Grabowski and Raniszewski, Softw. Pract. Exp. 2017].

Finally, we highlight the applicability of bd-anchors by developing an efficient and effective heuristic for top-K similarity search under edit distance. We show, using synthetic datasets, that our heuristic is more accurate and more than one order of magnitude faster in top-K similarity searches than the state-of-the-art tool for the same purpose [Zhang and Zhang, KDD 2020]. },

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

To alleviate these disadvantages, we introduce bidirectional string anchors (bd-anchors), a new string sampling mechanism. Given a positive integer 𝓁, our mechanism selects the lexicographically smallest rotation in every length-𝓁 fragment (in every sliding window of length 𝓁). We show that bd-anchors samples are also approximately uniform, locally consistent, and computable in linear time. In addition, our experiments using several datasets demonstrate that the bd-anchors sample sizes decrease proportionally to 𝓁; and that these sizes are competitive to or smaller than the minimizers sample sizes using the analogous sampling parameters. We provide theoretical justification for these results by analyzing the expected size of bd-anchors samples.

We also show that by using any bd-anchors sample, we can construct, in near-linear time, an index which requires linear (extra) space in the size of the sample and answers on-line pattern searches in near-optimal time. We further show, using several datasets, that a simple implementation of our index is consistently faster for on-line pattern searches than an analogous implementation of a minimizers-based index [Grabowski and Raniszewski, Softw. Pract. Exp. 2017].

Finally, we highlight the applicability of bd-anchors by developing an efficient and effective heuristic for top-K similarity search under edit distance. We show, using synthetic datasets, that our heuristic is more accurate and more than one order of magnitude faster in top-K similarity searches than the state-of-the-art tool for the same purpose [Zhang and Zhang, KDD 2020].

Charalampopoulos, Panagiotis; Kociumaka, Tomasz; Pissis, Solon P.; Radoszewski, Jakub

Faster Algorithms for Longest Common Substring Inproceedings

In: Mutzel, Petra; Pagh, Rasmus; Herman, Grzegorz (Ed.): 29th Annual European Symposium on Algorithms (ESA 2021), pp. 30:1–30:17, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2021, ISSN: 1868-8969.

@inproceedings{Charalampopoulos2021b,

title = {Faster Algorithms for Longest Common Substring},

author = {Panagiotis Charalampopoulos and Tomasz Kociumaka and Solon P. Pissis and Jakub Radoszewski},

editor = {Petra Mutzel and Rasmus Pagh and Grzegorz Herman},

doi = {10.4230/LIPIcs.ESA.2021.30},

issn = {1868-8969},

year = {2021},

date = {2021-01-01},

urldate = {2021-01-01},

booktitle = {29th Annual European Symposium on Algorithms (ESA 2021)},

volume = {204},

pages = {30:1--30:17},

publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},

address = {Dagstuhl, Germany},

series = {Leibniz International Proceedings in Informatics (LIPIcs)},

abstract = { In the classic longest common substring (LCS) problem, we are given two strings S and T, each of length at most n, over an alphabet of size σ, and we are asked to find a longest string occurring as a fragment of both S and T. Weiner, in his seminal paper that introduced the suffix tree, presented an 𝒪(n log σ)-time algorithm for this problem [SWAT 1973]. For polynomially-bounded integer alphabets, the linear-time construction of suffix trees by Farach yielded an 𝒪(n)-time algorithm for the LCS problem [FOCS 1997]. However, for small alphabets, this is not necessarily optimal for the LCS problem in the word RAM model of computation, in which the strings can be stored in 𝒪(n log σ/log n) space and read in 𝒪(n log σ/log n) time. We show that, in this model, we can compute an LCS in time 𝒪(n log σ / √{log n}), which is sublinear in n if σ = 2^{o(√{log n})} (in particular, if σ = 𝒪(1)), using optimal space 𝒪(n log σ/log n).

We then lift our ideas to the problem of computing a k-mismatch LCS, which has received considerable attention in recent years. In this problem, the aim is to compute a longest substring of S that occurs in T with at most k mismatches. Flouri et al. showed how to compute a 1-mismatch LCS in 𝒪(n log n) time [IPL 2015]. Thankachan et al. extended this result to computing a k-mismatch LCS in 𝒪(n log^k n) time for k = 𝒪(1) [J. Comput. Biol. 2016]. We show an 𝒪(n log^{k-1/2} n)-time algorithm, for any constant integer k > 0 and irrespective of the alphabet size, using 𝒪(n) space as the previous approaches. We thus notably break through the well-known n log^k n barrier, which stems from a recursive heavy-path decomposition technique that was first introduced in the seminal paper of Cole et al. [STOC 2004] for string indexing with k errors. },

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

We then lift our ideas to the problem of computing a k-mismatch LCS, which has received considerable attention in recent years. In this problem, the aim is to compute a longest substring of S that occurs in T with at most k mismatches. Flouri et al. showed how to compute a 1-mismatch LCS in 𝒪(n log n) time [IPL 2015]. Thankachan et al. extended this result to computing a k-mismatch LCS in 𝒪(n log^k n) time for k = 𝒪(1) [J. Comput. Biol. 2016]. We show an 𝒪(n log^{k-1/2} n)-time algorithm, for any constant integer k > 0 and irrespective of the alphabet size, using 𝒪(n) space as the previous approaches. We thus notably break through the well-known n log^k n barrier, which stems from a recursive heavy-path decomposition technique that was first introduced in the seminal paper of Cole et al. [STOC 2004] for string indexing with k errors.