Publications – ALPACA ITN Project

76 entries « ‹ 1 of 4 › »

Ghaffaari, Ali; Schönhuth, Alexander; Marschall, Tobias

DiVerG: Scalable Distance Index for Validation of Paired-End Alignments in Sequence Graphs Proceedings Article

In: Brejová, Broňa; Patro, Rob (Ed.): 25th International Conference on Algorithms for Bioinformatics (WABI 2025), pp. 10:1–10:24, Schloss Dagstuhl – Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2025, ISSN: 1868-8969.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@inproceedings{ghaffaari_et_al:LIPIcs.WABI.2025.10,

title = {DiVerG: Scalable Distance Index for Validation of Paired-End Alignments in Sequence Graphs},

author = {Ali Ghaffaari and Alexander Schönhuth and Tobias Marschall},

editor = {Broňa Brejová and Rob Patro},

url = {https://github.com/cartoonist/diverg},

doi = {10.4230/LIPIcs.WABI.2025.10},

issn = {1868-8969},

year  = {2025},

date = {2025-01-01},

urldate = {2025-01-01},

booktitle = {25th International Conference on Algorithms for Bioinformatics (WABI 2025)},

volume = {344},

pages = {10:1–10:24},

publisher = {Schloss Dagstuhl – Leibniz-Zentrum für Informatik},

address = {Dagstuhl, Germany},

series = {Leibniz International Proceedings in Informatics (LIPIcs)},

abstract = {Determining the distance between two loci within a genomic region is a recurrent operation in various tasks in computational genomics. A notable example of this task arises in paired-end read mapping as a form of validation of distances between multiple alignments. While straightforward for a single genome, graph-based reference structures render the operation considerably more involved. Given the sheer number of such queries in a typical read mapping experiment, an efficient algorithm for answering distance queries is crucial. In this paper, we introduce DiVerG, a compact data structure as well as a fast and scalable algorithm, for constructing distance indexes for general sequence graphs on multi-core CPU and many-core GPU architectures. DiVerG is based on PairG, but overcomes the limitations of PairG by exploiting the extensive potential for improvements in terms of scalability and space efficiency. As a consequence, DiVerG can process substantially larger datasets, such as whole human genomes, which are unmanageable by PairG. DiVerG offers faster index construction time and consistently faster query time with gains proportional to the size of the underlying compact data structure. We demonstrate that our method performs favorably on multiple real datasets at various scales. DiVerG achieves superior performance over PairG; e.g. resulting to 2.5--4x speed-up in query time, 44--340x smaller index size, and 3--50x faster construction time for the genome graph of the MHC region, as a particularly variable region of the human genome.The implementation is available at: https://github.com/cartoonist/diverg},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {inproceedings}

}

Zhang, Wenhai; Liu, Yuansheng; Xu, Jialu; Chen, Enlian; Schönhuth, Alexander; Luo, Xiao

PanTax: Strain-level taxonomic classification of metagenomic data using pangenome graphs Unpublished

2024.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@unpublished{Zhang2024,

title = {PanTax: Strain-level taxonomic classification of metagenomic data using pangenome graphs},

author = {Wenhai Zhang and Yuansheng Liu and Jialu Xu and Enlian Chen and Alexander Schönhuth and Xiao Luo},

url = {https://github.com/LuoGroup2023/PanTax},

doi = {10.1101/2024.11.15.623887},

year  = {2024},

date = {2024-11-01},

urldate = {2024-11-01},

publisher = {Cold Spring Harbor Laboratory},

abstract = {Microbes are omnipresent, thriving in a range of habitats from oceans to soils and even within our gastrointestinal tracts. They play a vital role in maintaining ecological equilibrium and promoting the health of their hosts. Consequently, understanding the strain diversity within microbial communities is crucial, as variations between strains can lead to distinct phenotypic expressions or diverse biological functions. However, current methods for taxonomic classification from metagenomic sequencing data have several limitations, including their reliance solely on species resolution, support for either short or long reads, or their confinement to a given single species. Most notably, the majority of existing taxonomic classifiers rely solely on a single linear representative genome as a reference, which fails to capture the strain diversity, thereby introducing single-reference biases. 

Here, we present PanTax, a pangenome graph-based taxonomic classification method that overcomes the shortcomings of single-reference genome-based approaches, because pangenome graphs possess the capability to depict the genetic variability present across multiple evolutionarily or environmentally related genomes. PanTax provides a comprehensive solution to taxonomic classification for strain resolution, compatibility with both short and long reads, and compatibility with single or multiple species. Extensive benchmarking results demonstrate that PanTax drastically outperforms state-of-the-art approaches, primarily evidenced by its significantly higher precision or recall (at both species and strain levels), while maintaining comparable or better performance in other aspects across various datasets. PanTax is a user-friendly open-source tool that is publicly accessible at https://github.com/LuoGroup2023/PanTax.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {unpublished}

}

Kang, Xiongbin; Zhang, Wenhai; Li, Yichen; Luo, Xiao; Schönhuth, Alexander

HyLight: Strain aware assembly of low coverage metagenomes Journal Article

In: Nature Communications, vol. 15, no. 1, pp. 8665, 2024, ISSN: 2041-1723.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@article{Kang2024,

title = {HyLight: Strain aware assembly of low coverage metagenomes},

author = {Xiongbin Kang and Wenhai Zhang and Yichen Li and Xiao Luo and Alexander Schönhuth},

doi = {10.1038/s41467-024-52907-0},

issn = {2041-1723},

year  = {2024},

date = {2024-10-01},

journal = {Nature Communications},

volume = {15},

number = {1},

pages = {8665},

publisher = {Springer Science and Business Media LLC},

abstract = {Different strains of identical species can vary substantially in terms of their spectrum of biomedically relevant phenotypes. Reconstructing the genomes of microbial communities at the level of their strains poses significant challenges, because sequencing errors can obscure strain-specific variants. Next-generation sequencing (NGS) reads are too short to resolve complex genomic regions. Third-generation sequencing (TGS) reads, although longer, are prone to higher error rates or substantially more expensive. Limiting TGS coverage to reduce costs compromises the accuracy of the assemblies. This explains why prior approaches agree on losses in strain awareness, accuracy, tendentially excessive costs, or combinations thereof. We introduce HyLight, a metagenome assembly approach that addresses these challenges by implementing the complementary strengths of TGS and NGS data. HyLight employs strain-resolved overlap graphs (OG) to accurately reconstruct individual strains within microbial communities. Our experiments demonstrate that HyLight produces strain-aware and contiguous assemblies at minimal error content, while significantly reducing costs because utilizing low-coverage TGS data. HyLight achieves an average improvement of 19.05% in preserving strain identity and demonstrates near-complete strain awareness across diverse datasets. In summary, HyLight offers considerable advances in metagenome assembly, insofar as it delivers significantly enhanced strain awareness, contiguity, and accuracy without the typical compromises observed in existing approaches.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {article}

}

Rivals, Eric

Incremental computation of the set of period sets Unpublished

2024.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

Bernardini, Giulia; Gabory, Esteban; Pissis, Solon P.; Stougie, Leen; Sweering, Michelle; Zuba, Wiktor

Elastic-Degenerate String Matching with 1 Error or Mismatch Journal Article

In: Theory of Computing Systems, vol. 68, no. 5, pp. 1442–1467, 2024, ISSN: 1433-0490.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@article{Bernardini2024b,

title = {Elastic-Degenerate String Matching with 1 Error or Mismatch},

author = {Giulia Bernardini and Esteban Gabory and Solon P. Pissis and Leen Stougie and Michelle Sweering and Wiktor Zuba},

doi = {10.1007/s00224-024-10194-8},

issn = {1433-0490},

year  = {2024},

date = {2024-09-01},

urldate = {2024-09-01},

journal = {Theory of Computing Systems},

volume = {68},

number = {5},

pages = {1442–1467},

publisher = {Springer Science and Business Media LLC},

abstract = {An elastic-degenerate (ED) string is a sequence of n finite sets of strings of total length N, introduced to represent a set of related DNA sequences, also known as a pangenome. The ED string matching (EDSM) problem consists in reporting all occurrences of a pattern of length m in an ED text. The EDSM problem has recently received some attention by the combinatorial pattern matching community, culminating in an $tilde{mathcal{O}}(nw^{w-1})+mathcal{O}(N)$-time algorithm [Bernardini et al., SIAM J. Comput. 2022], where $omega$ denotes the matrix multiplication exponent and the $tilde{mathcal{O}}(cdot)$ notation suppresses polylog factors. In the k-EDSM problem, the approximate version of EDSM, we are asked to report all pattern occurrences with at most k errors. k-EDSM can be solved in $mathcal{O}(k^2mG+kN)$ time, under edit distance, or $mathcal{O}(kmG+kN)$ time, under Hamming distance, where G denotes the total number of strings in the ED text [Bernardini et al., Theor. Comput. Sci. 2020]. Unfortunately, G is only bounded by N, and so even for k = 1, the existing algorithms run in $Omega(mN)$ time in the worst case. In this paper we make progress in this direction. We show that 1-EDSM can be solved in $mathcal{O}((nm^2+N)log m)$ or $mathcal{O}(nm^3+N)$ time under edit distance. For the decision version of the problem, we present a faster $mathcal{O}(nm^2sqrt{log m} + Nlog log m)$-time algorithm. We also show that 1-EDSM can be solved in $mathcal{O}(nm^2+N log m)$ time under Hamming distance. Our algorithms for edit distance rely on non-trivial reductions from 1-EDSM to special instances of classic computational geometry problems (2d rectangle stabbing or 2d range emptiness), which we show how to solve efficiently. In order to obtain an even faster algorithm for Hamming distance, we rely on employing and adapting the k-errata trees for indexing with errors [Cole et al., STOC 2004]. This is an extended version of a paper presented at LATIN 2022.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {article}

}

An elastic-degenerate (ED) string is a sequence of n finite sets of strings of total length N, introduced to represent a set of related DNA sequences, also known as a pangenome. The ED string matching (EDSM) problem consists in reporting all occurrences of a pattern of length m in an ED text. The EDSM problem has recently received some attention by the combinatorial pattern matching community, culminating in an $\tilde{\mathcal{O}}(nw^{w-1})+\mathcal{O}(N)$-time algorithm [Bernardini et al., SIAM J. Comput. 2022], where $\omega$ denotes the matrix multiplication exponent and the $\tilde{\mathcal{O}}(\cdot)$ notation suppresses polylog factors. In the k-EDSM problem, the approximate version of EDSM, we are asked to report all pattern occurrences with at most k errors. k-EDSM can be solved in $\mathcal{O}(k^2mG+kN)$ time, under edit distance, or $\mathcal{O}(kmG+kN)$ time, under Hamming distance, where G denotes the total number of strings in the ED text [Bernardini et al., Theor. Comput. Sci. 2020]. Unfortunately, G is only bounded by N, and so even for k = 1, the existing algorithms run in $\Omega(mN)$ time in the worst case. In this paper we make progress in this direction. We show that 1-EDSM can be solved in $\mathcal{O}((nm^2+N)\log m)$ or $\mathcal{O}(nm^3+N)$ time under edit distance. For the decision version of the problem, we present a faster $\mathcal{O}(nm^2\sqrt{\log m} + N\log \log m)$-time algorithm. We also show that 1-EDSM can be solved in $\mathcal{O}(nm^2+N \log m)$ time under Hamming distance. Our algorithms for edit distance rely on non-trivial reductions from 1-EDSM to special instances of classic computational geometry problems (2d rectangle stabbing or 2d range emptiness), which we show how to solve efficiently. In order to obtain an even faster algorithm for Hamming distance, we rely on employing and adapting the k-errata trees for indexing with errors [Cole et al., STOC 2004]. This is an extended version of a paper presented at LATIN 2022.

Gabory, Esteban; Liu, Chang; Loukides, Grigorios; Pissis, Solon P.; Zuba, Wiktor

Space-Efficient Indexes for Uncertain Strings Proceedings Article

In: 2024 IEEE 40th International Conference on Data Engineering (ICDE), pp. 4828–4842, IEEE, 2024.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@inproceedings{Gabory2024a,

title = {Space-Efficient Indexes for Uncertain Strings},

author = {Esteban Gabory and Chang Liu and Grigorios Loukides and Solon P. Pissis and Wiktor Zuba},

url = {https://arxiv.org/abs/2403.14256},

doi = {10.1109/icde60146.2024.00367},

year  = {2024},

date = {2024-05-01},

urldate = {2024-05-01},

booktitle = {2024 IEEE 40th International Conference on Data Engineering (ICDE)},

pages = {4828–4842},

publisher = {IEEE},

abstract = {Strings in the real world are often encoded with some level of uncertainty, for example, due to: unreliable data measurements; flexible sequence modeling; or noise introduced for privacy protection. In the character-level uncertainty model, an uncertain string X of length $n$ on an alphabet Σ is a sequence of $n$ probability distributions over Σ. Given an uncertain string $X$ and a weight threshold $frac{1}{z}in(0,1]$, we say that pattern $P$ occurs in $X$ at position $i$, if the product of probabilities of the letters of $P$ at positions $i,…, i+ vert Pvert-1$ is at least $frac{1}{z}$. While indexing standard strings for online pattern searches can be performed in linear time and space, indexing uncertain strings is much more challenging. Specifically, the state-of-the-art index for uncertain strings has $O(nz)$ size, requires $O(nz)$ time and $O(nz)$ space to be constructed, and answers pattern matching queries in the optimal $O(m+ vert Occ vert)$ time, where $m$ is the length of $P$ and $vert Occvert$ is the total number of occurrences of $P$ in $X$ . For large $n$ and (moderate) $z$ values, this index is completely impractical to construct, which outweighs the benefit of the supported optimal pattern matching queries. We were thus motivated to design a space-efficient index at the expense of slower yet competitive pattern matching queries. We show that when we have at hand a lower bound ℓ on the length of the supported pattern queries, as is often the case in real-world applications, we can slash the index size and the construction space roughly by ℓ. In particular, we propose an index of $mathcal{O}(frac{nz}{l}log z)$ expected size, which can be constructed using $mathcal{O}(frac{nz}{l}log z)$ expected space, and supports very fast pattern matching queries in expectation, for patterns of length m ≥ ℓ. We have implemented and evaluated several versions of our index. The best-performing version of our index is up to two orders of magnitude smaller than the state of the art in terms of both index size and construction space, while offering faster or very competitive query and construction times.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {inproceedings}

}

Strings in the real world are often encoded with some level of uncertainty, for example, due to: unreliable data measurements; flexible sequence modeling; or noise introduced for privacy protection. In the character-level uncertainty model, an uncertain string X of length $n$ on an alphabet Σ is a sequence of $n$ probability distributions over Σ. Given an uncertain string $X$ and a weight threshold $\frac{1}{z}\in(0,1]$, we say that pattern $P$ occurs in $X$ at position $i$, if the product of probabilities of the letters of $P$ at positions $i,…, i+ \vert P\vert-1$ is at least $\frac{1}{z}$. While indexing standard strings for online pattern searches can be performed in linear time and space, indexing uncertain strings is much more challenging. Specifically, the state-of-the-art index for uncertain strings has $O(nz)$ size, requires $O(nz)$ time and $O(nz)$ space to be constructed, and answers pattern matching queries in the optimal $O(m+ \vert Occ \vert)$ time, where $m$ is the length of $P$ and $\vert Occ\vert$ is the total number of occurrences of $P$ in $X$ . For large $n$ and (moderate) $z$ values, this index is completely impractical to construct, which outweighs the benefit of the supported optimal pattern matching queries. We were thus motivated to design a space-efficient index at the expense of slower yet competitive pattern matching queries. We show that when we have at hand a lower bound ℓ on the length of the supported pattern queries, as is often the case in real-world applications, we can slash the index size and the construction space roughly by ℓ. In particular, we propose an index of $\mathcal{O}(\frac{nz}{l}\log z)$ expected size, which can be constructed using $\mathcal{O}(\frac{nz}{l}\log z)$ expected space, and supports very fast pattern matching queries in expectation, for patterns of length m ≥ ℓ. We have implemented and evaluated several versions of our index. The best-performing version of our index is up to two orders of magnitude smaller than the state of the art in terms of both index size and construction space, while offering faster or very competitive query and construction times.

Rivals, Eric; Wang, Pengfei

Counting overlapping pairs of strings Unpublished

2024.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

Hannoush, Khodor; Marchet, Camille; Peterlongo, Pierre

Cdbgtricks: Strategies to update a compacted de Bruijn graph Unpublished

2024.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

Charalampopoulos, Panagiotis; Chen, Huiping; Christen, Peter; Loukides, Grigorios; Pisanti, Nadia; Pissis, Solon P.; Radoszewski, Jakub

Pattern Masking for Dictionary Matching: Theory and Practice Journal Article

In: Algorithmica, vol. 86, no. 6, pp. 1948–1978, 2024, ISSN: 1432-0541.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@article{Charalampopoulos2024,

title = {Pattern Masking for Dictionary Matching: Theory and Practice},

author = {Panagiotis Charalampopoulos and Huiping Chen and Peter Christen and Grigorios Loukides and Nadia Pisanti and Solon P. Pissis and Jakub Radoszewski},

doi = {10.1007/s00453-024-01213-8},

issn = {1432-0541},

year  = {2024},

date = {2024-03-01},

journal = {Algorithmica},

volume = {86},

number = {6},

pages = {1948–1978},

publisher = {Springer Science and Business Media LLC},

abstract = {Data masking is a common technique for sanitizing sensitive data maintained in database systems which is becoming increasingly important in various application areas, such as in record linkage of personal data. This work formalizes the Pattern Masking for Dictionary Matching (PMDM) problem: given a dictionary D of d strings, each of length l, a query string q of length l, and a positive integer z, we are asked to compute a smallest set K ⊆ 1, ... , l, so that if q[i] is replaced by a wildcard for all i ∈ K , then q matches at least z strings from D. Solving PMDM allows providing data utility guarantees as opposed to existing approaches. We first show, through a reduction from the well-known k-Clique problem, that a decision version of the PMDM problem is NP-complete, even for binary strings. We thus approach the problem from a more practical perspective. We show a combinatorial $$O((dl)^|K|/3 + dl)$$-time and O(dl)-space algorithm for PMDM for |K| = O(1). In fact, we show that we cannot hope for a faster combinatorial algorithm, unless the combinatorial k-Clique hypothesis fails (Abboud et al. in SIAM J Comput 47:2527–2555, 2018; Lincoln et al., in: 29th ACM-SIAM Symposium on Discrete Algorithms (SODA), 2018). Our combinatorial algorithm, executed with small |K|, is the backbone of a greedy heuristic that we propose. Our experiments on real-world and synthetic datasets show that our heuristic finds nearly-optimal solutions in practice and is also very efficient. We also generalize this algorithm for the problem of masking multiple query strings simultaneously so that every string has at least z matches in D. PMDM can be viewed as a generalization of the decision version of the dictionary matching with mismatches problem: by querying a PMDM data structure with string q and z = 1, one obtains the minimal number of mismatches of q with any string from D. The query time or space of all known data structures for the more restricted problem of dictionary matching with at most k mismatches incurs some exponential factor with respect to k. A simple exact algorithm for PMDM runs in time $$O(2^ld)$$. We present a data structure for PMDM that answers queries over D in time $$O(2^l/2(2^l/2 + τ )l)$$ and requires space $$O(2^ld^2/τ^2 + 2^l/2d)$$, for any parameter τ ∈ [1, d]. We complement our results by showing a two-way polynomial-time reduction between PMDM and the Minimum Union problem [Chlamtáˇc et al., ACM-SIAM Symposium on Discrete Algorithms (SODA) 2017]. This gives a polynomial-time $$O(d^1/4+epsilon)$$-approximation algorithm for PMDM, which is tight under a plausible complexity conjecture. This is an extended version of a paper that was presented at International Symposium on Algorithms and Computation (ISAAC) 2021.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {article}

}

Data masking is a common technique for sanitizing sensitive data maintained in database systems which is becoming increasingly important in various application areas, such as in record linkage of personal data. This work formalizes the Pattern Masking for Dictionary Matching (PMDM) problem: given a dictionary D of d strings, each of length l, a query string q of length l, and a positive integer z, we are asked to compute a smallest set K ⊆ 1, ... , l, so that if q[i] is replaced by a wildcard for all i ∈ K , then q matches at least z strings from D. Solving PMDM allows providing data utility guarantees as opposed to existing approaches. We first show, through a reduction from the well-known k-Clique problem, that a decision version of the PMDM problem is NP-complete, even for binary strings. We thus approach the problem from a more practical perspective. We show a combinatorial $$O((dl)^|K|/3 + dl)$$-time and O(dl)-space algorithm for PMDM for |K| = O(1). In fact, we show that we cannot hope for a faster combinatorial algorithm, unless the combinatorial k-Clique hypothesis fails (Abboud et al. in SIAM J Comput 47:2527–2555, 2018; Lincoln et al., in: 29th ACM-SIAM Symposium on Discrete Algorithms (SODA), 2018). Our combinatorial algorithm, executed with small |K|, is the backbone of a greedy heuristic that we propose. Our experiments on real-world and synthetic datasets show that our heuristic finds nearly-optimal solutions in practice and is also very efficient. We also generalize this algorithm for the problem of masking multiple query strings simultaneously so that every string has at least z matches in D. PMDM can be viewed as a generalization of the decision version of the dictionary matching with mismatches problem: by querying a PMDM data structure with string q and z = 1, one obtains the minimal number of mismatches of q with any string from D. The query time or space of all known data structures for the more restricted problem of dictionary matching with at most k mismatches incurs some exponential factor with respect to k. A simple exact algorithm for PMDM runs in time $$O(2^ld)$$. We present a data structure for PMDM that answers queries over D in time $$O(2^l/2(2^l/2 + τ )l)$$ and requires space $$O(2^ld^2/τ^2 + 2^l/2d)$$, for any parameter τ ∈ [1, d]. We complement our results by showing a two-way polynomial-time reduction between PMDM and the Minimum Union problem [Chlamtáˇc et al., ACM-SIAM Symposium on Discrete Algorithms (SODA) 2017]. This gives a polynomial-time $$O(d^1/4+epsilon)$$-approximation algorithm for PMDM, which is tight under a plausible complexity conjecture. This is an extended version of a paper that was presented at International Symposium on Algorithms and Computation (ISAAC) 2021.

10.

Rizzo, Nicola; Cáceres, Manuel; Mäkinen, Veli

Finding maximal exact matches in graphs Journal Article

In: Algorithms for Molecular Biology, vol. 19, no. 1, pp. 10, 2024, ISSN: 1748-7188.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@article{Rizzo2024,

title = {Finding maximal exact matches in graphs},

author = {Nicola Rizzo and Manuel Cáceres and Veli Mäkinen},

url = {https://github.com/algbio/efg-mems},

doi = {10.1186/s13015-024-00255-5},

issn = {1748-7188},

year  = {2024},

date = {2024-03-01},

urldate = {2024-03-01},

journal = {Algorithms for Molecular Biology},

volume = {19},

number = {1},

pages = {10},

publisher = {Springer Science and Business Media LLC},

abstract = {Background: We study the problem of finding maximal exact matches (MEMs) between a query string Q and a labeled graph G. MEMs are an important class of seeds, often used in seed-chain-extend type of practical alignment methods because of their strong connections to classical metrics. A principled way to speed up chaining is to limit the number of MEMs by considering only MEMs of length at least $kappa$($kappa$-MEMs). However, on arbitrary input graphs, the problem of finding MEMs cannot be solved in truly sub-quadratic time under SETH (Equi et al., TALG 2023) even on acyclic graphs.



Results: In this paper we show an $O(n cdot L cdot d^{L-1} + m + M_{kappa , L})$-time algorithm finding all $kappa$-MEMs between Q and G spanning exactly L nodes in G, where n is the total length of node labels, d is the maximum degree of a node in G, $m = |Q|$, and $M_{kappa ,L}$ is the number of output MEMs. We use this algorithm to develop a $kappa$-MEM finding solution on indexable Elastic Founder Graphs (Equi et al., Algorithmica 2022) running in time $O(nH^2 + m + M_{kappa})$, where H is the maximum number of nodes in a block, and $M_{kappa}$ is the total number of $kappa$-MEMs. Our results generalize to the analysis of multiple query strings (MEMs between G and any of the strings). Additionally, we provide some experimental results showing that the number of graph MEMs is an order of magnitude smaller than the number of string MEMs of the corresponding concatenated collection. 



Conclusions: We show that seed-chain-extend type of alignment methods can be implemented on top of indexable Elastic Founder Graphs by providing an efficient way to produce the seeds between a set of queries and the graph. The code is available in https://github.com/algbio/efg-mems.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {article}

}

11.

Goga, Adrián; Depuydt, Lore; Brown, Nathaniel K.; Fostier, Jan; Gagie, Travis; Navarro, Gonzalo

Faster Maximal Exact Matches with Lazy LCP Evaluation Proceedings Article

In: 2024 Data Compression Conference (DCC), pp. 123–132, IEEE, 2024.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

12.

Lemane, Téo; Lezzoche, Nolan; Lecubin, Julien; Pelletier, Eric; Lescot, Magali; Chikhi, Rayan; Peterlongo, Pierre

Indexing and real-time user-friendly queries in terabyte-sized complex genomic datasets with kmindex and ORA Journal Article

In: Nature Computational Science, vol. 4, no. 2, pp. 104–109, 2024, ISSN: 2662-8457.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG, WP2: Evolutionary/Comparative CPG, WP3: Translational CPG

13.

Rizzo, Nicola; Equi, Massimo; Norri, Tuukka; Mäkinen, Veli

Elastic founder graphs improved and enhanced Journal Article

In: Theoretical Computer Science, vol. 982, no. 114269, pp. 114269, 2024, ISSN: 0304-3975.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@article{Rizzo2024-jy,

title = {Elastic founder graphs improved and enhanced},

author = {Nicola Rizzo and Massimo Equi and Tuukka Norri and Veli Mäkinen},

doi = {10.1016/j.tcs.2023.114269},

issn = {0304-3975},

year  = {2024},

date = {2024-01-01},

urldate = {2024-01-01},

journal = {Theoretical Computer Science},

volume = {982},

number = {114269},

pages = {114269},

publisher = {Elsevier BV},

abstract = {Indexing labeled graphs for pattern matching is a central challenge of pangenomics. Equi et al. (2022) [14] developed the Elastic Founder Graph (EFG) representing an alignment of m sequences of length n, drawn from alphabet Σ plus the special gap character: the paths spell the original sequences or their recombination. By enforcing the semi-repeat-free property, the EFG admits a polynomial-space index for linear-time pattern matching, breaking through the conditional lower bounds on indexing labeled graphs (Equi et al. [13]). In this work, we improve the space of the EFG index answering pattern matching queries in linear time, from linear in the length of all strings spelled by three consecutive node labels, to linear in the size of the edge labels. Then, we develop linear-time construction algorithms optimizing for different metrics: we improve the existing linearithmic construction algorithms to O(mn), by solving the novel exclusive ancestor set problem on trees; we propose, for the simplified gapless setting, an O(mn)-time solution minimizing the maximum block height, that we generalize by substituting block height with prefix-aware height. Finally, to show the versatility of the framework, we develop a BWT-based EFG index and study how to encode and perform document listing queries on a set of paths of the graphs, reporting which paths present a given pattern as a substring. We propose the EFG framework as an improved and enhanced version of the framework for the gapless setting, along with construction methods that are valid in any setting concerned with the segmentation of aligned sequences.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {article}

}

14.

Benoit, Gaëtan; Raguideau, Sébastien; James, Robert; Phillippy, Adam M.; Chikhi, Rayan; Quince, Christopher

High-quality metagenome assembly from long accurate reads with metaMDBG Journal Article

In: Nature Biotechnology, 2024, ISSN: 1546-1696.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

15.

Bernardini, Giulia; Chen, Huiping; Conte, Alessio; Grossi, Roberto; Guerrini, Veronica; Loukides, Grigorios; Pisanti, Nadia; Pissis, Solon P.

Utility-Oriented String Mining Book Chapter

In: Proceedings of the 2024 SIAM International Conference on Data Mining (SDM), pp. 190–198, Society for Industrial and Applied Mathematics, 2024, ISBN: 9781611978032.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@inbook{Bernardini2024,

title = {Utility-Oriented String Mining},

author = {Giulia Bernardini and Huiping Chen and Alessio Conte and Roberto Grossi and Veronica Guerrini and Grigorios Loukides and Nadia Pisanti and Solon P. Pissis},

doi = {10.1137/1.9781611978032.22},

isbn = {9781611978032},

year  = {2024},

date = {2024-01-01},

booktitle = {Proceedings of the 2024 SIAM International Conference on Data Mining (SDM)},

pages = {190–198},

publisher = {Society for Industrial and Applied Mathematics},

abstract = {A string is often provided with numerical scores (utilities) which quantify the importance, interest, profit, or risk of the letters occurring at every position of the string. For example, every DNA fragment produced by modern sequencing machines comes with a confidence score per position. Motivated by the abundance of strings with utilities, we introduce Utility-oriented String Mining (USM), a natural generalization of the classic frequent substring mining problem. Given a string S of length n and a threshold 𝒱, USM asks for every string R whose utility U (R) is at least 𝒱, where U is a function that maps R to a utility score based on the utilities of all letters of every occurrence of R in S. In addition, our work makes the following contributions: (1) We identify a class 𝕌 of utility functions for which USM admits an 𝒪 (n2)-time algorithm. (2) We prove that no listing algorithm solves the USM problem in subquadratic time for every utility function, or even for every function in 𝕌. (3) We propose an 𝒪 (n log n)-time algorithm that solves USM for a class of monotone functions from 𝕌. (4) We design another 𝒪 (n log n)-time algorithm for the same problem that is comparable in runtime but offers drastic space savings in practice when, in addition, a lower bound on the length of the output strings is provided as input. (5) We demonstrate experimentally using publicly available, billion-letter datasets that our algorithms are many times more efficient, in terms of runtime and/or space, compared to an Apriori-like baseline which employs advanced string processing tools.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {inbook}

}

16.

Ascone, Rocco; Bernardini, Giulia; Conte, Alessio; Equi, Massimo; Gabory, Esteban; Grossi, Roberto; Pisanti, Nadia

A Unifying Taxonomy of Pattern Matching in Degenerate Strings and Founder Graphs Proceedings Article

In: Pissis, Solon P.; Sung, Wing-Kin (Ed.): pp. 14:1–14:21, Schloss Dagstuhl – Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2024, ISSN: 1868-8969.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

17.

Bille, Philip; Nekrich, Yakov; Pissis, Solon P.

Size-Constrained Weighted Ancestors with Applications Proceedings Article

In: 19th Scandinavian Symposium and Workshops on Algorithm Theory (SWAT 2024), pp. 14:1–14:12, Schloss Dagstuhl – Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2024, ISSN: 1868-8969.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@inproceedings{Bille2024,

title = {Size-Constrained Weighted Ancestors with Applications},

author = {Philip Bille and Yakov Nekrich and Solon P. Pissis},

doi = {10.4230/LIPICS.SWAT.2024.14},

issn = {1868-8969},

year  = {2024},

date = {2024-01-01},

booktitle = {19th Scandinavian Symposium and Workshops on Algorithm Theory (SWAT 2024)},

volume = {294},

pages = {14:1–14:12},

publisher = {Schloss Dagstuhl – Leibniz-Zentrum für Informatik},

address = {Dagstuhl, Germany},

series = {Leibniz International Proceedings in Informatics (LIPIcs)},

abstract = {The weighted ancestor problem on a rooted node-weighted tree T is a generalization of the classic predecessor problem: construct a data structure for a set of integers that supports fast predecessor queries. Both problems are known to require Ω(log log n) time for queries provided 𝒪(n poly log n) space is available, where n is the input size. The weighted ancestor problem has attracted a lot of attention by the combinatorial pattern matching community due to its direct application to suffix trees. In this formulation of the problem, the nodes are weighted by string depth. This research has culminated in a data structure for weighted ancestors in suffix trees with 𝒪(1) query time and an 𝒪(n)-time construction algorithm [Belazzougui et al., CPM 2021]. 

In this paper, we consider a different version of the weighted ancestor problem, where the nodes are weighted by any function weight that maps each node of T to a positive integer, such that weight(u) ≤ size(u) for any node u and weight(u₁) ≤ weight(u₂) if node u₁ is a descendant of node u₂, where size(u) is the number of nodes in the subtree rooted at u. In the size-constrained weighted ancestor (SWA) problem, for any node u of T and any integer k, we are asked to return the lowest ancestor w of u with weight at least k. We show that for any rooted tree with n nodes, we can locate node w in 𝒪(1) time after 𝒪(n)-time preprocessing. In particular, this implies a data structure for the SWA problem in suffix trees with 𝒪(1) query time and 𝒪(n)-time preprocessing, when the nodes are weighted by weight. We also show several string-processing applications of this result.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {inproceedings}

}

18.

Charalampopoulos, Panagiotis; Pissis, Solon P.; Radoszewski, Jakub; Rytter, Wojciech; Waleń, Tomasz; Zuba, Wiktor

Approximate Circular Pattern Matching Under Edit Distance Proceedings Article

In: Beyersdorff, Olaf; Kanté, Mamadou Moustapha; Kupferman, Orna; Lokshtanov, Daniel (Ed.): 41st International Symposium on Theoretical Aspects of Computer Science (STACS 2024), pp. 24:1–24:22, Schloss Dagstuhl – Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2024, ISSN: 1868-8969.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@inproceedings{Charalampopoulos2024a,

title = {Approximate Circular Pattern Matching Under Edit Distance},

author = {Panagiotis Charalampopoulos and Solon P. Pissis and Jakub Radoszewski and Wojciech Rytter and Tomasz Waleń and Wiktor Zuba},

editor = {Olaf Beyersdorff and Mamadou Moustapha Kanté and Orna Kupferman and Daniel Lokshtanov},

doi = {10.4230/LIPICS.STACS.2024.24},

issn = {1868-8969},

year  = {2024},

date = {2024-01-01},

booktitle = {41st International Symposium on Theoretical Aspects of Computer Science (STACS 2024)},

volume = {289},

pages = {24:1–24:22},

publisher = {Schloss Dagstuhl – Leibniz-Zentrum für Informatik},

address = {Dagstuhl, Germany},

series = {Leibniz International Proceedings in Informatics (LIPIcs)},

abstract = {In the k-Edit Circular Pattern Matching (k-Edit CPM) problem, we are given a length-n text T, a length-m pattern P, and a positive integer threshold k, and we are to report all starting positions of the substrings of T that are at edit distance at most k from some cyclic rotation of P. In the decision version of the problem, we are to check if any such substring exists. Very recently, Charalampopoulos et al. [ESA 2022] presented 𝒪(nk²)-time and 𝒪(nk log³ k)-time solutions for the reporting and decision versions of k-Edit CPM, respectively. Here, we show that the reporting and decision versions of k-Edit CPM can be solved in 𝒪(n+(n/m) k⁶) time and 𝒪(n+(n/m) k⁵ log³ k) time, respectively, thus obtaining the first algorithms with a complexity of the type 𝒪(n+(n/m) poly(k)) for this problem. Notably, our algorithms run in 𝒪(n) time when m = Ω(k⁶) and are superior to the previous respective solutions when m = ω(k⁴). We provide a meta-algorithm that yields efficient algorithms in several other interesting settings, such as when the strings are given in a compressed form (as straight-line programs), when the strings are dynamic, or when we have a quantum computer. 

We obtain our solutions by exploiting the structure of approximate circular occurrences of P in T, when T is relatively short w.r.t. P. Roughly speaking, either the starting positions of approximate occurrences of rotations of P form 𝒪(k⁴) intervals that can be computed efficiently, or some rotation of P is almost periodic (is at a small edit distance from a string with small period). Dealing with the almost periodic case is the most technically demanding part of this work; we tackle it using properties of locked fragments (originating from [Cole and Hariharan, SICOMP 2002]).},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {inproceedings}

}

19.

Bille, Philip; Gørtz, Inge Li; Lewenstein, Moshe; Pissis, Solon P.; Rotenberg, Eva; Steiner, Teresa Anna

Gapped String Indexing in Subquadratic Space and Sublinear Query Time Proceedings Article

In: Beyersdorff, Olaf; Kanté, Mamadou Moustapha; Kupferman, Orna; Lokshtanov, Daniel (Ed.): 41st International Symposium on Theoretical Aspects of Computer Science (STACS 2024), pp. 16:1–16:21, Schloss Dagstuhl – Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2024, ISSN: 1868-8969.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@inproceedings{Bille2024a,

title = {Gapped String Indexing in Subquadratic Space and Sublinear Query Time},

author = {Philip Bille and Inge Li Gørtz and Moshe Lewenstein and Solon P. Pissis and Eva Rotenberg and Teresa Anna Steiner},

editor = {Olaf Beyersdorff and Mamadou Moustapha Kanté and Orna Kupferman and Daniel Lokshtanov},

doi = {10.4230/LIPICS.STACS.2024.16},

issn = {1868-8969},

year  = {2024},

date = {2024-01-01},

booktitle = {41st International Symposium on Theoretical Aspects of Computer Science (STACS 2024)},

volume = {289},

pages = {16:1–16:21},

publisher = {Schloss Dagstuhl – Leibniz-Zentrum für Informatik},

address = {Dagstuhl, Germany},

series = {Leibniz International Proceedings in Informatics (LIPIcs)},

abstract = {In Gapped String Indexing, the goal is to compactly represent a string S of length n such that for any query consisting of two strings P₁ and P₂, called patterns, and an integer interval [α, β], called gap range, we can quickly find occurrences of P₁ and P₂ in S with distance in [α, β]. Gapped String Indexing is a central problem in computational biology and text mining and has thus received significant research interest, including parameterized and heuristic approaches. Despite this interest, the best-known time-space trade-offs for Gapped String Indexing are the straightforward 𝒪(n) space and 𝒪(n+ occ) query time or Ω(n²) space and Õ(|P₁| + |P₂| + occ) query time. 

We break through this barrier obtaining the first interesting trade-offs with polynomially subquadratic space and polynomially sublinear query time. In particular, we show that, for every 0 ≤ δ ≤ 1, there is a data structure for Gapped String Indexing with either Õ(n^2-δ/3) or Õ(n^3-2δ) space and Õ(|P₁| + |P₂| + n^δ⋅ (occ+1)) query time, where occ is the number of reported occurrences. As a new fundamental tool towards obtaining our main result, we introduce the Shifted Set Intersection problem: preprocess a collection of sets S₁, …, S_k of integers such that for any query consisting of three integers i,j,s, we can quickly output YES if and only if there exist a ∈ S_i and b ∈ S_j with a+s = b. We start by showing that the Shifted Set Intersection problem is equivalent to the indexing variant of 3SUM (3SUM Indexing) [Golovnev et al., STOC 2020]. We then give a data structure for Shifted Set Intersection with gaps, which entails a solution to the Gapped String Indexing problem. Furthermore, we enhance our data structure for deciding Shifted Set Intersection, so that we can support the reporting variant of the problem, i.e., outputting all certificates in the affirmative case. Via the obtained equivalence to 3SUM Indexing, we thus give new improved data structures for the reporting variant of 3SUM Indexing, and we show how this improves upon the state-of-the-art solution for Jumbled Indexing [Chan and Lewenstein, STOC 2015] for any alphabet of constant size σ > 5.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {inproceedings}

}

In Gapped String Indexing, the goal is to compactly represent a string S of length n such that for any query consisting of two strings P₁ and P₂, called patterns, and an integer interval [α, β], called gap range, we can quickly find occurrences of P₁ and P₂ in S with distance in [α, β]. Gapped String Indexing is a central problem in computational biology and text mining and has thus received significant research interest, including parameterized and heuristic approaches. Despite this interest, the best-known time-space trade-offs for Gapped String Indexing are the straightforward 𝒪(n) space and 𝒪(n+ occ) query time or Ω(n²) space and Õ(|P₁| + |P₂| + occ) query time.
We break through this barrier obtaining the first interesting trade-offs with polynomially subquadratic space and polynomially sublinear query time. In particular, we show that, for every 0 ≤ δ ≤ 1, there is a data structure for Gapped String Indexing with either Õ(n^2-δ/3) or Õ(n^3-2δ) space and Õ(|P₁| + |P₂| + n^δ⋅ (occ+1)) query time, where occ is the number of reported occurrences. As a new fundamental tool towards obtaining our main result, we introduce the Shifted Set Intersection problem: preprocess a collection of sets S₁, …, S_k of integers such that for any query consisting of three integers i,j,s, we can quickly output YES if and only if there exist a ∈ S_i and b ∈ S_j with a+s = b. We start by showing that the Shifted Set Intersection problem is equivalent to the indexing variant of 3SUM (3SUM Indexing) [Golovnev et al., STOC 2020]. We then give a data structure for Shifted Set Intersection with gaps, which entails a solution to the Gapped String Indexing problem. Furthermore, we enhance our data structure for deciding Shifted Set Intersection, so that we can support the reporting variant of the problem, i.e., outputting all certificates in the affirmative case. Via the obtained equivalence to 3SUM Indexing, we thus give new improved data structures for the reporting variant of 3SUM Indexing, and we show how this improves upon the state-of-the-art solution for Jumbled Indexing [Chan and Lewenstein, STOC 2015] for any alphabet of constant size σ > 5.

20.

Zuba, Wiktor; Loukides, Grigorios; Pissis, Solon P.; Thankachan, Sharma V.

Approximate Suffix-Prefix Dictionary Queries Proceedings Article

In: Královič, Rastislav; Kučera, Antonín (Ed.): 49th International Symposium on Mathematical Foundations of Computer Science (MFCS 2024), pp. 85:1–85:18, Schloss Dagstuhl – Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2024, ISSN: 1868-8969.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@inproceedings{Zuba2024,

title = {Approximate Suffix-Prefix Dictionary Queries},

author = {Wiktor Zuba and Grigorios Loukides and Solon P. Pissis and Sharma V. Thankachan},

editor = {Rastislav Královič and Antonín Kučera},

doi = {10.4230/LIPICS.MFCS.2024.85},

issn = {1868-8969},

year  = {2024},

date = {2024-01-01},

urldate = {2024-01-01},

booktitle = {49th International Symposium on Mathematical Foundations of Computer Science (MFCS 2024)},

volume = {306},

pages = {85:1–85:18},

publisher = {Schloss Dagstuhl – Leibniz-Zentrum für Informatik},

address = {Dagstuhl, Germany},

series = {Leibniz International Proceedings in Informatics (LIPIcs)},

abstract = {In the all-pairs suffix-prefix (APSP) problem [Gusfield et al., Inf. Process. Lett. 1992], we are given a dictionary R of r strings, S₁,…,S_r, of total length n, and we are asked to find the length SPL_i,j of the longest string that is both a suffix of S_i and a prefix of S_j, for all i,j ∈ [1..r]. APSP is a classic problem in string algorithms with applications in bioinformatics, especially in sequence assembly. Since r = |R| is typically very large in real-world applications, considering all r² pairs of strings explicitly is prohibitive. This is when the data structure variant of APSP makes sense; in the same spirit as distance oracles computing shortest paths between any two vertices given online. We show how to quickly locate k-approximate matches (under the Hamming or the edit distance) in R using a version of the k-errata tree [Cole et al., STOC 2004] that we introduce. Let SPL^k_i,j be the length of the longest suffix of S_i that is at distance at most k from a prefix of S_j. In particular, for any k = 𝒪(1), we show an 𝒪(nlog^k n)-sized data structure to support the following queries: 

- One-to-One^k(i,j): output SPL^k_i,j in 𝒪(log^k nlog log n) time. 

- Report^k(i,d): output all j ∈ [1..r], such that SPL^k_i,j ≥ d, in 𝒪(log^kn(log n/log log n+output)) time, where output denotes the size of the output. In fact, our algorithms work for any value of k not just for k = 𝒪(1), but the formulas bounding the complexities get much more complicated for larger values of k.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {inproceedings}

}

76 entries « ‹ 1 of 4 › »