Rizzo, Nicola; Equi, Massimo; Norri, Tuukka; Mäkinen, Veli
Elastic founder graphs improved and enhanced Journal Article
In: Theoretical Computer Science, vol. 982, no. 114269, pp. 114269, 2024, ISSN: 0304-3975.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Rizzo2024-jy,
title = {Elastic founder graphs improved and enhanced},
author = {Nicola Rizzo and Massimo Equi and Tuukka Norri and Veli Mäkinen},
doi = {10.1016/j.tcs.2023.114269},
issn = {0304-3975},
year = {2024},
date = {2024-01-01},
urldate = {2024-01-01},
journal = {Theoretical Computer Science},
volume = {982},
number = {114269},
pages = {114269},
publisher = {Elsevier BV},
abstract = {Indexing labeled graphs for pattern matching is a central challenge of pangenomics. Equi et al. (2022) [14] developed the Elastic Founder Graph (EFG) representing an alignment of m sequences of length n, drawn from alphabet Σ plus the special gap character: the paths spell the original sequences or their recombination. By enforcing the semi-repeat-free property, the EFG admits a polynomial-space index for linear-time pattern matching, breaking through the conditional lower bounds on indexing labeled graphs (Equi et al. [13]). In this work, we improve the space of the EFG index answering pattern matching queries in linear time, from linear in the length of all strings spelled by three consecutive node labels, to linear in the size of the edge labels. Then, we develop linear-time construction algorithms optimizing for different metrics: we improve the existing linearithmic construction algorithms to O(mn), by solving the novel exclusive ancestor set problem on trees; we propose, for the simplified gapless setting, an O(mn)-time solution minimizing the maximum block height, that we generalize by substituting block height with prefix-aware height. Finally, to show the versatility of the framework, we develop a BWT-based EFG index and study how to encode and perform document listing queries on a set of paths of the graphs, reporting which paths present a given pattern as a substring. We propose the EFG framework as an improved and enhanced version of the framework for the gapless setting, along with construction methods that are valid in any setting concerned with the segmentation of aligned sequences.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
Benoit, Gaëtan; Raguideau, Sébastien; James, Robert; Phillippy, Adam M.; Chikhi, Rayan; Quince, Christopher
High-quality metagenome assembly from long accurate reads with metaMDBG Journal Article
In: Nature Biotechnology, 2024, ISSN: 1546-1696.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Benoit2024,
title = {High-quality metagenome assembly from long accurate reads with metaMDBG},
author = {Gaëtan Benoit and Sébastien Raguideau and Robert James and Adam M. Phillippy and Rayan Chikhi and Christopher Quince},
doi = {10.1038/s41587-023-01983-6},
issn = {1546-1696},
year = {2024},
date = {2024-01-01},
urldate = {2024-01-01},
journal = {Nature Biotechnology},
publisher = {Springer Science and Business Media LLC},
abstract = {We introduce metaMDBG, a metagenomics assembler for PacBio HiFi reads. MetaMDBG combines a de Bruijn graph assembly in a minimizer space with an iterative assembly over sequences of minimizers to address variations in genome coverage depth and an abundance-based filtering strategy to simplify strain complexity. For complex communities, we obtained up to twice as many high-quality circularized prokaryotic metagenome-assembled genomes as existing methods and had better recovery of viruses and plasmids.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
Kang, Xiongbin; Xu, Jialu; Luo, Xiao; Schönhuth, Alexander
Hybrid-hybrid correction of errors in long reads with HERO Journal Article
In: Genome Biol., vol. 24, no. 1, pp. 275, 2023, ISBN: 1474-760X.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Kang2023-jk,
title = {Hybrid-hybrid correction of errors in long reads with HERO},
author = {Xiongbin Kang and Jialu Xu and Xiao Luo and Alexander Schönhuth},
doi = {10.1186/s13059-023-03112-7},
isbn = {1474-760X},
year = {2023},
date = {2023-12-01},
urldate = {2023-12-01},
journal = {Genome Biol.},
volume = {24},
number = {1},
pages = {275},
abstract = {Although generally superior, hybrid approaches for correcting errors in third-generation sequencing (TGS) reads, using next-generation sequencing (NGS) reads, mistake haplotype-specific variants for errors in polyploid and mixed samples. We suggest HERO, as the first “hybrid-hybrid”approach, to make use of both de Bruijn graphs and overlap graphs for optimal catering to the particular strengths of NGS and TGS reads. Extensive benchmarking experiments demonstrate that HERO improves indel and mismatch error rates by on average 65% (27~95%) and 20% (4~61%). Using HERO prior to genome assembly significantly improves the assemblies in the majority of the relevant categories.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
Loukides, Grigorios; Pissis, Solon P; Sweering, Michelle
Bidirectional string anchors for improved text indexing and top-$K$ similarity search Journal Article
In: IEEE Trans. Knowl. Data Eng., vol. 35, no. 11, pp. 11093–11111, 2023, ISBN: 1558-2191.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Loukides2023-qt,
title = {Bidirectional string anchors for improved text indexing and top-$K$ similarity search},
author = {Grigorios Loukides and Solon P Pissis and Michelle Sweering},
doi = {10.1109/TKDE.2022.3231780},
isbn = {1558-2191},
year = {2023},
date = {2023-11-01},
urldate = {2023-11-01},
journal = {IEEE Trans. Knowl. Data Eng.},
volume = {35},
number = {11},
pages = {11093–11111},
publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
abstract = {The minimizers sampling mechanism is a popular mechanism for string sampling. However, minimizers sampling mechanisms lack good guarantees on the expected size of their samples for different combinations of their input parameters. Furthermore, indexes constructed over minimizers samples lack good worst-case guarantees for on-line pattern searches. In response, we propose bidirectional string anchors (bd-anchors), a new string sampling mechanism. Given an integer ℓ , our mechanism selects the lexicographically smallest rotation in every length-ℓ fragment. We show that, like minimizers samples, bd-anchors samples are approximately uniform, locally consistent, and computable in linear time. Furthermore, our experiments demonstrate that the bd-anchors sample sizes decrease proportionally to ℓ ; and that these sizes are competitive to or smaller than the minimizers sample sizes. We theoretically justify these results by analyzing the expected size of bd-anchors samples. We also prove that computing a total order on the input alphabet which minimizes the bd-anchors sample size is NP-hard. We next highlight the benefits of bd-anchors in two important applications: text indexing and top-K similarity search. For the first application, we develop an index for performing on-line pattern searches in near-optimal time, and show experimentally that a simple implementation of our index is consistently faster for on-line pattern searches than an analogous implementation of a minimizers-based index; we also show that it is substantially faster than two classic text indexes. For the second application, we develop a heuristic for top-K similarity search under edit distance, and show experimentally that it is generally as accurate as the state-of-the-art tool for the same purpose but more than one order of magnitude faster.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
Bernardini, Giulia; Fici, Gabriele; Gawrychowski, Paweł; Pissis, Solon P
Substring Complexity in Sublinear Space Proceedings Article
In: Iwata, Satoru; Kakimura, Naonori (Ed.): 34th International Symposium on Algorithms and Computation (ISAAC 2023), pp. 12:1–12:19, Schloss Dagstuhl - Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2023, ISBN: 978-3-95977-289-1.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Bernardini2023-nr,
title = {Substring Complexity in Sublinear Space},
author = {Giulia Bernardini and Gabriele Fici and Paweł Gawrychowski and Solon P Pissis},
editor = {Iwata, Satoru and Kakimura, Naonori},
url = {https://arxiv.org/abs/2112.10376},
doi = {10.4230/LIPICS.ISAAC.2023.12},
isbn = {978-3-95977-289-1},
year = {2023},
date = {2023-11-01},
urldate = {2023-11-01},
booktitle = {34th International Symposium on Algorithms and Computation (ISAAC 2023)},
volume = {283},
pages = {12:1--12:19},
publisher = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {Shannon’s entropy is a definitive lower bound for statistical compression. Unfortunately, no such clear measure exists for the compressibility of repetitive strings. Thus, ad hoc measures are employed to estimate the repetitiveness of strings, e.g., the size z of the Lempel–Ziv parse or the number r of equal-letter runs of the Burrows-Wheeler transform. A more recent one is the size γ of a smallest string attractor. Let T be a string of length n. A string attractor of T is a set of positions of T capturing the occurrences of all the substrings of T. Unfortunately, Kempa and Prezza [STOC 2018] showed that computing γ is NP-hard. Kociumaka et al. [LATIN 2020] considered a new measure of compressibility that is based on the function S_T(k) counting the number of distinct substrings of length k of T, also known as the substring complexity of T. This new measure is defined as δ = sup{S_T(k)/k, k ≥ 1} and lower bounds all the relevant ad hoc measures previously considered. In particular, δ ≤ γ always holds and δ can be computed in 𝒪(n) time using Θ(n) working space. Kociumaka et al. showed that one can construct an 𝒪(δ log n/(δ))-sized representation of T supporting efficient direct access and efficient pattern matching queries on T. Given that for highly compressible strings, δ is significantly smaller than n, it is natural to pose the following question: Can we compute δ efficiently using sublinear working space? It is straightforward to show that in the comparison model, any algorithm computing δ using 𝒪(b) space requires Ω(n^{2-o(1)}/b) time through a reduction from the element distinctness problem [Yao, SIAM J. Comput. 1994]. We thus wanted to investigate whether we can indeed match this lower bound. We address this algorithmic challenge by showing the following bounds to compute δ: - 𝒪((n³log b)/b²) time using 𝒪(b) space, for any b ∈ [1,n], in the comparison model. - 𝒪̃(n²/b) time using 𝒪̃(b) space, for any b ∈ [√n,n], in the word RAM model. This gives an 𝒪̃(n^{1+ε})-time and 𝒪̃(n^{1-ε})-space algorithm to compute δ, for any 0 < ε ≤ 1/2. Let us remark that our algorithms compute S_T(k), for all k, within the same complexities.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Romashchenko, Nikolai; Linard, Benjamin; Pardi, Fabio; Rivals, Eric
EPIK: Precise and scalable evolutionary placement with informative textitk-mers Journal Article
In: Bioinformatics, vol. 39, no. 12, 2023, ISSN: 1367-4811.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Romashchenko2023-kq,
title = {EPIK: Precise and scalable evolutionary placement with informative textitk-mers},
author = {Nikolai Romashchenko and Benjamin Linard and Fabio Pardi and Eric Rivals},
url = {https://github.com/phylo42/IPK
https://github.com/phylo42/EPIK},
doi = {10.1093/bioinformatics/btad692},
issn = {1367-4811},
year = {2023},
date = {2023-11-01},
urldate = {2023-11-01},
journal = {Bioinformatics},
volume = {39},
number = {12},
publisher = {Oxford University Press (OUP)},
abstract = {Phylogenetic placement enables phylogenetic analysis of massive collections of newly sequenced DNA, when de novo tree inference is too unreliable or inefficient. Assuming that a high-quality reference tree is available, the idea is to seek the correct placement of the new sequences in that tree. Recently, alignment-free approaches to phylogenetic placement have emerged, both to circumvent the need to align the new sequences and to avoid the calculations that typically follow the alignment step. A promising approach is based on the inference of k-mers that can be potentially related to the reference sequences, also called phylo-k-mers. However, its usage is limited by the time and memory-consuming stage of reference data preprocessing and the large numbers of k-mers to consider.
We suggest a filtering method for selecting informative phylo-k-mers based on mutual information, which can significantly improve the efficiency of placement, at the cost of a small loss in placement accuracy. This method is implemented in IPK, a new tool for computing phylo-k-mers that significantly outperforms the software previously available. We also present EPIK, a new software for phylogenetic placement, supporting filtered phylo-k-mer databases. Our experiments on real-world data show that EPIK is the fastest phylogenetic placement tool available, when placing hundreds of thousands and millions of queries while still providing accurate placements.
IPK and EPIK are freely available at https://github.com/phylo42/IPK and https://github.com/phylo42/EPIK. Both are implemented in C++ and Python and supported on Linux and MacOS.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
We suggest a filtering method for selecting informative phylo-k-mers based on mutual information, which can significantly improve the efficiency of placement, at the cost of a small loss in placement accuracy. This method is implemented in IPK, a new tool for computing phylo-k-mers that significantly outperforms the software previously available. We also present EPIK, a new software for phylogenetic placement, supporting filtered phylo-k-mer databases. Our experiments on real-world data show that EPIK is the fastest phylogenetic placement tool available, when placing hundreds of thousands and millions of queries while still providing accurate placements.
IPK and EPIK are freely available at https://github.com/phylo42/IPK and https://github.com/phylo42/EPIK. Both are implemented in C++ and Python and supported on Linux and MacOS.
Andreace, Francesco; Lechat, Pierre; Dufresne, Yoann; Chikhi, Rayan
Comparing methods for constructing and representing human pangenome graphs Journal Article
In: Genome Biology, vol. 24, no. 1, pp. 274, 2023, ISSN: 1474-760X.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Andreace2023,
title = {Comparing methods for constructing and representing human pangenome graphs},
author = {Francesco Andreace and Pierre Lechat and Yoann Dufresne and Rayan Chikhi},
doi = {10.1186/s13059-023-03098-2},
issn = {1474-760X},
year = {2023},
date = {2023-11-01},
urldate = {2023-11-01},
journal = {Genome Biology},
volume = {24},
number = {1},
pages = {274},
publisher = {Springer Science and Business Media LLC},
abstract = {As a single reference genome cannot possibly represent all the variation present across human individuals, pangenome graphs have been introduced to incorporate population diversity within a wide range of genomic analyses. Several data structures have been proposed for representing collections of genomes as pangenomes, in particular graphs.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
González, Camila Duitama; Rangavittal, Samarth; Vicedomini, Riccardo; Chikhi, Rayan; Richard, Hugues
aKmerBroom: Ancient oral DNA decontamination using Bloom filters on k-mer sets Journal Article
In: iScience, vol. 26, no. 11, pp. 108057, 2023, ISSN: 2589-0042.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{DuitamaGonzalez2023,
title = {aKmerBroom: Ancient oral DNA decontamination using Bloom filters on k-mer sets},
author = {Camila Duitama González and Samarth Rangavittal and Riccardo Vicedomini and Rayan Chikhi and Hugues Richard},
doi = {10.1016/j.isci.2023.108057},
issn = {2589-0042},
year = {2023},
date = {2023-11-01},
urldate = {2023-11-01},
journal = {iScience},
volume = {26},
number = {11},
pages = {108057},
publisher = {Elsevier BV},
abstract = {Dental calculus samples are modeled as a mixture of DNA coming from dental plaque and contaminants. Current computational decontamination methods such as Recentrifuge and DeconSeq require either a reference database or sequenced negative controls, and therefore have limited use cases. We present a reference-free decontamination tool tailored for the removal of contaminant DNA of ancient oral sample called aKmerBroom. Our tool builds a Bloom filter of known ancient and modern oral k-mers, then scans an input set of ancient metagenomic reads using multiple passes to iteratively retain reads likely to be of oral origin. On synthetic data, aKmerBroom achieves over 89.53% sensitivity and 94.00% specificity. On real datasets, aKmerBroom shows higher read retainment (+60% on average) than other methods. We anticipate aKmerBroom will be a valuable tool for the processing of ancient oral samples as it will prevent contaminated datasets from being completely discarded in downstream analyses.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
Romashchenko, Nikolai; Linard, Benjamin; Rivals, Eric; Pardi, Fabio
Computing Phylo- k-mers Journal Article
In: IEEE/ACM Trans. Comput. Biol. Bioinform., vol. 20, no. 5, pp. 2889–2897, 2023, ISBN: 1557-9964.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Romashchenko2023-hj,
title = {Computing Phylo- k-mers},
author = {Nikolai Romashchenko and Benjamin Linard and Eric Rivals and Fabio Pardi},
url = {https://hal-lirmm.ccsd.cnrs.fr/lirmm-03778953v2},
doi = {10.1109/TCBB.2023.3278049},
isbn = {1557-9964},
year = {2023},
date = {2023-09-01},
urldate = {2023-09-01},
journal = {IEEE/ACM Trans. Comput. Biol. Bioinform.},
volume = {20},
number = {5},
pages = {2889–2897},
abstract = {Finding the correct position of new sequences within an established phylogenetic tree is an increasingly relevant problem in evolutionary bioinformatics and metagenomics. Recently, alignment-free approaches for this task have been proposed. One such approach is based on the concept of phylogenetically-informative k-mers or phylo- k-mers for short. In practice, phylo- k-mers are inferred from a set of related reference sequences and are equipped with scores expressing the probability of their appearance in different locations within the input reference phylogeny. Computing phylo- k-mers, however, represents a computational bottleneck to their applicability in real-world problems such as the phylogenetic analysis of metabarcoding reads and the detection of novel recombinant viruses. Here we consider the problem of phylo- k-mer computation: how can we efficiently find all k-mers whose probability lies above a given threshold for a given tree node? We describe and analyze algorithms for this problem, relying on branch-and-bound and divide-and-conquer techniques. We exploit the redundancy of adjacent windows of the alignment to save on computation. Besides computational complexity analyses, we provide an empirical evaluation of the relative performance of their implementations on simulated and real-world data. The divide-and-conquer algorithms are found to surpass the branch-and-bound approach, especially when many phylo- k-mers are found.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
Bonnet, Konstantinn; Marschall, Tobias; Doerr, Daniel
Constructing founder sets under allelic and non-allelic homologous recombination Journal Article
In: Algorithms Mol. Biol., vol. 18, no. 1, pp. 15, 2023, ISBN: 1748-7188.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Bonnet2023-gs,
title = {Constructing founder sets under allelic and non-allelic homologous recombination},
author = {Konstantinn Bonnet and Tobias Marschall and Daniel Doerr},
doi = {10.1186/s13015-023-00241-3},
isbn = {1748-7188},
year = {2023},
date = {2023-09-01},
urldate = {2023-09-01},
journal = {Algorithms Mol. Biol.},
volume = {18},
number = {1},
pages = {15},
abstract = {Homologous recombination between the maternal and paternal copies of a chromosome is a key mechanism for human inheritance and shapes population genetic properties of our species. However, a similar mechanism can also act between different copies of the same sequence, then called non-allelic homologous recombination (NAHR). This process can result in genomic rearrangements—including deletion, duplication, and inversion—and is underlying many genomic disorders. Despite its importance for genome evolution and disease, there is a lack of computational models to study genomic loci prone to NAHR. In this work, we propose such a computational model, providing a unified framework for both (allelic) homologous recombination and NAHR. Our model represents a set of genomes as a graph, where haplotypes correspond to walks through this graph. We formulate two founder set problems under our recombination model, provide flow-based algorithms for their solution, describe exact methods to characterize the number of recombinations, and demonstrate scalability to problem instances arising in practice.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
Rizzo, Nicola; Cáceres, Manuel; Mäkinen, Veli
Finding Maximal Exact Matches in Graphs Proceedings Article
In: Belazzougui, Djamal; Ouangraoua, A"ida (Ed.): 23rd International Workshop on Algorithms in Bioinformatics (WABI 2023), pp. 10:1–10:17, Schloss Dagstuhl - Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2023, ISBN: 978-3-95977-294-5.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Rizzo2023-hg,
title = {Finding Maximal Exact Matches in Graphs},
author = {Nicola Rizzo and Manuel Cáceres and Veli Mäkinen},
editor = {Belazzougui, Djamal and Ouangraoua, A"{i}da},
doi = {10.4230/LIPIcs.WABI.2023.10},
isbn = {978-3-95977-294-5},
year = {2023},
date = {2023-08-01},
urldate = {2023-08-01},
booktitle = {23rd International Workshop on Algorithms in Bioinformatics (WABI 2023)},
volume = {273},
pages = {10:1--10:17},
publisher = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {We study the problem of finding maximal exact matches (MEMs) between a query string Q and a labeled graph G. MEMs are an important class of seeds, often used in seed-chain-extend type of practical alignment methods because of their strong connections to classical metrics. A principled way to speed up chaining is to limit the number of MEMs by considering only MEMs of length at least κ (κ-MEMs). However, on arbitrary input graphs, the problem of finding MEMs cannot be solved in truly sub-quadratic time under SETH (Equi et al., ICALP 2019) even on acyclic graphs. In this paper we show an O(n⋅ L ⋅ d^{L-1} + m + M_{κ,L})-time algorithm finding all κ-MEMs between Q and G spanning exactly L nodes in G, where n is the total length of node labels, d is the maximum degree of a node in G, m = |Q|, and M_{κ,L} is the number of output MEMs. We use this algorithm to develop a κ-MEM finding solution on indexable Elastic Founder Graphs (Equi et al., Algorithmica 2022) running in time O(nH² + m + M_κ), where H is the maximum number of nodes in a block, and M_κ is the total number of κ-MEMs. Our results generalize to the analysis of multiple query strings (MEMs between G and any of the strings). Additionally, we provide some preliminary experimental results showing that the number of graph MEMs is an order of magnitude smaller than the number of string MEMs of the corresponding concatenated collection.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Ayad, Lorraine A K; Chikhi, Rayan; Pissis, Solon P
Seedability: optimizing alignment parameters for sensitive sequence comparison Journal Article
In: Bioinform. Adv., vol. 3, no. 1, 2023, ISSN: 2635-0041.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG, WP2: Evolutionary/Comparative CPG
@article{Ayad2023-li,
title = {Seedability: optimizing alignment parameters for sensitive sequence comparison},
author = {Lorraine A K Ayad and Rayan Chikhi and Solon P Pissis},
url = {https://github.com/lorrainea/Seedability},
doi = {10.1093/bioadv/vbad108},
issn = {2635-0041},
year = {2023},
date = {2023-08-01},
urldate = {2023-08-01},
journal = {Bioinform. Adv.},
volume = {3},
number = {1},
publisher = {Oxford University Press (OUP)},
abstract = {Most sequence alignment techniques make use of exact k-mer hits, called seeds, as anchors to optimize alignment speed. A large number of bioinformatics tools employing seed-based alignment techniques, such as Minimap2, use a single value of k per sequencing technology, without a strong guarantee that this is the best possible value. Given the ubiquity of sequence alignment, identifying values of k that lead to more sensitive alignments is thus an important task. To aid this, we present Seedability, a seed-based alignment framework designed for estimating an optimal seed k-mer length (as well as a minimal number of shared seeds) based on a given alignment identity threshold. In particular, we were motivated to make Minimap2 more sensitive in the pairwise alignment of short sequences.
The experimental results herein show improved alignments of short and divergent sequences when using the parameter values determined by Seedability in comparison to the default values of Minimap2. We also show several cases of pairs of real divergent sequences, where the default parameter values of Minimap2 yield no output alignments, but the values output by Seedability produce plausible alignments.
https://github.com/lorrainea/Seedability (distributed under GPL v3.0).},
keywords = {WP1: Primary CPG, WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {article}
}
The experimental results herein show improved alignments of short and divergent sequences when using the parameter values determined by Seedability in comparison to the default values of Minimap2. We also show several cases of pairs of real divergent sequences, where the default parameter values of Minimap2 yield no output alignments, but the values output by Seedability produce plausible alignments.
https://github.com/lorrainea/Seedability (distributed under GPL v3.0).
Loukides, Grigorios; Pissis, Solon P.; Thankachan, Sharma V.; Zuba, Wiktor
Suffix-prefix queries on a dictionary Proceedings Article
In: Bulteau, Laurent; Lipt'ak, Zsuzsanna (Ed.): 34th Annual Symposium on Combinatorial Pattern Matching (CPM 2023), pp. 21:1–21:20, Schloss Dagstuhl - Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2023, ISBN: 978-3-95977-276-1.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Loukides2023-rk,
title = {Suffix-prefix queries on a dictionary},
author = {Grigorios Loukides and Solon P. Pissis and Sharma V. Thankachan and Wiktor Zuba},
editor = {Bulteau, Laurent and Lipt'{a}k, Zsuzsanna},
doi = {10.4230/LIPIcs.CPM.2023.21},
isbn = {978-3-95977-276-1},
year = {2023},
date = {2023-06-01},
urldate = {2023-06-01},
booktitle = {34th Annual Symposium on Combinatorial Pattern Matching (CPM 2023)},
volume = {259},
pages = {21:1--21:20},
publisher = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {In the all-pairs suffix-prefix (APSP) problem, we are given a dictionary R of k strings, S_1,…,S_k, of total length n, and we are asked to find the length SPL_{i,j} of the longest string that is both a suffix of S_i and a prefix of S_j, for all i,j ∈ [1,k]. APSP is a classic problem in string algorithms with many applications in bioinformatics. When all strings of the dictionary are over an integer alphabet of size σ ≤ n^𝒪(1), APSP can be solved in the optimal 𝒪(n+k²) time with the use of the generalized suffix tree of the dictionary [Gusfield et al., Inf. Process. Lett. 1992]. In many bioinformatics applications, such as in sequence assembly, the size k of dictionary R is very large. In particular, k² usually dominates n, and thus the k² factor is the bottleneck both in the time and in the space complexity of such applications. We thus initiate a holistic study on several data structure variants of APSP. In particular, we consider the following types of queries: - One-to-One(i,j): output SPL_{i,j}. - One-to-All(i): output SPL_{i,j} for every j ∈ [1,k]. - Report(i,𝓁): output all distinct j ∈ [1,k] such that SPL_{i,j} ≥ 𝓁, where 𝓁 ≥ 0 is an integer. - Count(i,𝓁): output the number of distinct j ∈ [1,k] such that SPL_{i,j} ≥ 𝓁, where 𝓁 ≥ 0 is an integer. - Top(i,K): output K distinct j ∈ [1,k] with the highest values of SPL_{i,j} breaking ties arbitrarily. We assume the standard word RAM model of computation with word size w = Ω(log n) and an integer alphabet of size σ ≤ n^𝒪(1). We show the following upper bounds: Query | Space (words) | Query time | Note One-to-One(i,j) | 𝒪(n) | 𝒪(log log k) | Theorem 11 One-to-All(i) | 𝒪(n) | 𝒪(k) | Theorem 14 Report(i,𝓁) | 𝒪(n) | 𝒪(log n/log log n+output) | Theorem 19(i) Count(i,𝓁) | 𝒪(n) | 𝒪(log n/log log n) | Theorem 19(ii) Top(i,K) | 𝒪(n) | 𝒪(log² n/log log n+K) | Theorem 22 We also present efficient algorithms for constructing these data structures.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Gabory, Esteban; Mwaniki, Moses Njagi; Pisanti, Nadia; Pissis, Solon P; Radoszewski, Jakub; Sweering, Michelle; Zuba, Wiktor
Comparing elastic-degenerate strings: Algorithms, lower bounds, and applications Proceedings Article
In: Bulteau, Laurent; Lipt'ak, Zsuzsanna (Ed.): 34th Annual Symposium on Combinatorial Pattern Matching (CPM 2023), pp. 11:1–11:20, Schloss Dagstuhl - Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2023, ISBN: 978-3-95977-276-1.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG, WP2: Evolutionary/Comparative CPG
@inproceedings{Gabory2023-vh,
title = {Comparing elastic-degenerate strings: Algorithms, lower bounds, and applications},
author = {Esteban Gabory and Moses Njagi Mwaniki and Nadia Pisanti and Solon P Pissis and Jakub Radoszewski and Michelle Sweering and Wiktor Zuba},
editor = {Bulteau, Laurent and Lipt'{a}k, Zsuzsanna},
doi = {10.4230/LIPIcs.CPM.2023.11},
isbn = {978-3-95977-276-1},
year = {2023},
date = {2023-06-01},
urldate = {2023-06-01},
booktitle = {34th Annual Symposium on Combinatorial Pattern Matching (CPM 2023)},
volume = {259},
pages = {11:1--11:20},
publisher = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {An elastic-degenerate (ED) string T is a sequence of n sets T[1],…,T[n] containing m strings in total whose cumulative length is N. We call n, m, and N the length, the cardinality and the size of T, respectively. The language of T is defined as ℒ(T) = {S_1 ⋯ S_n : S_i ∈ T[i] for all i ∈ [1,n]}. ED strings have been introduced to represent a set of closely-related DNA sequences, also known as a pangenome. The basic question we investigate here is: Given two ED strings, how fast can we check whether the two languages they represent have a nonempty intersection? We call the underlying problem the ED String Intersection (EDSI) problem. For two ED strings T₁ and T₂ of lengths n₁ and n₂, cardinalities m₁ and m₂, and sizes N₁ and N₂, respectively, we show the following: - There is no 𝒪((N₁N₂)^{1-ε})-time algorithm, thus no 𝒪((N₁m₂+N₂m₁)^{1-ε})-time algorithm and no 𝒪((N₁n₂+N₂n₁)^{1-ε})-time algorithm, for any constant ε > 0, for EDSI even when T₁ and T₂ are over a binary alphabet, unless the Strong Exponential-Time Hypothesis is false. - There is no combinatorial 𝒪((N₁+N₂)^{1.2-ε}f(n₁,n₂))-time algorithm, for any constant ε > 0 and any function f, for EDSI even when T₁ and T₂ are over a binary alphabet, unless the Boolean Matrix Multiplication conjecture is false. - An 𝒪(N₁log N₁log n₁+N₂log N₂log n₂)-time algorithm for outputting a compact (RLE) representation of the intersection language of two unary ED strings. In the case when T₁ and T₂ are given in a compact representation, we show that the problem is NP-complete. - An 𝒪(N₁m₂+N₂m₁)-time algorithm for EDSI. - An Õ(N₁^{ω-1}n₂+N₂^{ω-1}n₁)-time algorithm for EDSI, where ω is the exponent of matrix multiplication; the Õ notation suppresses factors that are polylogarithmic in the input size. We also show that the techniques we develop have applications outside of ED string comparison.},
keywords = {WP1: Primary CPG, WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Ekim, Bariş; Sahlin, Kristoffer; Medvedev, Paul; Berger, Bonnie; Chikhi, Rayan
Efficient mapping of accurate long reads in minimizer space with mapquik Journal Article
In: Genome Research, 2023, ISSN: 1549-5469.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Ekim2023,
title = {Efficient mapping of accurate long reads in minimizer space with mapquik},
author = {Bariş Ekim and Kristoffer Sahlin and Paul Medvedev and Bonnie Berger and Rayan Chikhi},
doi = {10.1101/gr.277679.123},
issn = {1549-5469},
year = {2023},
date = {2023-06-01},
urldate = {2023-06-01},
journal = {Genome Research},
publisher = {Cold Spring Harbor Laboratory},
abstract = {DNA sequencing data continue to progress toward longer reads with increasingly lower sequencing error rates. We focus on the critical problem of mapping, or aligning, low-divergence sequences from long reads (e.g., Pacific Biosciences [PacBio] HiFi) to a reference genome, which poses challenges in terms of accuracy and computational resources when using cutting-edge read mapping approaches that are designed for all types of alignments. A natural idea would be to optimize efficiency with longer seeds to reduce the probability of extraneous matches; however, contiguous exact seeds quickly reach a sensitivity limit. We introduce mapquik, a novel strategy that creates accurate longer seeds by anchoring alignments through matches of k consecutively sampled minimizers (k-min-mers) and only indexing k-min-mers that occur once in the reference genome, thereby unlocking ultrafast mapping while retaining high sensitivity. We show that mapquik significantly accelerates the seeding and chaining steps-fundamental bottlenecks to read mapping-for both the human and maize genomes with >96% sensitivity and near-perfect specificity. On the human genome, for both real and simulated reads, mapquik achieves a 37x speedup over the state-of-the-art tool minimap2, and on the maize genome, mapquik achieves a 410x speedup over minimap2, making mapquik the fastest mapper to date. These accelerations are enabled from not only minimizer-space seeding but also a novel heuristic $O(n)$ pseudochaining algorithm, which improves upon the long-standing $O(nlog n)$ bound. Minimizer-space computation builds the foundation for achieving real-time analysis of long-read sequencing data.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
Lemane, Téo; Lezzoche, Nolan; Lecubin, Julien; Pelletier, Eric; Lescot, Magali; Chikhi, Rayan; Peterlongo, Pierre
kmindex and ORA: indexing and real-time user-friendly queries in terabyte-sized complex genomic datasets Unpublished
bioRxiv, 2023.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@unpublished{Lemane2023,
title = {kmindex and ORA: indexing and real-time user-friendly queries in terabyte-sized complex genomic datasets},
author = {Téo Lemane and Nolan Lezzoche and Julien Lecubin and Eric Pelletier and Magali Lescot and Rayan Chikhi and Pierre Peterlongo},
url = {https://github.com/tlemane/kmindex
https://ocean-read-atlas.mio.osupytheas.fr/},
doi = {10.1101/2023.05.31.543043},
year = {2023},
date = {2023-06-01},
urldate = {2023-06-01},
publisher = {Cold Spring Harbor Laboratory},
abstract = {Public sequencing databases contain vast amounts of biological information, yet they are largely underutilized as one cannot efficiently search them for any sequence(s) of interest. We present kmindex, an innovative approach that can index thousands of highly complex metagenomes and perform sequence searches in a fraction of a second. The index construction is an order of magnitude faster than previous methods, while search times are two orders of magnitude faster. With negligible false positive rates below 0.01%, kmindex outperforms the precision of existing approaches by four orders of magnitude. We demonstrate the scalability of kmindex by successfully indexing 1,393 complex marine seawater metagenome samples from the Tara Oceans project. Additionally, we introduce the publicly accessible web server “Ocean Read Atlas” (ORA) at https://ocean-read-atlas.mio.osupytheas.fr/, which enables real-time queries on the Tara Oceans dataset. The open-source kmindex software is available at https://github.com/tlemane/kmindex.},
howpublished = {bioRxiv},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {unpublished}
}
Ayad, Lorraine A K; Loukides, Grigorios; Pissis, Solon P
Text Indexing for Long Patterns: Anchors are All you Need Proceedings Article
In: pp. 2117–2131, Proceedings VLDB Endowment, 2023, ISSN: 2150-8097.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Ayad2023-vw,
title = {Text Indexing for Long Patterns: Anchors are All you Need},
author = {Lorraine A K Ayad and Grigorios Loukides and Solon P Pissis},
doi = {10.14778/3598581.3598586},
issn = {2150-8097},
year = {2023},
date = {2023-05-01},
urldate = {2023-05-01},
journal = {Proceedings VLDB Endowment},
volume = {16},
number = {9},
pages = {2117–2131},
publisher = {Proceedings VLDB Endowment},
abstract = {In many real-world database systems, a large fraction of the data is represented by strings: sequences of letters over some alphabet. This is because strings can easily encode data arising from different sources. It is often crucial to represent such string datasets in a compact form but also to simultaneously enable fast pattern matching queries. This is the classic text indexing problem. The four absolute measures anyone should pay attention to when designing or implementing a text index are: (i) index space; (ii) query time; (iii) construction space; and (iv) construction time. Unfortunately, however, most (if not all) widely-used indexes (e.g., suffix tree, suffix array, or their compressed counterparts) are not optimized for all four measures simultaneously, as it is difficult to have the best of all four worlds. Here, we take an important step in this direction by showing that text indexing with locally consistent anchors (lc-anchors) offers remarkably good performance in all four measures, when we have at hand a lower bound l on the length of the queried patterns --- which is arguably a quite reasonable assumption in practical applications. Specifically, we improve on the construction of the index proposed by Loukides and Pissis, which is based on bidirectional string anchors (bd-anchors), a new type of lc-anchors, by: (i) designing an average-case linear-time algorithm to compute bd-anchors; and (ii) developing a semi-external-memory implementation to construct the index in small space using near-optimal work. We then present an extensive experimental evaluation, based on the four measures, using real benchmark datasets. The results show that, for long patterns, the index constructed using our improved algorithms compares favorably to all classic indexes: (compressed) suffix tree; (compressed) suffix array; and the FM-index.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Forgia, Marco; Navarro, Beatriz; Daghino, Stefania; Cervera, Amelia; Gisel, Andreas; Perotto, Silvia; Aghayeva, Dilzara N.; Akinyuwa, Mary F.; Gobbi, Emanuela; Zheludev, Ivan N.; Edgar, Robert C.; Chikhi, Rayan; Turina, Massimo; Babaian, Artem; Serio, Francesco Di; Peña, Marcos
Hybrids of RNA viruses and viroid-like elements replicate in fungi Journal Article
In: Nature Communications, vol. 14, no. 1, pp. 2591, 2023, ISSN: 2041-1723.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Forgia2023,
title = {Hybrids of RNA viruses and viroid-like elements replicate in fungi},
author = {Marco Forgia and Beatriz Navarro and Stefania Daghino and Amelia Cervera and Andreas Gisel and Silvia Perotto and Dilzara N. Aghayeva and Mary F. Akinyuwa and Emanuela Gobbi and Ivan N. Zheludev and Robert C. Edgar and Rayan Chikhi and Massimo Turina and Artem Babaian and Francesco Di Serio and Marcos Peña},
doi = {10.1038/s41467-023-38301-2},
issn = {2041-1723},
year = {2023},
date = {2023-05-01},
urldate = {2023-05-01},
journal = {Nature Communications},
volume = {14},
number = {1},
pages = {2591},
publisher = {Springer Science and Business Media LLC},
abstract = {Earth’s life may have originated as self-replicating RNA, and it has been argued that RNA viruses and viroid-like elements are remnants of such pre-cellular RNA world. RNA viruses are defined by linear RNA genomes encoding an RNA-dependent RNA polymerase (RdRp), whereas viroid-like elements consist of small, single-stranded, circular RNA genomes that, in some cases, encode paired self-cleaving ribozymes. Here we show that the number of candidate viroid-like elements occurring in geographically and ecologically diverse niches is much higher than previously thought. We report that, amongst these circular genomes, fungal ambiviruses are viroid-like elements that undergo rolling circle replication and encode their own viral RdRp. Thus, ambiviruses are distinct infectious RNAs showing hybrid features of viroid-like RNAs and viruses. We also detected similar circular RNAs, containing active ribozymes and encoding RdRps, related to mitochondrial-like fungal viruses, highlighting fungi as an evolutionary hub for RNA viruses and viroid-like elements. Our findings point to a deep co-evolutionary history between RNA viruses and subviral elements and offer new perspectives in the origin and evolution of primordial infectious agents, and RNA life.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
Mille, Marie; Ripoll, Julie; Cazaux, Bastien; Rivals, Eric
dipwmsearch: a Python package for searching di-PWM motifs Journal Article
In: Bioinformatics, vol. 39, no. 4, 2023, ISSN: 1367-4811.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Mille2023-mj,
title = {dipwmsearch: a Python package for searching di-PWM motifs},
author = {Marie Mille and Julie Ripoll and Bastien Cazaux and Eric Rivals},
url = {https://rivals.lirmm.net/dipwmsearch/},
doi = {10.1093/bioinformatics/btad141},
issn = {1367-4811},
year = {2023},
date = {2023-04-01},
urldate = {2023-04-01},
journal = {Bioinformatics},
volume = {39},
number = {4},
abstract = {Seeking probabilistic motifs in a sequence is a common task to annotate putative transcription factor binding sites or other RNA/DNA binding sites. Useful motif representations include position weight matrices (PWMs), dinucleotide PWMs (di-PWMs), and hidden Markov models (HMMs). Dinucleotide PWMs not only combine the simplicity of PWMs—a matrix form and a cumulative scoring function—but also incorporate dependency between adjacent positions in the motif (unlike PWMs which disregard any dependency). For instance to represent binding sites, the HOCOMOCO database provides di-PWM motifs derived from experimental data. Currently, two programs, SPRy-SARUS and MOODS, can search for occurrences of di-PWMs in sequences.
We propose a Python package called dipwmsearch, which provides an original and efficient algorithm for this task (it first enumerates matching words for the di-PWM, and then searches these all at once in the sequence, even if the latter contains IUPAC codes). The user benefits from an easy installation via Pypi or conda, a comprehensive documentation, and executable scripts that facilitate the use of di-PWMs.
dipwmsearch is available at https://pypi.org/project/dipwmsearch/ and https://gite.lirmm.fr/rivals/dipwmsearch/ under Cecill license.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
We propose a Python package called dipwmsearch, which provides an original and efficient algorithm for this task (it first enumerates matching words for the di-PWM, and then searches these all at once in the sequence, even if the latter contains IUPAC codes). The user benefits from an easy installation via Pypi or conda, a comprehensive documentation, and executable scripts that facilitate the use of di-PWMs.
dipwmsearch is available at https://pypi.org/project/dipwmsearch/ and https://gite.lirmm.fr/rivals/dipwmsearch/ under Cecill license.
Frouin, Arthur; Laporte, Fabien; Hafner, Lukas; Maury, Mylene; McCaw, Zachary R.; Julienne, Hanna; Henches, Léo; Chikhi, Rayan; Lecuit, Marc; Aschard, Hugues
ChoruMM: a versatile multi-components mixed model for bacterial-GWAS Unpublished
bioRxiv, 2023.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@unpublished{Frouin2023,
title = {ChoruMM: a versatile multi-components mixed model for bacterial-GWAS},
author = {Arthur Frouin and Fabien Laporte and Lukas Hafner and Mylene Maury and Zachary R. McCaw and Hanna Julienne and Léo Henches and Rayan Chikhi and Marc Lecuit and Hugues Aschard},
doi = {10.1101/2023.03.28.534531},
year = {2023},
date = {2023-03-01},
urldate = {2023-03-01},
publisher = {Cold Spring Harbor Laboratory},
abstract = {Genome-wide Association Studies (GWAS) have been central to studying the genetics of complex human outcomes, and there is now tremendous interest in implementing GWAS-like approaches to study pathogenic bacteria. A variety of methods have been proposed to address the complex linkage structure of bacterial genomes, however, some questions remain about to optimize the genetic modelling of bacteria to decipher causal variations from correlated ones. Here we examined the genetic structure underlying whole-genome sequencing data from 3,824 Listeria monocytogenes strains, and demonstrate that the standard human genetics model, commonly assumed by existing bacterial GWAS methods, is inadequate for studying such highly structured organisms. We leverage these results to develop ChoruMM, a robust and powerful approach that consists of a multi-component linear mixed model, where components are inferred from a hierarchical clustering of the bacteria genetic relatedness matrix. Our ChoruMM approach also includes post-processing and visualization tools that address the pervasive long-range correlation observed in bacteria genome and allow to assess the type I error rate calibration.},
howpublished = {bioRxiv},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {unpublished}
}