Rizzo, Nicola; Equi, Massimo; Norri, Tuukka; Mäkinen, Veli

Elastic founder graphs improved and enhanced Journal Article

In: Theoretical Computer Science, vol. 982, no. 114269, pp. 114269, 2024, ISSN: 0304-3975.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@article{Rizzo2024-jy,

title = {Elastic founder graphs improved and enhanced},

author = {Nicola Rizzo and Massimo Equi and Tuukka Norri and Veli Mäkinen},

doi = {10.1016/j.tcs.2023.114269},

issn = {0304-3975},

year = {2024},

date = {2024-01-01},

urldate = {2024-01-01},

journal = {Theoretical Computer Science},

volume = {982},

number = {114269},

pages = {114269},

publisher = {Elsevier BV},

abstract = {Indexing labeled graphs for pattern matching is a central challenge of pangenomics. Equi et al. (2022) [14] developed the Elastic Founder Graph (EFG) representing an alignment of m sequences of length n, drawn from alphabet Σ plus the special gap character: the paths spell the original sequences or their recombination. By enforcing the semi-repeat-free property, the EFG admits a polynomial-space index for linear-time pattern matching, breaking through the conditional lower bounds on indexing labeled graphs (Equi et al. [13]). In this work, we improve the space of the EFG index answering pattern matching queries in linear time, from linear in the length of all strings spelled by three consecutive node labels, to linear in the size of the edge labels. Then, we develop linear-time construction algorithms optimizing for different metrics: we improve the existing linearithmic construction algorithms to O(mn), by solving the novel exclusive ancestor set problem on trees; we propose, for the simplified gapless setting, an O(mn)-time solution minimizing the maximum block height, that we generalize by substituting block height with prefix-aware height. Finally, to show the versatility of the framework, we develop a BWT-based EFG index and study how to encode and perform document listing queries on a set of paths of the graphs, reporting which paths present a given pattern as a substring. We propose the EFG framework as an improved and enhanced version of the framework for the gapless setting, along with construction methods that are valid in any setting concerned with the segmentation of aligned sequences.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {article}

}

Benoit, Gaëtan; Raguideau, Sébastien; James, Robert; Phillippy, Adam M.; Chikhi, Rayan; Quince, Christopher

High-quality metagenome assembly from long accurate reads with metaMDBG Journal Article

In: Nature Biotechnology, 2024, ISSN: 1546-1696.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@article{Benoit2024,

title = {High-quality metagenome assembly from long accurate reads with metaMDBG},

author = {Gaëtan Benoit and Sébastien Raguideau and Robert James and Adam M. Phillippy and Rayan Chikhi and Christopher Quince},

doi = {10.1038/s41587-023-01983-6},

issn = {1546-1696},

year = {2024},

date = {2024-01-01},

urldate = {2024-01-01},

journal = {Nature Biotechnology},

publisher = {Springer Science and Business Media LLC},

abstract = {We introduce metaMDBG, a metagenomics assembler for PacBio HiFi reads. MetaMDBG combines a de Bruijn graph assembly in a minimizer space with an iterative assembly over sequences of minimizers to address variations in genome coverage depth and an abundance-based filtering strategy to simplify strain complexity. For complex communities, we obtained up to twice as many high-quality circularized prokaryotic metagenome-assembled genomes as existing methods and had better recovery of viruses and plasmids.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {article}

}

Sauer, Lena M.; Cánovas, Rodrigo; Roche, Daniel Barry; Shams-Eldin, Hosam; Ravel, Patrice; Colinge, Jacques; Schwarz, Ralph T.; Mamoun, Choukri Ben; Rivals, Eric; Cornillot, Emmanuel

In: Malaria Journal, vol. 22, no. 1, pp. 27-46, 2023, ISBN: 1475-2875.

Abstract | Links | BibTeX | Tags: Misc

@article{Sauer2023,

title = {FT-GPI, a highly sensitive and accurate predictor of GPI-anchored proteins, reveals the composition and evolution of the GPI proteome in Plasmodium species},

author = {Lena M. Sauer and Rodrigo Cánovas and Daniel Barry Roche and Hosam Shams-Eldin and Patrice Ravel and Jacques Colinge and Ralph T. Schwarz and Choukri Ben Mamoun and Eric Rivals and Emmanuel Cornillot},

doi = {10.1186/s12936-022-04430-0},

isbn = {1475-2875},

year = {2023},

date = {2023-12-01},

urldate = {2023-12-01},

journal = {Malaria Journal},

volume = {22},

number = {1},

pages = {27-46},

publisher = {BioMed Central},

abstract = {Protozoan parasites are known to attach specific and diverse group of proteins to their plasma membrane via a GPI anchor. In malaria parasites, GPI-anchored proteins (GPI-APs) have been shown to play an important role in host–pathogen interactions and a key function in host cell invasion and immune evasion. Because of their immunogenic properties, some of these proteins have been considered as malaria vaccine candidates. However, identification of all possible GPI-APs encoded by these parasites remains challenging due to their sequence diversity and limitations of the tools used for their characterization.},

keywords = {Misc},

pubstate = {published},

tppubtype = {article}

}

Kang, Xiongbin; Xu, Jialu; Luo, Xiao; Schönhuth, Alexander

Hybrid-hybrid correction of errors in long reads with HERO Journal Article

In: Genome Biol., vol. 24, no. 1, pp. 275, 2023, ISBN: 1474-760X.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@article{Kang2023-jk,

title = {Hybrid-hybrid correction of errors in long reads with HERO},

author = {Xiongbin Kang and Jialu Xu and Xiao Luo and Alexander Schönhuth},

doi = {10.1186/s13059-023-03112-7},

isbn = {1474-760X},

year = {2023},

date = {2023-12-01},

urldate = {2023-12-01},

journal = {Genome Biol.},

volume = {24},

number = {1},

pages = {275},

abstract = {Although generally superior, hybrid approaches for correcting errors in third-generation sequencing (TGS) reads, using next-generation sequencing (NGS) reads, mistake haplotype-specific variants for errors in polyploid and mixed samples. We suggest HERO, as the first “hybrid-hybrid”approach, to make use of both de Bruijn graphs and overlap graphs for optimal catering to the particular strengths of NGS and TGS reads. Extensive benchmarking experiments demonstrate that HERO improves indel and mismatch error rates by on average 65% (27~95%) and 20% (4~61%). Using HERO prior to genome assembly significantly improves the assemblies in the majority of the relevant categories.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {article}

}

Bernardini, Giulia; Fici, Gabriele; Gawrychowski, Paweł; Pissis, Solon P

Substring Complexity in Sublinear Space Proceedings Article

In: Iwata, Satoru; Kakimura, Naonori (Ed.): 34th International Symposium on Algorithms and Computation (ISAAC 2023), pp. 12:1–12:19, Schloss Dagstuhl - Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2023, ISBN: 978-3-95977-289-1.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@inproceedings{Bernardini2023-nr,

title = {Substring Complexity in Sublinear Space},

author = {Giulia Bernardini and Gabriele Fici and Paweł Gawrychowski and Solon P Pissis},

editor = {Iwata, Satoru and Kakimura, Naonori},

url = {https://arxiv.org/abs/2112.10376},

doi = {10.4230/LIPICS.ISAAC.2023.12},

isbn = {978-3-95977-289-1},

year = {2023},

date = {2023-11-01},

urldate = {2023-11-01},

booktitle = {34th International Symposium on Algorithms and Computation (ISAAC 2023)},

volume = {283},

pages = {12:1--12:19},

publisher = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik},

address = {Dagstuhl, Germany},

series = {Leibniz International Proceedings in Informatics (LIPIcs)},

abstract = {Shannon’s entropy is a definitive lower bound for statistical compression. Unfortunately, no such clear measure exists for the compressibility of repetitive strings. Thus, ad hoc measures are employed to estimate the repetitiveness of strings, e.g., the size z of the Lempel–Ziv parse or the number r of equal-letter runs of the Burrows-Wheeler transform. A more recent one is the size γ of a smallest string attractor. Let T be a string of length n. A string attractor of T is a set of positions of T capturing the occurrences of all the substrings of T. Unfortunately, Kempa and Prezza [STOC 2018] showed that computing γ is NP-hard. Kociumaka et al. [LATIN 2020] considered a new measure of compressibility that is based on the function S_T(k) counting the number of distinct substrings of length k of T, also known as the substring complexity of T. This new measure is defined as δ = sup{S_T(k)/k, k ≥ 1} and lower bounds all the relevant ad hoc measures previously considered. In particular, δ ≤ γ always holds and δ can be computed in 𝒪(n) time using Θ(n) working space. Kociumaka et al. showed that one can construct an 𝒪(δ log n/(δ))-sized representation of T supporting efficient direct access and efficient pattern matching queries on T. Given that for highly compressible strings, δ is significantly smaller than n, it is natural to pose the following question: Can we compute δ efficiently using sublinear working space? It is straightforward to show that in the comparison model, any algorithm computing δ using 𝒪(b) space requires Ω(n^{2-o(1)}/b) time through a reduction from the element distinctness problem [Yao, SIAM J. Comput. 1994]. We thus wanted to investigate whether we can indeed match this lower bound. We address this algorithmic challenge by showing the following bounds to compute δ: - 𝒪((n³log b)/b²) time using 𝒪(b) space, for any b ∈ [1,n], in the comparison model. - 𝒪̃(n²/b) time using 𝒪̃(b) space, for any b ∈ [√n,n], in the word RAM model. This gives an 𝒪̃(n^{1+ε})-time and 𝒪̃(n^{1-ε})-space algorithm to compute δ, for any 0 < ε ≤ 1/2. Let us remark that our algorithms compute S_T(k), for all k, within the same complexities.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {inproceedings}

}

Loukides, Grigorios; Pissis, Solon P; Sweering, Michelle

Bidirectional string anchors for improved text indexing and top-$K$ similarity search Journal Article

In: IEEE Trans. Knowl. Data Eng., vol. 35, no. 11, pp. 11093–11111, 2023, ISBN: 1558-2191.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@article{Loukides2023-qt,

title = {Bidirectional string anchors for improved text indexing and top-$K$ similarity search},

author = {Grigorios Loukides and Solon P Pissis and Michelle Sweering},

doi = {10.1109/TKDE.2022.3231780},

isbn = {1558-2191},

year = {2023},

date = {2023-11-01},

urldate = {2023-11-01},

journal = {IEEE Trans. Knowl. Data Eng.},

volume = {35},

number = {11},

pages = {11093–11111},

publisher = {Institute of Electrical and Electronics Engineers (IEEE)},

abstract = {The minimizers sampling mechanism is a popular mechanism for string sampling. However, minimizers sampling mechanisms lack good guarantees on the expected size of their samples for different combinations of their input parameters. Furthermore, indexes constructed over minimizers samples lack good worst-case guarantees for on-line pattern searches. In response, we propose bidirectional string anchors (bd-anchors), a new string sampling mechanism. Given an integer ℓ , our mechanism selects the lexicographically smallest rotation in every length-ℓ fragment. We show that, like minimizers samples, bd-anchors samples are approximately uniform, locally consistent, and computable in linear time. Furthermore, our experiments demonstrate that the bd-anchors sample sizes decrease proportionally to ℓ ; and that these sizes are competitive to or smaller than the minimizers sample sizes. We theoretically justify these results by analyzing the expected size of bd-anchors samples. We also prove that computing a total order on the input alphabet which minimizes the bd-anchors sample size is NP-hard. We next highlight the benefits of bd-anchors in two important applications: text indexing and top-K similarity search. For the first application, we develop an index for performing on-line pattern searches in near-optimal time, and show experimentally that a simple implementation of our index is consistently faster for on-line pattern searches than an analogous implementation of a minimizers-based index; we also show that it is substantially faster than two classic text indexes. For the second application, we develop a heuristic for top-K similarity search under edit distance, and show experimentally that it is generally as accurate as the state-of-the-art tool for the same purpose but more than one order of magnitude faster.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {article}

}

Romashchenko, Nikolai; Linard, Benjamin; Pardi, Fabio; Rivals, Eric

EPIK: Precise and scalable evolutionary placement with informative textitk-mers Journal Article

In: Bioinformatics, vol. 39, no. 12, 2023, ISSN: 1367-4811.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@article{Romashchenko2023-kq,

title = {EPIK: Precise and scalable evolutionary placement with informative textitk-mers},

author = {Nikolai Romashchenko and Benjamin Linard and Fabio Pardi and Eric Rivals},

url = {https://github.com/phylo42/IPK

https://github.com/phylo42/EPIK},

doi = {10.1093/bioinformatics/btad692},

issn = {1367-4811},

year = {2023},

date = {2023-11-01},

urldate = {2023-11-01},

journal = {Bioinformatics},

volume = {39},

number = {12},

publisher = {Oxford University Press (OUP)},

abstract = {Phylogenetic placement enables phylogenetic analysis of massive collections of newly sequenced DNA, when de novo tree inference is too unreliable or inefficient. Assuming that a high-quality reference tree is available, the idea is to seek the correct placement of the new sequences in that tree. Recently, alignment-free approaches to phylogenetic placement have emerged, both to circumvent the need to align the new sequences and to avoid the calculations that typically follow the alignment step. A promising approach is based on the inference of k-mers that can be potentially related to the reference sequences, also called phylo-k-mers. However, its usage is limited by the time and memory-consuming stage of reference data preprocessing and the large numbers of k-mers to consider.

We suggest a filtering method for selecting informative phylo-k-mers based on mutual information, which can significantly improve the efficiency of placement, at the cost of a small loss in placement accuracy. This method is implemented in IPK, a new tool for computing phylo-k-mers that significantly outperforms the software previously available. We also present EPIK, a new software for phylogenetic placement, supporting filtered phylo-k-mer databases. Our experiments on real-world data show that EPIK is the fastest phylogenetic placement tool available, when placing hundreds of thousands and millions of queries while still providing accurate placements.

IPK and EPIK are freely available at https://github.com/phylo42/IPK and https://github.com/phylo42/EPIK. Both are implemented in C++ and Python and supported on Linux and MacOS.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {article}

}

We suggest a filtering method for selecting informative phylo-k-mers based on mutual information, which can significantly improve the efficiency of placement, at the cost of a small loss in placement accuracy. This method is implemented in IPK, a new tool for computing phylo-k-mers that significantly outperforms the software previously available. We also present EPIK, a new software for phylogenetic placement, supporting filtered phylo-k-mer databases. Our experiments on real-world data show that EPIK is the fastest phylogenetic placement tool available, when placing hundreds of thousands and millions of queries while still providing accurate placements.

IPK and EPIK are freely available at https://github.com/phylo42/IPK and https://github.com/phylo42/EPIK. Both are implemented in C++ and Python and supported on Linux and MacOS.

Andreace, Francesco; Lechat, Pierre; Dufresne, Yoann; Chikhi, Rayan

Comparing methods for constructing and representing human pangenome graphs Journal Article

In: Genome Biology, vol. 24, no. 1, pp. 274, 2023, ISSN: 1474-760X.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@article{Andreace2023,

title = {Comparing methods for constructing and representing human pangenome graphs},

author = {Francesco Andreace and Pierre Lechat and Yoann Dufresne and Rayan Chikhi},

doi = {10.1186/s13059-023-03098-2},

issn = {1474-760X},

year = {2023},

date = {2023-11-01},

urldate = {2023-11-01},

journal = {Genome Biology},

volume = {24},

number = {1},

pages = {274},

publisher = {Springer Science and Business Media LLC},

abstract = {As a single reference genome cannot possibly represent all the variation present across human individuals, pangenome graphs have been introduced to incorporate population diversity within a wide range of genomic analyses. Several data structures have been proposed for representing collections of genomes as pangenomes, in particular graphs.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {article}

}

González, Camila Duitama; Rangavittal, Samarth; Vicedomini, Riccardo; Chikhi, Rayan; Richard, Hugues

aKmerBroom: Ancient oral DNA decontamination using Bloom filters on k-mer sets Journal Article

In: iScience, vol. 26, no. 11, pp. 108057, 2023, ISSN: 2589-0042.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@article{DuitamaGonzalez2023,

title = {aKmerBroom: Ancient oral DNA decontamination using Bloom filters on k-mer sets},

author = {Camila Duitama González and Samarth Rangavittal and Riccardo Vicedomini and Rayan Chikhi and Hugues Richard},

doi = {10.1016/j.isci.2023.108057},

issn = {2589-0042},

year = {2023},

date = {2023-11-01},

urldate = {2023-11-01},

journal = {iScience},

volume = {26},

number = {11},

pages = {108057},

publisher = {Elsevier BV},

abstract = {Dental calculus samples are modeled as a mixture of DNA coming from dental plaque and contaminants. Current computational decontamination methods such as Recentrifuge and DeconSeq require either a reference database or sequenced negative controls, and therefore have limited use cases. We present a reference-free decontamination tool tailored for the removal of contaminant DNA of ancient oral sample called aKmerBroom. Our tool builds a Bloom filter of known ancient and modern oral k-mers, then scans an input set of ancient metagenomic reads using multiple passes to iteratively retain reads likely to be of oral origin. On synthetic data, aKmerBroom achieves over 89.53% sensitivity and 94.00% specificity. On real datasets, aKmerBroom shows higher read retainment (+60% on average) than other methods. We anticipate aKmerBroom will be a valuable tool for the processing of ancient oral samples as it will prevent contaminated datasets from being completely discarded in downstream analyses.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {article}

}

Willink, Beatriz; Tunström, Kalle; Nilén, Sofie; Chikhi, Rayan; Lemane, Téo; Takahashi, Michihiko; Takahashi, Yuma; Svensson, Erik I.; Wheat, Christopher West

The genomics and evolution of inter-sexual mimicry and female-limited polymorphisms in damselflies Journal Article

In: Nature Ecology & Evolution, vol. 8, no. 1, pp. 83–97, 2023, ISSN: 2397-334X.

Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG, WP3: Translational CPG

@article{Willink2023,

title = {The genomics and evolution of inter-sexual mimicry and female-limited polymorphisms in damselflies},

author = {Beatriz Willink and Kalle Tunström and Sofie Nilén and Rayan Chikhi and Téo Lemane and Michihiko Takahashi and Yuma Takahashi and Erik I. Svensson and Christopher West Wheat},

doi = {10.1038/s41559-023-02243-1},

issn = {2397-334X},

year = {2023},

date = {2023-11-01},

urldate = {2023-11-01},

journal = {Nature Ecology & Evolution},

volume = {8},

number = {1},

pages = {83–97},

publisher = {Springer Science and Business Media LLC},

abstract = {Sex-limited morphs can provide profound insights into the evolution and genomic architecture of complex phenotypes. Inter-sexual mimicry is one particular type of sex-limited polymorphism in which a novel morph resembles the opposite sex. While inter-sexual mimics are known in both sexes and a diverse range of animals, their evolutionary origin is poorly understood. Here, we investigated the genomic basis of female-limited morphs and male mimicry in the common bluetail damselfly. Differential gene expression between morphs has been documented in damselflies, but no causal locus has been previously identified. We found that male mimicry originated in an ancestrally sexually dimorphic lineage in association with multiple structural changes, probably driven by transposable element activity. These changes resulted in ~900 kb of novel genomic content that is partly shared by male mimics in a close relative, indicating that male mimicry is a trans-species polymorphism. More recently, a third morph originated following the translocation of part of the male-mimicry sequence into a genomic position ~3.5 mb apart. We provide evidence of balancing selection maintaining male mimicry, in line with previous field population studies. Our results underscore how structural variants affecting a handful of potentially regulatory genes and morph-specific genes can give rise to novel and complex phenotypic polymorphisms.},

keywords = {WP2: Evolutionary/Comparative CPG, WP3: Translational CPG},

pubstate = {published},

tppubtype = {article}

}

Romashchenko, Nikolai; Linard, Benjamin; Rivals, Eric; Pardi, Fabio

Computing Phylo- k-mers Journal Article

In: IEEE/ACM Trans. Comput. Biol. Bioinform., vol. 20, no. 5, pp. 2889–2897, 2023, ISBN: 1557-9964.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@article{Romashchenko2023-hj,

title = {Computing Phylo- k-mers},

author = {Nikolai Romashchenko and Benjamin Linard and Eric Rivals and Fabio Pardi},

url = {https://hal-lirmm.ccsd.cnrs.fr/lirmm-03778953v2},

doi = {10.1109/TCBB.2023.3278049},

isbn = {1557-9964},

year = {2023},

date = {2023-09-01},

urldate = {2023-09-01},

journal = {IEEE/ACM Trans. Comput. Biol. Bioinform.},

volume = {20},

number = {5},

pages = {2889–2897},

abstract = {Finding the correct position of new sequences within an established phylogenetic tree is an increasingly relevant problem in evolutionary bioinformatics and metagenomics. Recently, alignment-free approaches for this task have been proposed. One such approach is based on the concept of phylogenetically-informative k-mers or phylo- k-mers for short. In practice, phylo- k-mers are inferred from a set of related reference sequences and are equipped with scores expressing the probability of their appearance in different locations within the input reference phylogeny. Computing phylo- k-mers, however, represents a computational bottleneck to their applicability in real-world problems such as the phylogenetic analysis of metabarcoding reads and the detection of novel recombinant viruses. Here we consider the problem of phylo- k-mer computation: how can we efficiently find all k-mers whose probability lies above a given threshold for a given tree node? We describe and analyze algorithms for this problem, relying on branch-and-bound and divide-and-conquer techniques. We exploit the redundancy of adjacent windows of the alignment to save on computation. Besides computational complexity analyses, we provide an empirical evaluation of the relative performance of their implementations on simulated and real-world data. The divide-and-conquer algorithms are found to surpass the branch-and-bound approach, especially when many phylo- k-mers are found.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {article}

}

Cozzi, Davide; Rossi, Massimiliano; Rubinacci, Simone; Gagie, Travis; Köppl, Dominik; Boucher, Christina; Bonizzoni, Paola

μ- PBWT: a lightweight r-indexing of the PBWT for storing and querying UK Biobank data Journal Article

In: Bioinformatics, vol. 39, no. 9, 2023, ISSN: 1367-4811.

Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG

@article{Cozzi2023-bx,

title = {μ- PBWT: a lightweight r-indexing of the PBWT for storing and querying UK Biobank data},

author = {Davide Cozzi and Massimiliano Rossi and Simone Rubinacci and Travis Gagie and Dominik Köppl and Christina Boucher and Paola Bonizzoni},

url = {https://github.com/dlcgold/muPBWT},

doi = {10.1093/bioinformatics/btad552},

issn = {1367-4811},

year = {2023},

date = {2023-09-01},

urldate = {2023-09-01},

journal = {Bioinformatics},

volume = {39},

number = {9},

publisher = {Oxford University Press (OUP)},

abstract = {The Positional Burrows–Wheeler Transform (PBWT) is a data structure that indexes haplotype sequences in a manner that enables finding maximal haplotype matches in h sequences containing w variation sites in O(hw) time. This represents a significant improvement over classical quadratic-time approaches. However, the original PBWT data structure does not allow for queries over Biobank panels that consist of several millions of haplotypes, if an index of the haplotypes must be kept entirely in memory.

In this article, we leverage the notion of r-index proposed for the BWT to present a memory-efficient method for constructing and storing the run-length encoded PBWT, and computing set maximal matches (SMEMs) queries in haplotype sequences. We implement our method, which we refer to as μ-PBWT, and evaluate it on datasets of 1000 Genome Project and UK Biobank data. Our experiments demonstrate that the μ-PBWT reduces the memory usage up to a factor of 20% compared to the best current PBWT-based indexing. In particular, μ-PBWT produces an index that stores high-coverage whole genome sequencing data of chromosome 20 in about a third of the space of its BCF file. μ-PBWT is an adaptation of techniques for the run-length compressed BWT for the PBWT (RLPBWT) and it is based on keeping in memory only a succinct representation of the RLPBWT that still allows the efficient computation of set maximal matches (SMEMs) over the original panel.

Our implementation is open source and available at https://github.com/dlcgold/muPBWT. The binary is available at https://bioconda.github.io/recipes/mupbwt/README.html.},

keywords = {WP2: Evolutionary/Comparative CPG},

pubstate = {published},

tppubtype = {article}

}

In this article, we leverage the notion of r-index proposed for the BWT to present a memory-efficient method for constructing and storing the run-length encoded PBWT, and computing set maximal matches (SMEMs) queries in haplotype sequences. We implement our method, which we refer to as μ-PBWT, and evaluate it on datasets of 1000 Genome Project and UK Biobank data. Our experiments demonstrate that the μ-PBWT reduces the memory usage up to a factor of 20% compared to the best current PBWT-based indexing. In particular, μ-PBWT produces an index that stores high-coverage whole genome sequencing data of chromosome 20 in about a third of the space of its BCF file. μ-PBWT is an adaptation of techniques for the run-length compressed BWT for the PBWT (RLPBWT) and it is based on keeping in memory only a succinct representation of the RLPBWT that still allows the efficient computation of set maximal matches (SMEMs) over the original panel.

Our implementation is open source and available at https://github.com/dlcgold/muPBWT. The binary is available at https://bioconda.github.io/recipes/mupbwt/README.html.

Bonnet, Konstantinn; Marschall, Tobias; Doerr, Daniel

Constructing founder sets under allelic and non-allelic homologous recombination Journal Article

In: Algorithms Mol. Biol., vol. 18, no. 1, pp. 15, 2023, ISBN: 1748-7188.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@article{Bonnet2023-gs,

title = {Constructing founder sets under allelic and non-allelic homologous recombination},

author = {Konstantinn Bonnet and Tobias Marschall and Daniel Doerr},

doi = {10.1186/s13015-023-00241-3},

isbn = {1748-7188},

year = {2023},

date = {2023-09-01},

urldate = {2023-09-01},

journal = {Algorithms Mol. Biol.},

volume = {18},

number = {1},

pages = {15},

abstract = {Homologous recombination between the maternal and paternal copies of a chromosome is a key mechanism for human inheritance and shapes population genetic properties of our species. However, a similar mechanism can also act between different copies of the same sequence, then called non-allelic homologous recombination (NAHR). This process can result in genomic rearrangements—including deletion, duplication, and inversion—and is underlying many genomic disorders. Despite its importance for genome evolution and disease, there is a lack of computational models to study genomic loci prone to NAHR. In this work, we propose such a computational model, providing a unified framework for both (allelic) homologous recombination and NAHR. Our model represents a set of genomes as a graph, where haplotypes correspond to walks through this graph. We formulate two founder set problems under our recombination model, provide flow-based algorithms for their solution, describe exact methods to characterize the number of recombinations, and demonstrate scalability to problem instances arising in practice.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {article}

}

Ayad, Lorraine A K; Chikhi, Rayan; Pissis, Solon P

Seedability: optimizing alignment parameters for sensitive sequence comparison Journal Article

In: Bioinform. Adv., vol. 3, no. 1, 2023, ISSN: 2635-0041.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG, WP2: Evolutionary/Comparative CPG

@article{Ayad2023-li,

title = {Seedability: optimizing alignment parameters for sensitive sequence comparison},

author = {Lorraine A K Ayad and Rayan Chikhi and Solon P Pissis},

url = {https://github.com/lorrainea/Seedability},

doi = {10.1093/bioadv/vbad108},

issn = {2635-0041},

year = {2023},

date = {2023-08-01},

urldate = {2023-08-01},

journal = {Bioinform. Adv.},

volume = {3},

number = {1},

publisher = {Oxford University Press (OUP)},

abstract = {Most sequence alignment techniques make use of exact k-mer hits, called seeds, as anchors to optimize alignment speed. A large number of bioinformatics tools employing seed-based alignment techniques, such as Minimap2, use a single value of k per sequencing technology, without a strong guarantee that this is the best possible value. Given the ubiquity of sequence alignment, identifying values of k that lead to more sensitive alignments is thus an important task. To aid this, we present Seedability, a seed-based alignment framework designed for estimating an optimal seed k-mer length (as well as a minimal number of shared seeds) based on a given alignment identity threshold. In particular, we were motivated to make Minimap2 more sensitive in the pairwise alignment of short sequences.

The experimental results herein show improved alignments of short and divergent sequences when using the parameter values determined by Seedability in comparison to the default values of Minimap2. We also show several cases of pairs of real divergent sequences, where the default parameter values of Minimap2 yield no output alignments, but the values output by Seedability produce plausible alignments.

https://github.com/lorrainea/Seedability (distributed under GPL v3.0).},

keywords = {WP1: Primary CPG, WP2: Evolutionary/Comparative CPG},

pubstate = {published},

tppubtype = {article}

}

The experimental results herein show improved alignments of short and divergent sequences when using the parameter values determined by Seedability in comparison to the default values of Minimap2. We also show several cases of pairs of real divergent sequences, where the default parameter values of Minimap2 yield no output alignments, but the values output by Seedability produce plausible alignments.

https://github.com/lorrainea/Seedability (distributed under GPL v3.0).

Rizzo, Nicola; Cáceres, Manuel; Mäkinen, Veli

Finding Maximal Exact Matches in Graphs Proceedings Article

In: Belazzougui, Djamal; Ouangraoua, A"ida (Ed.): 23rd International Workshop on Algorithms in Bioinformatics (WABI 2023), pp. 10:1–10:17, Schloss Dagstuhl - Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2023, ISBN: 978-3-95977-294-5.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@inproceedings{Rizzo2023-hg,

title = {Finding Maximal Exact Matches in Graphs},

author = {Nicola Rizzo and Manuel Cáceres and Veli Mäkinen},

editor = {Belazzougui, Djamal and Ouangraoua, A"{i}da},

doi = {10.4230/LIPIcs.WABI.2023.10},

isbn = {978-3-95977-294-5},

year = {2023},

date = {2023-08-01},

urldate = {2023-08-01},

booktitle = {23rd International Workshop on Algorithms in Bioinformatics (WABI 2023)},

volume = {273},

pages = {10:1--10:17},

publisher = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik},

address = {Dagstuhl, Germany},

series = {Leibniz International Proceedings in Informatics (LIPIcs)},

abstract = {We study the problem of finding maximal exact matches (MEMs) between a query string Q and a labeled graph G. MEMs are an important class of seeds, often used in seed-chain-extend type of practical alignment methods because of their strong connections to classical metrics. A principled way to speed up chaining is to limit the number of MEMs by considering only MEMs of length at least κ (κ-MEMs). However, on arbitrary input graphs, the problem of finding MEMs cannot be solved in truly sub-quadratic time under SETH (Equi et al., ICALP 2019) even on acyclic graphs. In this paper we show an O(n⋅ L ⋅ d^{L-1} + m + M_{κ,L})-time algorithm finding all κ-MEMs between Q and G spanning exactly L nodes in G, where n is the total length of node labels, d is the maximum degree of a node in G, m = |Q|, and M_{κ,L} is the number of output MEMs. We use this algorithm to develop a κ-MEM finding solution on indexable Elastic Founder Graphs (Equi et al., Algorithmica 2022) running in time O(nH² + m + M_κ), where H is the maximum number of nodes in a block, and M_κ is the total number of κ-MEMs. Our results generalize to the analysis of multiple query strings (MEMs between G and any of the strings). Additionally, we provide some preliminary experimental results showing that the number of graph MEMs is an order of magnitude smaller than the number of string MEMs of the corresponding concatenated collection.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {inproceedings}

}

Lee, Sewon; Kim, Gyuri; Karin, Eli Levy; Mirdita, Milot; Park, Sukhwan; Chikhi, Rayan; Babaian, Artem; Kryshtafovych, Andriy; Steinegger, Martin

Petascale Homology Search for Structure Prediction Unpublished

bioRxiv, 2023.

Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG

@unpublished{Lee2023,

title = {Petascale Homology Search for Structure Prediction},

author = {Sewon Lee and Gyuri Kim and Eli Levy Karin and Milot Mirdita and Sukhwan Park and Rayan Chikhi and Artem Babaian and Andriy Kryshtafovych and Martin Steinegger},

doi = {10.1101/2023.07.10.548308},

year = {2023},

date = {2023-07-01},

urldate = {2023-07-01},

publisher = {Cold Spring Harbor Laboratory},

abstract = {The recent CASP15 competition highlighted the critical role of multiple sequence alignments (MSAs) in protein structure prediction, as demonstrated by the success of the top AlphaFold2-based prediction methods. To push the boundaries of MSA utilization, we conducted a petabase-scale search of the Sequence Read Archive (SRA), resulting in gigabytes of aligned homologs for CASP15 targets. These were merged with default MSAs produced by ColabFold-search and provided to ColabFold-predict. By using SRA data, we achieved highly accurate predictions (GDT_TS > 70) for 66% of the non-easy targets, whereas using ColabFold-search default MSAs scored highly in only 52%. Next, we tested the effect of deep homology search and ColabFold’s advanced features, such as more recycles, on prediction accuracy. While SRA homologs were most significant for improving ColabFold’s CASP15 ranking from 11th to 3rd place, other strategies contributed too. We analyze these in the context of existing strategies to improve prediction.},

howpublished = {bioRxiv},

keywords = {WP2: Evolutionary/Comparative CPG},

pubstate = {published},

tppubtype = {unpublished}

}

Loukides, Grigorios; Pissis, Solon P.; Thankachan, Sharma V.; Zuba, Wiktor

Suffix-prefix queries on a dictionary Proceedings Article

In: Bulteau, Laurent; Lipt'ak, Zsuzsanna (Ed.): 34th Annual Symposium on Combinatorial Pattern Matching (CPM 2023), pp. 21:1–21:20, Schloss Dagstuhl - Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2023, ISBN: 978-3-95977-276-1.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@inproceedings{Loukides2023-rk,

title = {Suffix-prefix queries on a dictionary},

author = {Grigorios Loukides and Solon P. Pissis and Sharma V. Thankachan and Wiktor Zuba},

editor = {Bulteau, Laurent and Lipt'{a}k, Zsuzsanna},

doi = {10.4230/LIPIcs.CPM.2023.21},

isbn = {978-3-95977-276-1},

year = {2023},

date = {2023-06-01},

urldate = {2023-06-01},

booktitle = {34th Annual Symposium on Combinatorial Pattern Matching (CPM 2023)},

volume = {259},

pages = {21:1--21:20},

publisher = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik},

address = {Dagstuhl, Germany},

series = {Leibniz International Proceedings in Informatics (LIPIcs)},

abstract = {In the all-pairs suffix-prefix (APSP) problem, we are given a dictionary R of k strings, S_1,…,S_k, of total length n, and we are asked to find the length SPL_{i,j} of the longest string that is both a suffix of S_i and a prefix of S_j, for all i,j ∈ [1,k]. APSP is a classic problem in string algorithms with many applications in bioinformatics. When all strings of the dictionary are over an integer alphabet of size σ ≤ n^𝒪(1), APSP can be solved in the optimal 𝒪(n+k²) time with the use of the generalized suffix tree of the dictionary [Gusfield et al., Inf. Process. Lett. 1992]. In many bioinformatics applications, such as in sequence assembly, the size k of dictionary R is very large. In particular, k² usually dominates n, and thus the k² factor is the bottleneck both in the time and in the space complexity of such applications. We thus initiate a holistic study on several data structure variants of APSP. In particular, we consider the following types of queries: - One-to-One(i,j): output SPL_{i,j}. - One-to-All(i): output SPL_{i,j} for every j ∈ [1,k]. - Report(i,𝓁): output all distinct j ∈ [1,k] such that SPL_{i,j} ≥ 𝓁, where 𝓁 ≥ 0 is an integer. - Count(i,𝓁): output the number of distinct j ∈ [1,k] such that SPL_{i,j} ≥ 𝓁, where 𝓁 ≥ 0 is an integer. - Top(i,K): output K distinct j ∈ [1,k] with the highest values of SPL_{i,j} breaking ties arbitrarily. We assume the standard word RAM model of computation with word size w = Ω(log n) and an integer alphabet of size σ ≤ n^𝒪(1). We show the following upper bounds: Query | Space (words) | Query time | Note One-to-One(i,j) | 𝒪(n) | 𝒪(log log k) | Theorem 11 One-to-All(i) | 𝒪(n) | 𝒪(k) | Theorem 14 Report(i,𝓁) | 𝒪(n) | 𝒪(log n/log log n+output) | Theorem 19(i) Count(i,𝓁) | 𝒪(n) | 𝒪(log n/log log n) | Theorem 19(ii) Top(i,K) | 𝒪(n) | 𝒪(log² n/log log n+K) | Theorem 22 We also present efficient algorithms for constructing these data structures.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {inproceedings}

}

Gabory, Esteban; Mwaniki, Moses Njagi; Pisanti, Nadia; Pissis, Solon P; Radoszewski, Jakub; Sweering, Michelle; Zuba, Wiktor

Comparing elastic-degenerate strings: Algorithms, lower bounds, and applications Proceedings Article

In: Bulteau, Laurent; Lipt'ak, Zsuzsanna (Ed.): 34th Annual Symposium on Combinatorial Pattern Matching (CPM 2023), pp. 11:1–11:20, Schloss Dagstuhl - Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2023, ISBN: 978-3-95977-276-1.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG, WP2: Evolutionary/Comparative CPG

@inproceedings{Gabory2023-vh,

title = {Comparing elastic-degenerate strings: Algorithms, lower bounds, and applications},

author = {Esteban Gabory and Moses Njagi Mwaniki and Nadia Pisanti and Solon P Pissis and Jakub Radoszewski and Michelle Sweering and Wiktor Zuba},

editor = {Bulteau, Laurent and Lipt'{a}k, Zsuzsanna},

doi = {10.4230/LIPIcs.CPM.2023.11},

isbn = {978-3-95977-276-1},

year = {2023},

date = {2023-06-01},

urldate = {2023-06-01},

booktitle = {34th Annual Symposium on Combinatorial Pattern Matching (CPM 2023)},

volume = {259},

pages = {11:1--11:20},

publisher = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik},

address = {Dagstuhl, Germany},

series = {Leibniz International Proceedings in Informatics (LIPIcs)},

abstract = {An elastic-degenerate (ED) string T is a sequence of n sets T[1],…,T[n] containing m strings in total whose cumulative length is N. We call n, m, and N the length, the cardinality and the size of T, respectively. The language of T is defined as ℒ(T) = {S_1 ⋯ S_n : S_i ∈ T[i] for all i ∈ [1,n]}. ED strings have been introduced to represent a set of closely-related DNA sequences, also known as a pangenome. The basic question we investigate here is: Given two ED strings, how fast can we check whether the two languages they represent have a nonempty intersection? We call the underlying problem the ED String Intersection (EDSI) problem. For two ED strings T₁ and T₂ of lengths n₁ and n₂, cardinalities m₁ and m₂, and sizes N₁ and N₂, respectively, we show the following: - There is no 𝒪((N₁N₂)^{1-ε})-time algorithm, thus no 𝒪((N₁m₂+N₂m₁)^{1-ε})-time algorithm and no 𝒪((N₁n₂+N₂n₁)^{1-ε})-time algorithm, for any constant ε > 0, for EDSI even when T₁ and T₂ are over a binary alphabet, unless the Strong Exponential-Time Hypothesis is false. - There is no combinatorial 𝒪((N₁+N₂)^{1.2-ε}f(n₁,n₂))-time algorithm, for any constant ε > 0 and any function f, for EDSI even when T₁ and T₂ are over a binary alphabet, unless the Boolean Matrix Multiplication conjecture is false. - An 𝒪(N₁log N₁log n₁+N₂log N₂log n₂)-time algorithm for outputting a compact (RLE) representation of the intersection language of two unary ED strings. In the case when T₁ and T₂ are given in a compact representation, we show that the problem is NP-complete. - An 𝒪(N₁m₂+N₂m₁)-time algorithm for EDSI. - An Õ(N₁^{ω-1}n₂+N₂^{ω-1}n₁)-time algorithm for EDSI, where ω is the exponent of matrix multiplication; the Õ notation suppresses factors that are polylogarithmic in the input size. We also show that the techniques we develop have applications outside of ED string comparison.},

keywords = {WP1: Primary CPG, WP2: Evolutionary/Comparative CPG},

pubstate = {published},

tppubtype = {inproceedings}

}

Sládeček, Tomáš; Gažiová, Michaela; Kucharík, Marcel; Zaťková, Andrea; Pös, Zuzana; Pös, Ondrej; Krampl, Werner; Tomková, Erika; Hýblová, Michaela; Minárik, Gabriel; Radvánszky, Ján; Budiš, Jaroslav; Szemes, Tomáš

In: Sci. Rep., vol. 13, no. 1, pp. 10531, 2023, ISBN: 2045-2322.

Abstract | Links | BibTeX | Tags: WP3: Translational CPG

@article{Sladecek2023-oq,

title = {Combination of expert guidelines-based and machine learning-based approaches leads to superior accuracy of automated prediction of clinical effect of copy number variations},

author = {Tomáš Sládeček and Michaela Gažiová and Marcel Kucharík and Andrea Zaťková and Zuzana Pös and Ondrej Pös and Werner Krampl and Erika Tomková and Michaela Hýblová and Gabriel Minárik and Ján Radvánszky and Jaroslav Budiš and Tomáš Szemes},

url = {https://predict.genovisio.com/},

doi = {10.1038/s41598-023-37352-1},

isbn = {2045-2322},

year = {2023},

date = {2023-06-01},

urldate = {2023-06-01},

journal = {Sci. Rep.},

volume = {13},

number = {1},

pages = {10531},

abstract = {Clinical interpretation of copy number variants (CNVs) is a complex process that requires skilled clinical professionals. General recommendations have been recently released to guide the CNV interpretation based on predefined criteria to uniform the decision process. Several semiautomatic computational methods have been proposed to recommend appropriate choices, relieving clinicians of tedious searching in vast genomic databases. We have developed and evaluated such a tool called MarCNV and tested it on CNV records collected from the ClinVar database. Alternatively, the emerging machine learning-based tools, such as the recently published ISV (Interpretation of Structural Variants), showed promising ways of even fully automated predictions using broader characterization of affected genomic elements. Such tools utilize features additional to ACMG criteria, thus providing supporting evidence and the potential to improve CNV classification. Since both approaches contribute to evaluation of CNVs clinical impact, we propose a combined solution in the form of a decision support tool based on automated ACMG guidelines (MarCNV) supplemented by a machine learning-based pathogenicity prediction (ISV) for the classification of CNVs. We provide evidence that such a combined approach is able to reduce the number of uncertain classifications and reveal potentially incorrect classifications using automated guidelines. CNV interpretation using MarCNV, ISV, and combined approach is available for non-commercial use at https://predict.genovisio.com/.},

keywords = {WP3: Translational CPG},

pubstate = {published},

tppubtype = {article}

}

Ekim, Bariş; Sahlin, Kristoffer; Medvedev, Paul; Berger, Bonnie; Chikhi, Rayan

Efficient mapping of accurate long reads in minimizer space with mapquik Journal Article

In: Genome Research, 2023, ISSN: 1549-5469.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@article{Ekim2023,

title = {Efficient mapping of accurate long reads in minimizer space with mapquik},

author = {Bariş Ekim and Kristoffer Sahlin and Paul Medvedev and Bonnie Berger and Rayan Chikhi},

doi = {10.1101/gr.277679.123},

issn = {1549-5469},

year = {2023},

date = {2023-06-01},

urldate = {2023-06-01},

journal = {Genome Research},

publisher = {Cold Spring Harbor Laboratory},

abstract = {DNA sequencing data continue to progress toward longer reads with increasingly lower sequencing error rates. We focus on the critical problem of mapping, or aligning, low-divergence sequences from long reads (e.g., Pacific Biosciences [PacBio] HiFi) to a reference genome, which poses challenges in terms of accuracy and computational resources when using cutting-edge read mapping approaches that are designed for all types of alignments. A natural idea would be to optimize efficiency with longer seeds to reduce the probability of extraneous matches; however, contiguous exact seeds quickly reach a sensitivity limit. We introduce mapquik, a novel strategy that creates accurate longer seeds by anchoring alignments through matches of k consecutively sampled minimizers (k-min-mers) and only indexing k-min-mers that occur once in the reference genome, thereby unlocking ultrafast mapping while retaining high sensitivity. We show that mapquik significantly accelerates the seeding and chaining steps-fundamental bottlenecks to read mapping-for both the human and maize genomes with >96% sensitivity and near-perfect specificity. On the human genome, for both real and simulated reads, mapquik achieves a 37x speedup over the state-of-the-art tool minimap2, and on the maize genome, mapquik achieves a 410x speedup over minimap2, making mapquik the fastest mapper to date. These accelerations are enabled from not only minimizer-space seeding but also a novel heuristic $O(n)$ pseudochaining algorithm, which improves upon the long-standing $O(nlog n)$ bound. Minimizer-space computation builds the foundation for achieving real-time analysis of long-read sequencing data.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {article}

}