Lemane, Téo; Lezzoche, Nolan; Lecubin, Julien; Pelletier, Eric; Lescot, Magali; Chikhi, Rayan; Peterlongo, Pierre
kmindex and ORA: indexing and real-time user-friendly queries in terabyte-sized complex genomic datasets Unpublished
bioRxiv, 2023.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@unpublished{Lemane2023,
title = {kmindex and ORA: indexing and real-time user-friendly queries in terabyte-sized complex genomic datasets},
author = {Téo Lemane and Nolan Lezzoche and Julien Lecubin and Eric Pelletier and Magali Lescot and Rayan Chikhi and Pierre Peterlongo},
url = {https://github.com/tlemane/kmindex
https://ocean-read-atlas.mio.osupytheas.fr/},
doi = {10.1101/2023.05.31.543043},
year = {2023},
date = {2023-06-01},
urldate = {2023-06-01},
publisher = {Cold Spring Harbor Laboratory},
abstract = {Public sequencing databases contain vast amounts of biological information, yet they are largely underutilized as one cannot efficiently search them for any sequence(s) of interest. We present kmindex, an innovative approach that can index thousands of highly complex metagenomes and perform sequence searches in a fraction of a second. The index construction is an order of magnitude faster than previous methods, while search times are two orders of magnitude faster. With negligible false positive rates below 0.01%, kmindex outperforms the precision of existing approaches by four orders of magnitude. We demonstrate the scalability of kmindex by successfully indexing 1,393 complex marine seawater metagenome samples from the Tara Oceans project. Additionally, we introduce the publicly accessible web server “Ocean Read Atlas” (ORA) at https://ocean-read-atlas.mio.osupytheas.fr/, which enables real-time queries on the Tara Oceans dataset. The open-source kmindex software is available at https://github.com/tlemane/kmindex.},
howpublished = {bioRxiv},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {unpublished}
}
Ayad, Lorraine A K; Loukides, Grigorios; Pissis, Solon P
Text Indexing for Long Patterns: Anchors are All you Need Proceedings Article
In: pp. 2117–2131, Proceedings VLDB Endowment, 2023, ISSN: 2150-8097.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Ayad2023-vw,
title = {Text Indexing for Long Patterns: Anchors are All you Need},
author = {Lorraine A K Ayad and Grigorios Loukides and Solon P Pissis},
doi = {10.14778/3598581.3598586},
issn = {2150-8097},
year = {2023},
date = {2023-05-01},
urldate = {2023-05-01},
journal = {Proceedings VLDB Endowment},
volume = {16},
number = {9},
pages = {2117–2131},
publisher = {Proceedings VLDB Endowment},
abstract = {In many real-world database systems, a large fraction of the data is represented by strings: sequences of letters over some alphabet. This is because strings can easily encode data arising from different sources. It is often crucial to represent such string datasets in a compact form but also to simultaneously enable fast pattern matching queries. This is the classic text indexing problem. The four absolute measures anyone should pay attention to when designing or implementing a text index are: (i) index space; (ii) query time; (iii) construction space; and (iv) construction time. Unfortunately, however, most (if not all) widely-used indexes (e.g., suffix tree, suffix array, or their compressed counterparts) are not optimized for all four measures simultaneously, as it is difficult to have the best of all four worlds. Here, we take an important step in this direction by showing that text indexing with locally consistent anchors (lc-anchors) offers remarkably good performance in all four measures, when we have at hand a lower bound l on the length of the queried patterns --- which is arguably a quite reasonable assumption in practical applications. Specifically, we improve on the construction of the index proposed by Loukides and Pissis, which is based on bidirectional string anchors (bd-anchors), a new type of lc-anchors, by: (i) designing an average-case linear-time algorithm to compute bd-anchors; and (ii) developing a semi-external-memory implementation to construct the index in small space using near-optimal work. We then present an extensive experimental evaluation, based on the four measures, using real benchmark datasets. The results show that, for long patterns, the index constructed using our improved algorithms compares favorably to all classic indexes: (compressed) suffix tree; (compressed) suffix array; and the FM-index.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Liao, Wen-Wei; Asri, Mobin; Ebler, Jana; Doerr, Daniel; Haukness, Marina; Hickey, Glenn; Lu, Shuangjia; Lucas, Julian K; Monlong, Jean; Abel, Haley J; Buonaiuto, Silvia; Chang, Xian H; Cheng, Haoyu; Chu, Justin; Colonna, Vincenza; Eizenga, Jordan M; Feng, Xiaowen; Fischer, Christian; Fulton, Robert S; Garg, Shilpa; Groza, Cristian; Guarracino, Andrea; Harvey, William T; Heumos, Simon; Howe, Kerstin; Jain, Miten; Lu, Tsung-Yu; Markello, Charles; Martin, Fergal J; Mitchell, Matthew W; Munson, Katherine M; Mwaniki, Moses Njagi; Novak, Adam M; Olsen, Hugh E; Pesout, Trevor; Porubsky, David; Prins, Pjotr; Sibbesen, Jonas A; Sirén, Jouni; Tomlinson, Chad; Villani, Flavia; Vollger, Mitchell R; Antonacci-Fulton, Lucinda L; Baid, Gunjan; Baker, Carl A; Belyaeva, Anastasiya; Billis, Konstantinos; Carroll, Andrew; Chang, Pi-Chuan; Cody, Sarah; Cook, Daniel E; Cook-Deegan, Robert M; Cornejo, Omar E; Diekhans, Mark; Ebert, Peter; Fairley, Susan; Fedrigo, Olivier; Felsenfeld, Adam L; Formenti, Giulio; Frankish, Adam; Gao, Yan; Garrison, Nanibaa' A; Giron, Carlos Garcia; Green, Richard E; Haggerty, Leanne; Hoekzema, Kendra; Hourlier, Thibaut; Ji, Hanlee P; Kenny, Eimear E; Koenig, Barbara A; Kolesnikov, Alexey; Korbel, Jan O; Kordosky, Jennifer; Koren, Sergey; Lee, Hojoon; Lewis, Alexandra P; Magalhães, Hugo; Marco-Sola, Santiago; Marijon, Pierre; McCartney, Ann; McDaniel, Jennifer; Mountcastle, Jacquelyn; Nattestad, Maria; Nurk, Sergey; Olson, Nathan D; Popejoy, Alice B; Puiu, Daniela; Rautiainen, Mikko; Regier, Allison A; Rhie, Arang; Sacco, Samuel; Sanders, Ashley D; Schneider, Valerie A; Schultz, Baergen I; Shafin, Kishwar; Smith, Michael W; Sofia, Heidi J; Tayoun, Ahmad N Abou; Thibaud-Nissen, Françoise; Tricomi, Francesca Floriana; Wagner, Justin; Walenz, Brian; Wood, Jonathan M D; Zimin, Aleksey V; Bourque, Guillaume; Chaisson, Mark J P; Flicek, Paul; Phillippy, Adam M; Zook, Justin M; Eichler, Evan E; Haussler, David; Wang, Ting; Jarvis, Erich D; Miga, Karen H; Garrison, Erik; Marschall, Tobias; Hall, Ira M; Li, Heng; Paten, Benedict
A draft human pangenome reference Journal Article
In: Nature, vol. 617, no. 7960, pp. 312–324, 2023, ISBN: 1476-4687.
Abstract | Links | BibTeX | Tags: WP3: Translational CPG
@article{Liao2023-do,
title = {A draft human pangenome reference},
author = {Wen-Wei Liao and Mobin Asri and Jana Ebler and Daniel Doerr and Marina Haukness and Glenn Hickey and Shuangjia Lu and Julian K Lucas and Jean Monlong and Haley J Abel and Silvia Buonaiuto and Xian H Chang and Haoyu Cheng and Justin Chu and Vincenza Colonna and Jordan M Eizenga and Xiaowen Feng and Christian Fischer and Robert S Fulton and Shilpa Garg and Cristian Groza and Andrea Guarracino and William T Harvey and Simon Heumos and Kerstin Howe and Miten Jain and Tsung-Yu Lu and Charles Markello and Fergal J Martin and Matthew W Mitchell and Katherine M Munson and Moses Njagi Mwaniki and Adam M Novak and Hugh E Olsen and Trevor Pesout and David Porubsky and Pjotr Prins and Jonas A Sibbesen and Jouni Sirén and Chad Tomlinson and Flavia Villani and Mitchell R Vollger and Lucinda L Antonacci-Fulton and Gunjan Baid and Carl A Baker and Anastasiya Belyaeva and Konstantinos Billis and Andrew Carroll and Pi-Chuan Chang and Sarah Cody and Daniel E Cook and Robert M Cook-Deegan and Omar E Cornejo and Mark Diekhans and Peter Ebert and Susan Fairley and Olivier Fedrigo and Adam L Felsenfeld and Giulio Formenti and Adam Frankish and Yan Gao and Nanibaa' A Garrison and Carlos Garcia Giron and Richard E Green and Leanne Haggerty and Kendra Hoekzema and Thibaut Hourlier and Hanlee P Ji and Eimear E Kenny and Barbara A Koenig and Alexey Kolesnikov and Jan O Korbel and Jennifer Kordosky and Sergey Koren and Hojoon Lee and Alexandra P Lewis and Hugo Magalhães and Santiago Marco-Sola and Pierre Marijon and Ann McCartney and Jennifer McDaniel and Jacquelyn Mountcastle and Maria Nattestad and Sergey Nurk and Nathan D Olson and Alice B Popejoy and Daniela Puiu and Mikko Rautiainen and Allison A Regier and Arang Rhie and Samuel Sacco and Ashley D Sanders and Valerie A Schneider and Baergen I Schultz and Kishwar Shafin and Michael W Smith and Heidi J Sofia and Ahmad N Abou Tayoun and Françoise Thibaud-Nissen and Francesca Floriana Tricomi and Justin Wagner and Brian Walenz and Jonathan M D Wood and Aleksey V Zimin and Guillaume Bourque and Mark J P Chaisson and Paul Flicek and Adam M Phillippy and Justin M Zook and Evan E Eichler and David Haussler and Ting Wang and Erich D Jarvis and Karen H Miga and Erik Garrison and Tobias Marschall and Ira M Hall and Heng Li and Benedict Paten},
doi = {10.1038/s41586-023-05896-x},
isbn = {1476-4687},
year = {2023},
date = {2023-05-01},
urldate = {2023-05-01},
journal = {Nature},
volume = {617},
number = {7960},
pages = {312–324},
publisher = {Springer Science and Business Media LLC},
abstract = {Here the Human Pangenome Reference Consortium presents a first draft of the human pangenome reference. The pangenome contains 47 phased, diploid assemblies from a cohort of genetically diverse individuals1. These assemblies cover more than 99% of the expected sequence in each genome and are more than 99% accurate at the structural and base pair levels. Based on alignments of the assemblies, we generate a draft pangenome that captures known variants and haplotypes and reveals new alleles at structurally complex loci. We also add 119 million base pairs of euchromatic polymorphic sequences and 1,115 gene duplications relative to the existing reference GRCh38. Roughly 90 million of the additional base pairs are derived from structural variation. Using our draft pangenome to analyse short-read data reduced small variant discovery errors by 34% and increased the number of structural variants detected per haplotype by 104% compared with GRCh38-based workflows, which enabled the typing of the vast majority of structural variant alleles per sample.},
keywords = {WP3: Translational CPG},
pubstate = {published},
tppubtype = {article}
}
Forgia, Marco; Navarro, Beatriz; Daghino, Stefania; Cervera, Amelia; Gisel, Andreas; Perotto, Silvia; Aghayeva, Dilzara N.; Akinyuwa, Mary F.; Gobbi, Emanuela; Zheludev, Ivan N.; Edgar, Robert C.; Chikhi, Rayan; Turina, Massimo; Babaian, Artem; Serio, Francesco Di; Peña, Marcos
Hybrids of RNA viruses and viroid-like elements replicate in fungi Journal Article
In: Nature Communications, vol. 14, no. 1, pp. 2591, 2023, ISSN: 2041-1723.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Forgia2023,
title = {Hybrids of RNA viruses and viroid-like elements replicate in fungi},
author = {Marco Forgia and Beatriz Navarro and Stefania Daghino and Amelia Cervera and Andreas Gisel and Silvia Perotto and Dilzara N. Aghayeva and Mary F. Akinyuwa and Emanuela Gobbi and Ivan N. Zheludev and Robert C. Edgar and Rayan Chikhi and Massimo Turina and Artem Babaian and Francesco Di Serio and Marcos Peña},
doi = {10.1038/s41467-023-38301-2},
issn = {2041-1723},
year = {2023},
date = {2023-05-01},
urldate = {2023-05-01},
journal = {Nature Communications},
volume = {14},
number = {1},
pages = {2591},
publisher = {Springer Science and Business Media LLC},
abstract = {Earth’s life may have originated as self-replicating RNA, and it has been argued that RNA viruses and viroid-like elements are remnants of such pre-cellular RNA world. RNA viruses are defined by linear RNA genomes encoding an RNA-dependent RNA polymerase (RdRp), whereas viroid-like elements consist of small, single-stranded, circular RNA genomes that, in some cases, encode paired self-cleaving ribozymes. Here we show that the number of candidate viroid-like elements occurring in geographically and ecologically diverse niches is much higher than previously thought. We report that, amongst these circular genomes, fungal ambiviruses are viroid-like elements that undergo rolling circle replication and encode their own viral RdRp. Thus, ambiviruses are distinct infectious RNAs showing hybrid features of viroid-like RNAs and viruses. We also detected similar circular RNAs, containing active ribozymes and encoding RdRps, related to mitochondrial-like fungal viruses, highlighting fungi as an evolutionary hub for RNA viruses and viroid-like elements. Our findings point to a deep co-evolutionary history between RNA viruses and subviral elements and offer new perspectives in the origin and evolution of primordial infectious agents, and RNA life.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
Mille, Marie; Ripoll, Julie; Cazaux, Bastien; Rivals, Eric
dipwmsearch: a Python package for searching di-PWM motifs Journal Article
In: Bioinformatics, vol. 39, no. 4, 2023, ISSN: 1367-4811.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Mille2023-mj,
title = {dipwmsearch: a Python package for searching di-PWM motifs},
author = {Marie Mille and Julie Ripoll and Bastien Cazaux and Eric Rivals},
url = {https://rivals.lirmm.net/dipwmsearch/},
doi = {10.1093/bioinformatics/btad141},
issn = {1367-4811},
year = {2023},
date = {2023-04-01},
urldate = {2023-04-01},
journal = {Bioinformatics},
volume = {39},
number = {4},
abstract = {Seeking probabilistic motifs in a sequence is a common task to annotate putative transcription factor binding sites or other RNA/DNA binding sites. Useful motif representations include position weight matrices (PWMs), dinucleotide PWMs (di-PWMs), and hidden Markov models (HMMs). Dinucleotide PWMs not only combine the simplicity of PWMs—a matrix form and a cumulative scoring function—but also incorporate dependency between adjacent positions in the motif (unlike PWMs which disregard any dependency). For instance to represent binding sites, the HOCOMOCO database provides di-PWM motifs derived from experimental data. Currently, two programs, SPRy-SARUS and MOODS, can search for occurrences of di-PWMs in sequences.
We propose a Python package called dipwmsearch, which provides an original and efficient algorithm for this task (it first enumerates matching words for the di-PWM, and then searches these all at once in the sequence, even if the latter contains IUPAC codes). The user benefits from an easy installation via Pypi or conda, a comprehensive documentation, and executable scripts that facilitate the use of di-PWMs.
dipwmsearch is available at https://pypi.org/project/dipwmsearch/ and https://gite.lirmm.fr/rivals/dipwmsearch/ under Cecill license.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
We propose a Python package called dipwmsearch, which provides an original and efficient algorithm for this task (it first enumerates matching words for the di-PWM, and then searches these all at once in the sequence, even if the latter contains IUPAC codes). The user benefits from an easy installation via Pypi or conda, a comprehensive documentation, and executable scripts that facilitate the use of di-PWMs.
dipwmsearch is available at https://pypi.org/project/dipwmsearch/ and https://gite.lirmm.fr/rivals/dipwmsearch/ under Cecill license.
Denti, Luca; Khorsand, Parsoa; Bonizzoni, Paola; Hormozdiari, Fereydoun; Chikhi, Rayan
SVDSS: structural variation discovery in hard-to-call genomic regions using sample-specific strings from accurate long reads Journal Article
In: Nat. Methods, vol. 20, no. 4, pp. 550–558, 2023, ISBN: 1548-7105.
Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG
@article{Denti2023-xa,
title = {SVDSS: structural variation discovery in hard-to-call genomic regions using sample-specific strings from accurate long reads},
author = {Luca Denti and Parsoa Khorsand and Paola Bonizzoni and Fereydoun Hormozdiari and Rayan Chikhi},
doi = {10.1038/s41592-022-01674-1},
isbn = {1548-7105},
year = {2023},
date = {2023-04-01},
urldate = {2023-04-01},
journal = {Nat. Methods},
volume = {20},
number = {4},
pages = {550–558},
publisher = {Springer Science and Business Media LLC},
abstract = {Structural variants (SVs) account for a large amount of sequence variability across genomes and play an important role in human genomics and precision medicine. Despite intense efforts over the years, the discovery of SVs in individuals remains challenging due to the diploid and highly repetitive structure of the human genome, and by the presence of SVs that vastly exceed sequencing read lengths. However, the recent introduction of low-error long-read sequencing technologies such as PacBio HiFi may finally enable these barriers to be overcome. Here we present SV discovery with sample-specific strings (SVDSS)—a method for discovery of SVs from long-read sequencing technologies (for example, PacBio HiFi) that combines and effectively leverages mapping-free, mapping-based and assembly-based methodologies for overall superior SV discovery performance. Our experiments on several human samples show that SVDSS outperforms state-of-the-art mapping-based methods for discovery of insertion and deletion SVs in PacBio HiFi reads and achieves notable improvements in calling SVs in repetitive regions of the genome.},
keywords = {WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {article}
}
Porubsky, David; Vollger, Mitchell R; Harvey, William T; Rozanski, Allison N; Ebert, Peter; Hickey, Glenn; Hasenfeld, Patrick; Sanders, Ashley D; Stober, Catherine; Consortium, Human Pangenome Reference; Korbel, Jan O; Paten, Benedict; Marschall, Tobias; Eichler, Evan E
Gaps and complex structurally variant loci in phased genome assemblies Journal Article
In: Genome Res., vol. 33, no. 4, pp. 496–510, 2023, ISSN: 1549-5469.
Abstract | Links | BibTeX | Tags: WP3: Translational CPG
@article{Porubsky2023-ue,
title = {Gaps and complex structurally variant loci in phased genome assemblies},
author = {David Porubsky and Mitchell R Vollger and William T Harvey and Allison N Rozanski and Peter Ebert and Glenn Hickey and Patrick Hasenfeld and Ashley D Sanders and Catherine Stober and Human Pangenome Reference Consortium and Jan O Korbel and Benedict Paten and Tobias Marschall and Evan E Eichler},
doi = {10.1101/gr.277334.122},
issn = {1549-5469},
year = {2023},
date = {2023-04-01},
urldate = {2023-04-01},
journal = {Genome Res.},
volume = {33},
number = {4},
pages = {496–510},
abstract = {There has been tremendous progress in phased genome assembly production by combining long-read data with parental information or linked-read data. Nevertheless, a typical phased genome assembly generated by trio-hifiasm still generates more than 140 gaps. We perform a detailed analysis of gaps, assembly breaks, and misorientations from 182 haploid assemblies obtained from a diversity panel of 77 unique human samples. Although trio-based approaches using HiFi are the current gold standard, chromosome-wide phasing accuracy is comparable when using Strand-seq instead of parental data. Importantly, the majority of assembly gaps cluster near the largest and most identical repeats (including segmental duplications [35.4%], satellite DNA [22.3%], or regions enriched in GA/AT-rich DNA [27.4%]). Consequently, 1513 protein-coding genes overlap assembly gaps in at least one haplotype, and 231 are recurrently disrupted or missing from five or more haplotypes. Furthermore, we estimate that 6-7 Mbp of DNA are misorientated per haplotype irrespective of whether trio-free or trio-based approaches are used. Of these misorientations, 81% correspond to bona fide large inversion polymorphisms in the human species, most of which are flanked by large segmental duplications. We also identify large-scale alignment discontinuities consistent with 11.9 Mbp of deletions and 161.4 Mbp of insertions per haploid genome. Although 99% of this variation corresponds to satellite DNA, we identify 230 regions of euchromatic DNA with frequent expansions and contractions, nearly half of which overlap with 197 protein-coding genes. Such variable and incompletely assembled regions are important targets for future algorithmic development and pangenome representation.},
keywords = {WP3: Translational CPG},
pubstate = {published},
tppubtype = {article}
}
Břinda, Karel; Lima, Leandro; Pignotti, Simone; Quinones-Olvera, Natalia; Salikhov, Kamil; Chikhi, Rayan; Kucherov, Gregory; Iqbal, Zamin; Baym, Michael
Efficient and Robust Search of Microbial Genomes via Phylogenetic Compression Unpublished
bioRxiv, 2023.
Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG
@unpublished{Brinda2023,
title = {Efficient and Robust Search of Microbial Genomes via Phylogenetic Compression},
author = {Karel Břinda and Leandro Lima and Simone Pignotti and Natalia Quinones-Olvera and Kamil Salikhov and Rayan Chikhi and Gregory Kucherov and Zamin Iqbal and Michael Baym},
doi = {10.1101/2023.04.15.536996},
year = {2023},
date = {2023-04-01},
urldate = {2023-04-01},
publisher = {Cold Spring Harbor Laboratory},
abstract = {Comprehensive collections approaching millions of sequenced genomes have become central information sources in the life sciences. However, the rapid growth of these collections makes it effectively impossible to search these data using tools such as BLAST and its successors. Here, we present a technique called phylogenetic compression, which uses evolutionary history to guide compression and efficiently search large collections of microbial genomes using existing algorithms and data structures. We show that, when applied to modern diverse collections approaching millions of genomes, lossless phylogenetic compression improves the compression ratios of assemblies, de Bruijn graphs, and k-mer indexes by one to two orders of magnitude. Additionally, we develop a pipeline for a BLAST-like search over these phylogeny-compressed reference data, and demonstrate it can align genes, plasmids, or entire sequencing experiments against all sequenced bacteria until 2019 on ordinary desktop computers within a few hours. Phylogenetic compression has broad applications in computational biology and may provide a fundamental design principle for future genomics infrastructure.},
howpublished = {bioRxiv},
keywords = {WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {unpublished}
}
Carr, Victoria R; Pissis, Solon P; Mullany, Peter; Shoaie, Saeed; Gomez-Cabrero, David; Moyes, David L
Palidis: fast discovery of novel insertion sequences Journal Article
In: Microb. Genom., vol. 9, no. 3, 2023, ISSN: 2057-5858.
Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG
@article{Carr2023-za,
title = {Palidis: fast discovery of novel insertion sequences},
author = {Victoria R Carr and Solon P Pissis and Peter Mullany and Saeed Shoaie and David Gomez-Cabrero and David L Moyes},
doi = {10.1099/mgen.0.000917},
issn = {2057-5858},
year = {2023},
date = {2023-03-01},
urldate = {2023-03-01},
journal = {Microb. Genom.},
volume = {9},
number = {3},
abstract = {The diversity of microbial insertion sequences, crucial mobile genetic elements in generating diversity in microbial genomes, needs to be better represented in current microbial databases. Identification of these sequences in microbiome communities presents some significant problems that have led to their underrepresentation. Here, we present a bioinformatics pipeline called Palidis that recognizes insertion sequences in metagenomic sequence data rapidly by identifying inverted terminal repeat regions from mixed microbial community genomes. Applying Palidis to 264 human metagenomes identifies 879 unique insertion sequences, with 519 being novel and not previously characterized. Querying this catalogue against a large database of isolate genomes reveals evidence of horizontal gene transfer events across bacterial classes. We will continue to apply this tool more widely, building the Insertion Sequence Catalogue, a valuable resource for researchers wishing to query their microbial genomes for insertion sequences.},
keywords = {WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {article}
}
Frouin, Arthur; Laporte, Fabien; Hafner, Lukas; Maury, Mylene; McCaw, Zachary R.; Julienne, Hanna; Henches, Léo; Chikhi, Rayan; Lecuit, Marc; Aschard, Hugues
ChoruMM: a versatile multi-components mixed model for bacterial-GWAS Unpublished
bioRxiv, 2023.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@unpublished{Frouin2023,
title = {ChoruMM: a versatile multi-components mixed model for bacterial-GWAS},
author = {Arthur Frouin and Fabien Laporte and Lukas Hafner and Mylene Maury and Zachary R. McCaw and Hanna Julienne and Léo Henches and Rayan Chikhi and Marc Lecuit and Hugues Aschard},
doi = {10.1101/2023.03.28.534531},
year = {2023},
date = {2023-03-01},
urldate = {2023-03-01},
publisher = {Cold Spring Harbor Laboratory},
abstract = {Genome-wide Association Studies (GWAS) have been central to studying the genetics of complex human outcomes, and there is now tremendous interest in implementing GWAS-like approaches to study pathogenic bacteria. A variety of methods have been proposed to address the complex linkage structure of bacterial genomes, however, some questions remain about to optimize the genetic modelling of bacteria to decipher causal variations from correlated ones. Here we examined the genetic structure underlying whole-genome sequencing data from 3,824 Listeria monocytogenes strains, and demonstrate that the standard human genetics model, commonly assumed by existing bacterial GWAS methods, is inadequate for studying such highly structured organisms. We leverage these results to develop ChoruMM, a robust and powerful approach that consists of a multi-component linear mixed model, where components are inferred from a hierarchical clustering of the bacteria genetic relatedness matrix. Our ChoruMM approach also includes post-processing and visualization tools that address the pervasive long-range correlation observed in bacteria genome and allow to assess the type I error rate calibration.},
howpublished = {bioRxiv},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {unpublished}
}
Luo, Xiao; Kang, Xiongbin; Schönhuth, Alexander
Predicting the prevalence of complex genetic diseases from individual genotype profiles using capsule networks Journal Article
In: Nat. Mach. Intell., vol. 5, no. 2, pp. 114–125, 2023, ISBN: 2522-5839.
Abstract | Links | BibTeX | Tags: WP3: Translational CPG
@article{Luo2023-le,
title = {Predicting the prevalence of complex genetic diseases from individual genotype profiles using capsule networks},
author = {Xiao Luo and Xiongbin Kang and Alexander Schönhuth},
doi = {10.1038/s42256-022-00604-2},
isbn = {2522-5839},
year = {2023},
date = {2023-02-01},
urldate = {2023-02-01},
journal = {Nat. Mach. Intell.},
volume = {5},
number = {2},
pages = {114–125},
publisher = {Springer Science and Business Media LLC},
abstract = {Diseases that have a complex genetic architecture tend to suffer from considerable amounts of genetic variants that, although playing a role in the disease, have not yet been revealed as such. Two major causes for this phenomenon are genetic variants that do not stack up effects, but interact in complex ways; in addition, as recently suggested, the omnigenic model postulates that variants interact in a holistic manner to establish disease phenotypes. Here we present DiseaseCapsule, as a capsule-network-based approach that explicitly addresses to capture the hierarchical structure of the underlying genome data, and has the potential to fully capture the non-linear relationships between variants and disease. DiseaseCapsule is the first such approach to operate in a whole-genome manner when predicting disease occurrence from individual genotype profiles. In experiments, we evaluated DiseaseCapsule on amyotrophic lateral sclerosis (ALS) and Parkinson’s disease, with a particular emphasis on ALS, which is known to have a complex genetic architecture and is affected by 40% missing heritability. On ALS, DiseaseCapsule achieves 86.9% accuracy on hold-out test data in predicting disease occurrence, thereby outperforming all other approaches by large margins. Also, DiseaseCapsule required sufficiently less training data for reaching optimal performance. Last but not least, the systematic exploitation of the network architecture yielded 922 genes of particular interest, and 644 ‘non-additive’ genes that are crucial factors in DiseaseCapsule, but remain masked within linear schemes.},
keywords = {WP3: Translational CPG},
pubstate = {published},
tppubtype = {article}
}
Mwaniki, Njagi Moses; Pisanti, Nadia
Optimal Sequence Alignment to ED-Strings Proceedings Article
In: Bansal, Mukul S.; Cai, Zhipeng; Mangul, Serghei (Ed.): Bioinformatics Research and Applications, pp. 204–216, Springer Nature, Germany, 2023, ISSN: 0302-9743.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Mwaniki2022b,
title = {Optimal Sequence Alignment to ED-Strings},
author = {Njagi Moses Mwaniki and Nadia Pisanti},
editor = {Mukul S. Bansal and Zhipeng Cai and Serghei Mangul},
doi = {10.1007/978-3-031-23198-8_19},
issn = {0302-9743},
year = {2023},
date = {2023-01-01},
urldate = {2022-01-01},
booktitle = {Bioinformatics Research and Applications},
volume = {13760},
pages = {204--216},
publisher = {Springer Nature},
address = {Germany},
abstract = {Partial Order Alignment (POA) was introduced by Lee et al. in 2002 to allow the alignment of a string to a graph-like structure representing a set of aligned strings (a Multiple Sequence Alignment, MSA). However, the POA edit transcript (the sequence of edit operations that describe the alignment) does not reflect the possible elasticity of the MSA (different gaps sizes in the aligned string), leaving room for possible misalignment and its propagation in progressive MSA. Elastic-Degenerate Strings (ED-strings) are strings that can represent the outcome of an MSA by highlighting gaps and variants as a list of strings that can differ in size and that can possibly include the empty string. In this paper, we define a method that optimally aligns a string to an ED-string, the latter compactly representing an MSA, overcoming the ambiguity in the POA edit transcript while maintaining its time and space complexity.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Rivals, Eric; Sweering, Michelle; Wang, Pengfei
Convergence of the number of period sets in strings Proceedings Article
In: Etessami, Kousha; Feige, Uriel; Puppis, Gabriele (Ed.): 50th International Colloquium on Automata, Languages, and Programming (ICALP 2023), pp. 100:1–100:14, Schloss Dagstuhl -- Leibniz-Zentrum f{"u}r Informatik, Dagstuhl, Germany, 2023, ISBN: 978-3-95977-278-5.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Rivals2022,
title = {Convergence of the number of period sets in strings},
author = {Eric Rivals and Michelle Sweering and Pengfei Wang},
editor = {Kousha Etessami and Uriel Feige and Gabriele Puppis},
doi = {10.4230/LIPIcs.ICALP.2023.100},
isbn = {978-3-95977-278-5},
year = {2023},
date = {2023-01-01},
urldate = {2023-01-01},
booktitle = {50th International Colloquium on Automata, Languages, and Programming (ICALP 2023)},
volume = {261},
pages = {100:1--100:14},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum f{"u}r Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {Consider words of length $n$. The set of all periods of a word of length $n$ is a subset of ${0, 1, 2, ldots , n-1}$. However, any subset of ${0, 1, 2, ldots , n-1}$ is not necessarily a valid set of periods. In a seminal paper in 1981, Guibas and Odlyzko have proposed to encode the set of periods of a word into an $n$ long binary string, called an autocorrelation, where a one at position $i$ denotes a period of $i$. They considered the question of recognizing a valid period set, and also studied the number of valid period sets for length $n$, denoted $kappa n$. They conjectured that $ln(kappa n)$ asymptotically converges to a constant times $ln^2(n)$. If improved lower bounds for $ln(kappa n)/ln^2(n)$ were proposed in 2001, the question of a tight upper bound has remained opened since Guibas and Odlyzko’s paper. Here, we exhibit an upper bound for this fraction, which implies its convergence and closes this long standing conjecture. Moreover, we extend our result to find similar bounds for the number of correlations: a generalization of autocorrelations which encodes the overlaps between two strings.},
howpublished = {arXiv},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Rizzo, Nicola; Cáceres, Manuel; Mäkinen, Veli
Chaining of maximal exact matches in graphs Proceedings Article
In: Nardini, Franco Maria; Pisanti, Nadia; Venturini, Rossano (Ed.): String Processing and Information Retrieval, pp. 353–366, Springer Nature Switzerland, Cham, 2023, ISBN: 978-3-031-43980-3.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@inproceedings{Rizzo2023-ym,
title = {Chaining of maximal exact matches in graphs},
author = {Nicola Rizzo and Manuel Cáceres and Veli Mäkinen},
editor = {Nardini, Franco Maria and Pisanti, Nadia and Venturini, Rossano},
doi = {10.1007/978-3-031-43980-3_29},
isbn = {978-3-031-43980-3},
year = {2023},
date = {2023-01-01},
urldate = {2023-01-01},
booktitle = {String Processing and Information Retrieval},
volume = {14240},
pages = {353–366},
publisher = {Springer Nature Switzerland},
address = {Cham},
series = {Lecture notes in computer science},
abstract = {We show how to chain maximal exact matches (MEMs) between a query string Q and a labeled directed acyclic graph (DAG) $G=(V,E)$ to solve the longest common subsequence (LCS) problem between Q and G. We obtain our result via a new symmetric formulation of chaining in DAGs that we solve in $O(m+n+k^2|V| + |E| + kNlog N)$ time, where $m=|Q|$, n is the total length of node labels, k is the minimum number of paths covering the nodes of G and N is the number of MEMs between Q and node labels, which we show encode full MEMs.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Sitarčík, Jozef; Vinař, Tomáš; Brejová, Broňa; Krampl, Werner; Budiš, Jaroslav; Radvánszky, Ján; Lucká, Mária
WarpSTR: determining tandem repeat lengths using raw nanopore signals Journal Article
In: Bioinformatics, vol. 39, no. 6, pp. btad388, 2023, ISSN: 1367-4811.
Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG
@article{10.1093/bioinformatics/btad388,
title = {WarpSTR: determining tandem repeat lengths using raw nanopore signals},
author = {Jozef Sitarčík and Tomáš Vinař and Broňa Brejová and Werner Krampl and Jaroslav Budiš and Ján Radvánszky and Mária Lucká},
url = {https://github.com/fmfi-compbio/warpstr},
doi = {10.1093/bioinformatics/btad388},
issn = {1367-4811},
year = {2023},
date = {2023-01-01},
urldate = {2023-01-01},
journal = {Bioinformatics},
volume = {39},
number = {6},
pages = {btad388},
abstract = {Short tandem repeats (STRs) are regions of a genome containing many consecutive copies of the same short motif, possibly with small variations. Analysis of STRs has many clinical uses but is limited by technology mainly due to STRs surpassing the used read length. Nanopore sequencing, as one of long-read sequencing technologies, produces very long reads, thus offering more possibilities to study and analyze STRs. Basecalling of nanopore reads is however particularly unreliable in repeating regions, and therefore direct analysis from raw nanopore data is required. Here, we present WarpSTR, a novel method for characterizing both simple and complex tandem repeats directly from raw nanopore signals using a finite-state automaton and a search algorithm analogous to dynamic time warping. By applying this approach to determine the lengths of 241 STRs, we demonstrate that our approach decreases the mean absolute error of the STR length estimate compared to basecalling and STRique. WarpSTR is freely available at https://github.com/fmfi-compbio/warpstr},
keywords = {WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {article}
}
González, Camila Duitama; Vicedomini, Riccardo; Lemane, Téo; Rascovan, Nicolas; Richard, Hugues; Chikhi, Rayan
decOM: Similarity-based microbial source tracking of ancient oral samples using k-mer-based methods Unpublished
bioRxiv, 2023.
Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG
@unpublished{Gonzalez2023,
title = {decOM: Similarity-based microbial source tracking of ancient oral samples using k-mer-based methods},
author = {Camila Duitama González and Riccardo Vicedomini and Téo Lemane and Nicolas Rascovan and Hugues Richard and Rayan Chikhi},
doi = {10.1101/2023.01.26.525439},
year = {2023},
date = {2023-01-01},
urldate = {2023-01-01},
publisher = {Cold Spring Harbor Laboratory},
abstract = {Background: The analysis of ancient oral metagenomes from archaeological human and animal samples is largely confounded by contaminant DNA sequences from modern and environmental sources. Existing methods for Microbial Source Tracking (MST) estimate the proportions of environmental sources, but do not perform well on ancient metagenomes. We developed a novel method called decOM for Microbial Source Tracking and classification of ancient and modern metagenomic samples using k-mer matrices.
Results: We analysed a collection of 360 ancient oral, modern oral, sediment/soil and skin metagenomes, using stratified five-fold cross-validation. decOM estimates the contributions of these source environments in ancient oral metagenomic samples with high accuracy, outperforming two state-of-the-art methods for source tracking, FEAST and mSourceTracker.
Conclusions: decOM is a high-accuracy microbial source tracking method, suitable for ancient oral metagenomic data sets. The decOM method is generic and could also be adapted for MST of other ancient and modern types of metagenomes. We anticipate that decOM will be a valuable tool for MST of ancient metagenomic studies.},
howpublished = {bioRxiv},
keywords = {WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {unpublished}
}
Results: We analysed a collection of 360 ancient oral, modern oral, sediment/soil and skin metagenomes, using stratified five-fold cross-validation. decOM estimates the contributions of these source environments in ancient oral metagenomic samples with high accuracy, outperforming two state-of-the-art methods for source tracking, FEAST and mSourceTracker.
Conclusions: decOM is a high-accuracy microbial source tracking method, suitable for ancient oral metagenomic data sets. The decOM method is generic and could also be adapted for MST of other ancient and modern types of metagenomes. We anticipate that decOM will be a valuable tool for MST of ancient metagenomic studies.
Blassel, Luc; Medvedev, Paul; Chikhi, Rayan
Mapping-friendly sequence reductions: Going beyond homopolymer compression Journal Article
In: iScience, vol. 25, no. 11, pp. 105305, 2022, ISSN: 2589-0042.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Blassel2022,
title = {Mapping-friendly sequence reductions: Going beyond homopolymer compression},
author = {Luc Blassel and Paul Medvedev and Rayan Chikhi},
doi = {10.1016/j.isci.2022.105305},
issn = {2589-0042},
year = {2022},
date = {2022-11-01},
urldate = {2022-11-01},
journal = {iScience},
volume = {25},
number = {11},
pages = {105305},
abstract = {Sequencing errors continue to pose algorithmic challenges to methods working with sequencing data. One of the simplest and most prevalent techniques for ameliorating the detrimental effects of homopolymer expansion/contraction errors present in long reads is homopolymer compression. It collapses runs of repeated nucleotides, to remove some sequencing errors and improve mapping sensitivity. Though our intuitive understanding justifies why homopolymer compression works, it in no way implies that it is the best transformation that can be done. In this paper, we explore if there are transformations that can be applied in the same pre-processing manner as homopolymer compression that would achieve better alignment sensitivity. We introduce a more general framework than homopolymer compression, called mapping-friendly sequence reductions. We transform the reference and the reads using these reductions and then apply an alignment algorithm. We demonstrate that some mapping-friendly sequence reductions lead to improved mapping accuracy, outperforming homopolymer compression.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
Luo, Xiao; Kang, Xiongbin; Schönhuth, Alexander
VeChat: correcting errors in long reads using variation graphs Journal Article
In: Nat Commun, vol. 13, no. 1, pp. 6657, 2022, ISSN: 2041-1723.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Luo2022b,
title = {VeChat: correcting errors in long reads using variation graphs},
author = {Xiao Luo and Xiongbin Kang and Alexander Schönhuth},
doi = {10.1038/s41467-022-34381-8},
issn = {2041-1723},
year = {2022},
date = {2022-11-01},
urldate = {2022-11-01},
journal = {Nat Commun},
volume = {13},
number = {1},
pages = {6657},
abstract = {Error correction is the canonical first step in long-read sequencing data analysis. Current self-correction methods, however, are affected by consensus sequence induced biases that mask true variants in haplotypes of lower frequency showing in mixed samples. Unlike consensus sequence templates, graph-based reference systems are not affected by such biases, so do not mistakenly mask true variants as errors. We present VeChat, as an approach to implement this idea: VeChat is based on variation graphs, as a popular type of data structure for pangenome reference systems. Extensive benchmarking experiments demonstrate that long reads corrected by VeChat contain 4 to 15 (Pacific Biosciences) and 1 to 10 times (Oxford Nanopore Technologies) less errors than when being corrected by state of the art approaches. Further, using VeChat prior to long-read assembly significantly improves the haplotype awareness of the assemblies. VeChat is an easy-to-use open-source tool and publicly available at https://github.com/HaploKit/vechat .},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
Lemane, Téo; Chikhi, Rayan; Peterlongo, Pierre
kmdiff, large-scale and user-friendly differential k-mer analyses Journal Article
In: Bioinformatics, vol. 38, no. 24, pp. 5443-5445, 2022, ISSN: 1367-4803.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG
@article{Lemane2022,
title = {kmdiff, large-scale and user-friendly differential k-mer analyses},
author = {Téo Lemane and Rayan Chikhi and Pierre Peterlongo},
url = {https://github.com/tlemane/kmdiff},
doi = {10.1093/bioinformatics/btac689},
issn = {1367-4803},
year = {2022},
date = {2022-10-01},
urldate = {2022-10-01},
journal = {Bioinformatics},
volume = {38},
number = {24},
pages = {5443-5445},
abstract = {Genome Wide Association Studies (GWAS) elucidate links between genotypes and phenotypes. Recent studies point out the interest of conducting such experiments using k-mers as the base signal instead of single-nucleotide polymorphisms. We propose a tool, kmdiff, that performs differential k-mer analyses on large sequencing cohorts in an order of magnitude less time and memory than previously possible.
AVAILABILITY: https://github.com/tlemane/kmdiff.},
keywords = {WP1: Primary CPG},
pubstate = {published},
tppubtype = {article}
}
AVAILABILITY: https://github.com/tlemane/kmdiff.
Petescia, Alessia; Nebohacova, Martina; Nosek, Jozef; Brejova, Brona; Vinar, Tomas
Contaminant Detection using Differentiating k-mers Presentation
Poster Presentation at ECCB 2022, 12.09.2022.
@misc{Petescia-presentation,
title = {Contaminant Detection using Differentiating k-mers},
author = {Alessia Petescia and Martina Nebohacova and Jozef Nosek and Brona Brejova and Tomas Vinar},
year = {2022},
date = {2022-09-12},
urldate = {2022-09-12},
issue = {ECCB 2022},
howpublished = {Poster Presentation at ECCB 2022},
keywords = {Misc},
pubstate = {published},
tppubtype = {presentation}
}