Willink, Beatriz; Tunström, Kalle; Nilén, Sofie; Chikhi, Rayan; Lemane, Téo; Takahashi, Michihiko; Takahashi, Yuma; Svensson, Erik I.; Wheat, Christopher West
The genomics and evolution of inter-sexual mimicry and female-limited polymorphisms in damselflies Journal Article
In: Nature Ecology & Evolution, vol. 8, no. 1, pp. 83–97, 2023, ISSN: 2397-334X.
Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG, WP3: Translational CPG
@article{Willink2023,
title = {The genomics and evolution of inter-sexual mimicry and female-limited polymorphisms in damselflies},
author = {Beatriz Willink and Kalle Tunström and Sofie Nilén and Rayan Chikhi and Téo Lemane and Michihiko Takahashi and Yuma Takahashi and Erik I. Svensson and Christopher West Wheat},
doi = {10.1038/s41559-023-02243-1},
issn = {2397-334X},
year = {2023},
date = {2023-11-01},
urldate = {2023-11-01},
journal = {Nature Ecology & Evolution},
volume = {8},
number = {1},
pages = {83–97},
publisher = {Springer Science and Business Media LLC},
abstract = {Sex-limited morphs can provide profound insights into the evolution and genomic architecture of complex phenotypes. Inter-sexual mimicry is one particular type of sex-limited polymorphism in which a novel morph resembles the opposite sex. While inter-sexual mimics are known in both sexes and a diverse range of animals, their evolutionary origin is poorly understood. Here, we investigated the genomic basis of female-limited morphs and male mimicry in the common bluetail damselfly. Differential gene expression between morphs has been documented in damselflies, but no causal locus has been previously identified. We found that male mimicry originated in an ancestrally sexually dimorphic lineage in association with multiple structural changes, probably driven by transposable element activity. These changes resulted in ~900 kb of novel genomic content that is partly shared by male mimics in a close relative, indicating that male mimicry is a trans-species polymorphism. More recently, a third morph originated following the translocation of part of the male-mimicry sequence into a genomic position ~3.5 mb apart. We provide evidence of balancing selection maintaining male mimicry, in line with previous field population studies. Our results underscore how structural variants affecting a handful of potentially regulatory genes and morph-specific genes can give rise to novel and complex phenotypic polymorphisms.},
keywords = {WP2: Evolutionary/Comparative CPG, WP3: Translational CPG},
pubstate = {published},
tppubtype = {article}
}
Cozzi, Davide; Rossi, Massimiliano; Rubinacci, Simone; Gagie, Travis; Köppl, Dominik; Boucher, Christina; Bonizzoni, Paola
μ- PBWT: a lightweight r-indexing of the PBWT for storing and querying UK Biobank data Journal Article
In: Bioinformatics, vol. 39, no. 9, 2023, ISSN: 1367-4811.
Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG
@article{Cozzi2023-bx,
title = {μ- PBWT: a lightweight r-indexing of the PBWT for storing and querying UK Biobank data},
author = {Davide Cozzi and Massimiliano Rossi and Simone Rubinacci and Travis Gagie and Dominik Köppl and Christina Boucher and Paola Bonizzoni},
url = {https://github.com/dlcgold/muPBWT},
doi = {10.1093/bioinformatics/btad552},
issn = {1367-4811},
year = {2023},
date = {2023-09-01},
urldate = {2023-09-01},
journal = {Bioinformatics},
volume = {39},
number = {9},
publisher = {Oxford University Press (OUP)},
abstract = {The Positional Burrows–Wheeler Transform (PBWT) is a data structure that indexes haplotype sequences in a manner that enables finding maximal haplotype matches in h sequences containing w variation sites in O(hw) time. This represents a significant improvement over classical quadratic-time approaches. However, the original PBWT data structure does not allow for queries over Biobank panels that consist of several millions of haplotypes, if an index of the haplotypes must be kept entirely in memory.
In this article, we leverage the notion of r-index proposed for the BWT to present a memory-efficient method for constructing and storing the run-length encoded PBWT, and computing set maximal matches (SMEMs) queries in haplotype sequences. We implement our method, which we refer to as μ-PBWT, and evaluate it on datasets of 1000 Genome Project and UK Biobank data. Our experiments demonstrate that the μ-PBWT reduces the memory usage up to a factor of 20% compared to the best current PBWT-based indexing. In particular, μ-PBWT produces an index that stores high-coverage whole genome sequencing data of chromosome 20 in about a third of the space of its BCF file. μ-PBWT is an adaptation of techniques for the run-length compressed BWT for the PBWT (RLPBWT) and it is based on keeping in memory only a succinct representation of the RLPBWT that still allows the efficient computation of set maximal matches (SMEMs) over the original panel.
Our implementation is open source and available at https://github.com/dlcgold/muPBWT. The binary is available at https://bioconda.github.io/recipes/mupbwt/README.html.},
keywords = {WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {article}
}
In this article, we leverage the notion of r-index proposed for the BWT to present a memory-efficient method for constructing and storing the run-length encoded PBWT, and computing set maximal matches (SMEMs) queries in haplotype sequences. We implement our method, which we refer to as μ-PBWT, and evaluate it on datasets of 1000 Genome Project and UK Biobank data. Our experiments demonstrate that the μ-PBWT reduces the memory usage up to a factor of 20% compared to the best current PBWT-based indexing. In particular, μ-PBWT produces an index that stores high-coverage whole genome sequencing data of chromosome 20 in about a third of the space of its BCF file. μ-PBWT is an adaptation of techniques for the run-length compressed BWT for the PBWT (RLPBWT) and it is based on keeping in memory only a succinct representation of the RLPBWT that still allows the efficient computation of set maximal matches (SMEMs) over the original panel.
Our implementation is open source and available at https://github.com/dlcgold/muPBWT. The binary is available at https://bioconda.github.io/recipes/mupbwt/README.html.
Ayad, Lorraine A K; Chikhi, Rayan; Pissis, Solon P
Seedability: optimizing alignment parameters for sensitive sequence comparison Journal Article
In: Bioinform. Adv., vol. 3, no. 1, 2023, ISSN: 2635-0041.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG, WP2: Evolutionary/Comparative CPG
@article{Ayad2023-li,
title = {Seedability: optimizing alignment parameters for sensitive sequence comparison},
author = {Lorraine A K Ayad and Rayan Chikhi and Solon P Pissis},
url = {https://github.com/lorrainea/Seedability},
doi = {10.1093/bioadv/vbad108},
issn = {2635-0041},
year = {2023},
date = {2023-08-01},
urldate = {2023-08-01},
journal = {Bioinform. Adv.},
volume = {3},
number = {1},
publisher = {Oxford University Press (OUP)},
abstract = {Most sequence alignment techniques make use of exact k-mer hits, called seeds, as anchors to optimize alignment speed. A large number of bioinformatics tools employing seed-based alignment techniques, such as Minimap2, use a single value of k per sequencing technology, without a strong guarantee that this is the best possible value. Given the ubiquity of sequence alignment, identifying values of k that lead to more sensitive alignments is thus an important task. To aid this, we present Seedability, a seed-based alignment framework designed for estimating an optimal seed k-mer length (as well as a minimal number of shared seeds) based on a given alignment identity threshold. In particular, we were motivated to make Minimap2 more sensitive in the pairwise alignment of short sequences.
The experimental results herein show improved alignments of short and divergent sequences when using the parameter values determined by Seedability in comparison to the default values of Minimap2. We also show several cases of pairs of real divergent sequences, where the default parameter values of Minimap2 yield no output alignments, but the values output by Seedability produce plausible alignments.
https://github.com/lorrainea/Seedability (distributed under GPL v3.0).},
keywords = {WP1: Primary CPG, WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {article}
}
The experimental results herein show improved alignments of short and divergent sequences when using the parameter values determined by Seedability in comparison to the default values of Minimap2. We also show several cases of pairs of real divergent sequences, where the default parameter values of Minimap2 yield no output alignments, but the values output by Seedability produce plausible alignments.
https://github.com/lorrainea/Seedability (distributed under GPL v3.0).
Lee, Sewon; Kim, Gyuri; Karin, Eli Levy; Mirdita, Milot; Park, Sukhwan; Chikhi, Rayan; Babaian, Artem; Kryshtafovych, Andriy; Steinegger, Martin
Petascale Homology Search for Structure Prediction Unpublished
bioRxiv, 2023.
Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG
@unpublished{Lee2023,
title = {Petascale Homology Search for Structure Prediction},
author = {Sewon Lee and Gyuri Kim and Eli Levy Karin and Milot Mirdita and Sukhwan Park and Rayan Chikhi and Artem Babaian and Andriy Kryshtafovych and Martin Steinegger},
doi = {10.1101/2023.07.10.548308},
year = {2023},
date = {2023-07-01},
urldate = {2023-07-01},
publisher = {Cold Spring Harbor Laboratory},
abstract = {The recent CASP15 competition highlighted the critical role of multiple sequence alignments (MSAs) in protein structure prediction, as demonstrated by the success of the top AlphaFold2-based prediction methods. To push the boundaries of MSA utilization, we conducted a petabase-scale search of the Sequence Read Archive (SRA), resulting in gigabytes of aligned homologs for CASP15 targets. These were merged with default MSAs produced by ColabFold-search and provided to ColabFold-predict. By using SRA data, we achieved highly accurate predictions (GDT_TS > 70) for 66% of the non-easy targets, whereas using ColabFold-search default MSAs scored highly in only 52%. Next, we tested the effect of deep homology search and ColabFold’s advanced features, such as more recycles, on prediction accuracy. While SRA homologs were most significant for improving ColabFold’s CASP15 ranking from 11th to 3rd place, other strategies contributed too. We analyze these in the context of existing strategies to improve prediction.},
howpublished = {bioRxiv},
keywords = {WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {unpublished}
}
Gabory, Esteban; Mwaniki, Moses Njagi; Pisanti, Nadia; Pissis, Solon P; Radoszewski, Jakub; Sweering, Michelle; Zuba, Wiktor
Comparing elastic-degenerate strings: Algorithms, lower bounds, and applications Proceedings Article
In: Bulteau, Laurent; Lipt'ak, Zsuzsanna (Ed.): 34th Annual Symposium on Combinatorial Pattern Matching (CPM 2023), pp. 11:1–11:20, Schloss Dagstuhl - Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2023, ISBN: 978-3-95977-276-1.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG, WP2: Evolutionary/Comparative CPG
@inproceedings{Gabory2023-vh,
title = {Comparing elastic-degenerate strings: Algorithms, lower bounds, and applications},
author = {Esteban Gabory and Moses Njagi Mwaniki and Nadia Pisanti and Solon P Pissis and Jakub Radoszewski and Michelle Sweering and Wiktor Zuba},
editor = {Bulteau, Laurent and Lipt'{a}k, Zsuzsanna},
doi = {10.4230/LIPIcs.CPM.2023.11},
isbn = {978-3-95977-276-1},
year = {2023},
date = {2023-06-01},
urldate = {2023-06-01},
booktitle = {34th Annual Symposium on Combinatorial Pattern Matching (CPM 2023)},
volume = {259},
pages = {11:1--11:20},
publisher = {Schloss Dagstuhl - Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {An elastic-degenerate (ED) string T is a sequence of n sets T[1],…,T[n] containing m strings in total whose cumulative length is N. We call n, m, and N the length, the cardinality and the size of T, respectively. The language of T is defined as ℒ(T) = {S_1 ⋯ S_n : S_i ∈ T[i] for all i ∈ [1,n]}. ED strings have been introduced to represent a set of closely-related DNA sequences, also known as a pangenome. The basic question we investigate here is: Given two ED strings, how fast can we check whether the two languages they represent have a nonempty intersection? We call the underlying problem the ED String Intersection (EDSI) problem. For two ED strings T₁ and T₂ of lengths n₁ and n₂, cardinalities m₁ and m₂, and sizes N₁ and N₂, respectively, we show the following: - There is no 𝒪((N₁N₂)^{1-ε})-time algorithm, thus no 𝒪((N₁m₂+N₂m₁)^{1-ε})-time algorithm and no 𝒪((N₁n₂+N₂n₁)^{1-ε})-time algorithm, for any constant ε > 0, for EDSI even when T₁ and T₂ are over a binary alphabet, unless the Strong Exponential-Time Hypothesis is false. - There is no combinatorial 𝒪((N₁+N₂)^{1.2-ε}f(n₁,n₂))-time algorithm, for any constant ε > 0 and any function f, for EDSI even when T₁ and T₂ are over a binary alphabet, unless the Boolean Matrix Multiplication conjecture is false. - An 𝒪(N₁log N₁log n₁+N₂log N₂log n₂)-time algorithm for outputting a compact (RLE) representation of the intersection language of two unary ED strings. In the case when T₁ and T₂ are given in a compact representation, we show that the problem is NP-complete. - An 𝒪(N₁m₂+N₂m₁)-time algorithm for EDSI. - An Õ(N₁^{ω-1}n₂+N₂^{ω-1}n₁)-time algorithm for EDSI, where ω is the exponent of matrix multiplication; the Õ notation suppresses factors that are polylogarithmic in the input size. We also show that the techniques we develop have applications outside of ED string comparison.},
keywords = {WP1: Primary CPG, WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Denti, Luca; Khorsand, Parsoa; Bonizzoni, Paola; Hormozdiari, Fereydoun; Chikhi, Rayan
SVDSS: structural variation discovery in hard-to-call genomic regions using sample-specific strings from accurate long reads Journal Article
In: Nat. Methods, vol. 20, no. 4, pp. 550–558, 2023, ISBN: 1548-7105.
Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG
@article{Denti2023-xa,
title = {SVDSS: structural variation discovery in hard-to-call genomic regions using sample-specific strings from accurate long reads},
author = {Luca Denti and Parsoa Khorsand and Paola Bonizzoni and Fereydoun Hormozdiari and Rayan Chikhi},
doi = {10.1038/s41592-022-01674-1},
isbn = {1548-7105},
year = {2023},
date = {2023-04-01},
urldate = {2023-04-01},
journal = {Nat. Methods},
volume = {20},
number = {4},
pages = {550–558},
publisher = {Springer Science and Business Media LLC},
abstract = {Structural variants (SVs) account for a large amount of sequence variability across genomes and play an important role in human genomics and precision medicine. Despite intense efforts over the years, the discovery of SVs in individuals remains challenging due to the diploid and highly repetitive structure of the human genome, and by the presence of SVs that vastly exceed sequencing read lengths. However, the recent introduction of low-error long-read sequencing technologies such as PacBio HiFi may finally enable these barriers to be overcome. Here we present SV discovery with sample-specific strings (SVDSS)—a method for discovery of SVs from long-read sequencing technologies (for example, PacBio HiFi) that combines and effectively leverages mapping-free, mapping-based and assembly-based methodologies for overall superior SV discovery performance. Our experiments on several human samples show that SVDSS outperforms state-of-the-art mapping-based methods for discovery of insertion and deletion SVs in PacBio HiFi reads and achieves notable improvements in calling SVs in repetitive regions of the genome.},
keywords = {WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {article}
}
Břinda, Karel; Lima, Leandro; Pignotti, Simone; Quinones-Olvera, Natalia; Salikhov, Kamil; Chikhi, Rayan; Kucherov, Gregory; Iqbal, Zamin; Baym, Michael
Efficient and Robust Search of Microbial Genomes via Phylogenetic Compression Unpublished
bioRxiv, 2023.
Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG
@unpublished{Brinda2023,
title = {Efficient and Robust Search of Microbial Genomes via Phylogenetic Compression},
author = {Karel Břinda and Leandro Lima and Simone Pignotti and Natalia Quinones-Olvera and Kamil Salikhov and Rayan Chikhi and Gregory Kucherov and Zamin Iqbal and Michael Baym},
doi = {10.1101/2023.04.15.536996},
year = {2023},
date = {2023-04-01},
urldate = {2023-04-01},
publisher = {Cold Spring Harbor Laboratory},
abstract = {Comprehensive collections approaching millions of sequenced genomes have become central information sources in the life sciences. However, the rapid growth of these collections makes it effectively impossible to search these data using tools such as BLAST and its successors. Here, we present a technique called phylogenetic compression, which uses evolutionary history to guide compression and efficiently search large collections of microbial genomes using existing algorithms and data structures. We show that, when applied to modern diverse collections approaching millions of genomes, lossless phylogenetic compression improves the compression ratios of assemblies, de Bruijn graphs, and k-mer indexes by one to two orders of magnitude. Additionally, we develop a pipeline for a BLAST-like search over these phylogeny-compressed reference data, and demonstrate it can align genes, plasmids, or entire sequencing experiments against all sequenced bacteria until 2019 on ordinary desktop computers within a few hours. Phylogenetic compression has broad applications in computational biology and may provide a fundamental design principle for future genomics infrastructure.},
howpublished = {bioRxiv},
keywords = {WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {unpublished}
}
Carr, Victoria R; Pissis, Solon P; Mullany, Peter; Shoaie, Saeed; Gomez-Cabrero, David; Moyes, David L
Palidis: fast discovery of novel insertion sequences Journal Article
In: Microb. Genom., vol. 9, no. 3, 2023, ISSN: 2057-5858.
Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG
@article{Carr2023-za,
title = {Palidis: fast discovery of novel insertion sequences},
author = {Victoria R Carr and Solon P Pissis and Peter Mullany and Saeed Shoaie and David Gomez-Cabrero and David L Moyes},
doi = {10.1099/mgen.0.000917},
issn = {2057-5858},
year = {2023},
date = {2023-03-01},
urldate = {2023-03-01},
journal = {Microb. Genom.},
volume = {9},
number = {3},
abstract = {The diversity of microbial insertion sequences, crucial mobile genetic elements in generating diversity in microbial genomes, needs to be better represented in current microbial databases. Identification of these sequences in microbiome communities presents some significant problems that have led to their underrepresentation. Here, we present a bioinformatics pipeline called Palidis that recognizes insertion sequences in metagenomic sequence data rapidly by identifying inverted terminal repeat regions from mixed microbial community genomes. Applying Palidis to 264 human metagenomes identifies 879 unique insertion sequences, with 519 being novel and not previously characterized. Querying this catalogue against a large database of isolate genomes reveals evidence of horizontal gene transfer events across bacterial classes. We will continue to apply this tool more widely, building the Insertion Sequence Catalogue, a valuable resource for researchers wishing to query their microbial genomes for insertion sequences.},
keywords = {WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {article}
}
Sitarčík, Jozef; Vinař, Tomáš; Brejová, Broňa; Krampl, Werner; Budiš, Jaroslav; Radvánszky, Ján; Lucká, Mária
WarpSTR: determining tandem repeat lengths using raw nanopore signals Journal Article
In: Bioinformatics, vol. 39, no. 6, pp. btad388, 2023, ISSN: 1367-4811.
Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG
@article{10.1093/bioinformatics/btad388,
title = {WarpSTR: determining tandem repeat lengths using raw nanopore signals},
author = {Jozef Sitarčík and Tomáš Vinař and Broňa Brejová and Werner Krampl and Jaroslav Budiš and Ján Radvánszky and Mária Lucká},
url = {https://github.com/fmfi-compbio/warpstr},
doi = {10.1093/bioinformatics/btad388},
issn = {1367-4811},
year = {2023},
date = {2023-01-01},
urldate = {2023-01-01},
journal = {Bioinformatics},
volume = {39},
number = {6},
pages = {btad388},
abstract = {Short tandem repeats (STRs) are regions of a genome containing many consecutive copies of the same short motif, possibly with small variations. Analysis of STRs has many clinical uses but is limited by technology mainly due to STRs surpassing the used read length. Nanopore sequencing, as one of long-read sequencing technologies, produces very long reads, thus offering more possibilities to study and analyze STRs. Basecalling of nanopore reads is however particularly unreliable in repeating regions, and therefore direct analysis from raw nanopore data is required. Here, we present WarpSTR, a novel method for characterizing both simple and complex tandem repeats directly from raw nanopore signals using a finite-state automaton and a search algorithm analogous to dynamic time warping. By applying this approach to determine the lengths of 241 STRs, we demonstrate that our approach decreases the mean absolute error of the STR length estimate compared to basecalling and STRique. WarpSTR is freely available at https://github.com/fmfi-compbio/warpstr},
keywords = {WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {article}
}
González, Camila Duitama; Vicedomini, Riccardo; Lemane, Téo; Rascovan, Nicolas; Richard, Hugues; Chikhi, Rayan
decOM: Similarity-based microbial source tracking of ancient oral samples using k-mer-based methods Unpublished
bioRxiv, 2023.
Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG
@unpublished{Gonzalez2023,
title = {decOM: Similarity-based microbial source tracking of ancient oral samples using k-mer-based methods},
author = {Camila Duitama González and Riccardo Vicedomini and Téo Lemane and Nicolas Rascovan and Hugues Richard and Rayan Chikhi},
doi = {10.1101/2023.01.26.525439},
year = {2023},
date = {2023-01-01},
urldate = {2023-01-01},
publisher = {Cold Spring Harbor Laboratory},
abstract = {Background: The analysis of ancient oral metagenomes from archaeological human and animal samples is largely confounded by contaminant DNA sequences from modern and environmental sources. Existing methods for Microbial Source Tracking (MST) estimate the proportions of environmental sources, but do not perform well on ancient metagenomes. We developed a novel method called decOM for Microbial Source Tracking and classification of ancient and modern metagenomic samples using k-mer matrices.
Results: We analysed a collection of 360 ancient oral, modern oral, sediment/soil and skin metagenomes, using stratified five-fold cross-validation. decOM estimates the contributions of these source environments in ancient oral metagenomic samples with high accuracy, outperforming two state-of-the-art methods for source tracking, FEAST and mSourceTracker.
Conclusions: decOM is a high-accuracy microbial source tracking method, suitable for ancient oral metagenomic data sets. The decOM method is generic and could also be adapted for MST of other ancient and modern types of metagenomes. We anticipate that decOM will be a valuable tool for MST of ancient metagenomic studies.},
howpublished = {bioRxiv},
keywords = {WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {unpublished}
}
Results: We analysed a collection of 360 ancient oral, modern oral, sediment/soil and skin metagenomes, using stratified five-fold cross-validation. decOM estimates the contributions of these source environments in ancient oral metagenomic samples with high accuracy, outperforming two state-of-the-art methods for source tracking, FEAST and mSourceTracker.
Conclusions: decOM is a high-accuracy microbial source tracking method, suitable for ancient oral metagenomic data sets. The decOM method is generic and could also be adapted for MST of other ancient and modern types of metagenomes. We anticipate that decOM will be a valuable tool for MST of ancient metagenomic studies.
Porrelli, Stefano; Gerbault-Seureau, Michèle; Rozzi, Roberto; Chikhi, Rayan; Curaudeau, Manon; Ropiquet, Anne; Hassanin, Alexandre
Draft genome of the lowland anoa (Bubalus depressicornis) and comparison with buffalo genome assemblies (Bovidae, Bubalina) Journal Article
In: G3 Genes|Genomes|Genetics, vol. 12, iss. 11, 2022, ISSN: 2160-1836.
Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG
@article{Porrelli2022,
title = {Draft genome of the lowland anoa (Bubalus depressicornis) and comparison with buffalo genome assemblies (Bovidae, Bubalina)},
author = {Stefano Porrelli and Michèle Gerbault-Seureau and Roberto Rozzi and Rayan Chikhi and Manon Curaudeau and Anne Ropiquet and Alexandre Hassanin},
editor = {D -J Koning},
doi = {10.1093/g3journal/jkac234},
issn = {2160-1836},
year = {2022},
date = {2022-09-01},
urldate = {2022-09-01},
journal = {G3 Genes|Genomes|Genetics},
volume = {12},
issue = {11},
publisher = {Oxford University Press (OUP)},
abstract = {Genomic data for wild species of the genus Bubalus (Asian buffaloes) are still lacking while several whole genomes are currently available for domestic water buffaloes. To address this, we sequenced the genome of a wild endangered dwarf buffalo, the lowland anoa (Bubalus depressicornis), produced a draft genome assembly and made comparison to published buffalo genomes. The lowland anoa genome assembly was 2.56 Gbp long and contained 103,135 contigs, the longest contig being 337.39 kbp long. N50 and L50 values were 38.73 and 19.83 kbp, respectively, mean coverage was 44× and GC content was 41.74%. Two strategies were adopted to evaluate genome completeness: (1) determination of genomic features with de novo and homology-based predictions using annotations of chromosome-level genome assembly of the river buffalo and (2) employment of benchmarking against universal single-copy orthologs (BUSCO). Homology-based predictions identified 94.51% complete and 3.65% partial genomic features. De novo gene predictions identified 32,393 genes, representing 97.14% of the reference’s annotated genes, whilst BUSCO search against the mammalian orthologs database identified 71.1% complete, 11.7% fragmented, and 17.2% missing orthologs, indicating a good level of completeness for downstream analyses. Repeat analyses indicated that the lowland anoa genome contains 42.12% of repetitive regions. The genome assembly of the lowland anoa is expected to contribute to comparative genome analyses among bovid species.},
keywords = {WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {article}
}
Charalampopoulos, Panagiotis; Pissis, Solon P.; Radoszewski, Jakub
Longest Palindromic Substring in Sublinear Time Proceedings Article
In: Bannai, Hideo; Holub, Jan (Ed.): 33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022), pp. 20:1–20:9, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2022, ISSN: 1868-8969.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG, WP2: Evolutionary/Comparative CPG
@inproceedings{Charalampopoulos2022b,
title = {Longest Palindromic Substring in Sublinear Time},
author = {Charalampopoulos, Panagiotis and Pissis, Solon P. and Radoszewski, Jakub},
editor = {Bannai, Hideo and Holub, Jan},
doi = {10.4230/LIPIcs.CPM.2022.20},
issn = {1868-8969},
year = {2022},
date = {2022-06-22},
urldate = {2022-06-22},
booktitle = {33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022)},
volume = {223},
pages = {20:1--20:9},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {We revisit the classic algorithmic problem of computing a longest palidromic substring. This problem is solvable by a celebrated 𝒪(n)-time algorithm [Manacher, J. ACM 1975], where n is the length of the input string. For small alphabets, 𝒪(n) is not necessarily optimal in the word RAM model of computation: a string of length n over alphabet [0,σ) can be stored in 𝒪(n log σ/log n) space and read in 𝒪(n log σ/log n) time. We devise a simple 𝒪(n log σ/log n)-time algorithm for computing a longest palindromic substring. In particular, our algorithm works in sublinear time if σ = $2^{o(log n)}$. Our technique relies on periodicity and on the 𝒪(n log σ/log n)-time constructible data structure of Kempa and Kociumaka [STOC 2019] that answers longest common extension queries in 𝒪(1) time.},
keywords = {WP1: Primary CPG, WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Bernardini, Giulia; Conte, Alessio; Gabory, Esteban; Grossi, Roberto; Loukides, Grigorios; Pissis, Solon P.; Punzi, Giulia; Sweering, Michelle
On Strings Having the Same Length- k Substrings Proceedings Article
In: Bannai, Hideo; Holub, Jan (Ed.): 33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022), pp. 16:1–16:17, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2022, ISSN: 1868-8969.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG, WP2: Evolutionary/Comparative CPG
@inproceedings{Bernardini2022b,
title = {On Strings Having the Same Length- k Substrings},
author = {Bernardini, Giulia and Conte, Alessio and Gabory, Esteban and Grossi, Roberto and Loukides, Grigorios and Pissis, Solon P. and Punzi, Giulia and Sweering, Michelle},
editor = {Bannai, Hideo and Holub, Jan},
doi = {10.4230/LIPIcs.CPM.2022.16},
issn = {1868-8969},
year = {2022},
date = {2022-06-22},
urldate = {2022-06-22},
booktitle = {33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022)},
volume = {223},
pages = {16:1--16:17},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {Let Substr_k(X) denote the set of length-k substrings of a given string X for a given integer k > 0. We study the following basic string problem, called z-Shortest 𝒮_k-Equivalent Strings: Given a set 𝒮_k of n length-k strings and an integer z > 0, list z shortest distinct strings T₁,…,T_z such that Substr_k(T_i) = 𝒮_k, for all i ∈ [1,z]. The z-Shortest 𝒮_k-Equivalent Strings problem arises naturally as an encoding problem in many real-world applications; e.g., in data privacy, in data compression, and in bioinformatics. The 1-Shortest 𝒮_k-Equivalent Strings, referred to as Shortest 𝒮_k-Equivalent String, asks for a shortest string X such that Substr_k(X) = 𝒮_k.
Our main contributions are summarized below:
- Given a directed graph G(V,E), the Directed Chinese Postman (DCP) problem asks for a shortest closed walk that visits every edge of G at least once. DCP can be solved in 𝒪̃(|E||V|) time using an algorithm for min-cost flow. We show, via a non-trivial reduction, that if Shortest 𝒮_k-Equivalent String over a binary alphabet has a near-linear-time solution then so does DCP.
- We show that the length of a shortest string output by Shortest 𝒮_k-Equivalent String is in 𝒪(k+n²). We generalize this bound by showing that the total length of z shortest strings is in 𝒪(zk+zn²+z²n). We derive these upper bounds by showing (asymptotically tight) bounds on the total length of z shortest Eulerian walks in general directed graphs.
- We present an algorithm for solving z-Shortest 𝒮_k-Equivalent Strings in 𝒪(nk+n²log²n+zn²log n+|output|) time. If z = 1, the time becomes 𝒪(nk+n²log²n) by the fact that the size of the input is Θ(nk) and the size of the output is 𝒪(k+n²).},
keywords = {WP1: Primary CPG, WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
Our main contributions are summarized below:
- Given a directed graph G(V,E), the Directed Chinese Postman (DCP) problem asks for a shortest closed walk that visits every edge of G at least once. DCP can be solved in 𝒪̃(|E||V|) time using an algorithm for min-cost flow. We show, via a non-trivial reduction, that if Shortest 𝒮_k-Equivalent String over a binary alphabet has a near-linear-time solution then so does DCP.
- We show that the length of a shortest string output by Shortest 𝒮_k-Equivalent String is in 𝒪(k+n²). We generalize this bound by showing that the total length of z shortest strings is in 𝒪(zk+zn²+z²n). We derive these upper bounds by showing (asymptotically tight) bounds on the total length of z shortest Eulerian walks in general directed graphs.
- We present an algorithm for solving z-Shortest 𝒮_k-Equivalent Strings in 𝒪(nk+n²log²n+zn²log n+|output|) time. If z = 1, the time becomes 𝒪(nk+n²log²n) by the fact that the size of the input is Θ(nk) and the size of the output is 𝒪(k+n²).
Bernardini, Giulia; Chen, Huiping; Loukides, Grigorios; Pissis, Solon P.; Stougie, Leen; Sweering, Michelle
Making de Bruijn Graphs Eulerian Proceedings Article
In: Bannai, Hideo; Holub, Jan (Ed.): 33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022), pp. 12:1–12:18, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2022, ISSN: 1868-8969.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG, WP2: Evolutionary/Comparative CPG
@inproceedings{Bernardini2022c,
title = {Making de Bruijn Graphs Eulerian},
author = {Bernardini, Giulia and Chen, Huiping and Loukides, Grigorios and Pissis, Solon P. and Stougie, Leen and Sweering, Michelle},
editor = {Bannai, Hideo and Holub, Jan},
doi = {10.4230/LIPIcs.CPM.2022.12},
issn = {1868-8969},
year = {2022},
date = {2022-06-06},
urldate = {2022-06-06},
booktitle = {33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022)},
volume = {223},
pages = {12:1--12:18},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {A directed multigraph is called Eulerian if it has a circuit which uses each edge exactly once. Euler’s theorem tells us that a weakly connected directed multigraph is Eulerian if and only if every node is balanced. Given a collection S of strings over an alphabet Σ, the de Bruijn graph (dBG) of order k of S is a directed multigraph $G_{S,k}(V,E)$, where V is the set of length-(k-1) substrings of the strings in S, and $G_{S,k}$ contains an edge (u,v) with multiplicity $m_{u,v}$, if and only if the string u[0]⋅ v is equal to the string u⋅ v[k-2] and this string occurs exactly $m_{u,v}$ times in total in strings in S. Let $G_{Σ,k}(V_{Σ,k},E_{Σ,k})$ be the complete dBG of $Σ^k$. The Eulerian Extension (EE) problem on $G_{S,k}$ asks to extend $G_{S,k}$ with a set $cal{B}$ of nodes from $V_{Σ,k}$ and a smallest multiset $cal{A}$ of edges from $E_{Σ,k}$ to make it Eulerian. Note that extending dBGs is algorithmically much more challenging than extending general directed multigraphs because some edges in dBGs are by definition forbidden. Extending dBGs lies at the heart of sequence assembly [Medvedev et al., WABI 2007], one of the most important tasks in bioinformatics. The novelty of our work with respect to existing works is that we allow not only to duplicate existing edges of $G_{S,k}$ but to also add novel edges and nodes, in an effort to (i) connect multiple components and (ii) reduce the total EE cost. It is easy to show that EE on $G_{S,k}$ is NP-hard via a reduction from shortest common superstring. We further show that EE remains NP-hard, even when we are not allowed to add new nodes, via a highly non-trivial reduction from 3-SAT. We thus investigate the following two problems underlying EE in dBGs:
1) When $G_{S,k}$ is not weakly connected, we are asked to connect its d > 1 components using a minimum-weight spanning tree, whose edges are paths on the underlying $G_{Σ,k}$ and weights are the corresponding path lengths. This way of connecting guarantees that no new unbalanced node is added. We show that this problem can be solved in $O(|V|klog d+|E|)$ time, which is nearly optimal, since the size of $G_{S,k}$ is $Θ(|V|k+|E|)$.
2) When $G_{S,k}$ is not balanced, we are asked to extend $G_{S,k}$ to $H_{S,k}(V cup cal{B},E cup cal{A})$ such that every node of $H_{S,k}$ is balanced and the total number $|cal{A}|$ of added edges is minimized. We show that this problem can be solved in the optimal $O(k|V| + |E|+ |cal{A}|)$ time. Let us stress that, although our main contributions are theoretical, the algorithms we design for the above two problems are practical. We combine the two algorithms in one method that makes any dBG Eulerian; and show experimentally that the cost of the obtained feasible solutions on real-world dBGs is substantially smaller than the corresponding cost obtained by existing greedy approaches.},
keywords = {WP1: Primary CPG, WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
1) When $G_{S,k}$ is not weakly connected, we are asked to connect its d > 1 components using a minimum-weight spanning tree, whose edges are paths on the underlying $G_{Σ,k}$ and weights are the corresponding path lengths. This way of connecting guarantees that no new unbalanced node is added. We show that this problem can be solved in $O(|V|klog d+|E|)$ time, which is nearly optimal, since the size of $G_{S,k}$ is $Θ(|V|k+|E|)$.
2) When $G_{S,k}$ is not balanced, we are asked to extend $G_{S,k}$ to $H_{S,k}(V cup cal{B},E cup cal{A})$ such that every node of $H_{S,k}$ is balanced and the total number $|cal{A}|$ of added edges is minimized. We show that this problem can be solved in the optimal $O(k|V| + |E|+ |cal{A}|)$ time. Let us stress that, although our main contributions are theoretical, the algorithms we design for the above two problems are practical. We combine the two algorithms in one method that makes any dBG Eulerian; and show experimentally that the cost of the obtained feasible solutions on real-world dBGs is substantially smaller than the corresponding cost obtained by existing greedy approaches.
Charalampopoulos, Panagiotis; Iliopoulos, Costas S.; Kociumaka, Tomasz; Pissis, Solon P.; Radoszewski, Jakub; Straszy'nski, Juliusz
Efficient Computation of Sequence Mappability Journal Article
In: Algorithmica, vol. 84, no. 5, pp. 1418–1440, 2022, ISBN: 1432-0541.
Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG
@article{Charalampopoulos2022,
title = {Efficient Computation of Sequence Mappability},
author = {Panagiotis Charalampopoulos and Costas S. Iliopoulos and Tomasz Kociumaka and Solon P. Pissis and Jakub Radoszewski and Juliusz Straszy'nski},
doi = {10.1007/s00453-022-00934-y},
isbn = {1432-0541},
year = {2022},
date = {2022-05-01},
urldate = {2022-05-01},
journal = {Algorithmica},
volume = {84},
number = {5},
pages = {1418--1440},
abstract = {Sequence mappability is an important task in genome resequencing. In the (k, m)-mappability problem, for a given sequence T of length n, the goal is to compute a table whose ith entry is the number of indices $j ne i$ such that the length-m substrings of T starting at positions i and j have at most k mismatches. Previous works on this problem focused on heuristics computing a rough approximation of the result or on the case of $k=1$. We present several efficient algorithms for the general case of the problem. Our main result is an algorithm that, for $k=O(1)$, works in $O(n)$space and, with high probability, in $O(n cdot min {m^k,log ^k n})$ time. Our algorithm requires a careful adaptation of the k-errata trees of Cole et al. [STOC 2004] to avoid multiple counting of pairs of substrings. Our technique can also be applied to solve the all-pairs Hamming distance problem introduced by Crochemore et al. [WABI 2017]. We further develop $O(n^2)$-time algorithms to compute all (k, m)-mappability tables for a fixed m and all $k in {0,ldots ,m}$or a fixed k and all $m in {k,ldots ,n}$. Finally, we show that, for $k,m = Theta (log n)$, the (k, m)-mappability problem cannot be solved in strongly subquadratic time unless the Strong Exponential Time Hypothesis fails. This is an improved and extended version of a paper presented at SPIRE 2018.},
keywords = {WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {article}
}
Bonnet, Konstantinn; Marschall, Tobias; Doerr, Daniel
Constructing Founder Sets Under Allelic and Non-Allelic Homologous Recombination Proceedings Article
In: Boucher, Christina; Rahmann, Sven (Ed.): 22nd International Workshop on Algorithms in Bioinformatics (WABI 2022), pp. 6:1–6:23, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2022, ISSN: 1868-8969.
Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG
@inproceedings{Bonnet2022,
title = {Constructing Founder Sets Under Allelic and Non-Allelic Homologous Recombination},
author = {Konstantinn Bonnet and Tobias Marschall and Daniel Doerr},
editor = {Christina Boucher and Sven Rahmann},
doi = {10.4230/LIPIcs.WABI.2022.6},
issn = {1868-8969},
year = {2022},
date = {2022-01-01},
urldate = {2022-01-01},
booktitle = {22nd International Workshop on Algorithms in Bioinformatics (WABI 2022)},
volume = {242},
pages = {6:1--6:23},
publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},
address = {Dagstuhl, Germany},
series = {Leibniz International Proceedings in Informatics (LIPIcs)},
abstract = {Homologous recombination between the maternal and paternal copies of a chromosome is a key mechanism for human inheritance and shapes population genetic properties of our species. However, a similar mechanism can also act between different copies of the same sequence, then called non-allelic homologous recombination (NAHR). This process can result in genomic rearrangements - including deletion, duplication, and inversion - and is underlying many genomic disorders. Despite its importance for genome evolution and disease, there is a lack of computational models to study genomic loci prone to NAHR.
In this work, we propose such a computational model, providing a unified framework for both (allelic) homologous recombination and NAHR. Our model represents a set of genomes as a graph, where human haplotypes correspond to walks through this graph. We formulate two founder set problems under our recombination model, provide flow-based algorithms for their solution, and demonstrate scalability to problem instances arising in practice.},
keywords = {WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {inproceedings}
}
In this work, we propose such a computational model, providing a unified framework for both (allelic) homologous recombination and NAHR. Our model represents a set of genomes as a graph, where human haplotypes correspond to walks through this graph. We formulate two founder set problems under our recombination model, provide flow-based algorithms for their solution, and demonstrate scalability to problem instances arising in practice.
Zhong, Haodi; Loukides, Grigorios; Pissis, Solon P.
Clustering sequence graphs Journal Article
In: Data & Knowledge Engineering, vol. 138, pp. 101981, 2022, ISSN: 0169-023X.
Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG
@article{Zhong2022,
title = {Clustering sequence graphs},
author = {Haodi Zhong and Grigorios Loukides and Solon P. Pissis},
doi = {https://doi.org/10.1016/j.datak.2022.101981},
issn = {0169-023X},
year = {2022},
date = {2022-01-01},
urldate = {2022-01-01},
journal = {Data & Knowledge Engineering},
volume = {138},
pages = {101981},
abstract = {In application domains ranging from social networks to e-commerce, it is important to cluster users with respect to both their relationships (e.g., friendship or trust) and their actions (e.g., visited locations or rated products). Motivated by these applications, we introduce here the task of clustering the nodes of a sequence graph, i.e., a graph whose nodes are labeled with strings (e.g., sequences of users' visited locations or rated products). Both string clustering algorithms and graph clustering algorithms are inappropriate to deal with this task, as they do not consider the structure of strings and graph simultaneously. Moreover, attributed graph clustering algorithms generally construct poor solutions because they need to represent a string as a vector of attributes, which inevitably loses information and may harm clustering quality. We thus introduce the problem of clustering a sequence graph. We first propose two pairwise distance measures for sequence graphs, one based on edit distance and shortest path distance and another one based on SimRank. We then formalize the problem under each measure, showing also that it is NP-hard. In addition, we design a polynomial-time 2-approximation algorithm, as well as a heuristic for the problem. Experiments using real datasets and a case study demonstrate the effectiveness and efficiency of our methods.},
keywords = {WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {article}
}
Cartes, Jorge Avila; Anand, Santosh; Ciccolella, Simone; Bonizzoni, Paola; Vedova, Gianluca Della
Accurate and fast clade assignment via deep learning and frequency chaos game representation Journal Article
In: GigaScience, vol. 12, 2022, ISSN: 2047-217X, (giac119).
Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG
@article{Cartes2022,
title = {Accurate and fast clade assignment via deep learning and frequency chaos game representation},
author = {Jorge Avila Cartes and Santosh Anand and Simone Ciccolella and Paola Bonizzoni and Gianluca Della Vedova},
url = {https://github.com/AlgoLab/CouGaR-g},
doi = {10.1093/gigascience/giac119},
issn = {2047-217X},
year = {2022},
date = {2022-01-01},
urldate = {2022-01-01},
journal = {GigaScience},
volume = {12},
publisher = {Cold Spring Harbor Laboratory},
abstract = {Since the beginning of the coronavirus disease 2019 pandemic, there has been an explosion of sequencing of the severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2) virus, making it the most widely sequenced virus in the history. Several databases and tools have been created to keep track of genome sequences and variants of the virus; most notably, the GISAID platform hosts millions of complete genome sequences, and it is continuously expanding every day. A challenging task is the development of fast and accurate tools that are able to distinguish between the different SARS-CoV-2 variants and assign them to a clade.In this article, we leverage the frequency chaos game representation (FCGR) and convolutional neural networks (CNNs) to develop an original method that learns how to classify genome sequences that we implement into CouGaR-g, a tool for the clade assignment problem on SARS-CoV-2 sequences. On a testing subset of the GISAID, CouGaR-g achieved an $96.29%$ overall accuracy, while a similar tool, Covidex, obtained a $77,12%$ overall accuracy. As far as we know, our method is the first using deep learning and FCGR for intraspecies classification. Furthermore, by using some feature importance methods, CouGaR-g allows to identify k-mers that match SARS-CoV-2 marker variants.By combining FCGR and CNNs, we develop a method that achieves a better accuracy than Covidex (which is based on random forest) for clade assignment of SARS-CoV-2 genome sequences, also thanks to our training on a much larger dataset, with comparable running times. Our method implemented in CouGaR-g is able to detect k-mers that capture relevant biological information that distinguishes the clades, known as marker variants.The trained models can be tested online providing a FASTA file (with 1 or multiple sequences) at https://huggingface.co/spaces/BIASLab/sars-cov-2-classification-fcgr. CouGaR-g is also available at https://github.com/AlgoLab/CouGaR-g under the GPL.},
note = {giac119},
keywords = {WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {article}
}
Badkobeh, Golnaz; Charalampopoulos, Panagiotis; Kosolobov, Dmitry; Pissis, Solon P.
Internal shortest absent word queries in constant time and linear space Journal Article
In: Theoretical Computer Science, vol. 922, pp. 271-282, 2022, ISSN: 0304-3975.
Abstract | Links | BibTeX | Tags: WP1: Primary CPG, WP2: Evolutionary/Comparative CPG
@article{Badkobeh2022,
title = {Internal shortest absent word queries in constant time and linear space},
author = {Golnaz Badkobeh and Panagiotis Charalampopoulos and Dmitry Kosolobov and Solon P. Pissis},
url = {https://arxiv.org/abs/2106.01763},
doi = {https://doi.org/10.1016/j.tcs.2022.04.029},
issn = {0304-3975},
year = {2022},
date = {2022-01-01},
urldate = {2022-01-01},
journal = {Theoretical Computer Science},
volume = {922},
pages = {271-282},
abstract = {Given a string T of length n over an alphabet Σ⊂1,2,…,nO(1) of size σ, we are to preprocess T so that given a range [i,j], we can return a representation of a shortest string over Σ that is absent in the fragment T[i]⋯T[j] of T. We present an O(n)-space data structure that answers such queries in constant time and can be constructed in O(nlogσn) time.},
keywords = {WP1: Primary CPG, WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {article}
}
Parmigiani, Luca; Wittler, Roland; Stoye, Jens
Revisiting pangenome openness with k-mers Unpublished
bioRxiv, 2022.
Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG
@unpublished{Parmigiani2022,
title = {Revisiting pangenome openness with k-mers},
author = {Luca Parmigiani and Roland Wittler and Jens Stoye},
doi = {10.1101/2022.11.15.516472},
year = {2022},
date = {2022-01-01},
urldate = {2022-01-01},
journal = {bioRxiv},
publisher = {Cold Spring Harbor Laboratory},
abstract = {Pangenomics is the study of related genomes collectively, usually from the same species or closely related taxa. Originally, pangenomes were defined for bacterial species. After the concept was extended to eukaryotic genomes, two definitions of pangenome evolved in parallel: the gene-based approach, which defines the pangenome as the union of all genes, and the sequence-based approach, which defines the pangenome as the set of all nonredundant genomic sequences.Estimating the total size of the pangenome for a given species has been subject of study since the very first mention of pangenomes. Traditionally, this is performed predicting the ratio at which new genes are discovered, referred to as the openness of the species.Here, we abstract each genome as a set of items, which is entirely agnostic of the two approaches (gene-based, sequence-based). Genes are a viable option for items, but also other possibilities are feasible, e.g., genome sequence substrings of fixed length k (k-mers).In the present study, we investigate the use of k-mers to estimate the openness as an alternative to genes, and compare the results. An efficient implementation is also provided.Competing Interest StatementThe authors have declared no competing interest.},
howpublished = {bioRxiv},
keywords = {WP2: Evolutionary/Comparative CPG},
pubstate = {published},
tppubtype = {unpublished}
}