Sauer, Lena M.; Cánovas, Rodrigo; Roche, Daniel Barry; Shams-Eldin, Hosam; Ravel, Patrice; Colinge, Jacques; Schwarz, Ralph T.; Mamoun, Choukri Ben; Rivals, Eric; Cornillot, Emmanuel

In: Malaria Journal, vol. 22, no. 1, pp. 27-46, 2023, ISBN: 1475-2875.

Abstract | Links | BibTeX | Tags: Misc

@article{Sauer2023,

title = {FT-GPI, a highly sensitive and accurate predictor of GPI-anchored proteins, reveals the composition and evolution of the GPI proteome in Plasmodium species},

author = {Lena M. Sauer and Rodrigo Cánovas and Daniel Barry Roche and Hosam Shams-Eldin and Patrice Ravel and Jacques Colinge and Ralph T. Schwarz and Choukri Ben Mamoun and Eric Rivals and Emmanuel Cornillot},

doi = {10.1186/s12936-022-04430-0},

isbn = {1475-2875},

year = {2023},

date = {2023-12-01},

urldate = {2023-12-01},

journal = {Malaria Journal},

volume = {22},

number = {1},

pages = {27-46},

publisher = {BioMed Central},

abstract = {Protozoan parasites are known to attach specific and diverse group of proteins to their plasma membrane via a GPI anchor. In malaria parasites, GPI-anchored proteins (GPI-APs) have been shown to play an important role in host–pathogen interactions and a key function in host cell invasion and immune evasion. Because of their immunogenic properties, some of these proteins have been considered as malaria vaccine candidates. However, identification of all possible GPI-APs encoded by these parasites remains challenging due to their sequence diversity and limitations of the tools used for their characterization.},

keywords = {Misc},

pubstate = {published},

tppubtype = {article}

}

Mwaniki, Njagi Moses; Pisanti, Nadia

Optimal Sequence Alignment to ED-Strings Proceedings Article

In: Bansal, Mukul S.; Cai, Zhipeng; Mangul, Serghei (Ed.): Bioinformatics Research and Applications, pp. 204–216, Springer Nature, Germany, 2023, ISSN: 0302-9743.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@inproceedings{Mwaniki2022b,

title = {Optimal Sequence Alignment to ED-Strings},

author = {Njagi Moses Mwaniki and Nadia Pisanti},

editor = {Mukul S. Bansal and Zhipeng Cai and Serghei Mangul},

doi = {10.1007/978-3-031-23198-8_19},

issn = {0302-9743},

year = {2023},

date = {2023-01-01},

urldate = {2022-01-01},

booktitle = {Bioinformatics Research and Applications},

volume = {13760},

pages = {204--216},

publisher = {Springer Nature},

address = {Germany},

abstract = {Partial Order Alignment (POA) was introduced by Lee et al. in 2002 to allow the alignment of a string to a graph-like structure representing a set of aligned strings (a Multiple Sequence Alignment, MSA). However, the POA edit transcript (the sequence of edit operations that describe the alignment) does not reflect the possible elasticity of the MSA (different gaps sizes in the aligned string), leaving room for possible misalignment and its propagation in progressive MSA. Elastic-Degenerate Strings (ED-strings) are strings that can represent the outcome of an MSA by highlighting gaps and variants as a list of strings that can differ in size and that can possibly include the empty string. In this paper, we define a method that optimally aligns a string to an ED-string, the latter compactly representing an MSA, overcoming the ambiguity in the POA edit transcript while maintaining its time and space complexity.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {inproceedings}

}

Blassel, Luc; Medvedev, Paul; Chikhi, Rayan

Mapping-friendly sequence reductions: Going beyond homopolymer compression Journal Article

In: iScience, vol. 25, no. 11, pp. 105305, 2022, ISSN: 2589-0042.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@article{Blassel2022,

title = {Mapping-friendly sequence reductions: Going beyond homopolymer compression},

author = {Luc Blassel and Paul Medvedev and Rayan Chikhi},

doi = {10.1016/j.isci.2022.105305},

issn = {2589-0042},

year = {2022},

date = {2022-11-01},

urldate = {2022-11-01},

journal = {iScience},

volume = {25},

number = {11},

pages = {105305},

abstract = {Sequencing errors continue to pose algorithmic challenges to methods working with sequencing data. One of the simplest and most prevalent techniques for ameliorating the detrimental effects of homopolymer expansion/contraction errors present in long reads is homopolymer compression. It collapses runs of repeated nucleotides, to remove some sequencing errors and improve mapping sensitivity. Though our intuitive understanding justifies why homopolymer compression works, it in no way implies that it is the best transformation that can be done. In this paper, we explore if there are transformations that can be applied in the same pre-processing manner as homopolymer compression that would achieve better alignment sensitivity. We introduce a more general framework than homopolymer compression, called mapping-friendly sequence reductions. We transform the reference and the reads using these reductions and then apply an alignment algorithm. We demonstrate that some mapping-friendly sequence reductions lead to improved mapping accuracy, outperforming homopolymer compression.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {article}

}

Luo, Xiao; Kang, Xiongbin; Schönhuth, Alexander

VeChat: correcting errors in long reads using variation graphs Journal Article

In: Nat Commun, vol. 13, no. 1, pp. 6657, 2022, ISSN: 2041-1723.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@article{Luo2022b,

title = {VeChat: correcting errors in long reads using variation graphs},

author = {Xiao Luo and Xiongbin Kang and Alexander Schönhuth},

doi = {10.1038/s41467-022-34381-8},

issn = {2041-1723},

year = {2022},

date = {2022-11-01},

urldate = {2022-11-01},

journal = {Nat Commun},

volume = {13},

number = {1},

pages = {6657},

abstract = {Error correction is the canonical first step in long-read sequencing data analysis. Current self-correction methods, however, are affected by consensus sequence induced biases that mask true variants in haplotypes of lower frequency showing in mixed samples. Unlike consensus sequence templates, graph-based reference systems are not affected by such biases, so do not mistakenly mask true variants as errors. We present VeChat, as an approach to implement this idea: VeChat is based on variation graphs, as a popular type of data structure for pangenome reference systems. Extensive benchmarking experiments demonstrate that long reads corrected by VeChat contain 4 to 15 (Pacific Biosciences) and 1 to 10 times (Oxford Nanopore Technologies) less errors than when being corrected by state of the art approaches. Further, using VeChat prior to long-read assembly significantly improves the haplotype awareness of the assemblies. VeChat is an easy-to-use open-source tool and publicly available at https://github.com/HaploKit/vechat .},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {article}

}

Lemane, Téo; Chikhi, Rayan; Peterlongo, Pierre

kmdiff, large-scale and user-friendly differential k-mer analyses Journal Article

In: Bioinformatics, vol. 38, no. 24, pp. 5443-5445, 2022, ISSN: 1367-4803.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@article{Lemane2022,

title = {kmdiff, large-scale and user-friendly differential k-mer analyses},

author = {Téo Lemane and Rayan Chikhi and Pierre Peterlongo},

url = {https://github.com/tlemane/kmdiff},

doi = {10.1093/bioinformatics/btac689},

issn = {1367-4803},

year = {2022},

date = {2022-10-01},

urldate = {2022-10-01},

journal = {Bioinformatics},

volume = {38},

number = {24},

pages = {5443-5445},

abstract = {Genome Wide Association Studies (GWAS) elucidate links between genotypes and phenotypes. Recent studies point out the interest of conducting such experiments using k-mers as the base signal instead of single-nucleotide polymorphisms. We propose a tool, kmdiff, that performs differential k-mer analyses on large sequencing cohorts in an order of magnitude less time and memory than previously possible.

AVAILABILITY: https://github.com/tlemane/kmdiff.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {article}

}

AVAILABILITY: https://github.com/tlemane/kmdiff.

Dufresne, Yoann; Lemane, Teo; Marijon, Pierre; Peterlongo, Pierre; Rahman, Amatur; Kokot, Marek; Medvedev, Paul; Deorowicz, Sebastian; Chikhi, Rayan

The K-mer File Format: a standardized and compact disk representation of sets of k-mers Journal Article

In: Bioinformatics, vol. 38, no. 18, pp. 4423–4425, 2022, ISSN: 1367-4811.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@article{Dufresne2022,

title = {The K-mer File Format: a standardized and compact disk representation of sets of k-mers},

author = {Yoann Dufresne and Teo Lemane and Pierre Marijon and Pierre Peterlongo and Amatur Rahman and Marek Kokot and Paul Medvedev and Sebastian Deorowicz and Rayan Chikhi},

url = {https://github.com/Kmer-File-Format/},

doi = {10.1093/bioinformatics/btac528},

issn = {1367-4811},

year = {2022},

date = {2022-09-01},

urldate = {2022-09-01},

journal = {Bioinformatics},

volume = {38},

number = {18},

pages = {4423--4425},

abstract = {\textbf{Summary:} Bioinformatics applications increasingly rely on ad hoc disk storage of k-mer sets, e.g. for de Bruijn graphs

or alignment indexes. Here, we introduce the K-mer File Format as a general lossless framework for storing and

manipulating k-mer sets, realizing space savings of 3–5 compared to other formats, and bringing interoperability

across tools.

\textbf{Availability and implementation:} Format specification, Cþþ/Rust API, tools: https://github.com/Kmer-File-Format/.

\textbf{Supplementary Information:} Supplementary data are available at Bioinformatics online.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {article}

}

**Summary:**Bioinformatics applications increasingly rely on ad hoc disk storage of k-mer sets, e.g. for de Bruijn graphs

or alignment indexes. Here, we introduce the K-mer File Format as a general lossless framework for storing and

manipulating k-mer sets, realizing space savings of 3–5 compared to other formats, and bringing interoperability

across tools.

**Availability and implementation:**Format specification, Cþþ/Rust API, tools: <a href="https://github.com/Kmer-File-Format/">https://github.com/Kmer-File-Format/</a>.

**Supplementary Information:**Supplementary data are available at Bioinformatics online.

Kang, Xiongbin; Luo, Xiao; Schönhuth, Alexander

StrainXpress: strain aware metagenome assembly from short reads Journal Article

In: Nucleic Acids Res, vol. 50, no. 17, pp. e101, 2022, ISSN: 1362-4962.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@article{Kang2022,

title = {StrainXpress: strain aware metagenome assembly from short reads},

author = {Xiongbin Kang and Xiao Luo and Alexander Schönhuth},

doi = {10.1093/nar/gkac543},

issn = {1362-4962},

year = {2022},

date = {2022-09-01},

urldate = {2022-09-01},

journal = {Nucleic Acids Res},

volume = {50},

number = {17},

pages = {e101},

abstract = {Next-generation sequencing-based metagenomics has enabled to identify microorganisms in characteristic habitats without the need for lengthy cultivation. Importantly, clinically relevant phenomena such as resistance to medication, virulence or interactions with the environment can vary already within species. Therefore, a major current challenge is to reconstruct individual genomes from the sequencing reads at the level of strains, and not just the level of species. However, strains of one species can differ only by minor amounts of variants, which makes it difficult to distinguish them. Despite considerable recent progress, related approaches have remained fragmentary so far. Here, we present StrainXpress, as a comprehensive solution to the problem of strain aware metagenome assembly from next-generation sequencing reads. In experiments, StrainXpress reconstructs strain-specific genomes from metagenomes that involve up to >1000 strains and proves to successfully deal with poorly covered strains. The amount of reconstructed strain-specific sequence exceeds that of the current state-of-the-art approaches by on average 26.75% across all data sets (first quartile: 18.51%, median: 26.60%, third quartile: 35.05%).},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {article}

}

Charalampopoulos, Panagiotis; Pissis, Solon P.; Radoszewski, Jakub

Longest Palindromic Substring in Sublinear Time Proceedings Article

In: Bannai, Hideo; Holub, Jan (Ed.): 33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022), pp. 20:1–20:9, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2022, ISSN: 1868-8969.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG, WP2: Evolutionary/Comparative CPG

@inproceedings{Charalampopoulos2022b,

title = {Longest Palindromic Substring in Sublinear Time},

author = {Charalampopoulos, Panagiotis and Pissis, Solon P. and Radoszewski, Jakub},

editor = {Bannai, Hideo and Holub, Jan},

doi = {10.4230/LIPIcs.CPM.2022.20},

issn = {1868-8969},

year = {2022},

date = {2022-06-22},

urldate = {2022-06-22},

booktitle = {33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022)},

volume = {223},

pages = {20:1--20:9},

publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},

address = {Dagstuhl, Germany},

series = {Leibniz International Proceedings in Informatics (LIPIcs)},

abstract = {We revisit the classic algorithmic problem of computing a longest palidromic substring. This problem is solvable by a celebrated 𝒪(n)-time algorithm [Manacher, J. ACM 1975], where n is the length of the input string. For small alphabets, 𝒪(n) is not necessarily optimal in the word RAM model of computation: a string of length n over alphabet [0,σ) can be stored in 𝒪(n log σ/log n) space and read in 𝒪(n log σ/log n) time. We devise a simple 𝒪(n log σ/log n)-time algorithm for computing a longest palindromic substring. In particular, our algorithm works in sublinear time if σ = $2^{o(log n)}$. Our technique relies on periodicity and on the 𝒪(n log σ/log n)-time constructible data structure of Kempa and Kociumaka [STOC 2019] that answers longest common extension queries in 𝒪(1) time.},

keywords = {WP1: Primary CPG, WP2: Evolutionary/Comparative CPG},

pubstate = {published},

tppubtype = {inproceedings}

}

Bernardini, Giulia; Conte, Alessio; Gabory, Esteban; Grossi, Roberto; Loukides, Grigorios; Pissis, Solon P.; Punzi, Giulia; Sweering, Michelle

On Strings Having the Same Length- k Substrings Proceedings Article

In: Bannai, Hideo; Holub, Jan (Ed.): 33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022), pp. 16:1–16:17, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2022, ISSN: 1868-8969.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG, WP2: Evolutionary/Comparative CPG

@inproceedings{Bernardini2022b,

title = {On Strings Having the Same Length- k Substrings},

author = {Bernardini, Giulia and Conte, Alessio and Gabory, Esteban and Grossi, Roberto and Loukides, Grigorios and Pissis, Solon P. and Punzi, Giulia and Sweering, Michelle},

editor = {Bannai, Hideo and Holub, Jan},

doi = {10.4230/LIPIcs.CPM.2022.16},

issn = {1868-8969},

year = {2022},

date = {2022-06-22},

urldate = {2022-06-22},

booktitle = {33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022)},

volume = {223},

pages = {16:1--16:17},

publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},

address = {Dagstuhl, Germany},

series = {Leibniz International Proceedings in Informatics (LIPIcs)},

abstract = {Let Substr_k(X) denote the set of length-k substrings of a given string X for a given integer k > 0. We study the following basic string problem, called z-Shortest 𝒮_k-Equivalent Strings: Given a set 𝒮_k of n length-k strings and an integer z > 0, list z shortest distinct strings T₁,…,T_z such that Substr_k(T_i) = 𝒮_k, for all i ∈ [1,z]. The z-Shortest 𝒮_k-Equivalent Strings problem arises naturally as an encoding problem in many real-world applications; e.g., in data privacy, in data compression, and in bioinformatics. The 1-Shortest 𝒮_k-Equivalent Strings, referred to as Shortest 𝒮_k-Equivalent String, asks for a shortest string X such that Substr_k(X) = 𝒮_k.

Our main contributions are summarized below:

- Given a directed graph G(V,E), the Directed Chinese Postman (DCP) problem asks for a shortest closed walk that visits every edge of G at least once. DCP can be solved in 𝒪̃(|E||V|) time using an algorithm for min-cost flow. We show, via a non-trivial reduction, that if Shortest 𝒮_k-Equivalent String over a binary alphabet has a near-linear-time solution then so does DCP.

- We show that the length of a shortest string output by Shortest 𝒮_k-Equivalent String is in 𝒪(k+n²). We generalize this bound by showing that the total length of z shortest strings is in 𝒪(zk+zn²+z²n). We derive these upper bounds by showing (asymptotically tight) bounds on the total length of z shortest Eulerian walks in general directed graphs.

- We present an algorithm for solving z-Shortest 𝒮_k-Equivalent Strings in 𝒪(nk+n²log²n+zn²log n+|output|) time. If z = 1, the time becomes 𝒪(nk+n²log²n) by the fact that the size of the input is Θ(nk) and the size of the output is 𝒪(k+n²).},

keywords = {WP1: Primary CPG, WP2: Evolutionary/Comparative CPG},

pubstate = {published},

tppubtype = {inproceedings}

}

Our main contributions are summarized below:

- Given a directed graph G(V,E), the Directed Chinese Postman (DCP) problem asks for a shortest closed walk that visits every edge of G at least once. DCP can be solved in 𝒪̃(|E||V|) time using an algorithm for min-cost flow. We show, via a non-trivial reduction, that if Shortest 𝒮_k-Equivalent String over a binary alphabet has a near-linear-time solution then so does DCP.

- We show that the length of a shortest string output by Shortest 𝒮_k-Equivalent String is in 𝒪(k+n²). We generalize this bound by showing that the total length of z shortest strings is in 𝒪(zk+zn²+z²n). We derive these upper bounds by showing (asymptotically tight) bounds on the total length of z shortest Eulerian walks in general directed graphs.

- We present an algorithm for solving z-Shortest 𝒮_k-Equivalent Strings in 𝒪(nk+n²log²n+zn²log n+|output|) time. If z = 1, the time becomes 𝒪(nk+n²log²n) by the fact that the size of the input is Θ(nk) and the size of the output is 𝒪(k+n²).

Teramo, Antonella; Binatti, Andrea; Ciabatti, Elena; Schiavoni, Gianluca; Tarrini, Giulia; Baril`a, Gregorio; Calabretto, Giulia; Vicenzetto, Cristina; Gasparini, Vanessa Rebecca; Facco, Monica; Petrini, Iacopo; Grossi, Roberto; Pisanti, Nadia; Bortoluzzi, Stefania; Falini, Brunangelo; Tiacci, Enrico; Galimberti, Sara; Semenzato, Gianpietro; Zambello, Renato

Defining TCRγδlymphoproliferative disorders by combined immunophenotypic and molecular evaluation Journal Article

In: Nature Communications, vol. 13, no. 1, pp. 3298, 2022, ISBN: 2041-1723.

Abstract | Links | BibTeX | Tags: WP3: Translational CPG

@article{Teramo2022,

title = {Defining TCRγδlymphoproliferative disorders by combined immunophenotypic and molecular evaluation},

author = {Antonella Teramo and Andrea Binatti and Elena Ciabatti and Gianluca Schiavoni and Giulia Tarrini and Gregorio Baril`a and Giulia Calabretto and Cristina Vicenzetto and Vanessa Rebecca Gasparini and Monica Facco and Iacopo Petrini and Roberto Grossi and Nadia Pisanti and Stefania Bortoluzzi and Brunangelo Falini and Enrico Tiacci and Sara Galimberti and Gianpietro Semenzato and Renato Zambello},

doi = {10.1038/s41467-022-31015-x},

isbn = {2041-1723},

year = {2022},

date = {2022-06-08},

urldate = {2022-06-08},

journal = {Nature Communications},

volume = {13},

number = {1},

pages = {3298},

abstract = {Tγδlarge granular lymphocyte leukemia (TγδLGLL) is a rare lymphoproliferative disease, scantily described in literature. A deep-analysis, in an initial cohort of 9 TγδLGLL compared to 23 healthy controls, shows that TγδLGLL dominant clonotypes are mainly public and exhibit different V-(D)-J γ/δusage between patients with symptomatic and indolent Tγδneoplasm. Moreover, some clonotypes share the same rearranged sequence. Data obtained in an enlarged cohort (n = 36) indicate the importance of a combined evaluation of immunophenotype and STAT mutational profile for the correct management of patients with Tγδcell expansions. In fact, we observe an association between Vδ2/Vγ9 clonality and indolent course, while Vδ2/Vγ9 negativity correlates with symptomatic disease. Moreover, the 7 patients with STAT3 mutations have neutropenia and a CD56-/Vδ2- phenotype, and the 3 cases with STAT5B mutations display an asymptomatic clinical course and CD56/Vδ2 expression. All these data indicate that biological characterization is needed for Tγδ-cell neoplasm definition.},

keywords = {WP3: Translational CPG},

pubstate = {published},

tppubtype = {article}

}

Bernardini, Giulia; Chen, Huiping; Loukides, Grigorios; Pissis, Solon P.; Stougie, Leen; Sweering, Michelle

Making de Bruijn Graphs Eulerian Proceedings Article

In: Bannai, Hideo; Holub, Jan (Ed.): 33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022), pp. 12:1–12:18, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2022, ISSN: 1868-8969.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG, WP2: Evolutionary/Comparative CPG

@inproceedings{Bernardini2022c,

title = {Making de Bruijn Graphs Eulerian},

author = {Bernardini, Giulia and Chen, Huiping and Loukides, Grigorios and Pissis, Solon P. and Stougie, Leen and Sweering, Michelle},

editor = {Bannai, Hideo and Holub, Jan},

doi = {10.4230/LIPIcs.CPM.2022.12},

issn = {1868-8969},

year = {2022},

date = {2022-06-06},

urldate = {2022-06-06},

booktitle = {33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022)},

volume = {223},

pages = {12:1--12:18},

publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},

address = {Dagstuhl, Germany},

series = {Leibniz International Proceedings in Informatics (LIPIcs)},

abstract = {A directed multigraph is called Eulerian if it has a circuit which uses each edge exactly once. Euler’s theorem tells us that a weakly connected directed multigraph is Eulerian if and only if every node is balanced. Given a collection S of strings over an alphabet Σ, the de Bruijn graph (dBG) of order k of S is a directed multigraph $G_{S,k}(V,E)$, where V is the set of length-(k-1) substrings of the strings in S, and $G_{S,k}$ contains an edge (u,v) with multiplicity $m_{u,v}$, if and only if the string u[0]⋅ v is equal to the string u⋅ v[k-2] and this string occurs exactly $m_{u,v}$ times in total in strings in S. Let $G_{Σ,k}(V_{Σ,k},E_{Σ,k})$ be the complete dBG of $Σ^k$. The Eulerian Extension (EE) problem on $G_{S,k}$ asks to extend $G_{S,k}$ with a set $cal{B}$ of nodes from $V_{Σ,k}$ and a smallest multiset $cal{A}$ of edges from $E_{Σ,k}$ to make it Eulerian. Note that extending dBGs is algorithmically much more challenging than extending general directed multigraphs because some edges in dBGs are by definition forbidden. Extending dBGs lies at the heart of sequence assembly [Medvedev et al., WABI 2007], one of the most important tasks in bioinformatics. The novelty of our work with respect to existing works is that we allow not only to duplicate existing edges of $G_{S,k}$ but to also add novel edges and nodes, in an effort to (i) connect multiple components and (ii) reduce the total EE cost. It is easy to show that EE on $G_{S,k}$ is NP-hard via a reduction from shortest common superstring. We further show that EE remains NP-hard, even when we are not allowed to add new nodes, via a highly non-trivial reduction from 3-SAT. We thus investigate the following two problems underlying EE in dBGs:

1) When $G_{S,k}$ is not weakly connected, we are asked to connect its d > 1 components using a minimum-weight spanning tree, whose edges are paths on the underlying $G_{Σ,k}$ and weights are the corresponding path lengths. This way of connecting guarantees that no new unbalanced node is added. We show that this problem can be solved in $O(|V|klog d+|E|)$ time, which is nearly optimal, since the size of $G_{S,k}$ is $Θ(|V|k+|E|)$.

2) When $G_{S,k}$ is not balanced, we are asked to extend $G_{S,k}$ to $H_{S,k}(V cup cal{B},E cup cal{A})$ such that every node of $H_{S,k}$ is balanced and the total number $|cal{A}|$ of added edges is minimized. We show that this problem can be solved in the optimal $O(k|V| + |E|+ |cal{A}|)$ time. Let us stress that, although our main contributions are theoretical, the algorithms we design for the above two problems are practical. We combine the two algorithms in one method that makes any dBG Eulerian; and show experimentally that the cost of the obtained feasible solutions on real-world dBGs is substantially smaller than the corresponding cost obtained by existing greedy approaches.},

keywords = {WP1: Primary CPG, WP2: Evolutionary/Comparative CPG},

pubstate = {published},

tppubtype = {inproceedings}

}

1) When $G_{S,k}$ is not weakly connected, we are asked to connect its d > 1 components using a minimum-weight spanning tree, whose edges are paths on the underlying $G_{Σ,k}$ and weights are the corresponding path lengths. This way of connecting guarantees that no new unbalanced node is added. We show that this problem can be solved in $O(|V|klog d+|E|)$ time, which is nearly optimal, since the size of $G_{S,k}$ is $Θ(|V|k+|E|)$.

2) When $G_{S,k}$ is not balanced, we are asked to extend $G_{S,k}$ to $H_{S,k}(V cup cal{B},E cup cal{A})$ such that every node of $H_{S,k}$ is balanced and the total number $|cal{A}|$ of added edges is minimized. We show that this problem can be solved in the optimal $O(k|V| + |E|+ |cal{A}|)$ time. Let us stress that, although our main contributions are theoretical, the algorithms we design for the above two problems are practical. We combine the two algorithms in one method that makes any dBG Eulerian; and show experimentally that the cost of the obtained feasible solutions on real-world dBGs is substantially smaller than the corresponding cost obtained by existing greedy approaches.

Sládeček, T.; Gažiová, M.; Pös, O.; Pös, Z.; Budiš, J.; Radvánsky, J.; Szemes, T.

Combination of expert decision systems with artificial intelligence leads to superior accuracy of automated prediction of clinical effect of copy number variation Presentation

Poster Presentation at ESHG, 01.06.2022.

BibTeX | Tags: Misc, WP3: Translational CPG

@misc{Sladecek2022,

title = {Combination of expert decision systems with artificial intelligence leads to superior accuracy of automated prediction of clinical effect of copy number variation},

author = {T. Sládeček and M. Gažiová and O. Pös and Z. Pös and J. Budiš and J. Radvánsky and T. Szemes},

year = {2022},

date = {2022-06-01},

howpublished = {Poster Presentation at ESHG},

keywords = {Misc, WP3: Translational CPG},

pubstate = {published},

tppubtype = {presentation}

}

Bernardini, Giulia; Gawrychowski, Paweł; Pisanti, Nadia; Pissis, Solon P.; Rosone, Giovanna

Elastic-Degenerate String Matching via Fast Matrix Multiplication Journal Article

In: SIAM Journal on Computing, vol. 51, no. 3, pp. 549–576, 2022.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@article{Bernardini2022,

title = {Elastic-Degenerate String Matching via Fast Matrix Multiplication},

author = {Giulia Bernardini and Paweł Gawrychowski and Nadia Pisanti and Solon P. Pissis and Giovanna Rosone},

doi = {10.1137/20m1368033},

year = {2022},

date = {2022-05-01},

urldate = {2022-05-01},

journal = {SIAM Journal on Computing},

volume = {51},

number = {3},

pages = {549--576},

publisher = {Society for Industrial & Applied Mathematics (SIAM)},

abstract = {An elastic-degenerate (ED) string is a sequence of $n$ sets of strings of total length $N$ which was recently proposed to model a set of similar sequences. The ED string matching (EDSM) problem is to find all occurrences of a pattern of length $m$ in an ED text. The EDSM problem has recently received some attention in the combinatorial pattern matching community, and an $mathcal{O}(nm^{1.5}sqrt{log m} + N)$-time algorithm is known [Aoyama et al., CPM 2018]. The standard assumption in the prior work on this question is that $N$ is substantially larger than both $n$ and $m$, and thus we would like to have a linear dependency on the former. Under this assumption, the natural open problem is whether we can decrease the 1.5 exponent in the time complexity, similarly as in the related (but, to the best of our knowledge, not equivalent) word break problem [Backurs and Indyk, FOCS 2016]. Our starting point is a conditional lower bound for the EDSM problem. We use the popular combinatorial Boolean matrix multiplication (BMM) conjecture stating that there is no truly subcubic combinatorial algorithm for BMM [Abboud and Williams, FOCS 2014]. By designing an appropriate reduction, we show that a combinatorial algorithm solving the EDSM problem in $mathcal{O}(nm^{1.5-epsilon} + N)$ time, for any $epsilon>0$, refutes this conjecture. Our reduction should be understood as an indication that decreasing the exponent requires fast matrix multiplication. String periodicity and fast Fourier transform are two standard tools in string algorithms. Our main technical contribution is that we successfully combine these tools with fast matrix multiplication to design a noncombinatorial $tilde{mathcal{O}}(nm^{omega-1}+N)$-time algorithm for EDSM, where $omega$ denotes the matrix multiplication exponent and the $tilde{mathcal{O}}(cdot)$ notation suppresses polylog factors. To the best of our knowledge, we are the first to combine these tools. In particular, using the fact that $omega<2.373$ [Alman and Williams, SODA 2021; Le Gall, ISSAC 2014; Williams, STOC 2012], we obtain an $mathcal{O}(nm^{1.373} + N)$-time algorithm for EDSM. An important building block in our solution that might find applications in other problems is a method of selecting a small set of length-$ell$ substrings of the pattern, called anchors, so that any occurrence of a string from an ED text set contains at least one but not too many (on average) such anchors inside.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {article}

}

Charalampopoulos, Panagiotis; Iliopoulos, Costas S.; Kociumaka, Tomasz; Pissis, Solon P.; Radoszewski, Jakub; Straszy'nski, Juliusz

Efficient Computation of Sequence Mappability Journal Article

In: Algorithmica, vol. 84, no. 5, pp. 1418–1440, 2022, ISBN: 1432-0541.

Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG

@article{Charalampopoulos2022,

title = {Efficient Computation of Sequence Mappability},

author = {Panagiotis Charalampopoulos and Costas S. Iliopoulos and Tomasz Kociumaka and Solon P. Pissis and Jakub Radoszewski and Juliusz Straszy'nski},

doi = {10.1007/s00453-022-00934-y},

isbn = {1432-0541},

year = {2022},

date = {2022-05-01},

urldate = {2022-05-01},

journal = {Algorithmica},

volume = {84},

number = {5},

pages = {1418--1440},

abstract = {Sequence mappability is an important task in genome resequencing. In the (k, m)-mappability problem, for a given sequence T of length n, the goal is to compute a table whose ith entry is the number of indices $j ne i$ such that the length-m substrings of T starting at positions i and j have at most k mismatches. Previous works on this problem focused on heuristics computing a rough approximation of the result or on the case of $k=1$. We present several efficient algorithms for the general case of the problem. Our main result is an algorithm that, for $k=O(1)$, works in $O(n)$space and, with high probability, in $O(n cdot min {m^k,log ^k n})$ time. Our algorithm requires a careful adaptation of the k-errata trees of Cole et al. [STOC 2004] to avoid multiple counting of pairs of substrings. Our technique can also be applied to solve the all-pairs Hamming distance problem introduced by Crochemore et al. [WABI 2017]. We further develop $O(n^2)$-time algorithms to compute all (k, m)-mappability tables for a fixed m and all $k in {0,ldots ,m}$or a fixed k and all $m in {k,ldots ,n}$. Finally, we show that, for $k,m = Theta (log n)$, the (k, m)-mappability problem cannot be solved in strongly subquadratic time unless the Strong Exponential Time Hypothesis fails. This is an improved and extended version of a paper presented at SPIRE 2018.},

keywords = {WP2: Evolutionary/Comparative CPG},

pubstate = {published},

tppubtype = {article}

}

Baaijens, Jasmijn A.; Bonizzoni, Paola; Boucher, Christina; Vedova, Gianluca Della; Pirola, Yuri; Rizzi, Raffaella; Sirén, Jouni

Computational graph pangenomics: a tutorial on data structures and their applications Journal Article

In: Natural Computing, vol. 21, no. 1, pp. 81–108, 2022, ISBN: 1572-9796.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@article{Baaijens2022,

title = {Computational graph pangenomics: a tutorial on data structures and their applications},

author = {Jasmijn A. Baaijens and Paola Bonizzoni and Christina Boucher and Gianluca Della Vedova and Yuri Pirola and Raffaella Rizzi and Jouni Sirén},

doi = {10.1007/s11047-022-09882-6},

isbn = {1572-9796},

year = {2022},

date = {2022-03-01},

urldate = {2022-03-01},

journal = {Natural Computing},

volume = {21},

number = {1},

pages = {81--108},

abstract = {Computational pangenomics is an emerging research field that is changing the way computer scientists are facing challenges in biological sequence analysis. In past decades, contributions from combinatorics, stringology, graph theory and data structures were essential in the development of a plethora of software tools for the analysis of the human genome. These tools allowed computational biologists to approach ambitious projects at population scale, such as the 1000 Genomes Project. A major contribution of the 1000 Genomes Project is the characterization of a broad spectrum of genetic variations in the human genome, including the discovery of novel variations in the South Asian, African and European populations---thus enhancing the catalogue of variability within the reference genome. Currently, the need to take into account the high variability in population genomes as well as the specificity of an individual genome in a personalized approach to medicine is rapidly pushing the abandonment of the traditional paradigm of using a single reference genome. A graph-based representation of multiple genomes, or a graph pangenome, is replacing the linear reference genome. This means completely rethinking well-established procedures to analyze, store, and access information from genome representations. Properly addressing these challenges is crucial to face the computational tasks of ambitious healthcare projects aiming to characterize human diversity by sequencing 1M individuals (Stark et al. 2019). This tutorial aims to introduce readers to the most recent advances in the theory of data structures for the representation of graph pangenomes. We discuss efficient representations of haplotypes and the variability of genotypes in graph pangenomes, and highlight applications in solving computational problems in human and microbial (viral) pangenomes.},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {article}

}

Luo, Xiao; Kang, Xiongbin; Schönhuth, Alexander

Strainline: full-length de novo viral haplotype reconstruction from noisy long reads Journal Article

In: Genome Biology, vol. 23, iss. 1, no. 29, pp. 1–27, 2022, ISSN: 1474-760X.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG, WP3: Translational CPG

@article{Luo2022,

title = {Strainline: full-length de novo viral haplotype reconstruction from noisy long reads},

author = {Xiao Luo and Xiongbin Kang and Alexander Schönhuth},

doi = {10.1186/s13059-021-02587-6},

issn = {1474-760X},

year = {2022},

date = {2022-01-20},

urldate = {2022-01-20},

journal = {Genome Biology},

volume = {23},

number = {29},

issue = {1},

pages = {1--27},

publisher = {Springer Science and Business Media LLC},

abstract = {Haplotype-resolved de novo assembly of highly diverse virus genomes is critical in prevention, control and treatment of viral diseases. Current methods either can handle only relatively accurate short read data, or collapse haplotype-specific variations into consensus sequence. Here, we present Strainline, a novel approach to assemble viral haplotypes from noisy long reads without a reference genome. Strainline is the first approach to provide strain-resolved, full-length de novo assemblies of viral quasispecies from noisy third-generation sequencing data. Benchmarking on simulated and real datasets of varying complexity and diversity confirm this novelty and demonstrate the superiority of Strainline.},

keywords = {WP1: Primary CPG, WP3: Translational CPG},

pubstate = {published},

tppubtype = {article}

}

Gažiová, M.; Sládeček, T.; Pös, O.; Števko, M.; Krampl, W.; Pös, Z.; Hekel, R.; Hlavačka, M.; Kucharík, M.; Radvánszky, J.; Budiš, J.; Szemes, T.

Automated prediction of the clinical impact of structural copy number variations Journal Article

In: Scientific Reports, vol. 12, no. 1, pp. 555, 2022, ISSN: 2045-2322.

Abstract | Links | BibTeX | Tags: Misc, WP3: Translational CPG

@article{Gaziova2022,

title = {Automated prediction of the clinical impact of structural copy number variations},

author = {M. Gažiová and T. Sládeček and O. Pös and M. Števko and W. Krampl and Z. Pös and R. Hekel and M. Hlavačka and M. Kucharík and J. Radvánszky and J. Budiš and T. Szemes},

doi = {10.1038/s41598-021-04505-z},

issn = {2045-2322},

year = {2022},

date = {2022-01-11},

urldate = {2022-01-11},

journal = {Scientific Reports},

volume = {12},

number = {1},

pages = {555},

publisher = {Springer Science and Business Media LLC},

abstract = {Copy number variants (CNVs) play an important role in many biological processes, including the development of genetic diseases, making them attractive targets for genetic analyses. The interpretation of the effect of these structural variants is a challenging problem due to highly variable numbers of gene, regulatory, or other genomic elements affected by the CNV. This led to the demand for the interpretation tools that would relieve researchers, laboratory diagnosticians, genetic counselors, and clinical geneticists from the laborious process of annotation and classification of CNVs. We designed and validated a prediction method (ISV; Interpretation of Structural Variants) that is based on boosted trees which takes into account annotations of CNVs from several publicly available databases. The presented approach achieved more than 98{%} prediction accuracy on both copy number loss and copy number gain variants while also allowing CNVs being assigned “uncertain”significance in predictions. We believe that ISV’s prediction capability and explainability have a great potential to guide users to more precise interpretations and classifications of CNVs.},

keywords = {Misc, WP3: Translational CPG},

pubstate = {published},

tppubtype = {article}

}

Rizzo, Nicola; Mäkinen, Veli

Linear Time Construction of Indexable Elastic Founder Graphs Proceedings Article

In: Bazgan, Cristina; Fernau, Henning (Ed.): Combinatorial Algorithms, pp. 480–493, Springer International Publishing, Cham, 2022, ISBN: 978-3-031-06678-8.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@inproceedings{Rizzo2022,

title = {Linear Time Construction of Indexable Elastic Founder Graphs},

author = {Nicola Rizzo and Veli Mäkinen},

editor = {Cristina Bazgan and Henning Fernau},

url = {https://arxiv.org/abs/2201.06492},

doi = {10.1007/978-3-031-06678-8_35},

isbn = {978-3-031-06678-8},

year = {2022},

date = {2022-01-01},

urldate = {2022-01-01},

booktitle = {Combinatorial Algorithms},

pages = {480--493},

publisher = {Springer International Publishing},

address = {Cham},

abstract = {Pattern matching on graphs has been widely studied lately due to its importance in genomics applications. Unfortunately, even the simplest problem of deciding if a string appears as a subpath of a graph admits a quadratic lower bound under the Orthogonal Vectors Hypothesis (Equi et al. ICALP 2019, SOFSEM 2021). To avoid this bottleneck, the research has shifted towards more specific graph classes, e.g. those induced from multiple sequence alignments (MSAs). Consider segmenting $mathsf{MSA}[1..m,1..n]$ into $b$ blocks $mathsf{MSA}[1..m,1..j_1]$, $mathsf{MSA}[1..m,j_1+1..j_2]$, $ldots$, $mathsf{MSA}[1..m,j_b-1+1..n]$. The distinct strings in the rows of the blocks, after the removal of gap symbols, form the nodes of an elastic founder graph (EFG) where the edges represent the original connections observed in the MSA. An EFG is called indexable if a node label occurs as a prefix of only those paths that start from a node of the same block. Equi et al. (ISAAC 2021) showed that such EFGs support fast pattern matching and gave an $O(mn log m)$-time algorithm for preprocessing the MSA in a way that allows the construction of indexable EFGs maximizing the number of blocks and, alternatively, minimizing the maximum length of a block, in $O(n)$ and $O(n loglog n)$ time respectively. Using the suffix tree and solving a novel ancestor problem on trees, we improve the preprocessing to $O(mn)$ time and the $O(n loglog n)$-time EFG construction to $O(n)$ time, thus showing that both types of indexable EFGs can be constructed in time linear in the input size.},

howpublished = {arXiv},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {inproceedings}

}

Mwaniki, Njagi Moses; Garrison, Erik; Pisanti, Nadia

Fast Exact String to D-Texts Alignments Unpublished

arXiv, 2022.

Abstract | Links | BibTeX | Tags: WP1: Primary CPG

@unpublished{Mwaniki2022,

title = {Fast Exact String to D-Texts Alignments},

author = {Njagi Moses Mwaniki and Erik Garrison and Nadia Pisanti},

doi = {10.48550/ARXIV.2206.03242},

year = {2022},

date = {2022-01-01},

urldate = {2022-01-01},

publisher = {arXiv},

abstract = {In recent years, aligning a sequence to a pangenome has become a central problem in genomics and pangenomics. A fast and accurate solution to this problem can serve as a toolkit to many crucial tasks such as read-correction, Multiple Sequences Alignment (MSA), genome assemblies, variant calling, just to name a few. In this paper we propose a new, fast and exact method to align a string to a D-string, the latter possibly representing an MSA, a pan-genome or a partial assembly. An implementation of our tool dsa is publicly available at https://github.com/urbanslug/dsa.},

howpublished = {arXiv},

keywords = {WP1: Primary CPG},

pubstate = {published},

tppubtype = {unpublished}

}

Zhong, Haodi; Loukides, Grigorios; Pissis, Solon P.

Clustering sequence graphs Journal Article

In: Data & Knowledge Engineering, vol. 138, pp. 101981, 2022, ISSN: 0169-023X.

Abstract | Links | BibTeX | Tags: WP2: Evolutionary/Comparative CPG

@article{Zhong2022,

title = {Clustering sequence graphs},

author = {Haodi Zhong and Grigorios Loukides and Solon P. Pissis},

doi = {https://doi.org/10.1016/j.datak.2022.101981},

issn = {0169-023X},

year = {2022},

date = {2022-01-01},

urldate = {2022-01-01},

journal = {Data & Knowledge Engineering},

volume = {138},

pages = {101981},

abstract = {In application domains ranging from social networks to e-commerce, it is important to cluster users with respect to both their relationships (e.g., friendship or trust) and their actions (e.g., visited locations or rated products). Motivated by these applications, we introduce here the task of clustering the nodes of a sequence graph, i.e., a graph whose nodes are labeled with strings (e.g., sequences of users' visited locations or rated products). Both string clustering algorithms and graph clustering algorithms are inappropriate to deal with this task, as they do not consider the structure of strings and graph simultaneously. Moreover, attributed graph clustering algorithms generally construct poor solutions because they need to represent a string as a vector of attributes, which inevitably loses information and may harm clustering quality. We thus introduce the problem of clustering a sequence graph. We first propose two pairwise distance measures for sequence graphs, one based on edit distance and shortest path distance and another one based on SimRank. We then formalize the problem under each measure, showing also that it is NP-hard. In addition, we design a polynomial-time 2-approximation algorithm, as well as a heuristic for the problem. Experiments using real datasets and a case study demonstrate the effectiveness and efficiency of our methods.},

keywords = {WP2: Evolutionary/Comparative CPG},

pubstate = {published},

tppubtype = {article}

}