Blassel, Luc; Medvedev, Paul; Chikhi, Rayan

Mapping-friendly sequence reductions: Going beyond homopolymer compression Journal Article

In: iScience, vol. 25, no. 11, pp. 105305, 2022, ISSN: 2589-0042.

@article{pmid36339268,

title = {Mapping-friendly sequence reductions: Going beyond homopolymer compression},

author = {Luc Blassel and Paul Medvedev and Rayan Chikhi},

doi = {10.1016/j.isci.2022.105305},

issn = {2589-0042},

year = {2022},

date = {2022-11-01},

urldate = {2022-11-01},

journal = {iScience},

volume = {25},

number = {11},

pages = {105305},

abstract = {Sequencing errors continue to pose algorithmic challenges to methods working with sequencing data. One of the simplest and most prevalent techniques for ameliorating the detrimental effects of homopolymer expansion/contraction errors present in long reads is homopolymer compression. It collapses runs of repeated nucleotides, to remove some sequencing errors and improve mapping sensitivity. Though our intuitive understanding justifies why homopolymer compression works, it in no way implies that it is the best transformation that can be done. In this paper, we explore if there are transformations that can be applied in the same pre-processing manner as homopolymer compression that would achieve better alignment sensitivity. We introduce a more general framework than homopolymer compression, called mapping-friendly sequence reductions. We transform the reference and the reads using these reductions and then apply an alignment algorithm. We demonstrate that some mapping-friendly sequence reductions lead to improved mapping accuracy, outperforming homopolymer compression.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Lemane, Téo; Chikhi, Rayan; Peterlongo, Pierre

kmdiff, large-scale and user-friendly differential k-mer analyses Journal Article

In: Bioinformatics, 2022, ISSN: 1367-4811.

@article{pmid36315078,

title = {kmdiff, large-scale and user-friendly differential k-mer analyses},

author = {Téo Lemane and Rayan Chikhi and Pierre Peterlongo},

doi = {10.1093/bioinformatics/btac689},

issn = {1367-4811},

year = {2022},

date = {2022-10-01},

urldate = {2022-10-01},

journal = {Bioinformatics},

abstract = {Genome Wide Association Studies (GWAS) elucidate links between genotypes and phenotypes. Recent studies point out the interest of conducting such experiments using k-mers as the base signal instead of single-nucleotide polymorphisms. We propose a tool, kmdiff, that performs differential k-mer analyses on large sequencing cohorts in an order of magnitude less time and memory than previously possible.

AVAILABILITY: https://github.com/tlemane/kmdiff.

FUNDING: The work was funded by IPL Inria Neuromarkers, ANR Inception (ANR-16-CONV-0005), ANR Prairie (ANR-19-P3IA-0001), ANR SeqDigger (ANR-19-CE45-0008), H2020 ITN ALPACA grant 956229.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

AVAILABILITY: https://github.com/tlemane/kmdiff.

FUNDING: The work was funded by IPL Inria Neuromarkers, ANR Inception (ANR-16-CONV-0005), ANR Prairie (ANR-19-P3IA-0001), ANR SeqDigger (ANR-19-CE45-0008), H2020 ITN ALPACA grant 956229.

Dufresne, Yoann; Lemane, Teo; Marijon, Pierre; Peterlongo, Pierre; Rahman, Amatur; Kokot, Marek; Medvedev, Paul; Deorowicz, Sebastian; Chikhi, Rayan

The K-mer File Format: a standardized and compact disk representation of sets of k-mers Journal Article

In: Bioinformatics, vol. 38, no. 18, pp. 4423–4425, 2022, ISSN: 1367-4811.

@article{pmid35904548,

title = {The K-mer File Format: a standardized and compact disk representation of sets of k-mers},

author = {Yoann Dufresne and Teo Lemane and Pierre Marijon and Pierre Peterlongo and Amatur Rahman and Marek Kokot and Paul Medvedev and Sebastian Deorowicz and Rayan Chikhi},

doi = {10.1093/bioinformatics/btac528},

issn = {1367-4811},

year = {2022},

date = {2022-09-01},

urldate = {2022-09-01},

journal = {Bioinformatics},

volume = {38},

number = {18},

pages = {4423--4425},

abstract = {SUMMARY: Bioinformatics applications increasingly rely on ad hoc disk storage of k-mer sets, e.g. for de Bruijn graphs or alignment indexes. Here, we introduce the K-mer File Format as a general lossless framework for storing and manipulating k-mer sets, realizing space savings of 3-5× compared to other formats, and bringing interoperability across tools.

AVAILABILITY AND IMPLEMENTATION: Format specification, C++/Rust API, tools: https://github.com/Kmer-File-Format/.

SUPPLEMENTARY INFORMATION: Supplementary data are available at Bioinformatics online.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

AVAILABILITY AND IMPLEMENTATION: Format specification, C++/Rust API, tools: https://github.com/Kmer-File-Format/.

SUPPLEMENTARY INFORMATION: Supplementary data are available at Bioinformatics online.

Charalampopoulos, Panagiotis; Pissis, Solon P.; Radoszewski, Jakub

Longest Palindromic Substring in Sublinear Time Inproceedings

In: Bannai, Hideo; Holub, Jan (Ed.): 33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022), pp. 20:1–20:9, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2022, ISSN: 1868-8969.

@inproceedings{Charalampopoulos2022b,

title = {Longest Palindromic Substring in Sublinear Time},

author = {Charalampopoulos, Panagiotis and Pissis, Solon P. and Radoszewski, Jakub},

editor = {Bannai, Hideo and Holub, Jan},

doi = {10.4230/LIPIcs.CPM.2022.20},

issn = {1868-8969},

year = {2022},

date = {2022-06-22},

urldate = {2022-06-22},

booktitle = {33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022)},

volume = {223},

pages = {20:1--20:9},

publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},

address = {Dagstuhl, Germany},

series = {Leibniz International Proceedings in Informatics (LIPIcs)},

abstract = {We revisit the classic algorithmic problem of computing a longest palidromic substring. This problem is solvable by a celebrated 𝒪(n)-time algorithm [Manacher, J. ACM 1975], where n is the length of the input string. For small alphabets, 𝒪(n) is not necessarily optimal in the word RAM model of computation: a string of length n over alphabet [0,σ) can be stored in 𝒪(n log σ/log n) space and read in 𝒪(n log σ/log n) time. We devise a simple 𝒪(n log σ/log n)-time algorithm for computing a longest palindromic substring. In particular, our algorithm works in sublinear time if σ = $2^{o(log n)}$. Our technique relies on periodicity and on the 𝒪(n log σ/log n)-time constructible data structure of Kempa and Kociumaka [STOC 2019] that answers longest common extension queries in 𝒪(1) time.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

Bernardini, Giulia; Conte, Alessio; Gabory, Esteban; Grossi, Roberto; Loukides, Grigorios; Pissis, Solon P.; Punzi, Giulia; Sweering, Michelle

On Strings Having the Same Length- k Substrings Inproceedings

In: Bannai, Hideo; Holub, Jan (Ed.): 33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022), pp. 16:1–16:17, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2022, ISSN: 1868-8969.

@inproceedings{Bernardini2022b,

title = {On Strings Having the Same Length- k Substrings},

author = {Bernardini, Giulia and Conte, Alessio and Gabory, Esteban and Grossi, Roberto and Loukides, Grigorios and Pissis, Solon P. and Punzi, Giulia and Sweering, Michelle},

editor = {Bannai, Hideo and Holub, Jan},

doi = {10.4230/LIPIcs.CPM.2022.16},

issn = {1868-8969},

year = {2022},

date = {2022-06-22},

booktitle = {33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022)},

volume = {223},

pages = {16:1--16:17},

publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},

address = {Dagstuhl, Germany},

series = {Leibniz International Proceedings in Informatics (LIPIcs)},

abstract = {Let Substr_k(X) denote the set of length-k substrings of a given string X for a given integer k > 0. We study the following basic string problem, called z-Shortest 𝒮_k-Equivalent Strings: Given a set 𝒮_k of n length-k strings and an integer z > 0, list z shortest distinct strings T₁,…,T_z such that Substr_k(T_i) = 𝒮_k, for all i ∈ [1,z]. The z-Shortest 𝒮_k-Equivalent Strings problem arises naturally as an encoding problem in many real-world applications; e.g., in data privacy, in data compression, and in bioinformatics. The 1-Shortest 𝒮_k-Equivalent Strings, referred to as Shortest 𝒮_k-Equivalent String, asks for a shortest string X such that Substr_k(X) = 𝒮_k.

Our main contributions are summarized below:

- Given a directed graph G(V,E), the Directed Chinese Postman (DCP) problem asks for a shortest closed walk that visits every edge of G at least once. DCP can be solved in 𝒪̃(|E||V|) time using an algorithm for min-cost flow. We show, via a non-trivial reduction, that if Shortest 𝒮_k-Equivalent String over a binary alphabet has a near-linear-time solution then so does DCP.

- We show that the length of a shortest string output by Shortest 𝒮_k-Equivalent String is in 𝒪(k+n²). We generalize this bound by showing that the total length of z shortest strings is in 𝒪(zk+zn²+z²n). We derive these upper bounds by showing (asymptotically tight) bounds on the total length of z shortest Eulerian walks in general directed graphs.

- We present an algorithm for solving z-Shortest 𝒮_k-Equivalent Strings in 𝒪(nk+n²log²n+zn²log n+|output|) time. If z = 1, the time becomes 𝒪(nk+n²log²n) by the fact that the size of the input is Θ(nk) and the size of the output is 𝒪(k+n²).},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

Our main contributions are summarized below:

- Given a directed graph G(V,E), the Directed Chinese Postman (DCP) problem asks for a shortest closed walk that visits every edge of G at least once. DCP can be solved in 𝒪̃(|E||V|) time using an algorithm for min-cost flow. We show, via a non-trivial reduction, that if Shortest 𝒮_k-Equivalent String over a binary alphabet has a near-linear-time solution then so does DCP.

- We show that the length of a shortest string output by Shortest 𝒮_k-Equivalent String is in 𝒪(k+n²). We generalize this bound by showing that the total length of z shortest strings is in 𝒪(zk+zn²+z²n). We derive these upper bounds by showing (asymptotically tight) bounds on the total length of z shortest Eulerian walks in general directed graphs.

- We present an algorithm for solving z-Shortest 𝒮_k-Equivalent Strings in 𝒪(nk+n²log²n+zn²log n+|output|) time. If z = 1, the time becomes 𝒪(nk+n²log²n) by the fact that the size of the input is Θ(nk) and the size of the output is 𝒪(k+n²).

Teramo, Antonella; Binatti, Andrea; Ciabatti, Elena; Schiavoni, Gianluca; Tarrini, Giulia; Baril`a, Gregorio; Calabretto, Giulia; Vicenzetto, Cristina; Gasparini, Vanessa Rebecca; Facco, Monica; Petrini, Iacopo; Grossi, Roberto; Pisanti, Nadia; Bortoluzzi, Stefania; Falini, Brunangelo; Tiacci, Enrico; Galimberti, Sara; Semenzato, Gianpietro; Zambello, Renato

Defining TCRγδlymphoproliferative disorders by combined immunophenotypic and molecular evaluation Journal Article

In: Nature Communications, vol. 13, no. 1, pp. 3298, 2022, ISBN: 2041-1723.

@article{Teramo2022,

title = {Defining TCRγδlymphoproliferative disorders by combined immunophenotypic and molecular evaluation},

author = {Antonella Teramo and Andrea Binatti and Elena Ciabatti and Gianluca Schiavoni and Giulia Tarrini and Gregorio Baril`a and Giulia Calabretto and Cristina Vicenzetto and Vanessa Rebecca Gasparini and Monica Facco and Iacopo Petrini and Roberto Grossi and Nadia Pisanti and Stefania Bortoluzzi and Brunangelo Falini and Enrico Tiacci and Sara Galimberti and Gianpietro Semenzato and Renato Zambello},

doi = {10.1038/s41467-022-31015-x},

isbn = {2041-1723},

year = {2022},

date = {2022-06-08},

urldate = {2022-06-08},

journal = {Nature Communications},

volume = {13},

number = {1},

pages = {3298},

abstract = {Tγδlarge granular lymphocyte leukemia (TγδLGLL) is a rare lymphoproliferative disease, scantily described in literature. A deep-analysis, in an initial cohort of 9 TγδLGLL compared to 23 healthy controls, shows that TγδLGLL dominant clonotypes are mainly public and exhibit different V-(D)-J γ/δusage between patients with symptomatic and indolent Tγδneoplasm. Moreover, some clonotypes share the same rearranged sequence. Data obtained in an enlarged cohort (n = 36) indicate the importance of a combined evaluation of immunophenotype and STAT mutational profile for the correct management of patients with Tγδcell expansions. In fact, we observe an association between Vδ2/Vγ9 clonality and indolent course, while Vδ2/Vγ9 negativity correlates with symptomatic disease. Moreover, the 7 patients with STAT3 mutations have neutropenia and a CD56-/Vδ2- phenotype, and the 3 cases with STAT5B mutations display an asymptomatic clinical course and CD56/Vδ2 expression. All these data indicate that biological characterization is needed for Tγδ-cell neoplasm definition.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Bernardini, Giulia; Chen, Huiping; Loukides, Grigorios; Pissis, Solon P.; Stougie, Leen; Sweering, Michelle

Making de Bruijn Graphs Eulerian Inproceedings

In: Bannai, Hideo; Holub, Jan (Ed.): 33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022), pp. 12:1–12:18, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2022, ISSN: 1868-8969.

@inproceedings{Bernardini2022c,

title = {Making de Bruijn Graphs Eulerian},

author = {Bernardini, Giulia and Chen, Huiping and Loukides, Grigorios and Pissis, Solon P. and Stougie, Leen and Sweering, Michelle},

editor = {Bannai, Hideo and Holub, Jan},

doi = {10.4230/LIPIcs.CPM.2022.12},

issn = {1868-8969},

year = {2022},

date = {2022-06-06},

urldate = {2022-06-06},

booktitle = {33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022)},

volume = {223},

pages = {12:1--12:18},

publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},

address = {Dagstuhl, Germany},

series = {Leibniz International Proceedings in Informatics (LIPIcs)},

abstract = {A directed multigraph is called Eulerian if it has a circuit which uses each edge exactly once. Euler’s theorem tells us that a weakly connected directed multigraph is Eulerian if and only if every node is balanced. Given a collection S of strings over an alphabet Σ, the de Bruijn graph (dBG) of order k of S is a directed multigraph $G_{S,k}(V,E)$, where V is the set of length-(k-1) substrings of the strings in S, and $G_{S,k}$ contains an edge (u,v) with multiplicity $m_{u,v}$, if and only if the string u[0]⋅ v is equal to the string u⋅ v[k-2] and this string occurs exactly $m_{u,v}$ times in total in strings in S. Let $G_{Σ,k}(V_{Σ,k},E_{Σ,k})$ be the complete dBG of $Σ^k$. The Eulerian Extension (EE) problem on $G_{S,k}$ asks to extend $G_{S,k}$ with a set $\cal{B}$ of nodes from $V_{Σ,k}$ and a smallest multiset $\cal{A}$ of edges from $E_{Σ,k}$ to make it Eulerian. Note that extending dBGs is algorithmically much more challenging than extending general directed multigraphs because some edges in dBGs are by definition forbidden. Extending dBGs lies at the heart of sequence assembly [Medvedev et al., WABI 2007], one of the most important tasks in bioinformatics. The novelty of our work with respect to existing works is that we allow not only to duplicate existing edges of $G_{S,k}$ but to also add novel edges and nodes, in an effort to (i) connect multiple components and (ii) reduce the total EE cost. It is easy to show that EE on $G_{S,k}$ is NP-hard via a reduction from shortest common superstring. We further show that EE remains NP-hard, even when we are not allowed to add new nodes, via a highly non-trivial reduction from 3-SAT. We thus investigate the following two problems underlying EE in dBGs:

1) When $G_{S,k}$ is not weakly connected, we are asked to connect its d > 1 components using a minimum-weight spanning tree, whose edges are paths on the underlying $G_{Σ,k}$ and weights are the corresponding path lengths. This way of connecting guarantees that no new unbalanced node is added. We show that this problem can be solved in $O(|V|klog d+|E|)$ time, which is nearly optimal, since the size of $G_{S,k}$ is $Θ(|V|k+|E|)$.

2) When $G_{S,k}$ is not balanced, we are asked to extend $G_{S,k}$ to $H_{S,k}(V \cup \cal{B},E \cup \cal{A})$ such that every node of $H_{S,k}$ is balanced and the total number $|\cal{A}|$ of added edges is minimized. We show that this problem can be solved in the optimal $O(k|V| + |E|+ |cal{A}|)$ time. Let us stress that, although our main contributions are theoretical, the algorithms we design for the above two problems are practical. We combine the two algorithms in one method that makes any dBG Eulerian; and show experimentally that the cost of the obtained feasible solutions on real-world dBGs is substantially smaller than the corresponding cost obtained by existing greedy approaches.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

1) When $G_{S,k}$ is not weakly connected, we are asked to connect its d > 1 components using a minimum-weight spanning tree, whose edges are paths on the underlying $G_{Σ,k}$ and weights are the corresponding path lengths. This way of connecting guarantees that no new unbalanced node is added. We show that this problem can be solved in $O(|V|klog d+|E|)$ time, which is nearly optimal, since the size of $G_{S,k}$ is $Θ(|V|k+|E|)$.

2) When $G_{S,k}$ is not balanced, we are asked to extend $G_{S,k}$ to $H_{S,k}(V \cup \cal{B},E \cup \cal{A})$ such that every node of $H_{S,k}$ is balanced and the total number $|\cal{A}|$ of added edges is minimized. We show that this problem can be solved in the optimal $O(k|V| + |E|+ |cal{A}|)$ time. Let us stress that, although our main contributions are theoretical, the algorithms we design for the above two problems are practical. We combine the two algorithms in one method that makes any dBG Eulerian; and show experimentally that the cost of the obtained feasible solutions on real-world dBGs is substantially smaller than the corresponding cost obtained by existing greedy approaches.

Bonnet, Konstantinn; Marschall, Tobias; Doerr, Daniel

Constructing founder sets under allelic and non-allelic homologous recombination Unpublished

bioRxiv, 2022.

@unpublished{Bonnet2022,

title = {Constructing founder sets under allelic and non-allelic homologous recombination},

author = {Konstantinn Bonnet and Tobias Marschall and Daniel Doerr},

doi = {10.1101/2022.05.27.493721},

year = {2022},

date = {2022-05-29},

urldate = {2022-05-01},

publisher = {Cold Spring Harbor Laboratory},

abstract = {Homologous recombination between the maternal and paternal copies of a chromosome is a key mechanism for human inheritance and shapes population genetic properties of our species. However, a similar mechanism can also act between different copies of the same sequence, then called non-allelic homologous recombination (NAHR). This process can result in genomic rearrangements—including deletion, duplication, and inversion—and is underlying many genomic disorders. Despite its importance for genome evolution and disease, there is a lack of computational models to study genomic loci prone to NAHR. In this work, we propose such a computational model, providing a unified framework for both (allelic) homologous recombination and NAHR. Our model represents a set of genomes as a graph, where human haplotypes correspond to walks through this graph. We formulate two founder set problems under our recombination model, provide flow-based algorithms for their solution, and demonstrate scalability to problem instances arising in practice.},

howpublished = {bioRxiv},

keywords = {},

pubstate = {published},

tppubtype = {unpublished}

}

Bernardini, Giulia; Gawrychowski, Paweł; Pisanti, Nadia; Pissis, Solon P.; Rosone, Giovanna

Elastic-Degenerate String Matching via Fast Matrix Multiplication Journal Article

In: SIAM Journal on Computing, vol. 51, no. 3, pp. 549–576, 2022.

@article{Bernardini2022,

title = {Elastic-Degenerate String Matching via Fast Matrix Multiplication},

author = {Giulia Bernardini and Paweł Gawrychowski and Nadia Pisanti and Solon P. Pissis and Giovanna Rosone},

doi = {10.1137/20m1368033},

year = {2022},

date = {2022-05-01},

urldate = {2022-05-01},

journal = {SIAM Journal on Computing},

volume = {51},

number = {3},

pages = {549--576},

publisher = {Society for Industrial & Applied Mathematics (SIAM)},

abstract = {An elastic-degenerate (ED) string is a sequence of $n$ sets of strings of total length $N$ which was recently proposed to model a set of similar sequences. The ED string matching (EDSM) problem is to find all occurrences of a pattern of length $m$ in an ED text. The EDSM problem has recently received some attention in the combinatorial pattern matching community, and an $\mathcal{O}(nm^{1.5}\sqrt{\log m} + N)$-time algorithm is known [Aoyama et al., CPM 2018]. The standard assumption in the prior work on this question is that $N$ is substantially larger than both $n$ and $m$, and thus we would like to have a linear dependency on the former. Under this assumption, the natural open problem is whether we can decrease the 1.5 exponent in the time complexity, similarly as in the related (but, to the best of our knowledge, not equivalent) word break problem [Backurs and Indyk, FOCS 2016]. Our starting point is a conditional lower bound for the EDSM problem. We use the popular combinatorial Boolean matrix multiplication (BMM) conjecture stating that there is no truly subcubic combinatorial algorithm for BMM [Abboud and Williams, FOCS 2014]. By designing an appropriate reduction, we show that a combinatorial algorithm solving the EDSM problem in $\mathcal{O}(nm^{1.5-\epsilon} + N)$ time, for any $\epsilon>0$, refutes this conjecture. Our reduction should be understood as an indication that decreasing the exponent requires fast matrix multiplication. String periodicity and fast Fourier transform are two standard tools in string algorithms. Our main technical contribution is that we successfully combine these tools with fast matrix multiplication to design a noncombinatorial $\tilde{\mathcal{O}}(nm^{\omega-1}+N)$-time algorithm for EDSM, where $\omega$ denotes the matrix multiplication exponent and the $\tilde{\mathcal{O}}(\cdot)$ notation suppresses polylog factors. To the best of our knowledge, we are the first to combine these tools. In particular, using the fact that $\omega<2.373$ [Alman and Williams, SODA 2021; Le Gall, ISSAC 2014; Williams, STOC 2012], we obtain an $\mathcal{O}(nm^{1.373} + N)$-time algorithm for EDSM. An important building block in our solution that might find applications in other problems is a method of selecting a small set of length-$\ell$ substrings of the pattern, called anchors, so that any occurrence of a string from an ED text set contains at least one but not too many (on average) such anchors inside.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Charalampopoulos, Panagiotis; Iliopoulos, Costas S.; Kociumaka, Tomasz; Pissis, Solon P.; Radoszewski, Jakub; Straszy'nski, Juliusz

Efficient Computation of Sequence Mappability Journal Article

In: Algorithmica, vol. 84, no. 5, pp. 1418–1440, 2022, ISBN: 1432-0541.

@article{Charalampopoulos2022,

title = {Efficient Computation of Sequence Mappability},

author = {Panagiotis Charalampopoulos and Costas S. Iliopoulos and Tomasz Kociumaka and Solon P. Pissis and Jakub Radoszewski and Juliusz Straszy'nski},

doi = {10.1007/s00453-022-00934-y},

isbn = {1432-0541},

year = {2022},

date = {2022-05-01},

urldate = {2022-05-01},

journal = {Algorithmica},

volume = {84},

number = {5},

pages = {1418--1440},

abstract = {Sequence mappability is an important task in genome resequencing. In the (k, m)-mappability problem, for a given sequence T of length n, the goal is to compute a table whose ith entry is the number of indices $j ne i$ such that the length-m substrings of T starting at positions i and j have at most k mismatches. Previous works on this problem focused on heuristics computing a rough approximation of the result or on the case of $k=1$. We present several efficient algorithms for the general case of the problem. Our main result is an algorithm that, for $k=O(1)$, works in $O(n)$space and, with high probability, in $O(n cdot min {m^k,log ^k n})$ time. Our algorithm requires a careful adaptation of the k-errata trees of Cole et al. [STOC 2004] to avoid multiple counting of pairs of substrings. Our technique can also be applied to solve the all-pairs Hamming distance problem introduced by Crochemore et al. [WABI 2017]. We further develop $O(n^2)$-time algorithms to compute all (k, m)-mappability tables for a fixed m and all $k in {0,ldots ,m}$or a fixed k and all $m in {k,ldots ,n}$. Finally, we show that, for $k,m = \Theta (\log n)$, the (k, m)-mappability problem cannot be solved in strongly subquadratic time unless the Strong Exponential Time Hypothesis fails. This is an improved and extended version of a paper presented at SPIRE 2018.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Luo, Xiao; Kang, Xiongbin; Schönhuth, Alexander

Strainline: full-length de novo viral haplotype reconstruction from noisy long reads Journal Article

In: Genome Biology, vol. 23, no. 1, pp. 29, 2022, ISSN: 1474-760X.

@article{Luo2022,

title = {Strainline: full-length de novo viral haplotype reconstruction from noisy long reads},

author = {Xiao Luo and Xiongbin Kang and Alexander Schönhuth},

doi = {10.1186/s13059-021-02587-6},

issn = {1474-760X},

year = {2022},

date = {2022-01-20},

urldate = {2022-01-01},

journal = {Genome Biology},

volume = {23},

number = {1},

pages = {29},

publisher = {Springer Science and Business Media LLC},

abstract = {Haplotype-resolved de novo assembly of highly diverse virus genomes is critical in prevention, control and treatment of viral diseases. Current methods either can handle only relatively accurate short read data, or collapse haplotype-specific variations into consensus sequence. Here, we present Strainline, a novel approach to assemble viral haplotypes from noisy long reads without a reference genome. Strainline is the first approach to provide strain-resolved, full-length de novo assemblies of viral quasispecies from noisy third-generation sequencing data. Benchmarking on simulated and real datasets of varying complexity and diversity confirm this novelty and demonstrate the superiority of Strainline.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Gažiová, M.; Sládeček, T.; Pös, O.; Števko, M.; Krampl, W.; Pös, Z.; Hekel, R.; Hlavačka, M.; Kucharík, M.; Radvánszky, J.; Budiš, J.; Szemes, T.

Automated prediction of the clinical impact of structural copy number variations Journal Article

In: Scientific Reports, vol. 12, no. 1, pp. 555, 2022, ISSN: 2045-2322.

@article{Gaziova2022,

title = {Automated prediction of the clinical impact of structural copy number variations},

author = {M. Gažiová and T. Sládeček and O. Pös and M. Števko and W. Krampl and Z. Pös and R. Hekel and M. Hlavačka and M. Kucharík and J. Radvánszky and J. Budiš and T. Szemes},

doi = {10.1038/s41598-021-04505-z},

issn = {2045-2322},

year = {2022},

date = {2022-01-11},

urldate = {2022-01-01},

journal = {Scientific Reports},

volume = {12},

number = {1},

pages = {555},

publisher = {Springer Science and Business Media LLC},

abstract = {Copy number variants (CNVs) play an important role in many biological processes, including the development of genetic diseases, making them attractive targets for genetic analyses. The interpretation of the effect of these structural variants is a challenging problem due to highly variable numbers of gene, regulatory, or other genomic elements affected by the CNV. This led to the demand for the interpretation tools that would relieve researchers, laboratory diagnosticians, genetic counselors, and clinical geneticists from the laborious process of annotation and classification of CNVs. We designed and validated a prediction method (ISV; Interpretation of Structural Variants) that is based on boosted trees which takes into account annotations of CNVs from several publicly available databases. The presented approach achieved more than 98{%} prediction accuracy on both copy number loss and copy number gain variants while also allowing CNVs being assigned “uncertain”significance in predictions. We believe that ISV’s prediction capability and explainability have a great potential to guide users to more precise interpretations and classifications of CNVs.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Rizzo, Nicola; Mäkinen, Veli

Linear Time Construction of Indexable Elastic Founder Graphs Inproceedings

In: Bazgan, Cristina; Fernau, Henning (Ed.): Combinatorial Algorithms, pp. 480–493, Springer International Publishing, Cham, 2022, ISBN: 978-3-031-06678-8.

@inproceedings{Rizzo2022,

title = {Linear Time Construction of Indexable Elastic Founder Graphs},

author = {Nicola Rizzo and Veli Mäkinen},

editor = {Cristina Bazgan and Henning Fernau},

url = {https://arxiv.org/abs/2201.06492},

doi = {10.1007/978-3-031-06678-8_35},

isbn = {978-3-031-06678-8},

year = {2022},

date = {2022-01-01},

urldate = {2022-01-01},

booktitle = {Combinatorial Algorithms},

pages = {480--493},

publisher = {Springer International Publishing},

address = {Cham},

abstract = {Pattern matching on graphs has been widely studied lately due to its importance in genomics applications. Unfortunately, even the simplest problem of deciding if a string appears as a subpath of a graph admits a quadratic lower bound under the Orthogonal Vectors Hypothesis (Equi et al. ICALP 2019, SOFSEM 2021). To avoid this bottleneck, the research has shifted towards more specific graph classes, e.g. those induced from multiple sequence alignments (MSAs). Consider segmenting $mathsf{MSA}[1..m,1..n]$ into $b$ blocks $mathsf{MSA}[1..m,1..j_1]$, $mathsf{MSA}[1..m,j_1+1..j_2]$, $ldots$, $mathsf{MSA}[1..m,j_b-1+1..n]$. The distinct strings in the rows of the blocks, after the removal of gap symbols, form the nodes of an elastic founder graph (EFG) where the edges represent the original connections observed in the MSA. An EFG is called indexable if a node label occurs as a prefix of only those paths that start from a node of the same block. Equi et al. (ISAAC 2021) showed that such EFGs support fast pattern matching and gave an $O(mn log m)$-time algorithm for preprocessing the MSA in a way that allows the construction of indexable EFGs maximizing the number of blocks and, alternatively, minimizing the maximum length of a block, in $O(n)$ and $O(n loglog n)$ time respectively. Using the suffix tree and solving a novel ancestor problem on trees, we improve the preprocessing to $O(mn)$ time and the $O(n loglog n)$-time EFG construction to $O(n)$ time, thus showing that both types of indexable EFGs can be constructed in time linear in the input size.},

howpublished = {arXiv},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

Mwaniki, Njagi Moses; Garrison, Erik; Pisanti, Nadia

Fast Exact String to D-Texts Alignments Unpublished

arXiv, 2022.

@unpublished{Mwaniki2022,

title = {Fast Exact String to D-Texts Alignments},

author = {Njagi Moses Mwaniki and Erik Garrison and Nadia Pisanti},

doi = {10.48550/ARXIV.2206.03242},

year = {2022},

date = {2022-01-01},

urldate = {2022-01-01},

publisher = {arXiv},

abstract = {In recent years, aligning a sequence to a pangenome has become a central problem in genomics and pangenomics. A fast and accurate solution to this problem can serve as a toolkit to many crucial tasks such as read-correction, Multiple Sequences Alignment (MSA), genome assemblies, variant calling, just to name a few. In this paper we propose a new, fast and exact method to align a string to a D-string, the latter possibly representing an MSA, a pan-genome or a partial assembly. An implementation of our tool dsa is publicly available at https://github.com/urbanslug/dsa.},

howpublished = {arXiv},

keywords = {},

pubstate = {published},

tppubtype = {unpublished}

}

Zhong, Haodi; Loukides, Grigorios; Pissis, Solon P.

Clustering sequence graphs Journal Article

In: Data & Knowledge Engineering, vol. 138, pp. 101981, 2022, ISSN: 0169-023X.

@article{Zhong2022,

title = {Clustering sequence graphs},

author = {Haodi Zhong and Grigorios Loukides and Solon P. Pissis},

doi = {https://doi.org/10.1016/j.datak.2022.101981},

issn = {0169-023X},

year = {2022},

date = {2022-01-01},

urldate = {2022-01-01},

journal = {Data & Knowledge Engineering},

volume = {138},

pages = {101981},

abstract = {In application domains ranging from social networks to e-commerce, it is important to cluster users with respect to both their relationships (e.g., friendship or trust) and their actions (e.g., visited locations or rated products). Motivated by these applications, we introduce here the task of clustering the nodes of a sequence graph, i.e., a graph whose nodes are labeled with strings (e.g., sequences of users' visited locations or rated products). Both string clustering algorithms and graph clustering algorithms are inappropriate to deal with this task, as they do not consider the structure of strings and graph simultaneously. Moreover, attributed graph clustering algorithms generally construct poor solutions because they need to represent a string as a vector of attributes, which inevitably loses information and may harm clustering quality. We thus introduce the problem of clustering a sequence graph. We first propose two pairwise distance measures for sequence graphs, one based on edit distance and shortest path distance and another one based on SimRank. We then formalize the problem under each measure, showing also that it is NP-hard. In addition, we design a polynomial-time 2-approximation algorithm, as well as a heuristic for the problem. Experiments using real datasets and a case study demonstrate the effectiveness and efficiency of our methods.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Cartes, Jorge Avila; Anand, Santosh; Ciccolella, Simone; Bonizzoni, Paola; Vedova, Gianluca Della

Accurate and Fast Clade Assignment via Deep Learning and Frequency Chaos Game Representation Journal Article

In: bioRxiv, 2022.

@article{Cartes2022,

title = {Accurate and Fast Clade Assignment via Deep Learning and Frequency Chaos Game Representation},

author = {Jorge Avila Cartes and Santosh Anand and Simone Ciccolella and Paola Bonizzoni and Gianluca Della Vedova},

doi = {10.1101/2022.06.13.495912},

year = {2022},

date = {2022-01-01},

urldate = {2022-01-01},

journal = {bioRxiv},

publisher = {Cold Spring Harbor Laboratory},

abstract = {Background Since the beginning of the COVID-19 pandemic there has been an explosion of sequencing of the SARS-CoV-2 virus, making it the most widely sequenced virus in the history. Several databases and tools have been created to keep track of genome sequences and variants of the virus, most notably the GISAID platform hosts millions of complete genome sequences, and it is continuously expanding every day. A challenging task is the development of fast and accurate tools that are able to distinguish between the different SARS-CoV-2 variants and assign them to a clade.Results In this paper, we leverage the Frequency Chaos Game Representation (FCGR) and Convolutional Neural Networks (CNNs) to develop an original method that learns how to classify genome sequences that we implement into CouGaR-g, a tool for the clade assignment problem on SARS-CoV-2 sequences. On a testing subset of the GISAID, CouGaR-g achieves an 96.29% overall accuracy, while a similar tool, Covidex, obtained a 77, 12% overall accuracy. As far as we know, our method is the first using Deep Learning and FCGR for intra-species classification. Furthermore, by using some feature importance methods CouGaR-g allows to identify k-mers that matches SARS-CoV-2 marker variants.Conclusions By combining FCGR and CNNs, we develop a method that achieves a better accuracy than Covidex (which is based on Random Forest) for clade assignment of SARS-CoV-2 genome sequences, also thanks to our training on a much larger dataset, with comparable running times. Our method implemented in CouGaR-g is able to detect k-mers that capture relevant biological information that distinguishes the clades, known as marker variants.Availability The trained models can be tested online providing a FASTA file (with one or multiple sequences) at https://huggingface.co/spaces/BIASLab/sars-cov-2-classification-fcgr. CouGaR-g is also available at https://github.com/AlgoLab/CouGaR-g under the GPL.Competing Interest StatementThe authors have declared no competing interest.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Rizzo, Nicola; Mäkinen, Veli

Indexable Elastic Founder Graphs of Minimum Height Inproceedings

In: Bannai, Hideo; Holub, Jan (Ed.): 33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022), pp. 19:1–19:19, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2022, ISSN: 1868-8969.

@inproceedings{Rizzo2022b,

title = {Indexable Elastic Founder Graphs of Minimum Height},

author = {Nicola Rizzo and Veli Mäkinen},

editor = {Hideo Bannai and Jan Holub},

doi = {10.4230/LIPIcs.CPM.2022.19},

issn = {1868-8969},

year = {2022},

date = {2022-01-01},

urldate = {2022-01-01},

booktitle = {33rd Annual Symposium on Combinatorial Pattern Matching (CPM 2022)},

volume = {223},

pages = {19:1--19:19},

publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},

address = {Dagstuhl, Germany},

series = {Leibniz International Proceedings in Informatics (LIPIcs)},

abstract = {Indexable elastic founder graphs have been recently proposed as a data structure for genomics applications supporting fast pattern matching queries. Consider segmenting a multiple sequence alignment MSA[1..m,1..n] into b blocks MSA[1..m,1..j₁], MSA[1..m,j₁+1..j₂], …, MSA[1..m,j_{b-1}+1..n]. The resulting elastic founder graph (EFG) is obtained by merging in each block the strings that are equivalent after the removal of gap symbols, taking the strings as the nodes of the block and the original MSA connections as edges. We call an elastic founder graph indexable if a node label occurs as a prefix of only those paths that start from a node of the same block. Equi et al. (ISAAC 2021) showed that such EFGs support fast pattern matching and studied their construction maximizing the number of blocks and minimizing the maximum length of a block, but left open the case of minimizing the maximum number of distinct strings in a block that we call graph height. For the simplified gapless setting, we give an O(mn) time algorithm to find a segmentation of an MSA minimizing the height of the resulting indexable founder graph, by combining previous results in segmentation algorithms and founder graphs. For the general setting, the known techniques yield a linear-time parameterized solution on constant alphabet Σ, taking time O(m n² log|Σ|) in the worst case, so we study the refined measure of prefix-aware height, that omits counting strings that are prefixes of another considered string. The indexable EFG minimizing the maximum prefix-aware height provides a lower bound for the original height: by exploiting exploiting suffix trees built from the MSA rows and the data structure answering weighted ancestor queries in constant time of Belazzougui et al. (CPM 2021), we give an O(mn)-time algorithm for the optimal EFG under this alternative height. },

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

Badkobeh, Golnaz; Charalampopoulos, Panagiotis; Kosolobov, Dmitry; Pissis, Solon P.

Internal shortest absent word queries in constant time and linear space Journal Article

In: Theoretical Computer Science, vol. 922, pp. 271-282, 2022, ISSN: 0304-3975.

@article{BADKOBEH2022271,

title = {Internal shortest absent word queries in constant time and linear space},

author = {Golnaz Badkobeh and Panagiotis Charalampopoulos and Dmitry Kosolobov and Solon P. Pissis},

url = {https://arxiv.org/abs/2106.01763},

doi = {https://doi.org/10.1016/j.tcs.2022.04.029},

issn = {0304-3975},

year = {2022},

date = {2022-01-01},

urldate = {2022-01-01},

journal = {Theoretical Computer Science},

volume = {922},

pages = {271-282},

abstract = {Given a string T of length n over an alphabet Σ⊂1,2,…,nO(1) of size σ, we are to preprocess T so that given a range [i,j], we can return a representation of a shortest string over Σ that is absent in the fragment T[i]⋯T[j] of T. We present an O(n)-space data structure that answers such queries in constant time and can be constructed in O(nlogσn) time.},

keywords = {},

pubstate = {published},

tppubtype = {article}

}

Mirka, Renee; Williamson, David P.

An Experimental Evaluation of Semidefinite Programming and Spectral Algorithms for Max Cut Inproceedings

In: Schulz, Christian; Uçar, Bora (Ed.): 20th International Symposium on Experimental Algorithms (SEA 2022), pp. 19:1–19:14, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2022, ISSN: 1868-8969.

@inproceedings{mirka_et_al:LIPIcs.SEA.2022.19,

title = {An Experimental Evaluation of Semidefinite Programming and Spectral Algorithms for Max Cut},

author = {Renee Mirka and David P. Williamson},

editor = {Christian Schulz and Bora Uçar},

url = {https://drops.dagstuhl.de/opus/volltexte/2022/16553},

doi = {10.4230/LIPIcs.SEA.2022.19},

issn = {1868-8969},

year = {2022},

date = {2022-01-01},

urldate = {2022-01-01},

booktitle = {20th International Symposium on Experimental Algorithms (SEA 2022)},

volume = {233},

pages = {19:1--19:14},

publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},

address = {Dagstuhl, Germany},

series = {Leibniz International Proceedings in Informatics (LIPIcs)},

abstract = {We experimentally evaluate the performance of several Max Cut approximation algorithms. In particular, we compare the results of the Goemans and Williamson algorithm using semidefinite programming with Trevisan’s algorithm using spectral partitioning. The former algorithm has a known .878 approximation guarantee whereas the latter has a .614 approximation guarantee. We investigate whether this gap in approximation guarantees is evident in practice or whether the spectral algorithm performs as well as the SDP. We also compare the performances to the standard greedy Max Cut algorithm which has a .5 approximation guarantee and two additional spectral algorithms. The algorithms are tested on Erdős-Renyi random graphs, complete graphs from TSPLIB, and real-world graphs from the Network Repository. We find, unsurprisingly, that the spectral algorithms provide a significant speed advantage over the SDP. In our experiments, the spectral algorithms return cuts with values which are competitive with those of the SDP.},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}

Charalampopoulos, Panagiotis; Chen, Huiping; Christen, Peter; Loukides, Grigorios; Pisanti, Nadia; Pissis, Solon P.; Radoszewski, Jakub

Pattern Masking for Dictionary Matching Inproceedings

In: Ahn, Hee-Kap; Sadakane, Kunihiko (Ed.): 32nd International Symposium on Algorithms and Computation (ISAAC 2021), pp. 65:1–65:19, Schloss Dagstuhl -- Leibniz-Zentrum für Informatik, Dagstuhl, Germany, 2021, ISSN: 1868-8969.

@inproceedings{Charalampopoulos2021,

title = {Pattern Masking for Dictionary Matching},

author = {Panagiotis Charalampopoulos and Huiping Chen and Peter Christen and Grigorios Loukides and Nadia Pisanti and Solon P. Pissis and Jakub Radoszewski},

editor = {Hee-Kap Ahn and Kunihiko Sadakane},

doi = {10.4230/LIPIcs.ISAAC.2021.65},

issn = {1868-8969},

year = {2021},

date = {2021-01-01},

urldate = {2021-01-01},

booktitle = {32nd International Symposium on Algorithms and Computation (ISAAC 2021)},

volume = {212},

pages = {65:1--65:19},

publisher = {Schloss Dagstuhl -- Leibniz-Zentrum für Informatik},

address = {Dagstuhl, Germany},

series = {Leibniz International Proceedings in Informatics (LIPIcs)},

keywords = {},

pubstate = {published},

tppubtype = {inproceedings}

}