NVBIO
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
kmers_inl.h
Go to the documentation of this file.
1 /*
2  * nvbio
3  * Copyright (c) 2011-2014, NVIDIA CORPORATION. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of the NVIDIA CORPORATION nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include <cstdlib>
30 #include <nvbio/basic/primitives.h>
31 #include <nvbio/basic/atomics.h>
32 #include <nvbio/basic/cuda/sort.h>
34 #include <nvbio/strings/seeds.h>
35 
36 #include "assembly_types.h"
37 
38 /* K-mer Extraction/Counting/Uniqueness Functionality */
39 
40 // extract the string set sequence id from the kmer coordinates
41 struct get_kmer_seq_id : public thrust::unary_function<SequenceSetKmerCoord,uint32>
42 {
44  uint32 operator() (const SequenceSetKmerCoord kmer_coord) const
45  {
46  return kmer_coord.x;
47  }
48 };
49 
51 {
53  uint32 operator() (const SequenceSetKmerCoord kmer_coord) const
54  {
55  return kmer_coord.w;
56  }
57 };
58 
59 // extract the kmer size from the kmer coordinates
60 struct get_kmer_size : public thrust::unary_function<SequenceSetKmerCoord,uint32>
61 {
63  uint32 operator() (const SequenceSetKmerCoord kmer_coord) const
64  {
65  return kmer_coord.z - kmer_coord.y;
66  }
67 };
68 
69 struct get_global_id : public thrust::unary_function<SequenceSetKmerCoord,uint32>
70 {
73  {
74  return coord.z;
75  }
76 };
77 
78 // store the global kmer id in its coordinate
80 {
82 
84  set_global_id(SequenceSetKmerCoord* coords) : coords(coords) { }
85 
87  void operator() (const uint32 idx) const
88  {
89  coords[idx].z = idx;
90  }
91 };
92 
93 // store the global kmer id and the active region id in its coordinate
95 {
98 
101  coords(coords), active_region_ids(_active_region_ids) { }
102 
104  void operator() (const uint32 idx) const
105  {
106  SequenceSetKmerCoord& coord = coords[idx];
107  coord.z = idx; // since the kmer size is fixed, can reuse this field
108  coord.w = active_region_ids[coord.x];
109  }
110 };
111 
112 // maps from the coordinate of a kmer
113 // to the global id of its prefix
114 struct get_prefix_global_id : public thrust::unary_function<uint32,uint32>
115 {
118  {
119  return coord.z + coord.x;
120  }
121 };
122 
123 // maps from the id of a kmer
124 // to the global id of its prefix
125 struct get_prefix_global_id_by_idx : public thrust::unary_function<uint32,uint32>
126 {
128 
131  coords(_coords) { }
132 
134  uint32 operator() (const uint32 idx) const
135  {
136  const SequenceSetKmerCoord coord = coords[idx];
137  return coord.z + coord.x;
138  }
139 };
140 
141 // maps from the id of a kmer
142 // to the global id of its suffix
143 struct get_suffix_global_id_by_idx : public thrust::unary_function<uint32,uint32>
144 {
146 
149  coords(_coords) { }
150 
152  uint32 operator() (const uint32 idx) const
153  {
154  const SequenceSetKmerCoord coord = coords[idx];
155  return coord.z + coord.x + 1;
156  }
157 };
158 
159 // creates a map from the id of a kmer
160 // to the global id of its prefix
162 {
165 
167  compute_prefix_global_id(const SequenceSetKmerCoord* _coords, uint32* _prefix_ids):
168  coords(_coords), prefix_ids(_prefix_ids) { }
169 
171  void operator() (const uint32 id) const
172  {
173  const SequenceSetKmerCoord coord = coords[id];
174  prefix_ids[id] = coord.z + coord.x;
175  }
176 };
177 
178 // creates a map from the id of a kmer
179 // to the global id of its suffix
181 {
184 
186  compute_suffix_global_id(const SequenceSetKmerCoord* _coords, uint32* _suffix_ids):
187  coords(_coords), suffix_ids(_suffix_ids) { }
188 
190  void operator() (const uint32 id) const
191  {
192  const SequenceSetKmerCoord coord = coords[id];
193  suffix_ids[id] = coord.z + coord.x + 1;
194  }
195 };
196 
197 // maps kmer global id to its UID
198 struct global_to_uid : public thrust::unary_function<uint64,uint64>
199 {
200  const uint32* uid_map;
202  global_to_uid(const uint32* _uid_map) : uid_map(_uid_map) { }
203 
205  uint32 operator() (const uint32 global_id) const
206  {
207  return uid_map[global_id];
208  }
209 };
210 
211 // creates a map from the global id to the id in the sorted array
213 {
217 
219  void operator() (const thrust::tuple<uint32, SequenceSetKmerCoord> idx_coord) const
220  {
221  const uint32 idx = thrust::get<0>(idx_coord);
222  const SequenceSetKmerCoord coord = thrust::get<1>(idx_coord);
223  global_to_sorted_id_map[coord.z] = idx; // TODO: not coalesced
224  }
225 };
226 
227 // compute the kmer keys: 64-bit compacted kmer sequence
228 template <typename string_set_type>
230 {
231  typedef typename string_set_type::string_type sequence;
232  const string_set_type string_set;
236 
238  kmers_to_64b_functor(const uint32 _kmer_size, const uint32 _dna_symbol_size, const string_set_type _string_set) :
239  kmer_size(_kmer_size), dna_symbol_size(_dna_symbol_size), symbol_mask((1u << dna_symbol_size) - 1u),
240  string_set(_string_set) {}
241 
243  uint64 operator() (const SequenceSetKmerCoord kmer_coord) const
244  {
245  const sequence seq = string_set[kmer_coord.x];
246  const uint32 seq_pos = kmer_coord.y;
247  const uint32 seq_len = seq.length();
248 
249  uint64 kmer_key = 0u;
250  for (uint32 i = 0; i < kmer_size; i++) {
251  kmer_key |= uint64(seq_pos + i < seq_len ? (seq[seq_pos + i] & symbol_mask) : 0u) << ((kmer_size-1-i)*dna_symbol_size); //(i*dna_symbol_size);
252  }
253  return kmer_key;
254  }
255 };
256 
257 // equality based on kmer key and sequence id coordinate
259 {
260  const uint64* keys;
262 
264  kmer_key_sid_eq(const uint64* _keys, const SequenceSetKmerCoord* _coords):
265  keys(_keys), coords(_coords) { }
266 
267  NVBIO_HOST_DEVICE bool operator()(const uint32 i, const uint32 j)
268  {
269  return keys[i] == keys[j] && coords[i].x == coords[j].x;
270  }
271 };
272 
273 // equality based on kmer key and region id coordinate
275 {
276  const uint64* keys;
278 
280  kmer_key_rid_eq(const uint64* _keys, const SequenceSetKmerCoord* _coords):
281  keys(_keys), coords(_coords) { }
282 
283  NVBIO_HOST_DEVICE bool operator()(const uint32 i, const uint32 j)
284  {
285  return keys[i] == keys[j] && coords[i].w == coords[j].w;
286  }
287 };
288 
289 // equality based on sequence id and position
291 {
294  kmer_pos_sid_eq(const SequenceSetKmerCoord* _coords): coords(_coords) { }
295 
296  NVBIO_HOST_DEVICE bool operator()(const uint32 i, const uint32 j)
297  {
298  const SequenceSetKmerCoord ci = coords[i];
299  const SequenceSetKmerCoord cj = coords[j];
300  return ci.y == cj.y && ci.x == cj.x;
301  }
302 };
303 
304 // equality based on kmer key only
306 {
307  const uint32* keys;
308 
310  kmer_uid_eq(const uint32* _keys) : keys(_keys) { }
311 
313  bool operator()(const uint32 i, const uint32 j)
314  {
315  return keys[i] == keys[j];
316  }
317 };
318 
319 // extract i-th word from the kmer in a string set
320 template <typename string_set_type>
322 {
323  typedef typename string_set_type::string_type sequence;
324  const string_set_type string_set;
329  const uint32 i;
330 
332  kmer_word_extractor_functor(const string_set_type _string_set, const uint32 _dna_symbol_size,
333  const SequenceSetKmerCoord* _coords, const uint32 _i) :
334  dna_symbol_size(_dna_symbol_size),
337  coords(_coords), string_set(_string_set), i(_i) {}
338 
340  uint32 operator() (const uint32 kmer_idx) const
341  {
342  SequenceSetKmerCoord kmer_coord = coords[kmer_idx];
343  const uint32 kmer_pos = kmer_coord.y;
344  const uint32 kmer_size = kmer_coord.z - kmer_coord.y; // TODO: z-field is currently used for the global id, use kmer_size if fixed
345  const sequence seq = string_set[kmer_coord.x];
346  const uint32 seq_len = seq.length();
347 
348  uint32 word = 0u;
349  for (uint32 j = 0; j < kmer_size; j++) {
350  const uint32 jj = kmer_pos + i*symbols_per_word + j;
351  const uint32 c = jj < seq_len ? char_to_iupac16(dna_to_char(seq[jj])) : 0u;
352  word |= (c << (symbol_offset - j*dna_symbol_size));
353  }
354  return word;
355  }
356 };
357 
358 // add tuples of kmer and sequence counts
360 {
362  thrust::tuple<uint32, uint32> operator() (const thrust::tuple<uint32, uint32>& x, const thrust::tuple<uint32, uint32>& y) const
363  {
364  return thrust::tuple<uint32, uint32>(thrust::get<0>(x) + thrust::get<0>(y), thrust::get<1>(x) + thrust::get<1>(y));
365  }
366 };
367 
368 // check if a kmer occurs more than once in at least one sequence
369 // given a tuple of the number of sequences that contain it and
370 // the number of times it occurs in all the sequences
372 {
374  bool operator() (const thrust::tuple<uint32, uint32>& nSeq_nOcc) const
375  {
376  return thrust::get<0>(nSeq_nOcc) >= thrust::get<1>(nSeq_nOcc);
377  }
378 };
379 
380 // given the last id of a kmer in the group of consecutive equal kmers
381 // mark all identical consecutive kmers as unique in the map
383 {
387 
389  mark_kmer_uniqueness(uint8* _is_unique_map, const uint64* _sorted_kmer_keys, const SequenceSetKmerCoord* _sorted_coords) :
390  is_unique_map(_is_unique_map), sorted_kmer_keys(_sorted_kmer_keys), sorted_coords(_sorted_coords) { }
391 
393  void operator() (const uint32 kmer_id) const
394  {
395  const uint64 kmer_key = sorted_kmer_keys[kmer_id];
396  const uint32 rid = sorted_coords[kmer_id].w;
397 
398  int32 idx = kmer_id;
399  while(idx >= 0) {
400  if(sorted_coords[idx].w != rid || sorted_kmer_keys[idx] != kmer_key) break;
401  is_unique_map[idx] = 1; //TODO: thrust documentation states first, but last in experiments
402  idx--;
403  }
404  }
405 };
406 
408 {
412 
414  store_kmer_unique_ids(uint32* _unique_id_map, const uint64* _sorted_kmer_keys, const SequenceSetKmerCoord* _sorted_coords) :
415  unique_id_map(_unique_id_map), sorted_kmer_keys(_sorted_kmer_keys), sorted_coords(_sorted_coords) { }
416 
418  void operator() (const thrust::tuple<uint32, uint32>& uid_kid) const
419  {
420  const uint32 uid = thrust::get<0>(uid_kid);
421  const uint32 kid = thrust::get<1>(uid_kid);
422  const uint64 kmer_key = sorted_kmer_keys[kid];
423  const uint32 rid = sorted_coords[kid].w;
424 
425  int32 idx = kid;
426  while(idx >= 0) {
427  if(sorted_coords[idx].w != rid || sorted_kmer_keys[idx] != kmer_key) break;
428  unique_id_map[idx] = uid;
429  idx--; //TODO: thrust documentation states first, but last in experiments
430  }
431  }
432 };
433 
434 //struct populate_unique_kmer_data
435 //{
436 // const uint64* sorted_kmer_keys;
437 // const SequenceSetKmerCoord* sorted_coords;
438 // const uint32* unique_kmer_idxs;
439 //
440 // // fill in unique kmer annotations
441 // uint8* unique_flag_map;
442 // uint32* unique_UID_map;
443 //
444 // NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
445 // populate_unique_kmer_data(const uint64* _sorted_kmer_keys, const SequenceSetKmerCoord* _coords,
446 // const uint32* _unique_kmer_idxs, uint8* _unique_flag_map, uint32* _unique_UID_map) :
447 // sorted_kmer_keys(_sorted_kmer_keys), sorted_coords(_coords), unique_kmer_idxs(_unique_kmer_idxs),
448 // unique_flag_map(_unique_flag_map), unique_UID_map (_unique_UID_map) { }
449 //
450 // NVBIO_FORCEINLINE NVBIO_HOST_DEVICE
451 // void operator() (const uint32 uid) const
452 // {
453 // const uint32 unique_kmer_idx = unique_kmer_idxs[uid];
454 // const uint64 key = sorted_kmer_keys[unique_kmer_idx]; // find the group of coordinates with this key and region
455 // const uint32 region = sorted_coords[unique_kmer_idx].w;
456 // uint32 global_kmer_id = sorted_coords[unique_kmer_idx].z;
457 // unique_flag_map[global_kmer_id] = 1;
458 // unique_UID_map[global_kmer_id] = uid;
459 //
460 // int32 idx = unique_kmer_idx - 1;
461 // while(idx >= 0) {
462 // if(sorted_kmer_keys[idx] != key || sorted_coords[idx].w != region) break;
463 // global_kmer_id = sorted_coords[idx].z;
464 // unique_flag_map[global_kmer_id] = 1; //TODO: thrust documentation states first, but last in experiments
465 // unique_UID_map[global_kmer_id] = uid;
466 // idx--;
467 // }
468 // }
469 //};
470 
472 {
476 
477  // fill in unique kmer annotations
480 
482  populate_unique_kmer_data(const uint32* _kmer_counts, const SequenceSetKmerCoord* _coords,
483  const uint32* _unique_kmer_idxs, uint8* _unique_flag_map, uint32* _unique_UID_map) :
484  kmer_counts(_kmer_counts), sorted_coords(_coords), unique_kmer_idxs(_unique_kmer_idxs),
485  unique_flag_map(_unique_flag_map), unique_UID_map (_unique_UID_map) { }
486 
488  void operator() (const uint32 uid) const
489  {
490  const uint32 unique_kmer_idx = unique_kmer_idxs[uid];
491  const uint32 count = kmer_counts[uid];
492  uint32 global_kmer_id = sorted_coords[unique_kmer_idx].z;
493  unique_flag_map[global_kmer_id] = 1;
494  unique_UID_map[global_kmer_id] = uid;
495 
496  for(uint32 i = 1; i < count; i++) {
497  global_kmer_id = sorted_coords[unique_kmer_idx - i].z; //TODO: thrust bug: pointer to last instead of first
498  unique_flag_map[global_kmer_id] = 1; // TODO: not coalesced
499  unique_UID_map[global_kmer_id] = uid;
500  }
501  }
502 };
503 
504 // marks kmers that have a prefix satisfying the following:
505 // the prefix is unique and has a matching kmer
506 // (i.e. it is not the last (k-1)mer in the sequence)
508 {
511  mark_unique_prefix_kmer(uint8* _unique_prefix_map) : unique_prefix_map(_unique_prefix_map) { }
512 
514  void operator() (const thrust::tuple<uint8, uint64>& uflag_prefix) const
515  {
516  const uint8 is_unique_prefix = thrust::get<0>(uflag_prefix);
517  const uint32 kmer_id = thrust::get<1>(uflag_prefix);
518 
519  if((is_unique_prefix == 1) && (kmer_id != (uint32) -1)) {
520  unique_prefix_map[kmer_id] = 1; // TODO: writes not coalesced
521  }
522  }
523 };
524 
525 // ---------- Super-kmers: chains of consecutive kmers
526 
527 // given a non-unique kmer N, finds the closest unique predecessor and successor kmers in the sequence, P and S
528 // returns the new coordinate for the chain starting with the P and ending with S [P...N...S]
529 // returns a dummy coordinate (N.x, -1, -1, 0) if no such previous unique kmer exists
530 // TODO: could use shared memory here
531 template <typename string_set_type>
533 {
534  typedef typename string_set_type::string_type sequence;
535  const string_set_type string_set;
536  const uint32* repeat_global_ids; // global ids of the repeat kmers
537  const uint8* global_unique_flag_map; // unsorted
540  const SequenceSetKmerCoord* coords; // sorted
542 
543  // structures to populate
547 
549  extract_super_kmers_functor(const uint32* _repeat_global_ids,
550  const uint8* _is_unique_map, const uint32* _global_UID_map, const uint32* _sorted_idx_map,
551  const SequenceSetKmerCoord* _coords, const string_set_type _string_set, const uint32 _kmer_size,
552  SequenceSetKmerCoord* _super_coords, uint32* _prefix_uids, uint32* _suffix_uids):
553  repeat_global_ids(_repeat_global_ids), global_unique_flag_map(_is_unique_map), global_UID_map(_global_UID_map),
554  global_sorted_idx_map(_sorted_idx_map),
555  coords(_coords), string_set(_string_set), kmer_size(_kmer_size),
556  super_coords(_super_coords), prefix_uids(_prefix_uids), suffix_uids(_suffix_uids){ }
557 
559  void operator()(const uint32 idx) const {
560  const uint32 global_id = repeat_global_ids[idx];
561  const SequenceSetKmerCoord coord = coords[global_sorted_idx_map[global_id]];
562  const sequence seq = string_set[coord.x];
563  const uint32 seq_len = seq.length();
564  SequenceSetKmerCoord super_coord = make_uint4(coord.x, (uint32)-1, (uint32)-1, coord.w);
565  uint32 prefix_id = (uint32) -1;
566  uint32 suffix_id = (uint32) -1;
567 
568  // find the unique predecessor
569  uint32 i = 1u;
570  while(i <= coord.y) { // search only this sequence
571  if(global_unique_flag_map[global_id-i] == 1) {
572  super_coord.y = coord.y - i;
573  super_coord.z = coord.y + kmer_size;
574  prefix_id = global_UID_map[global_id-i];
575  break;
576  }
577  i++;
578  }
579 
580  if(super_coord.y != (uint32) -1) { // otherwise this chain will be ignored
581  // find the unique successor
582  i = 1u;
583  while(i < seq_len - coord.y - kmer_size + 1) { // search only until the end of the sequence
584  if(global_unique_flag_map[global_id+i] == 1) {
585  super_coord.z = coord.y + i + kmer_size;
586  suffix_id = global_UID_map[global_id+i];
587  break;
588  }
589  i++;
590  }
591  }
592 
593  super_coords[idx] = super_coord;
594  prefix_uids[idx] = prefix_id;
595  suffix_uids[idx] = suffix_id;
596  }
597 };
598 
599 template <typename string_set_type>
601 {
602  typedef typename string_set_type::string_type sequence;
603  const string_set_type string_set;
605 
607  print_super_kmer_functor(const string_set_type _string_set, const SequenceSetKmerCoord* _coords) :
608  coords(_coords), string_set(_string_set) {}
609 
611  void operator() (const uint32 kmer_idx) const
612  {
613  SequenceSetKmerCoord kmer_coord = coords[kmer_idx];
614  const uint32 kmer_pos = kmer_coord.y;
615  const uint32 kmer_size = kmer_coord.z - kmer_coord.y;
616  const sequence seq = string_set[kmer_coord.x];
617  const uint32 seq_len = seq.length();
618 
619  uint8 kmer_seq[14];
620  for (uint32 k = 0; k < kmer_size; k++) {
621  uint8 c = iupac16_to_char(char_to_iupac16(dna_to_char(seq[kmer_pos + k])));
622  kmer_seq[k] = c;
623  }
624  kmer_seq[kmer_size] = '\0';
625 
626  printf("id: %llu, reg %u, seq %s \n", kmer_idx, kmer_coord.w, kmer_seq);
627  }
628 };
629 
630 // returns true if the super kmer does not represent a valid chain of kmers
631 // based on the value of its unique predecessor id
633 {
635  bool operator()(const uint32 chain_prefix_id)
636  {
637  return chain_prefix_id == (uint32) -1;
638  }
639 };
640 
641 struct is_unique_suffix_id : public thrust::unary_function<uint32,bool>
642 {
644  bool operator()(const uint32 suffix_id)
645  {
646  return suffix_id != (uint32) -1;
647  }
648 };
649 
650 struct is_unique_uid : public thrust::unary_function<uint32,uint8>
651 {
654  is_unique_uid(const uint32 _max_unique_uid) : max_unique_uid(_max_unique_uid) { }
655 
658  {
659  return uid < max_unique_uid;
660  }
661 };
662 
664 {
666  bool operator()(const uint8 flag)
667  {
668  return (flag != 1);
669  }
670 };
671 
672 // collapse kmers that start at the same position and belong to the same sequence
673 // by just returning the one with max length
675 {
677  thrust::tuple<SequenceSetKmerCoord, uint32, uint32> operator() (
678  const thrust::tuple<SequenceSetKmerCoord, uint32, uint32>& x,
679  const thrust::tuple<SequenceSetKmerCoord, uint32, uint32>& y) const
680  {
681  const SequenceSetKmerCoord i = thrust::get<0>(x);
682  const SequenceSetKmerCoord j = thrust::get<0>(y);
683 
684  if(i.z > j.z) {
685  return x;
686  }
687  return y;
688  }
689 };
690 
691 // computes the max prefix overlap with the closest lexicographically smaller super-kmer
692 // assumes that the function is not called for idx == 0
693 // returns the number of kmers by which the chains overlap
694 // no overlap if the previous kmer is from a different region
695 template <typename string_set_type>
697 {
698  typedef typename string_set_type::string_type sequence;
699  const string_set_type string_set;
700  const SequenceSetKmerCoord* coords; // sorted in lexicographic order
702 
704  find_max_kmer_overlaps(const string_set_type _string_set, const SequenceSetKmerCoord* _coords,
705  const uint32 _kmer_size): string_set(_string_set), coords(_coords), kmer_size(_kmer_size) { }
706 
708  uint32 operator() (const uint32 idx) const
709  {
710  const SequenceSetKmerCoord coord_curr = coords[idx];
711  const SequenceSetKmerCoord coord_prev = coords[idx-1];
712 
713  if(coord_curr.w != coord_prev.w) return 0;
714 
715  const sequence seq_curr = string_set[coord_curr.x];
716  const sequence seq_prev = string_set[coord_prev.x];
717  const uint32 max_overlap =
718  (coord_curr.z - coord_curr.y) > (coord_prev.z - coord_prev.y) ?
719  (coord_prev.z - coord_prev.y) : (coord_curr.z - coord_curr.y);
720 
721  uint32 overlap = 0u;
722  for (uint32 j = 0; j < max_overlap; j++)
723  {
724  if(seq_curr[coord_curr.y + j] == seq_prev[coord_prev.y +j]) {
725  overlap++;
726  } else {
727  break;
728  }
729  }
730  uint32 n_ovp = (overlap >= kmer_size) ? overlap - kmer_size + 1 : 0;
731  return n_ovp;
732  }
733 };
734 
735 // returns the number of kmers that did not overlap with the lexicographic
736 // super-kmer predecessor and are not unique
737 // i.e. total number of kmers in the chain - number of overlapping kmers
738 // does not count the unique first and last kmers
739 struct num_non_overlapped_repeats : public thrust::unary_function<uint32,uint32>
740 {
743  const uint32* overlaps;
745 
748  const uint32* _suffix_kmer_ids,
749  const uint32* _overlaps,
750  const uint32 _kmer_size):
751  coords(_coords), suffix_kmer_ids(_suffix_kmer_ids), overlaps(_overlaps), kmer_size(_kmer_size) { }
752 
754  uint32 operator() (const uint32 idx) const
755  {
756  const SequenceSetKmerCoord coord = coords[idx];
757  uint32 len = coord.z - coord.y;
758  const uint32 n_kmers = (len >= kmer_size) ? len - kmer_size + 1 : 0;
759 
760  uint32 n_unique = 0;
761  if(overlaps[idx] == 0) {
762  n_unique++; // skip the header unique kmer if this is the first chain starting with this kmer
763  // otherwise, it will be included in the overlap
764  }
765  if(overlaps[idx] != n_kmers && suffix_kmer_ids[idx] != (uint32) -1) { // if not all the kmers overlapped
766  n_unique++; // skip the last unique kmer
767  }
768  return n_kmers - overlaps[idx] - n_unique;
769  }
770 };
771 
772 struct num_adj_repeats : public thrust::unary_function<uint32,uint32>
773 {
776  const uint32* overlaps;
778 
781  const uint32* _suffix_kmer_ids,
782  const uint32* _overlaps,
783  const uint32 _kmer_size):
784  coords(_coords), suffix_kmer_ids(_suffix_kmer_ids), overlaps(_overlaps), kmer_size(_kmer_size) { }
785 
787  uint32 operator() (const uint32 idx) const
788  {
789  const SequenceSetKmerCoord coord = coords[idx];
790  uint32 len = coord.z - coord.y;
791  const uint32 n_kmers = (len >= kmer_size) ? len - kmer_size + 1 : 0; // TODO: remove len check
792 
793  uint32 n_repeat_kmers_to_extract = n_kmers - overlaps[idx];
794  if(overlaps[idx] == 0) {
795  n_repeat_kmers_to_extract--; // skip the first unique kmer
796  }
797  if(overlaps[idx] != n_kmers && suffix_kmer_ids[idx] != (uint32) -1) { // if not all the kmers overlapped
798  n_repeat_kmers_to_extract--; // skip the last unique kmer
799  }
800 
801  // number of kmers to extract will be the same as number of edges, plus RU edge if exists
802  uint32 n_adj = n_repeat_kmers_to_extract;
803  if(overlaps[idx] != n_kmers && suffix_kmer_ids[idx] != (uint32) -1) { // if not all the kmers overlapped
804  n_adj++; // add the last R-U edge
805  }
806 
807  return n_adj;
808  }
809 };
810 
811 template <typename string_set_type>
813 {
814  typedef typename string_set_type::string_type sequence;
815  const string_set_type string_set;
825 
829 
831  extract_non_overlapped_repeats(const string_set_type _string_set,
832  const SequenceSetKmerCoord* _chain_coords,
833  const uint32* _chain_pref_gids, const uint32* _chain_suffix_gids,
834  const uint32* _chain_offsets,
835  const uint32* _adj_extraction_offsets,
836  const uint32* _chain_overlaps,
837  const uint32 _kmer_size, const uint32 _uid_offset,
838  SequenceSetKmerCoord* _kmers_out, uint32* _prefix_uids, uint32* _suffix_uids):
839  string_set(_string_set),
840  super_coords(_chain_coords),
841  super_prefix_global_ids(_chain_pref_gids),
842  super_suffix_global_ids(_chain_suffix_gids),
843  chain_offsets(_chain_offsets),
844  adj_extraction_offsets(_adj_extraction_offsets),
845  chain_overlaps(_chain_overlaps),
846  kmer_size(_kmer_size), uid_offset(_uid_offset),
847  kmers_out(_kmers_out),
848  prefix_uids(_prefix_uids), suffix_uids(_suffix_uids),
849  seeder(_kmer_size, 1u) { }
850 
852  void operator() (const uint32 idx) const
853  {
854  const SequenceSetKmerCoord chain_coord = super_coords[idx];
855  const sequence seq = string_set[chain_coord.x];
856  const uint32 seq_len = seq.length();
857  const uint32 offset = chain_offsets[idx] + uid_offset;
858  const uint32 adj_offset = adj_extraction_offsets[idx];
859 
860  const uint8 unique_suffix = (super_suffix_global_ids[idx] != (uint32) -1);
861  const uint32 n_kmers_to_extract = chain_offsets[idx+1] - chain_offsets[idx];
862 
863  uint32 len = chain_coord.z - chain_coord.y;
864  const uint32 n_kmers = (len >= kmer_size) ? len - kmer_size + 1 : 0; // TODO: remove len check
865 
866  // if everything overlapped or nothing to extract and no possible R-U edge
867  if(chain_overlaps[idx] == n_kmers || (n_kmers_to_extract + unique_suffix == 0)) return;
868 
869  const uint32 kmer_id = (chain_coord.z - unique_suffix) - n_kmers_to_extract - kmer_size + 1;
870  for (uint32 j = 0; j < n_kmers_to_extract; j++) {
871  const uint2 kmer = seeder.seed(seq_len, kmer_id + j);
872  kmers_out[offset + j] = make_uint4(chain_coord.x, kmer.x, kmer.y, chain_coord.w);
873 
874  suffix_uids[adj_offset + j] = offset + j;
875  if(j < n_kmers_to_extract - 1) {
876  prefix_uids[adj_offset + j + 1] = offset + j;
877  } else {
878  if(unique_suffix == 1) {
879  prefix_uids[adj_offset + j + 1] = offset + j; // the last kmer is a prefix only to the RU (k+1)mer
880  }
881  }
882  }
883  if(unique_suffix == 1) {
884  suffix_uids[adj_offset + n_kmers_to_extract] = super_suffix_global_ids[idx];
885  }
886 
887  // find the first prefix
888  if(chain_overlaps[idx] == 0 || chain_overlaps[idx] == 1) {
889  // overlap is just the unique header kmer or none
890  // use the unique id of the kmer
891  prefix_uids[adj_offset] = super_prefix_global_ids[idx];
892  } else {
893  uint32 pred_chain_idx = idx - 1;
894  while(1) {
895  if(chain_overlaps[pred_chain_idx] >= chain_overlaps[idx]) { // not stored by the previous chain // >=2
896  pred_chain_idx--;
897  } else {
898  // ignores the unique predecessor that overlapped and was not extracted
899  prefix_uids[adj_offset] = uid_offset + chain_offsets[pred_chain_idx] + (chain_overlaps[idx] - chain_overlaps[pred_chain_idx]) - 2; // last overlapped kmer
900  break;
901  }
902  }
903  }
904  }
905 };
906 
908 {
910  const uint32* overlaps;
912 
914  count_overlapped_adjacencies(const uint32* _adj_extraction_offsets, const uint32* _overlaps, uint32* _counts):
915  adj_extraction_offsets(_adj_extraction_offsets),
916  overlaps(_overlaps),
917  counts(_counts) { }
918 
920  void operator() (const uint32 idx) const
921  {
922  int32 n_to_update = overlaps[idx] - 1;
923  if(n_to_update <= 0) return;
924 
925  uint32 pred_chain_idx = idx - 1;
926  while(n_to_update > 0) {
927  const int32 pred_overlap = overlaps[pred_chain_idx] - 1;
928  if(pred_overlap >= n_to_update) { // everything is shared with the pred
929  pred_chain_idx--;
930  } else {
931  // update the counts in this pred chain
932  const uint32 n_update_pred = n_to_update - (pred_overlap > 0 ? pred_overlap : 0);
933  const uint32 pred_offset = adj_extraction_offsets[pred_chain_idx];
934  for(uint32 i = 0; i < n_update_pred; i++) {
935  nvbio::atomic_add(&counts[pred_offset + i], 1);
936  }
937  n_to_update -= n_update_pred;
938  }
939  }
940  }
941 };
942 
944 {
951 
953  extract_repeat_adjacencies(const uint32* _node_offsets, const uint32* _prefix_uids,
954  const uint32* _suffix_uids, const uint32* _edge_counts,
955  uint32* _node_adj_map, uint32* _edge_counts_shuffled):
956  node_offsets(_node_offsets),
957  prefix_uids(_prefix_uids),
958  suffix_uids(_suffix_uids),
959  edge_counts(_edge_counts),
960  node_adj_map(_node_adj_map),
961  edge_counts_shuffled(_edge_counts_shuffled){ }
962 
964  void operator() (const uint32 prefix_idx) const
965  {
966  const uint32 from_uid = prefix_uids[prefix_idx];
967  const uint32 offset = node_offsets[from_uid];
968  const uint32 num_edges = node_offsets[from_uid+1] - node_offsets[from_uid];
969 
970  uint32 i = 0;
971  while(i < num_edges) { // at most this many prefixes to handle
972  if(prefix_idx < i) break;
973  const uint32 uid = prefix_uids[prefix_idx - i];
974  if(uid != from_uid) break; // handled all the prefixes with the given UID
975  node_adj_map[offset + i] = suffix_uids[prefix_idx - i];
976  edge_counts_shuffled[offset + i] = edge_counts[prefix_idx - i];
977  i++;
978  }
979  }
980 };
981 
983 {
990 
992  extract_unique_adjacencies(const uint32* _node_offsets, const uint32* _prefix_uids,
993  const uint32* _suffix_uids, const uint32* _edge_counts,
994  uint32* _node_adj_map, uint32* _edge_counts_shuffled):
995  node_offsets(_node_offsets),
996  prefix_uids(_prefix_uids),
997  suffix_uids(_suffix_uids),
998  edge_counts(_edge_counts),
999  node_adj_map(_node_adj_map),
1000  edge_counts_shuffled(_edge_counts_shuffled){ }
1001 
1003  void operator() (const uint32 prefix_idx) const
1004  {
1005  const uint32 from_uid = prefix_uids[prefix_idx];
1006  const uint32 offset_last = node_offsets[from_uid+1] - 1;
1007  const uint32 num_edges = node_offsets[from_uid+1] - node_offsets[from_uid];
1008 
1009  uint32 i = 0;
1010  uint32 n_unique = 0;
1011  while(i < num_edges) { // at most this many prefixes to handle
1012  if(prefix_idx < i) break;
1013  const uint32 uid = prefix_uids[prefix_idx - i];
1014  if(uid != from_uid) break; // handled all the prefixes with the given UID
1015  if(suffix_uids[prefix_idx - i] == (uint32) -1) { // repeat (TODO: fill with -1)
1016  i++;
1017  continue;
1018  }
1019  node_adj_map[offset_last - n_unique] = suffix_uids[prefix_idx - i]; // fill out from the end
1020  edge_counts_shuffled[offset_last - n_unique] = edge_counts[prefix_idx - i];
1021  i++;
1022  n_unique++;
1023  }
1024  }
1025 };
1026 
1027 // converts a coordinate to the corresponding sequence -- used for debugging
1028 template <typename string_set_type>
1030 {
1031  typedef typename string_set_type::string_type sequence;
1032  const string_set_type string_set;
1034  const uint32* offsets;
1036 
1038  super_coord_to_seq(const string_set_type _string_set,
1039  const SequenceSetKmerCoord* _super_coords, const uint32* _offsets,
1040  uint8* _sequences) :
1041  string_set(_string_set), super_coords(_super_coords), offsets(_offsets),
1042  sequences(_sequences) {}
1043 
1045  void operator() (const uint64 idx) const
1046  {
1047  const SequenceSetKmerCoord coord = super_coords[idx];
1048  const sequence seq = string_set[coord.x];
1049  const uint32 seq_pos = coord.y;
1050  const uint32 offset = offsets[idx];
1051 
1052  for (uint32 i = 0; i < coord.z - coord.y; i++) {
1053  uint8 c = dna_to_char(seq[seq_pos + i]);
1054  sequences[offset + i] = c;
1055  }
1056  }
1057 };
1058 
1059 // generate the coordinates of each kmer in the given set
1060 template <typename string_set_type>
1062 {
1063  n_kmers = enumerate_string_set_seeds(
1064  string_set,
1065  uniform_seeds_functor<>(kmer_size, 1u),
1066  coords);
1067 
1068  // record the index into the kmer array in the coordinates vector
1070  thrust::make_counting_iterator<uint32>(0u),
1071  thrust::make_counting_iterator<uint32>(0u) + n_kmers,
1072  set_global_id_region_id(plain_view(coords), plain_view(active_region_ids)));
1073 }
1074 
1075 // generate the 64-bit compacted kmer sequence keys
1076 // assumes the coordinates have already been generated
1077 template <typename string_set_type>
1079 {
1080  kmers_64b.resize(n_kmers);
1082  coords.begin(),
1083  coords.begin() + n_kmers,
1084  kmers_64b.begin(),
1085  kmers_to_64b_functor<string_set_type>(kmer_size, 2, string_set));
1086 }
1087 
1088 // sort the kmers based on the 64-bit kmer sequence key only
1089 // assumes the keys have already been generated
1090 template <typename string_set_type>
1092 {
1093  // identical keys from the same sequence will be consecutive
1094  // identical keys from the same region will also be consecutive
1095  thrust::stable_sort_by_key(
1096  kmers_64b.begin(),
1097  kmers_64b.begin() + n_kmers,
1098  coords.begin());
1099 }
1100 
1101 // stable sort the kmers based on the 64-bit kmer sequence key
1102 // assumes the kmers are sorted by seq id only already
1103 // assumes the keys have already been generated
1104 template <typename string_set_type>
1105 template <typename meta_iterator_type>
1106 void D_KmerSet<string_set_type>::sort_kmers_by_64b_keys_meta(const meta_iterator_type meta_data)
1107 {
1108  thrust::stable_sort_by_key(
1109  kmers_64b.begin(),
1110  kmers_64b.begin() + n_kmers,
1111  thrust::make_zip_iterator(thrust::make_tuple(coords.begin(), meta_data)));
1112 }
1113 
1114 // sort the kmers based on the 64-bit kmer sequence key by region
1115 // assumes the keys have already been generated
1116 template <typename string_set_type>
1118 {
1119  // identical keys from the same sequence will be consecutive
1120  // identical keys from the same region will also be consecutive
1121  thrust::stable_sort_by_key(
1122  kmers_64b.begin(),
1123  kmers_64b.begin() + n_kmers,
1124  coords.begin());
1125 
1126  D_VectorU32::iterator region_ids = get_scratch_space(n_kmers);
1128  coords.begin(),
1129  coords.begin() + n_kmers,
1130  region_ids,
1131  get_kmer_reg_id());
1132 
1133  thrust::stable_sort_by_key(
1134  region_ids,
1135  region_ids + n_kmers,
1136  thrust::make_zip_iterator(thrust::make_tuple(coords.begin(), kmers_64b.begin())));
1137 
1138  reset_scratch_space();
1139 }
1140 
1141 // sort kmers by key and set sequence id
1142 // doing 2-pass sort to use the much faster thrust radix-sort
1143 // instead of merge-sort on custom keys
1144 template <typename string_set_type>
1146 {
1147  thrust::stable_sort_by_key(
1148  kmers_64b.begin(),
1149  kmers_64b.begin() + n_kmers,
1150  coords.begin());
1151  D_VectorU32 seq_ids(n_kmers);
1153  coords.begin(),
1154  coords.begin() + n_kmers,
1155  seq_ids.begin(),
1156  get_kmer_seq_id());
1157  thrust::stable_sort_by_key(
1158  seq_ids.begin(),
1159  seq_ids.begin() + n_kmers,
1160  thrust::make_zip_iterator(thrust::make_tuple(kmers_64b.begin(), coords.begin())));
1161 }
1162 
1163 // sort kmers by key and set sequence id
1164 // sort the additional meta-data along with the kmers
1165 // doing 2-pass sort to use the much faster thrust radix-sort
1166 // instead of merge-sort on custom keys
1167 template <typename string_set_type>
1168 template <typename meta_iterator_type>
1170 {
1171  thrust::stable_sort_by_key(
1172  kmers_64b.begin(),
1173  kmers_64b.begin() + n_kmers,
1174  thrust::make_zip_iterator(thrust::make_tuple(coords.begin(), meta_data)));
1175  D_VectorU32 seq_ids(n_kmers);
1177  coords.begin(),
1178  coords.begin() + n_kmers,
1179  seq_ids.begin(),
1180  get_kmer_seq_id());
1181  thrust::stable_sort_by_key(
1182  seq_ids.begin(),
1183  seq_ids.begin() + n_kmers,
1184  thrust::make_zip_iterator(thrust::make_tuple(kmers_64b.begin(), coords.begin(), meta_data)));
1185 }
1186 
1187 // sort kmers in lexicographic order - sorts the indices and returns the iterator
1188 // kmer length can vary
1189 // if kmer_size is set, all kmers are assumed to have this size
1190 template <typename string_set_type>
1191 D_VectorU32::iterator sort_kmers_lexicographic(const string_set_type string_set,
1192  D_VectorSetKmerCoord& coords,
1193  const uint32 n_kmers, const uint32 fixed_kmer_size,
1194  D_VectorU32& indices)
1195 {
1196  const uint32 WORD_BITS = uint32(8u * sizeof(uint32));
1197  const uint32 SYMBOL_SIZE = 4u;
1198  const uint32 SYMBOLS_PER_WORD = WORD_BITS/SYMBOL_SIZE;
1199 
1200  // maximum size of a kmer in the set
1201  uint32 max_length = fixed_kmer_size;
1202  if(fixed_kmer_size == 0u) {
1203  max_length = thrust::reduce(
1205  thrust::make_transform_iterator(coords.begin() + n_kmers, get_kmer_size()),
1206  0u,
1207  thrust::maximum<uint32>());
1208  }
1209 
1210  // maximum number of words needed to represent a kmer
1211  const uint32 max_words = (max_length + SYMBOLS_PER_WORD-1)/SYMBOLS_PER_WORD;
1212 
1213  // ----- LSD radix-sort (word by word) -----
1214  //uint32 n_kmers = coords.size();
1215  D_VectorU32 radices(2*n_kmers); // kmer words (sort keys)
1216  indices.resize(2*n_kmers); // kmer ids (values to be sorted)
1217  thrust::copy(
1218  thrust::make_counting_iterator<uint32>(0u),
1219  thrust::make_counting_iterator<uint32>(n_kmers),
1220  indices.begin());
1221 
1222  // ping-pong buffers
1224  cuda::SortEnactor sort_enactor;
1225  sort_buffers.selector = 0;
1226  sort_buffers.keys[0] = nvbio::device_view(radices);
1227  sort_buffers.keys[1] = nvbio::device_view(radices) + n_kmers;
1228  sort_buffers.values[0] = nvbio::device_view(indices);
1229  sort_buffers.values[1] = nvbio::device_view(indices) + n_kmers;
1230 
1231  // sort in LSD order
1232  for (int32 word_idx = max_words-1; word_idx >= 0; --word_idx) {
1233  // extract the given radix word from each kmer
1235  indices.begin() + sort_buffers.selector * n_kmers,
1236  indices.begin() + sort_buffers.selector * n_kmers + n_kmers,
1237  radices.begin() + sort_buffers.selector * n_kmers,
1239  string_set,
1240  SYMBOL_SIZE,
1241  plain_view(coords),
1242  word_idx));
1243  // sort the words
1244  sort_enactor.sort(n_kmers, sort_buffers);
1245  }
1246 
1247  return indices.begin() + sort_buffers.selector * n_kmers;
1248 }
1249 
1250 template <typename string_set_type>
1251 void segmented_sort_super_kmers_lexicographic(const string_set_type string_set, uint32 n_coords,
1252  D_VectorSetKmerCoord& super_coords, D_VectorU32& super_prefix_ids, D_VectorU32& super_suffix_ids)
1253 {
1254  D_VectorU32 ids(n_coords);
1255  D_VectorSetKmerCoord temp(super_coords);
1256  D_VectorU32 temp_pref(super_prefix_ids);
1257  D_VectorU32 temp_suf(super_suffix_ids);
1258 
1259  // first pass: sort by coord key
1260  D_VectorU32::iterator sorted_ids = sort_kmers_lexicographic(string_set, super_coords, n_coords, 0u, ids);
1261  thrust::gather(
1262  sorted_ids,
1263  sorted_ids + n_coords,
1264  temp.begin(),
1265  super_coords.begin());
1266  thrust::gather(
1267  sorted_ids,
1268  sorted_ids + n_coords,
1269  thrust::make_zip_iterator(thrust::make_tuple(temp_pref.begin(), temp_suf.begin())),
1270  thrust::make_zip_iterator(thrust::make_tuple(super_prefix_ids.begin(), super_suffix_ids.begin())));
1271 
1272  // second pass: stable sort by region id => as a result coord keys will be sorted by region
1273  //D_VectorU32 reg_ids(n_coords);
1275  super_coords.begin(),
1276  super_coords.begin() + n_coords,
1277  ids.begin(),
1278  get_kmer_reg_id());
1279 
1280  thrust::stable_sort_by_key(
1281  ids.begin(),
1282  ids.begin() + n_coords,
1283  thrust::make_zip_iterator(thrust::make_tuple(super_coords.begin(), super_prefix_ids.begin(), super_suffix_ids.begin())));
1284 }
1285 
1286 // count the kmer occurrences across all the sequences in the set
1287 // assumes that the keys have been sorted
1288 template <typename string_set_type>
1290 {
1291  // count the number of distinct keys and extract them
1292  kmers_64b_distinct.resize(n_kmers);
1293  kmers_counts.resize(n_kmers);
1294  D_VectorU8 d_temp_storage;
1295  n_distinct = cuda::runlength_encode(
1296  n_kmers,
1297  kmers_64b.begin(),
1298  kmers_64b_distinct.begin(),
1299  kmers_counts.begin(),
1300  d_temp_storage);
1301 }
1302 
1303 // count the kmer occurrences across all the sequences in the set
1304 // assumes that the keys have been sorted
1305 template <typename string_set_type>
1307 {
1308  // count the number of distinct keys and extract them
1309  kmers_64b_unique_idxs.resize(n_kmers);
1310  kmers_counts.resize(n_kmers);
1311  n_unique = thrust::reduce_by_key(
1312  thrust::counting_iterator<uint32>(0),
1313  thrust::counting_iterator<uint32>(0) + n_kmers,
1314  thrust::constant_iterator<uint32>(1),
1315  kmers_64b_unique_idxs.begin(),
1316  kmers_counts.begin(),
1317  kmer_key_rid_eq(plain_view(kmers_64b), plain_view(coords))).first - kmers_64b_unique_idxs.begin();
1318 }
1319 
1320 // partition kmers by index into the sorted kmer key set
1321 // based on whether they are unique or not
1322 // a kmer is not unique if it occurs more than once in at least one sequence
1323 template <typename string_set_type>
1325 {
1326  // count the number of distinct kmers per sequence
1327  D_VectorU32::iterator distinct_idxs_per_seq = get_scratch_space(n_kmers);
1328  D_VectorU32::iterator count_per_seq = get_scratch_space(n_kmers);
1329  thrust::counting_iterator<uint64> ids(0);
1330  uint64 n_distinct_per_seq = thrust::reduce_by_key(
1331  ids,
1332  ids + n_kmers,
1333  thrust::constant_iterator<uint32>(1),
1334  distinct_idxs_per_seq,
1335  count_per_seq,
1336  kmer_key_sid_eq(plain_view(kmers_64b), plain_view(coords))).first - distinct_idxs_per_seq;
1337 
1338  D_VectorU32::iterator distinct_idxs = get_scratch_space(n_distinct_per_seq);
1339  D_VectorU32::iterator seq_count = get_scratch_space(n_distinct_per_seq); // number of sequences containing the same kmer
1340  D_VectorU32::iterator kmer_counts = get_scratch_space(n_distinct_per_seq); // number of kmer occurrences across all sequences
1341  n_distinct = thrust::reduce_by_key(
1342  distinct_idxs_per_seq,
1343  distinct_idxs_per_seq + n_distinct_per_seq,
1344  thrust::make_zip_iterator(thrust::make_tuple(thrust::constant_iterator<uint32>(1), count_per_seq)),
1345  distinct_idxs,
1346  thrust::make_zip_iterator(thrust::make_tuple(seq_count, kmer_counts)),
1347  kmer_key_rid_eq(plain_view(kmers_64b), plain_view(coords)),
1348  kmer_count_tuple_sum()).first - distinct_idxs;
1349 
1350  // partition the distinct indices into unique and non-unique kmers
1351  kmers_64b_unique_idxs.resize(n_distinct);
1352  kmers_counts.resize(n_distinct);
1353  n_unique = thrust::copy_if(
1354  thrust::make_zip_iterator(thrust::make_tuple(distinct_idxs, kmer_counts)),
1355  thrust::make_zip_iterator(thrust::make_tuple(distinct_idxs + n_distinct, kmer_counts + n_distinct)),
1356  thrust::make_zip_iterator(thrust::make_tuple(seq_count, kmer_counts)),
1357  thrust::make_zip_iterator(thrust::make_tuple(kmers_64b_unique_idxs.begin(), kmers_counts.begin())),
1358  is_unique_kmer()) - thrust::make_zip_iterator(thrust::make_tuple(kmers_64b_unique_idxs.begin(), kmers_counts.begin()));
1359 
1360  reset_scratch_space();
1361 }
1362 
1363 template <typename string_set_type>
1365 {
1366  D_VectorU8 unique_map_sorted(n_kmers);
1368  kmers_64b_unique_idxs.begin(),
1369  kmers_64b_unique_idxs.begin() + n_unique,
1370  mark_kmer_uniqueness(plain_view(unique_map_sorted), plain_view(kmers_64b), plain_view(coords)));
1371 
1372  global_unique_flag_map.resize(n_kmers);
1373  thrust::gather(
1374  global_to_sorted_id_map.begin(),
1375  global_to_sorted_id_map.begin() + n_kmers,
1376  unique_map_sorted.begin(),
1377  global_unique_flag_map.begin());
1378 }
1379 
1380 template <typename string_set_type>
1382 {
1383  D_VectorU32 id_map_sorted(n_kmers);
1384  thrust::fill(id_map_sorted.begin(), id_map_sorted.begin() + n_kmers, (uint32) -1);
1386  thrust::make_zip_iterator(thrust::make_tuple(thrust::make_counting_iterator<uint32>(0u), kmers_64b_unique_idxs.begin())),
1387  thrust::make_zip_iterator(thrust::make_tuple(thrust::make_counting_iterator<uint32>(0u) + n_unique, kmers_64b_unique_idxs.begin() + n_unique)),
1388  store_kmer_unique_ids(plain_view(id_map_sorted), plain_view(kmers_64b), plain_view(coords)));
1389 
1390  global_to_UID_map.resize(n_kmers);
1391  thrust::gather(
1392  global_to_sorted_id_map.begin(),
1393  global_to_sorted_id_map.begin() + n_kmers,
1394  id_map_sorted.begin(),
1395  global_to_UID_map.begin());
1396 }
1397 
1398 template <typename string_set_type>
1400 {
1401  global_to_sorted_id_map.resize(n_kmers);
1403  thrust::make_zip_iterator(thrust::make_tuple(thrust::make_counting_iterator<uint32>(0u), coords.begin())),
1404  thrust::make_zip_iterator(thrust::make_tuple(thrust::make_counting_iterator<uint32>(0u) + n_kmers, coords.begin() + n_kmers)),
1405  global_to_sorted_id(plain_view(global_to_sorted_id_map)));
1406 }
1407 
1408 template <typename string_set_type>
1410 {
1411  global_unique_flag_map.resize(n_kmers);
1412  global_to_UID_map.resize(n_kmers);
1413  thrust::fill(global_to_UID_map.begin(), global_to_UID_map.begin() + n_kmers, (uint32) -1);
1415  thrust::make_counting_iterator<uint32>(0u),
1416  thrust::make_counting_iterator<uint32>(0u) + n_unique,
1417  populate_unique_kmer_data(plain_view(kmers_counts), plain_view(coords), plain_view(kmers_64b_unique_idxs),
1418  plain_view(global_unique_flag_map), plain_view(global_to_UID_map)));
1419 }
1420 
1421 
1422 struct test_funct : public thrust::unary_function<uint32,uint32>
1423 {
1426  {
1427  return uid;
1428  }
1429 };
1430 
1431 // only keep the coordinates that have a unique prefix
1432 template <typename string_set_type>
1434 {
1435  D_VectorSetKmerCoord unique_pref_coords(n_kmers);
1436  uint32 n_unique_coords = thrust::copy_if( // remove_if
1437  coords.begin(),
1438  coords.begin() + n_kmers,
1439  thrust::make_permutation_iterator(
1440  unique_map.begin(),
1442  unique_pref_coords.begin(),
1443  thrust::identity<uint8>()) - unique_pref_coords.begin();
1444 
1445  thrust::copy(unique_pref_coords.begin(), unique_pref_coords.begin() + n_unique_coords, coords.begin());
1446  n_kmers = n_unique_coords;
1447 }
1448 
1449 // for every kmer that is not unique N
1450 // find its closest unique predecessor P and successor S kmer in the same sequence
1451 // and output a kmer chain (P ->... N ... -> S)
1452 // as a new 'super' kmer coordinate
1453 template <typename string_set_type>
1455 {
1456  // collect the ids of the repeat kmers
1457  kmers_64b_repeat_idxs.resize(n_kmers);
1458  uint32 n_chains = thrust::copy_if(
1459  thrust::make_counting_iterator<uint32>(0u),
1460  thrust::make_counting_iterator<uint32>(0u) + n_kmers,
1461  global_unique_flag_map.begin(),
1462  kmers_64b_repeat_idxs.begin(),
1463  thrust::logical_not<uint8>()) - kmers_64b_repeat_idxs.begin();
1464 
1465  D_VectorSetKmerCoord super_kmers(n_chains);
1466  D_VectorU32 prefix_uids(n_chains);
1467  D_VectorU32 suffix_uids(n_chains);
1469  thrust::make_counting_iterator(0),
1470  thrust::make_counting_iterator(0) + n_chains,
1472  plain_view(kmers_64b_repeat_idxs),
1473  plain_view(global_unique_flag_map),
1474  plain_view(global_to_UID_map),
1475  plain_view(global_to_sorted_id_map),
1476  plain_view(coords),
1477  string_set,
1478  kmer_size,
1479  plain_view(super_kmers),
1480  plain_view(prefix_uids),
1481  plain_view(suffix_uids)));
1482 
1483  // collapse chains that belong to the same sequence and same unique kmer predecessor
1484  // (these will be consecutive by construction)
1485  // keep the max chain length coord per same kmer and seq id
1486  // -- will keep one copy of the chains that is terminated with a unique kmer
1487  // and the longest chain amongst the ones that are not
1488  super_coords.resize(n_chains);
1489  super_prefix_uids.resize(n_chains);
1490  super_suffix_uids.resize(n_chains);
1491  uint32 n_collapsed_chains = thrust::reduce_by_key(
1492  thrust::make_counting_iterator(0u),
1493  thrust::make_counting_iterator(0u) + n_chains,
1494  thrust::make_zip_iterator(thrust::make_tuple(super_kmers.begin(), prefix_uids.begin(), suffix_uids.begin())),
1495  thrust::make_discard_iterator(),
1496  thrust::make_zip_iterator(thrust::make_tuple(super_coords.begin(), super_prefix_uids.begin(), super_suffix_uids.begin())),
1497  kmer_pos_sid_eq(plain_view(super_kmers)),
1498  collapse_same_start_kmers()).second - thrust::make_zip_iterator(thrust::make_tuple(super_coords.begin(), super_prefix_uids.begin(), super_suffix_uids.begin()));
1499 
1500  // eliminate non-unique kmers that are not preceded by a unique kmer
1501  D_VectorSetKmerCoord t1(n_chains);
1502  D_VectorU32 t2(n_chains);
1503  D_VectorU32 t3(n_chains);
1504  n_super_coords = thrust::remove_copy_if(
1505  thrust::make_zip_iterator(thrust::make_tuple(
1506  super_coords.begin(),
1507  super_prefix_uids.begin(),
1508  super_suffix_uids.begin())),
1509  thrust::make_zip_iterator(thrust::make_tuple(
1510  super_coords.begin() + n_collapsed_chains,
1511  super_prefix_uids.begin() + n_collapsed_chains,
1512  super_suffix_uids.begin() + n_collapsed_chains)),
1513  super_prefix_uids.begin(),
1514  thrust::make_zip_iterator(thrust::make_tuple(
1515  t1.begin(),
1516  t2.begin(),
1517  t3.begin())),
1518  is_invalid_super_kmer()) - thrust::make_zip_iterator(thrust::make_tuple(
1519  t1.begin(),
1520  t2.begin(),
1521  t3.begin()));
1522 
1523 
1524  thrust::copy(t1.begin(), t1.begin() + n_chains, super_coords.begin());
1525  thrust::copy(t2.begin(), t2.begin() + n_chains, super_prefix_uids.begin());
1526  thrust::copy(t3.begin(), t3.begin() + n_chains, super_suffix_uids.begin());
1527 
1528  //super_coords.erase(super_coords.begin() + n_super_coords, super_coords.end());
1529  //super_prefix_uids.erase(super_prefix_uids.begin() + n_super_coords, super_prefix_uids.end());
1530  //super_suffix_uids.erase(super_suffix_uids.begin() + n_super_coords, super_suffix_uids.end());
1531 
1532 // printf("Collapsed Filtered Super Kmer Coords: \n");
1533 // for(uint64 i = 0; i < n_collapsed_chains_filtered; i++) {
1534 // SequenceSetKmerCoord c = (SequenceSetKmerCoord) super_coords[i];
1535 // printf("coord [%u %u %u %u] pref %llu suf %llu\n", c.x, c.y, c.z, c.w, (uint64) super_prefix_global_ids[i],
1536 // (uint64) super_suffix_global_ids[i]);
1537 // }
1538 
1539  reset_scratch_space();
1540 }
1541 
1542 // collapses lexicographically sorted chains of kmers by computing their prefix overlaps
1543 // and extracting kmers not included in the overlapping region
1544 // along with kmer adjacencies and adjacency count information
1545 template <typename string_set_type>
1547  D_VectorU32& prefix_ids_out, D_VectorU32& suffix_ids_out, D_VectorU32& adj_counts)
1548 {
1549  uint32 n_super_kmers = n_super_coords;
1550 
1551  // find overlap with the previous chain (in sorted order)
1552  // overlap is measured in # of shared prefix kmers
1553  D_VectorU32 max_predecessor_overlaps(n_super_kmers);
1555  thrust::make_counting_iterator<uint32>(1u), // skip the first chain
1556  thrust::make_counting_iterator<uint32>(0u) + n_super_kmers,
1557  max_predecessor_overlaps.begin() + 1,
1558  find_max_kmer_overlaps<string_set_type>(string_set, plain_view(super_coords), kmer_size));
1559 
1560  // find the number of kmer nodes that need to be created
1561  // the first kmer in a chain is a unique kmer and should be ignored
1562  // the last kmer in the chain might be unique, in which case it is also ignored
1563  thrust::device_vector<uint8> temp_storage;
1564  D_VectorU32 chain_extraction_offsets(n_super_kmers + 1);
1566  n_super_kmers,
1568  thrust::make_counting_iterator<uint32>(0u),
1570  plain_view(super_coords),
1571  plain_view(super_suffix_uids),
1572  plain_view(max_predecessor_overlaps), kmer_size)),
1573  chain_extraction_offsets.begin() + 1,
1574  thrust::plus<uint32>(),
1575  temp_storage);
1576 
1577  D_VectorU32 adj_extraction_offsets(n_super_kmers + 1);
1579  n_super_kmers,
1581  thrust::make_counting_iterator<uint32>(0u),
1583  plain_view(super_coords),
1584  plain_view(super_suffix_uids),
1585  plain_view(max_predecessor_overlaps), kmer_size)),
1586  adj_extraction_offsets.begin() + 1,
1587  thrust::plus<uint32>(),
1588  temp_storage);
1589 
1590  // extract non-unique kmers and the (k+1)mer prefix and suffix kmer ids from the collapsed chains
1591  n_repeat = chain_extraction_offsets[n_super_kmers];
1592  uint32 n_prefsuf_to_extract = adj_extraction_offsets[n_super_kmers];
1593 
1594  //printf("n_super_coords %u \n", n_super_coords);
1595  //printf("n_repeat %u \n", n_repeat);
1596  //printf("Num X-R-X edges %u \n", n_prefsuf_to_extract);
1597 
1598  // allocate the necessary storage and populate
1599  kmers_out.resize(n_unique + n_repeat);
1600  prefix_ids_out.resize(n_prefsuf_to_extract);
1601  suffix_ids_out.resize(n_prefsuf_to_extract);
1603  thrust::make_counting_iterator<uint32>(0u),
1604  thrust::make_counting_iterator<uint32>(0u) + n_super_kmers,
1606  string_set,
1607  plain_view(super_coords),
1608  plain_view(super_prefix_uids),
1609  plain_view(super_suffix_uids),
1610  plain_view(chain_extraction_offsets),
1611  plain_view(adj_extraction_offsets),
1612  plain_view(max_predecessor_overlaps),
1613  kmer_size, n_unique,
1614  plain_view(kmers_out),
1615  plain_view(prefix_ids_out),
1616  plain_view(suffix_ids_out)));
1617 
1618 
1619  // count the number of repeated kmer adjacencies
1620  adj_counts.resize(n_prefsuf_to_extract);
1621  thrust::fill(adj_counts.begin(), adj_counts.begin() + n_prefsuf_to_extract, 1);
1623  thrust::make_counting_iterator<uint32>(1u),
1624  thrust::make_counting_iterator<uint32>(0u) + n_super_kmers,
1626  plain_view(adj_extraction_offsets),
1627  plain_view(max_predecessor_overlaps),
1628  plain_view(adj_counts)));
1629 
1630 }