Fermat
knn_inline.h
1 /*
2  * Copyright (c) 2010-2018, NVIDIA Corporation
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or far materials provided with the distribution.
12  * * Neither the name of NVIDIA Corporation nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #include <cugar/basic/cuda/arch.h>
29 
30 //#define KD_KNN_STATS
31 #ifdef KD_KNN_STATS
32 #define KD_KNN_STATS_DEF(type,name,value) type name = value;
33 #define KD_KNN_STATS_ADD(name,value) name += value;
34 #else
35 #define KD_KNN_STATS_DEF(type,name,value)
36 #define KD_KNN_STATS_ADD(name,value)
37 #endif
38 
39 namespace cugar {
40 namespace cuda {
41 
42 namespace kd_knn {
43 
44 #define KNN_LOOKUP_BLOCK_SIZE 64
45 #define KNN_LOOKUP_CTA_BLOCKS 32
46 
47 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
48 float comp(const float2 v, const uint32 i)
49 {
50  return i == 0 ? v.x : v.y;
51 }
52 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
53 void set_comp(float2& v, const uint32 i, const float t)
54 {
55  if (i == 0) v.x = t;
56  else v.y = t;
57 }
58 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
59 float sq_length(const float2 v)
60 {
61  return v.x*v.x + v.y*v.y;
62 }
63 
64 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
65 float comp(const float3 v, const uint32 i)
66 {
67  return
68  i == 0 ? v.x :
69  i == 1 ? v.y :
70  v.z;
71 }
72 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
73 void set_comp(float3& v, const uint32 i, const float t)
74 {
75  if (i == 0) v.x = t;
76  else if (i == 1) v.y = t;
77  else v.z = t;
78 }
79 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
80 float sq_length(const float3 v)
81 {
82  return v.x*v.x + v.y*v.y + v.z*v.z;
83 }
84 
85 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
86 float comp(const float4 v, const uint32 i)
87 {
88  return i < 2 ?
89  (i == 0 ? v.x : v.y) :
90  (i == 2 ? v.z : v.w);
91 }
92 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
93 void set_comp(float4& v, const uint32 i, const float t)
94 {
95  if (i == 0) v.x = t;
96  else if (i == 1) v.y = t;
97  else if (i == 2) v.z = t;
98  else v.w = t;
99 }
100 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE
101 float sq_length(const float4 v)
102 {
103  return v.x*v.x + v.y*v.y + v.z*v.z+ v.w*v.w;
104 }
105 
106 template <typename VectorType, typename PointIterator>
107 __device__ void lookup_2d(
108  const VectorType query,
109  const Kd_node* kd_nodes,
110  const uint2* kd_ranges,
111  const uint2* kd_leaves,
112  const PointIterator kd_points,
113  Kd_knn<2>::Result* results)
114 {
115  //
116  // 1-st pass: find the leaf containing the query point and compute an upper bound
117  // on the search distance
118  //
119 
120  uint32 idx;
121  float dist2 = 1.0e16f;
122 
123  // start from the root node
124  uint32 node_index = 0;
125 
126  // keep track of which leaf we visited
127  uint32 first_leaf = 0;
128 
129  while (1)
130  {
131  const Kd_node node = Kd_node::load_ldg( kd_nodes + node_index );
132 
133  if (node.is_leaf())
134  {
135  // find the closest neighbor in this leaf
136  const uint2 leaf = __ldg( &kd_leaves[ node.get_leaf_index() ] );
137  for (uint32 i = leaf.x; i < leaf.y; ++i)
138  {
139  const VectorType delta = kd_points[i] - query;
140  const float d2 = delta[0]*delta[0] + delta[1]*delta[1];
141  if (dist2 > d2)
142  {
143  dist2 = d2;
144  idx = i;
145  }
146  }
147 
148  // keep track of which leaf we found
149  first_leaf = node_index;
150  break;
151  }
152  else
153  {
154  const uint32 split_dim = node.get_split_dim();
155  const float split_plane = node.get_split_plane();
156 
157  node_index = node.get_child_offset() + (comp( query, split_dim ) < split_plane ? 0u : 1u);
158  }
159  }
160 
161  //
162  // 2-nd pass: visit the tree with a stack and careful pruning
163  //
164 
165  int32 stackp = 1;
166  float4 stack[64];
167 
168  // place a sentinel node in the stack
169  stack[0] = make_float4( 0.0f, 0.0f, 0.0f, binary_cast<float>(uint32(-1)) );
170 
171  // start from the root node
172  node_index = 0;
173 
174  float2 cdist = make_float2( 0.0f, 0.0f );
175 
176  while (node_index != uint32(-1))
177  {
178  const Kd_node node = Kd_node::load_ldg( kd_nodes + node_index );
179 
180  if (node.is_leaf())
181  {
182  if (first_leaf != node_index)
183  {
184  // find the closest neighbor in this leaf
185  const uint2 leaf = __ldg( &kd_leaves[ node.get_leaf_index() ] );
186  for (uint32 i = leaf.x; i < leaf.y; ++i)
187  {
188  const VectorType delta = kd_points[i] - query;
189  const float d2 = delta[0]*delta[0] + delta[1]*delta[1];
190  if (dist2 > d2)
191  {
192  dist2 = d2;
193  idx = i;
194  }
195  }
196  }
197 
198  // pop the next node from the stack
199  while (stackp > 0)
200  {
201  const float4 stack_node = stack[ --stackp ];
202  node_index = binary_cast<uint32>( stack_node.w );
203  cdist = make_float2( stack_node.x, stack_node.y );
204 
205  if (sq_length( cdist ) < dist2)
206  break;
207  }
208  }
209  else
210  {
211  const uint32 split_dim = node.get_split_dim();
212  const float split_plane = node.get_split_plane();
213 
214  const float split_dist = comp( query, split_dim ) - split_plane;
215 
216  const uint32 select = split_dist <= 0.0f ? 0u : 1u;
217 
218  node_index = node.get_child_offset() + select;
219 
220  // compute the vector distance to the far node
221  float2 cdist_far = cdist;
222  set_comp( cdist_far, split_dim, split_dist );
223 
224  // check whether we should push the far node on the stack
225  const float dist_far2 = sq_length( cdist_far );
226 
227  if (dist_far2 < dist2)
228  {
229  stack[ stackp++ ] = make_float4(
230  cdist_far.x,
231  cdist_far.y,
232  0,
233  binary_cast<float>( node.get_child_offset() + 1u - select ) );
234  }
235  }
236  }
237 
238  // write the result
239  results->index = idx;
240  results->dist2 = dist2;
241 }
242 
243 template <typename VectorType, typename PointIterator>
244 __device__ void lookup_3d(
245  const VectorType query,
246  const Kd_node* kd_nodes,
247  const uint2* kd_ranges,
248  const uint2* kd_leaves,
249  const PointIterator kd_points,
250  Kd_knn<3>::Result* results)
251 {
252  //
253  // 1-st pass: find the leaf containing the query point and compute an upper bound
254  // on the search distance
255  //
256 
257  uint32 idx;
258  float dist2 = 1.0e16f;
259 
260  // start from the root node
261  uint32 node_index = 0;
262 
263  // keep track of which leaf we visited
264  uint32 first_leaf = 0;
265 
266  while (1)
267  {
268  const Kd_node node = Kd_node::load_ldg( kd_nodes + node_index );
269 
270  if (node.is_leaf())
271  {
272  // find the closest neighbor in this leaf
273  const uint2 leaf = __ldg( &kd_leaves[ node.get_leaf_index() ]);
274  for (uint32 i = leaf.x; i < leaf.y; ++i)
275  {
276  const VectorType delta = kd_points[i] - query;
277  const float d2 = delta[0]*delta[0] + delta[1]*delta[1] + delta[2]*delta[2];
278  if (dist2 > d2)
279  {
280  dist2 = d2;
281  idx = i;
282  }
283  }
284 
285  // keep track of which leaf we found
286  first_leaf = node_index;
287  break;
288  }
289  else
290  {
291  const uint32 split_dim = node.get_split_dim();
292  const float split_plane = node.get_split_plane();
293 
294  node_index = node.get_child_offset() + (comp( query, split_dim ) < split_plane ? 0u : 1u);
295  }
296  }
297 
298  //
299  // 2-nd pass: visit the tree with a stack and careful pruning
300  //
301 
302  int32 stackp = 1;
303  float4 stack[64];
304 
305  // place a sentinel node in the stack
306  stack[0] = make_float4( 0.0f, 0.0f, 0.0f, binary_cast<float>(uint32(-1)) );
307 
308  // start from the root node
309  node_index = 0;
310 
311  float3 cdist = make_float3( 0.0f, 0.0f, 0.0f );
312 
313  while (node_index != uint32(-1))
314  {
315  const Kd_node node = Kd_node::load_ldg( kd_nodes + node_index );
316 
317  if (node.is_leaf())
318  {
319  if (first_leaf != node_index)
320  {
321  // find the closest neighbor in this leaf
322  const uint2 leaf = __ldg( &kd_leaves[ node.get_leaf_index() ] );
323  for (uint32 i = leaf.x; i < leaf.y; ++i)
324  {
325  const VectorType delta = kd_points[i] - query;
326  const float d2 = delta[0]*delta[0] + delta[1]*delta[1] + delta[2]*delta[2];
327  if (dist2 > d2)
328  {
329  dist2 = d2;
330  idx = i;
331  }
332  }
333  }
334 
335  // pop the next node from the stack
336  while (stackp > 0)
337  {
338  const float4 stack_node = stack[ --stackp ];
339  node_index = binary_cast<uint32>( stack_node.w );
340  cdist = make_float3( stack_node.x, stack_node.y, stack_node.z );
341 
342  if (sq_length( cdist ) < dist2)
343  break;
344  }
345  }
346  else
347  {
348  const uint32 split_dim = node.get_split_dim();
349  const float split_plane = node.get_split_plane();
350 
351  const float split_dist = comp( query, split_dim ) - split_plane;
352 
353  const uint32 select = split_dist <= 0.0f ? 0u : 1u;
354 
355  node_index = node.get_child_offset() + select;
356 
357  // compute the vector distance to the far node
358  float3 cdist_far = cdist;
359  set_comp( cdist_far, split_dim, split_dist );
360 
361  // check whether we should push the far node on the stack
362  const float dist_far2 = sq_length( cdist_far );
363 
364  if (dist_far2 < dist2)
365  {
366  stack[ stackp++ ] = make_float4(
367  cdist_far.x,
368  cdist_far.y,
369  cdist_far.z,
370  binary_cast<float>( node.get_child_offset() + 1u - select ) );
371  }
372  }
373  }
374 
375  // write the result
376  results->index = idx;
377  results->dist2 = dist2;
378 }
379 
380 struct Compare
381 {
382  CUGAR_FORCEINLINE CUGAR_HOST_DEVICE bool operator() (const float2 op1, const float2 op2) const
383  {
384  return (op1.y == op2.y) ?
385  op1.x < op2.x :
386  op1.y < op2.y;
387  }
388 };
389 
390 #if 0
391 CUGAR_FORCEINLINE CUGAR_HOST_DEVICE float norm3(const float4 v) { return v.x*v.x + v.y*v.y + v.z*v.z; }
392 
393 #define KNN_REORDER_STACK(LhsIndex, RhsIndex) \
394 if (norm3( stack[stackp - (LhsIndex)] ) < \
395  norm3( stack[stackp - (RhsIndex)] )) \
396  { \
397  const float4 tmpv = stack[stackp - (LhsIndex)]; \
398  stack[stackp - (LhsIndex)] = stack[stackp - (RhsIndex)]; \
399  stack[stackp - (RhsIndex)] = tmpv; \
400  }
401 #else
402 #define KNN_REORDER_STACK(LhsIndex, RhsIndex)
403 #endif
404 
405 template <uint32 K>
407 {
408  const static uint32 M = 8;
409 };
410 
411 template <uint32 K, typename VectorType, typename PointIterator>
412 __device__ void lookup_2d(
413  const VectorType query,
414  const Kd_node* kd_nodes,
415  const uint2* kd_ranges,
416  const uint2* kd_leaves,
417  const PointIterator kd_points,
418  Kd_knn<3>::Result* results)
419 {
420  KD_KNN_STATS_DEF(uint32, leaf_tests, 0u);
421  KD_KNN_STATS_DEF(uint32, point_tests, 0u);
422  KD_KNN_STATS_DEF(uint32, node_pops, 0u);
423  KD_KNN_STATS_DEF(uint32, node_pushes, 0u);
424 
425  //
426  // 1-st pass: find the smallest node containing the query point and compute an upper bound
427  // on the search distance
428  //
429  typedef vector_view<float2*> queue_vector_type;
430  float2 queue_storage[K+1];
431  queue_vector_type queue_vector(0,queue_storage);
433  float max_dist2 = 1.0e16f;
434 
435  // start from the root node
436  uint32 node_index = 0;
437 
438  // keep track of which node we visited
439  uint32 entry_subtree = 0;
440 
441  while (1)
442  {
443  const Kd_node node = Kd_node::load_ldg( kd_nodes + node_index );
444  const uint2 range = kd_ranges[ node_index ];
445 
446  if (range.y - range.x < K)
447  break;
448 
449  entry_subtree = node_index;
450 
451  if (node.is_leaf())
452  break;
453  else
454  {
455  const uint32 split_dim = node.get_split_dim();
456  const float split_plane = node.get_split_plane();
457 
458  node_index = node.get_child_offset() + (comp( query, split_dim ) < split_plane ? 0u : 1u);
459  }
460  }
461 
462  //
463  // 2-nd pass: in the entry subtree, find an upper bound on distance
464  // looking at the first K neighbors encountered during a traversal
465  //
466 
467  // find the closest nodes with at least M primitives
468  const uint32 M = knn_lookup_traits<K>::M;
469 
470  int32 stackp = 1;
471  float4 stack[64];
472 
473  // place a sentinel node in the stack
474  stack[0] = make_float4( 1.0e8f, 1.0e8f, 1.0e8f, binary_cast<float>(uint32(-1)) );
475 
476  if (K <= 8)
477  {
478  // find the closest neighbors in this node
479  const uint2 range = kd_ranges[ entry_subtree ];
480  for (uint32 i = range.x; i < range.y; ++i)
481  {
482  const VectorType delta = kd_points[i] - query;
483  const float d2 = delta[0]*delta[0] + delta[1]*delta[1];
484 
485  // check whether the queue is already full
486  if (queue.size() == K && d2 < queue.top().y)
487  queue.pop();
488 
489  if (queue.size() < K)
490  queue.push( make_float2(binary_cast<float>(i), d2) );
491  }
492  // set the maximum distance bound
493  max_dist2 = queue.top().y;
494  }
495  else
496  {
497  // start from the entry node
498  node_index = entry_subtree;
499 
500  max_dist2 = 0.0f;
501  uint32 found = 0;
502 
503  float2 cdist = make_float2( 0.0f, 0.0f );
504  while (node_index != uint32(-1))
505  {
506  KD_KNN_STATS_ADD( node_tests, 1u );
507  const Kd_node node = Kd_node::load_ldg( kd_nodes + node_index );
508  const uint2 range = kd_ranges[ node_index ];
509 
510  if (node.is_leaf() || range.y - range.x <= M)
511  {
512  // find the closest neighbors in this node
513  KD_KNN_STATS_ADD( leaf_tests, 1u );
514  KD_KNN_STATS_ADD( point_tests, range.y - range.x );
515  for (uint32 i = range.x; i < range.y; ++i)
516  {
517  const VectorType delta = kd_points[i] - query;
518  const float d2 = delta[0]*delta[0] + delta[1]*delta[1];
519 
520  // reset the maximum distance bound
521  max_dist2 = cugar::max( max_dist2, d2 );
522  if (++found == K)
523  break;
524  }
525  if (found == K)
526  break;
527 
528  // pop the next node from the stack
529  while (stackp > 0)
530  {
531  KD_KNN_STATS_ADD( node_pops, 1u );
532  const float4 stack_node = stack[ --stackp ];
533  node_index = binary_cast<uint32>( stack_node.w );
534  cdist = make_float2( stack_node.x, stack_node.y );
535 
536  if (sq_length( cdist ) < max_dist2)
537  break;
538  }
539  }
540  else
541  {
542  const uint32 split_dim = node.get_split_dim();
543  const float split_plane = node.get_split_plane();
544 
545  const float split_dist = comp( query, split_dim ) - split_plane;
546 
547  const uint32 select = split_dist <= 0.0f ? 0u : 1u;
548 
549  node_index = node.get_child_offset() + select;
550 
551  // compute the vector distance to the far node
552  float2 cdist_far = cdist;
553  set_comp( cdist_far, split_dim, split_dist );
554 
555  // check whether we should push the far node on the stack
556  const float dist_far2 = sq_length( cdist_far );
557 
558  if (dist_far2 < max_dist2)
559  {
560  KD_KNN_STATS_ADD( node_pushes, 1u );
561 
562  #if 0
563  // partially reorder the stack
564  if (stackp >= 1)
565  {
566  const float4 last_cdist = stack[ stackp-1 ];
567  const float last_dist2 = sq_length( last_cdist );
568 
569  const uint32 index =
570  last_dist2 < dist_far2 ? stackp-1 : stackp;
571 
572  if (last_dist2 < dist_far2)
573  stack[ stackp ] = last_cdist;
574 
575  stack[ index ] = make_float4(
576  cdist_far.x,
577  cdist_far.y,
578  0,
579  binary_cast<float>( node.get_child_offset() + 1u - select ) );
580 
581  stackp++;
582  }
583  #else
584  stack[ stackp++ ] = make_float4(
585  cdist_far.x,
586  cdist_far.y,
587  0,
588  binary_cast<float>( node.get_child_offset() + 1u - select ) );
589 
590  if (stackp >= 4)
591  {
592  KNN_REORDER_STACK(4, 3);
593  KNN_REORDER_STACK(2, 1);
594  KNN_REORDER_STACK(4, 2);
595  KNN_REORDER_STACK(3, 1);
596  KNN_REORDER_STACK(3, 2);
597  }
598  #endif
599  }
600  }
601  }
602 
603  // remove the excluded node
604  entry_subtree = uint32(-1);
605  }
606 
607  //
608  // 3-rd pass: restart traversal from the root node, this time pruning
609  // the tree using the computed upper bound on distance
610  //
611 
612  stackp = 1;
613 
614  // start from the root node
615  node_index = 0;
616 
617  float2 cdist = make_float2( 0.0f, 0.0f );
618 
619  while (node_index != uint32(-1))
620  {
621  KD_KNN_STATS_ADD( node_tests, 1u );
622  const Kd_node node = Kd_node::load_ldg( kd_nodes + node_index );
623 
624  if (node.is_leaf() || entry_subtree == node_index)
625  {
626  if (entry_subtree != node_index)
627  {
628  // find the closest neighbors in this leaf
629  const uint2 range = __ldg( &kd_leaves[ node.get_leaf_index() ] );
630 
631  KD_KNN_STATS_ADD( leaf_tests, 1u );
632  KD_KNN_STATS_ADD( point_tests, range.y - range.x );
633  for (uint32 i = range.x; i < range.y; ++i)
634  {
635  const VectorType delta = kd_points[i] - query;
636  const float d2 = delta[0]*delta[0] + delta[1]*delta[1];
637  if (max_dist2 > d2)
638  {
639  KD_KNN_STATS_ADD( point_pushes, 1u );
640 
641  // check whether the queue is already full
642  if (queue.size() == K && d2 < queue.top().y)
643  queue.pop();
644 
645  if (queue.size() < K)
646  queue.push( make_float2( binary_cast<float>(i), d2 ) );
647 
648  // reset the maximum distance bound
649  if (queue.size() == K)
650  max_dist2 = queue.top().y;
651  }
652  }
653  }
654 
655  // pop the next node from the stack
656  while (stackp > 0)
657  {
658  KD_KNN_STATS_ADD( node_pops, 1u );
659  const float4 stack_node = stack[ --stackp ];
660  node_index = binary_cast<uint32>( stack_node.w );
661  cdist = make_float2( stack_node.x, stack_node.y );
662 
663  if (sq_length( cdist ) < max_dist2)
664  break;
665  }
666  }
667  else
668  {
669  const uint32 split_dim = node.get_split_dim();
670  const float split_plane = node.get_split_plane();
671 
672  const float split_dist = comp( query, split_dim ) - split_plane;
673 
674  const uint32 select = split_dist <= 0.0f ? 0u : 1u;
675 
676  node_index = node.get_child_offset() + select;
677 
678  // compute the vector distance to the far node
679  float2 cdist_far = cdist;
680  set_comp( cdist_far, split_dim, split_dist );
681 
682  // check whether we should push the far node on the stack
683  const float dist_far2 = sq_length( cdist_far );
684 
685  if (dist_far2 < max_dist2)
686  {
687  KD_KNN_STATS_ADD( node_pushes, 1u );
688 
689  #if 0
690  // partially reorder the stack
691  if (stackp >= 1)
692  {
693  const float4 last_cdist = stack[ stackp-1 ];
694  const float last_dist2 = sq_length( last_cdist );
695 
696  const uint32 index =
697  last_dist2 < dist_far2 ? stackp-1 : stackp;
698 
699  if (last_dist2 < dist_far2)
700  stack[ stackp ] = last_cdist;
701 
702  stack[ index ] = make_float4(
703  cdist_far.x,
704  cdist_far.y,
705  cdist_far.z,
706  binary_cast<float>( node.get_child_offset() + 1u - select ) );
707 
708  stackp++;
709  }
710  #else
711  stack[ stackp++ ] = make_float4(
712  cdist_far.x,
713  cdist_far.y,
714  0,
715  binary_cast<float>( node.get_child_offset() + 1u - select ) );
716 
717  if (stackp >= 4)
718  {
719  KNN_REORDER_STACK(4, 3);
720  KNN_REORDER_STACK(2, 1);
721  KNN_REORDER_STACK(4, 2);
722  KNN_REORDER_STACK(3, 1);
723  KNN_REORDER_STACK(3, 2);
724  }
725  #endif
726  }
727  }
728  }
729 
730  // write the results
731  for (uint32 i = 0; i < K; ++i)
732  ((float2*)results)[i] = queue[i];
733 }
734 
735 template <uint32 K, typename VectorType, typename PointIterator>
736 __device__ void lookup_3d(
737  const VectorType query,
738  const Kd_node* kd_nodes,
739  const uint2* kd_ranges,
740  const uint2* kd_leaves,
741  const PointIterator kd_points,
742  Kd_knn<3>::Result* results)
743 {
744  KD_KNN_STATS_DEF(uint32, leaf_tests, 0u);
745  KD_KNN_STATS_DEF(uint32, point_tests, 0u);
746  KD_KNN_STATS_DEF(uint32, node_pops, 0u);
747  KD_KNN_STATS_DEF(uint32, node_pushes, 0u);
748 
749  //
750  // 1-st pass: find the smallest node containing the query point and compute an upper bound
751  // on the search distance
752  //
753  typedef vector_view<float2*> queue_vector_type;
754  float2 queue_storage[K+1];
755  //typedef vector_view< register_array<float2,K+1> > queue_vector_type;
756  //register_array<float2,K+1> queue_storage;
757  queue_vector_type queue_vector(0,queue_storage);
759  float max_dist2 = 1.0e16f;
760 
761  // start from the root node
762  uint32 node_index = 0;
763 
764  // keep track of which node we visited
765  uint32 entry_subtree = 0;
766 
767  while (1)
768  {
769  const Kd_node node = Kd_node::load_ldg( kd_nodes + node_index );
770  const uint2 range = kd_ranges[ node_index ];
771 
772  if (range.y - range.x < K)
773  break;
774 
775  entry_subtree = node_index;
776 
777  if (node.is_leaf())
778  break;
779  else
780  {
781  const uint32 split_dim = node.get_split_dim();
782  const float split_plane = node.get_split_plane();
783 
784  node_index = node.get_child_offset() + (comp( query, split_dim ) < split_plane ? 0u : 1u);
785  }
786  }
787 
788  //
789  // 2-nd pass: in the entry subtree, find an upper bound on distance
790  // looking at the first K neighbors encountered during a traversal
791  //
792 
793  // find the closest nodes with at least M primitives
794  const uint32 M = knn_lookup_traits<K>::M;
795 
796  int32 stackp = 1;
797  float4 stack[64];
798 
799  // place a sentinel node in the stack
800  stack[0] = make_float4( 1.0e8f, 1.0e8f, 1.0e8f, binary_cast<float>(uint32(-1)) );
801 
802  if (K <= 8)
803  {
804  // find the closest neighbors in this node
805  const uint2 range = kd_ranges[ entry_subtree ];
806  for (uint32 i = range.x; i < range.y; ++i)
807  {
808  const VectorType delta = kd_points[i] - query;
809  const float d2 = delta[0]*delta[0] + delta[1]*delta[1] + delta[2]*delta[2];
810 
811  // check whether the queue is already full
812  if (queue.size() == K && d2 < queue.top().y)
813  queue.pop();
814 
815  if (queue.size() < K)
816  queue.push( make_float2(binary_cast<float>(i), d2) );
817  }
818  // set the maximum distance bound
819  max_dist2 = queue.top().y;
820  }
821  else
822  {
823  // start from the entry node
824  node_index = entry_subtree;
825 
826  max_dist2 = 0.0f;
827  uint32 found = 0;
828 
829  float3 cdist = make_float3( 0.0f, 0.0f, 0.0f );
830  while (node_index != uint32(-1))
831  {
832  KD_KNN_STATS_ADD( node_tests, 1u );
833  const Kd_node node = Kd_node::load_ldg( kd_nodes + node_index );
834  const uint2 range = kd_ranges[ node_index ];
835 
836  if (node.is_leaf() || range.y - range.x <= M)
837  {
838  // find the closest neighbors in this node
839  KD_KNN_STATS_ADD( leaf_tests, 1u );
840  KD_KNN_STATS_ADD( point_tests, range.y - range.x );
841  for (uint32 i = range.x; i < range.y; ++i)
842  {
843  const VectorType delta = kd_points[i] - query;
844  const float d2 = delta[0]*delta[0] + delta[1]*delta[1] + delta[2]*delta[2];
845 
846  // reset the maximum distance bound
847  max_dist2 = cugar::max( max_dist2, d2 );
848  if (++found == K)
849  break;
850  }
851  if (found == K)
852  break;
853 
854  // pop the next node from the stack
855  while (stackp > 0)
856  {
857  KD_KNN_STATS_ADD( node_pops, 1u );
858  const float4 stack_node = stack[ --stackp ];
859  node_index = binary_cast<uint32>( stack_node.w );
860  cdist = make_float3( stack_node.x, stack_node.y, stack_node.z );
861 
862  if (sq_length( cdist ) < max_dist2)
863  break;
864  }
865  }
866  else
867  {
868  const uint32 split_dim = node.get_split_dim();
869  const float split_plane = node.get_split_plane();
870 
871  const float split_dist = comp( query, split_dim ) - split_plane;
872 
873  const uint32 select = split_dist <= 0.0f ? 0u : 1u;
874 
875  node_index = node.get_child_offset() + select;
876 
877  // compute the vector distance to the far node
878  float3 cdist_far = cdist;
879  set_comp( cdist_far, split_dim, split_dist );
880 
881  // check whether we should push the far node on the stack
882  const float dist_far2 = sq_length( cdist_far );
883 
884  if (dist_far2 < max_dist2)
885  {
886  KD_KNN_STATS_ADD( node_pushes, 1u );
887 
888  #if 0
889  // partially reorder the stack
890  if (stackp >= 1)
891  {
892  const float4 last_cdist = stack[ stackp-1 ];
893  const float last_dist2 = sq_length( last_cdist );
894 
895  const uint32 index =
896  last_dist2 < dist_far2 ? stackp-1 : stackp;
897 
898  if (last_dist2 < dist_far2)
899  stack[ stackp ] = last_cdist;
900 
901  stack[ index ] = make_float4(
902  cdist_far.x,
903  cdist_far.y,
904  cdist_far.z,
905  binary_cast<float>( node.get_child_offset() + 1u - select ) );
906 
907  stackp++;
908  }
909  #else
910  stack[ stackp++ ] = make_float4(
911  cdist_far.x,
912  cdist_far.y,
913  cdist_far.z,
914  binary_cast<float>( node.get_child_offset() + 1u - select ) );
915 
916  if (stackp >= 4)
917  {
918  KNN_REORDER_STACK(4, 3);
919  KNN_REORDER_STACK(2, 1);
920  KNN_REORDER_STACK(4, 2);
921  KNN_REORDER_STACK(3, 1);
922  KNN_REORDER_STACK(3, 2);
923  }
924  #endif
925  }
926  }
927  }
928 
929  // remove the excluded node
930  entry_subtree = uint32(-1);
931  }
932 
933  //
934  // 3-rd pass: restart traversal from the root node, this time pruning
935  // the tree using the computed upper bound on distance
936  //
937 
938  stackp = 1;
939 
940  // start from the root node
941  node_index = 0;
942 
943  float3 cdist = make_float3( 0.0f, 0.0f, 0.0f );
944 
945  while (node_index != uint32(-1))
946  {
947  KD_KNN_STATS_ADD( node_tests, 1u );
948  const Kd_node node = Kd_node::load_ldg( kd_nodes + node_index );
949 
950  if (node.is_leaf() || entry_subtree == node_index)
951  {
952  if (entry_subtree != node_index)
953  {
954  // find the closest neighbors in this leaf
955  const uint2 range = __ldg( &kd_leaves[ node.get_leaf_index() ] );
956 
957  KD_KNN_STATS_ADD( leaf_tests, 1u );
958  KD_KNN_STATS_ADD( point_tests, range.y - range.x );
959  for (uint32 i = range.x; i < range.y; ++i)
960  {
961  const VectorType delta = kd_points[i] - query;
962  const float d2 = delta[0]*delta[0] + delta[1]*delta[1] + delta[2]*delta[2];
963  if (max_dist2 > d2)
964  {
965  KD_KNN_STATS_ADD( point_pushes, 1u );
966 
967  // check whether the queue is already full
968  if (queue.size() == K && d2 < queue.top().y)
969  queue.pop();
970 
971  if (queue.size() < K)
972  queue.push( make_float2( binary_cast<float>(i), d2 ) );
973 
974  // reset the maximum distance bound
975  if (queue.size() == K)
976  max_dist2 = queue.top().y;
977  }
978  }
979  }
980 
981  // pop the next node from the stack
982  while (stackp > 0)
983  {
984  KD_KNN_STATS_ADD( node_pops, 1u );
985  const float4 stack_node = stack[ --stackp ];
986  node_index = binary_cast<uint32>( stack_node.w );
987  cdist = make_float3( stack_node.x, stack_node.y, stack_node.z );
988 
989  if (sq_length( cdist ) < max_dist2)
990  break;
991  }
992  }
993  else
994  {
995  const uint32 split_dim = node.get_split_dim();
996  const float split_plane = node.get_split_plane();
997 
998  const float split_dist = comp( query, split_dim ) - split_plane;
999 
1000  const uint32 select = split_dist <= 0.0f ? 0u : 1u;
1001 
1002  node_index = node.get_child_offset() + select;
1003 
1004  // compute the vector distance to the far node
1005  float3 cdist_far = cdist;
1006  set_comp( cdist_far, split_dim, split_dist );
1007 
1008  // check whether we should push the far node on the stack
1009  const float dist_far2 = sq_length( cdist_far );
1010 
1011  if (dist_far2 < max_dist2)
1012  {
1013  KD_KNN_STATS_ADD( node_pushes, 1u );
1014 
1015  #if 0
1016  // partially reorder the stack
1017  if (stackp >= 1)
1018  {
1019  const float4 last_cdist = stack[ stackp-1 ];
1020  const float last_dist2 = sq_length( last_cdist );
1021 
1022  const uint32 index =
1023  last_dist2 < dist_far2 ? stackp-1 : stackp;
1024 
1025  if (last_dist2 < dist_far2)
1026  stack[ stackp ] = last_cdist;
1027 
1028  stack[ index ] = make_float4(
1029  cdist_far.x,
1030  cdist_far.y,
1031  cdist_far.z,
1032  binary_cast<float>( node.get_child_offset() + 1u - select ) );
1033 
1034  stackp++;
1035  }
1036  #else
1037  stack[ stackp++ ] = make_float4(
1038  cdist_far.x,
1039  cdist_far.y,
1040  cdist_far.z,
1041  binary_cast<float>( node.get_child_offset() + 1u - select ) );
1042 
1043  if (stackp >= 4)
1044  {
1045  KNN_REORDER_STACK(4, 3);
1046  KNN_REORDER_STACK(2, 1);
1047  KNN_REORDER_STACK(4, 2);
1048  KNN_REORDER_STACK(3, 1);
1049  KNN_REORDER_STACK(3, 2);
1050  }
1051  #endif
1052  }
1053  }
1054  }
1055 
1056  // write the results
1057  for (uint32 i = 0; i < K; ++i)
1058  ((float2*)results)[i] = queue[i];
1059 }
1060 
1061 template <uint32 DIM>
1063 
1064 template <>
1066 {
1067  template <typename VectorType, typename PointIterator>
1068  __device__ static void lookup(
1069  const VectorType query,
1070  const Kd_node* kd_nodes,
1071  const uint2* kd_ranges,
1072  const uint2* kd_leaves,
1073  const PointIterator kd_points,
1074  Kd_knn<2>::Result* results)
1075  {
1076  lookup_2d( query, kd_nodes, kd_ranges, kd_leaves, kd_points, results );
1077  }
1078 
1079  template <uint32 K, typename VectorType, typename PointIterator>
1080  __device__ static void lookup(
1081  const VectorType query,
1082  const Kd_node* kd_nodes,
1083  const uint2* kd_ranges,
1084  const uint2* kd_leaves,
1085  const PointIterator kd_points,
1086  Kd_knn<2>::Result* results)
1087  {
1088  lookup_2d<K>( query, kd_nodes, kd_ranges, kd_leaves, kd_points, results );
1089  }
1090 };
1091 
1092 template <>
1094 {
1095  template <typename VectorType, typename PointIterator>
1096  __device__ static void lookup(
1097  const VectorType query,
1098  const Kd_node* kd_nodes,
1099  const uint2* kd_ranges,
1100  const uint2* kd_leaves,
1101  const PointIterator kd_points,
1102  Kd_knn<3>::Result* results)
1103  {
1104  lookup_3d( query, kd_nodes, kd_ranges, kd_leaves, kd_points, results );
1105  }
1106 
1107  template <uint32 K, typename VectorType, typename PointIterator>
1108  __device__ static void lookup(
1109  const VectorType query,
1110  const Kd_node* kd_nodes,
1111  const uint2* kd_ranges,
1112  const uint2* kd_leaves,
1113  const PointIterator kd_points,
1114  Kd_knn<3>::Result* results)
1115  {
1116  lookup_3d<K>( query, kd_nodes, kd_ranges, kd_leaves, kd_points, results );
1117  }
1118 };
1119 
1120 template <uint32 DIM, typename QueryIterator, typename PointIterator>
1121 __global__ void lookup_kernel_1(
1122  const uint32 n_points,
1123  const QueryIterator points_begin,
1124  const Kd_node* kd_nodes,
1125  const uint2* kd_ranges,
1126  const uint2* kd_leaves,
1127  const PointIterator kd_points,
1128  Kd_knn<DIM>::Result* results)
1129 {
1130  const uint32 grid_size = gridDim.x * blockDim.x;
1131 
1132  // loop through all logical blocks associated to this physical one
1133  for (uint32 base_idx = blockIdx.x * blockDim.x;
1134  base_idx < n_points;
1135  base_idx += grid_size)
1136  {
1137  const uint32 index = threadIdx.x + base_idx;
1138  if (index >= n_points)
1139  return;
1140 
1142  points_begin[ index ],
1143  kd_nodes,
1144  kd_ranges,
1145  kd_leaves,
1146  kd_points,
1147  results + index );
1148  }
1149 }
1150 
1151 template <uint32 DIM, uint32 K, typename QueryIterator, typename PointIterator>
1152 //__launch_bounds__(KNN_LOOKUP_BLOCK_SIZE, KNN_LOOKUP_CTA_BLOCKS)
1153 __global__ void lookup_kernel(
1154  const uint32 n_points,
1155  const QueryIterator points_begin,
1156  const Kd_node* kd_nodes,
1157  const uint2* kd_ranges,
1158  const uint2* kd_leaves,
1159  const PointIterator kd_points,
1160  Kd_knn<DIM>::Result* results)
1161 {
1162  const uint32 grid_size = gridDim.x * blockDim.x;
1163 
1164  // loop through all logical blocks associated to this physical one
1165  for (uint32 base_idx = blockIdx.x * blockDim.x;
1166  base_idx < n_points;
1167  base_idx += grid_size)
1168  {
1169  const uint32 index = threadIdx.x + base_idx;
1170  if (index >= n_points)
1171  return;
1172 
1174  points_begin[ index ],
1175  kd_nodes,
1176  kd_ranges,
1177  kd_leaves,
1178  kd_points,
1179  results + index*K );
1180  }
1181 }
1182 
1183 } // namespace knn
1184 
1185 // perform a k-nn lookup for a set of query points
1186 //
1187 // \param points_begin beginning of the query point sequence
1188 // \param points_end end of the query point sequence
1189 // \param kd_nodes k-d tree nodes
1190 // \param kd_ranges k-d tree node ranges
1191 // \param kd_leaves k-d tree leaves
1192 // \param kd_points k-d tree points
1193 template <uint32 DIM>
1194 template <typename QueryIterator, typename PointIterator>
1196  const QueryIterator points_begin,
1197  const QueryIterator points_end,
1198  const Kd_node* kd_nodes,
1199  const uint2* kd_ranges,
1200  const uint2* kd_leaves,
1201  const PointIterator kd_points,
1202  Result* results)
1203 {
1204  const uint32 n_points = uint32( points_end - points_begin );
1205 
1206  const uint32 BLOCK_SIZE = 128;
1207  const uint32 max_blocks = (uint32)cuda::max_active_blocks(
1208  kd_knn::lookup_kernel_1<DIM,QueryIterator,PointIterator>, BLOCK_SIZE, 0);
1209  const uint32 n_blocks = cugar::min( max_blocks, uint32(n_points + BLOCK_SIZE-1) / BLOCK_SIZE );
1210 
1211  kd_knn::lookup_kernel_1<DIM> <<<n_blocks,BLOCK_SIZE>>> (
1212  n_points,
1213  points_begin,
1214  kd_nodes,
1215  kd_ranges,
1216  kd_leaves,
1217  kd_points,
1218  results );
1219 
1220  //cudaDeviceSynchronize();
1221 }
1222 
1223 // perform a k-nn lookup for a set of query points
1224 //
1225 // \param points_begin beginning of the query point sequence
1226 // \param points_end end of the query point sequence
1227 // \param kd_nodes k-d tree nodes
1228 // \param kd_ranges k-d tree node ranges
1229 // \param kd_leaves k-d tree leaves
1230 // \param kd_points k-d tree points
1231 template <uint32 DIM>
1232 template <uint32 K, typename QueryIterator, typename PointIterator>
1234  const QueryIterator points_begin,
1235  const QueryIterator points_end,
1236  const Kd_node* kd_nodes,
1237  const uint2* kd_ranges,
1238  const uint2* kd_leaves,
1239  const PointIterator kd_points,
1240  Result* results)
1241 {
1242  const uint32 n_points = uint32( points_end - points_begin );
1243 
1244  const uint32 BLOCK_SIZE = KNN_LOOKUP_BLOCK_SIZE;
1245  const uint32 max_blocks = (uint32)cuda::max_active_blocks(
1246  kd_knn::lookup_kernel<DIM,K,QueryIterator,PointIterator>, BLOCK_SIZE, 0);
1247  const uint32 n_blocks = cugar::min( max_blocks, uint32(n_points + BLOCK_SIZE-1) / BLOCK_SIZE );
1248 
1249  kd_knn::lookup_kernel<DIM,K> <<<n_blocks,BLOCK_SIZE>>> (
1250  n_points,
1251  points_begin,
1252  kd_nodes,
1253  kd_ranges,
1254  kd_leaves,
1255  kd_points,
1256  results );
1257 
1258  //cudaDeviceSynchronize();
1259 }
1260 
1261 } // namespace cuda
1262 } // namespace cugar
1263 
Definition: knn.h:52
Definition: vector_view.h:87
Definition: knn_inline.h:1062
void run(const QueryIterator points_begin, const QueryIterator points_end, const Kd_node *kd_nodes, const uint2 *kd_ranges, const uint2 *kd_leaves, const PointIterator kd_points, Result *results)
Definition: knn_inline.h:1195
CUGAR_HOST_DEVICE uint32 is_leaf() const
Definition: kd_node.h:142
Definition: knn_inline.h:380
Definition: kd_node.h:110
CUGAR_FORCEINLINE CUGAR_HOST_DEVICE uint32 size() const
Definition: priority_queue_inline.h:51
CUGAR_FORCEINLINE CUGAR_HOST_DEVICE Key & top()
Definition: priority_queue_inline.h:106
Definition: knn_inline.h:406
CUGAR_FORCEINLINE CUGAR_HOST_DEVICE void push(const Key key)
Definition: priority_queue_inline.h:59
CUGAR_HOST_DEVICE uint32 get_split_dim() const
Definition: kd_node.h:187
Definition: priority_queue.h:89
Define a vector_view POD type and plain_view() for std::vector.
Definition: diff.h:38
CUGAR_HOST_DEVICE uint32 get_leaf_index() const
Definition: kd_node.h:154
CUGAR_HOST_DEVICE float get_split_plane() const
Definition: kd_node.h:191
CUGAR_FORCEINLINE CUGAR_HOST_DEVICE Out binary_cast(const In in)
Definition: types.h:288
CUGAR_FORCEINLINE CUGAR_HOST_DEVICE uint8 comp(const uchar2 a, const char c)
Definition: numbers.h:218
CUGAR_HOST_DEVICE uint32 get_child_offset() const
Definition: kd_node.h:148
CUGAR_FORCEINLINE CUGAR_HOST_DEVICE void pop()
Definition: priority_queue_inline.h:81