Fermat
sah_builder_inline.h
1 /*
2  * Copyright (c) 2010-2011, NVIDIA Corporation
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of NVIDIA Corporation nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
30 #include <nih/basic/utils.h>
31 #include <nih/basic/cuda/scan.h>
32 #include <nih/basic/cuda_config.h>
33 #include <thrust/scan.h>
34 #include <thrust/sort.h>
35 
36 namespace nih {
37 namespace cuda {
38 
39 #define SAH_SINGLE_WARP 0
40 #define SAH_MAX_BINS 128
41 //#define SAH_OBJECT_REORDERING
42 //#define SAH_CHECKS
43 
44 #define ACCESS_BINS(a,id,axis,index,stride) a[id + (axis*BINS + index)*stride]
45 
46 
47 void init_bins(const uint32 BINS, const uint32 n_nodes, Sah_builder::Bins bins);
48 
49 void sah_split(
50  const uint32 BINS,
51  Sah_builder::Bins bins,
52  Sah_builder::Queue qin,
53  const int input_node_offset,
54  Sah_builder::Queue qout,
55  uint32* n_output,
56  int output_node_offset,
57  Bvh_node* nodes,
58  uint32* n_leaves,
59  const uint32 max_leaf_size,
60  const float max_cost);
61 
62 void distribute_objects(
63  const uint32 BINS,
64  Sah_builder::Objects objects,
65  const int n_objects,
66  Sah_builder::Queue queue,
67  const int input_node_offset,
68  Sah_builder::Bins bins,
69  uint32* new_index);
70 
71 void setup_leaves(
72  const int n_objects,
73  const int32* leaf_ids,
74  uint2* leaves);
75 
76 
77 FORCE_INLINE NIH_DEVICE void update_bin(float3* bin_bmin, float3* bin_bmax, int32* bin_counter, const Bbox4f bbox)
78 {
79  const float3 bmin = *bin_bmin;
80  if (bbox[0][0] < bmin.x) atomicMin( (int32*)&(bin_bmin->x), __float_as_int(bbox[0][0]) );
81  if (bbox[0][1] < bmin.y) atomicMin( (int32*)&(bin_bmin->y), __float_as_int(bbox[0][1]) );
82  if (bbox[0][2] < bmin.z) atomicMin( (int32*)&(bin_bmin->z), __float_as_int(bbox[0][2]) );
83 
84  const float3 bmax = *bin_bmax;
85  if (bbox[1][0] > bmax.x) atomicMax( (int32*)&(bin_bmax->x), __float_as_int(bbox[1][0]) );
86  if (bbox[1][1] > bmax.y) atomicMax( (int32*)&(bin_bmax->y), __float_as_int(bbox[1][1]) );
87  if (bbox[1][2] > bmax.z) atomicMax( (int32*)&(bin_bmax->z), __float_as_int(bbox[1][2]) );
88 
89  atomicAdd( bin_counter, 1 );
90 }
91 
96 template <typename Iterator>
97 __global__ void update_bins_kernel(
98  const uint32 BINS,
99  const uint32 n_objects,
100  const Iterator bboxes,
101  const Vector4f origin,
102  const Sah_builder::Objects objects,
103  const Sah_builder::Queue queue,
104  Sah_builder::Bins bins)
105 {
106  const uint32 grid_size = gridDim.x * blockDim.x;
107 
108  // loop through all logical blocks associated to this physical one
109  for (uint32 base_idx = blockIdx.x * blockDim.x;
110  base_idx < n_objects;
111  base_idx += grid_size)
112  {
113  const uint32 idx = threadIdx.x + base_idx;
114 
115  if (idx >= n_objects)
116  return;
117 
118  #ifdef SAH_OBJECT_REORDERING
119  const uint32 id = objects.index[ idx ];
120  if (id == uint32(-1))
121  continue;
122  #else
123  const uint32 id = idx;
124  #endif
125 
126  // check if the object has already been assigned to a node
127  const int32 node_id = objects.node_ids[id];
128  if (node_id > -1)
129  continue;
130 
131  const int32 split_id = objects.split_ids[id];
132 
133  const Sah_builder::Bin node_bbox = queue.bins[split_id];
134  const float3 node_size = make_float3(
135  node_bbox.bmax.x - node_bbox.bmin.x,
136  node_bbox.bmax.y - node_bbox.bmin.y,
137  node_bbox.bmax.z - node_bbox.bmin.z );
138 
139  Bbox4f bbox = bboxes[id];
140  bbox[0] += origin;
141  bbox[1] += origin;
142  const Vector3f center = (xyz(bbox[0]) + xyz(bbox[1]))*0.5f;
143 
144  const int4 bin_id = make_int4(
145  (node_size.x < 1.0e-8f ? 0 : quantize( (center[0] - node_bbox.bmin.x) / node_size.x, BINS )),
146  (node_size.y < 1.0e-8f ? 0 : quantize( (center[1] - node_bbox.bmin.y) / node_size.y, BINS )),
147  (node_size.z < 1.0e-8f ? 0 : quantize( (center[2] - node_bbox.bmin.z) / node_size.z, BINS )),
148  0 );
149 
150  objects.bin_ids[idx] = bin_id;
151 
152  const uint32 binX = split_id + (BINS * 0 + bin_id.x)*queue.size;
153  const uint32 binY = split_id + (BINS * 1 + bin_id.y)*queue.size;
154  const uint32 binZ = split_id + (BINS * 2 + bin_id.z)*queue.size;
155 
156  update_bin( bins.bmin + binX, bins.bmax + binX, (int32*)bins.size + binX, bbox );
157  update_bin( bins.bmin + binY, bins.bmax + binY, (int32*)bins.size + binY, bbox );
158  update_bin( bins.bmin + binZ, bins.bmax + binZ, (int32*)bins.size + binZ, bbox );
159  }
160 }
161 
165 template <typename Iterator>
166 inline void update_bins(
167  const uint32 BINS,
168  const uint32 n_objects,
169  const Iterator bboxes,
170  const Vector4f origin,
171  const Sah_builder::Objects objects,
172  const Sah_builder::Queue queue,
173  Sah_builder::Bins bins)
174 {
175  const uint32 BLOCK_SIZE = SAH_SINGLE_WARP ? 32 : 128;
176  const size_t max_blocks = SAH_SINGLE_WARP ? 1 : thrust::detail::device::cuda::arch::max_active_blocks(update_bins_kernel<Iterator>, BLOCK_SIZE, 0);
177  const size_t n_blocks = nih::min( max_blocks, (n_objects + BLOCK_SIZE-1) / BLOCK_SIZE );
178 
179  update_bins_kernel<<<n_blocks,BLOCK_SIZE>>> (
180  BINS,
181  n_objects,
182  bboxes,
183  origin,
184  objects,
185  queue,
186  bins );
187 
188  cudaThreadSynchronize();
189 }
190 
191 inline void start_timer(const cudaEvent_t start, const cudaEvent_t stop)
192 {
193  cudaEventRecord( start, 0 );
194 }
195 inline float stop_timer(const cudaEvent_t start, const cudaEvent_t stop)
196 {
197  float dtime;
198  cudaEventRecord( stop, 0 );
199  cudaEventSynchronize( stop );
200  cudaEventElapsedTime( &dtime, start, stop );
201  return dtime;
202 }
203 
205 {
207  typedef uint32 result_type;
208 
209  NIH_HOST_DEVICE uint32 operator() (const Sah_builder::Bin bin) const
210  {
211  return binary_cast<int32>(bin.bmin.w);
212  }
213 };
214 
215 // build a bvh given a set of bboxes
216 template <typename Iterator>
218  const uint32 BINS,
219  const Bbox3f bbox,
220  const Iterator bbox_begin,
221  const Iterator bbox_end,
222  const uint32 max_leaf_size,
223  const float max_cost)
224 {
225  const uint32 n_objects = uint32( bbox_end - bbox_begin );
226 
227  need_space( *m_nodes, n_objects*2 );
228  need_space( *m_leaves, n_objects );
229  need_space( *m_index, n_objects );
230 
231  need_space( m_bin_bmin, (n_objects / max_leaf_size) * BINS * 3 ); // might need more later on...
232  need_space( m_bin_bmax, (n_objects / max_leaf_size) * BINS * 3 ); // might need more later on...
233  need_space( m_bin_size, (n_objects / max_leaf_size) * BINS * 3 ); // might need more later on...
234  need_space( m_queue_bins, n_objects * 2 );
235  need_space( m_queue_splits, n_objects * 2 );
236  #ifdef SAH_OBJECT_REORDERING
237  need_space( m_queue_offsets, n_objects );
238  need_space( m_new_pos, n_objects*2 );
239  #endif
240 
241  need_space( m_bin_ids, n_objects );
242  need_space( m_split_ids, n_objects );
243  need_space( m_node_ids, n_objects );
244 
245  Queue queue[2];
246 
247  queue[0].bins = thrust::raw_pointer_cast( &m_queue_bins.front() );
248  queue[0].splits = thrust::raw_pointer_cast( &m_queue_splits.front() );
249  queue[1].bins = thrust::raw_pointer_cast( &m_queue_bins.front() ) + n_objects;
250  queue[1].splits = thrust::raw_pointer_cast( &m_queue_splits.front() ) + n_objects;
251  #ifdef SAH_OBJECT_REORDERING
252  queue[0].offsets = thrust::raw_pointer_cast( &m_queue_offsets.front() );
253  queue[1].offsets = thrust::raw_pointer_cast( &m_queue_offsets.front() );
254  #endif
255 
256  Objects objects;
257  objects.bin_ids = thrust::raw_pointer_cast( &m_bin_ids.front() );
258  objects.split_ids = thrust::raw_pointer_cast( &m_split_ids.front() );
259  objects.node_ids = thrust::raw_pointer_cast( &m_node_ids.front() );
260  #ifdef SAH_OBJECT_REORDERING
261  objects.index = thrust::raw_pointer_cast( &m_new_pos.front() );
262  #endif
263 
264  // assign all objects to split task 0 and node -1
265  thrust::fill( m_split_ids.begin(), m_split_ids.begin() + n_objects, 0 );
266  thrust::fill( m_node_ids.begin(), m_node_ids.begin() + n_objects, -1 );
267 
268  #ifdef SAH_OBJECT_REORDERING
269  // assign initial ordering [0,1,2,...,n_objects-1] to the objects
270  thrust::copy( thrust::make_counting_iterator(0u), thrust::make_counting_iterator(0u) + n_objects, m_new_pos.begin() );
271  #endif
272 
273  // initialize root bounding box
274  {
275  Bin bin;
276  bin.bmin = make_float4( 0.0f,0.0f,0.0f,binary_cast<float>(n_objects) );
277  bin.bmax = make_float4( bbox[1][0]-bbox[0][0],bbox[1][1]-bbox[0][1],bbox[1][2]-bbox[0][2],binary_cast<float>(Bvh_node::kInternal) );
278  m_queue_bins[0] = bin;
279  }
280 
281  int input_node_offset = 0;
282  int output_node_offset = 1;
283 
284  m_counters.resize(2);
285  m_counters[0] = 0;
286  m_counters[1] = 0;
287 
288  int in_queue = 0;
289  int out_queue = 1;
290  int n_input_tasks = 1;
291 
292  cudaEvent_t start, stop;
293  cudaEventCreate( &start );
294  cudaEventCreate( &stop );
295 
296  #ifdef SAH_OBJECT_REORDERING
297  uint32* index[2] = {
298  thrust::raw_pointer_cast( &m_new_pos.front() ),
299  thrust::raw_pointer_cast( &m_new_pos.front() ) + n_objects
300  };
301  #else
302  uint32* index[2] = { 0, 0 };
303  #endif
304 
305  Bins bins;
306 
307  m_levels[0] = 0;
308  int32 level = 0;
309 
310  // keep processing nodes in the input task queue until there's no more output
311  while (n_input_tasks)
312  {
313  // mark the beginning of the new level
314  m_levels[ level++ ] = input_node_offset;
315 
316  need_space( m_bin_bmin, n_input_tasks * BINS * 3 );
317  need_space( m_bin_bmax, n_input_tasks * BINS * 3 );
318  need_space( m_bin_size, n_input_tasks * BINS * 3 );
319 
320  bins.bmin = thrust::raw_pointer_cast( &m_bin_bmin.front() );
321  bins.bmax = thrust::raw_pointer_cast( &m_bin_bmax.front() );
322  bins.size = thrust::raw_pointer_cast( &m_bin_size.front() );
323 
324  // reset the output task counter
325  m_counters[0] = 0;
326 
327  // set queue size
328  queue[ in_queue ].size = n_input_tasks;
329 
330  start_timer( start, stop );
331  //init_bins( BINS, n_input_tasks, bins );
332  {
333  const float HUGE = 1.0e8f;
334  thrust::fill( m_bin_bmin.begin(), m_bin_bmin.begin() + n_input_tasks * BINS * 3, make_float3( HUGE, HUGE, HUGE ) );
335  thrust::fill( m_bin_bmax.begin(), m_bin_bmax.begin() + n_input_tasks * BINS * 3, make_float3( -HUGE, -HUGE, -HUGE ) );
336  thrust::fill( m_bin_size.begin(), m_bin_size.begin() + n_input_tasks * BINS * 3, 0 );
337  }
338  m_init_bins_time += stop_timer( start, stop );
339 
340  #ifdef SAH_OBJECT_REORDERING
342  thrust::make_transform_iterator( m_queue_bins.begin(), Bin_counter() ) + in_queue*n_objects,
343  thrust::make_transform_iterator( m_queue_bins.begin(), Bin_counter() ) + in_queue*n_objects + n_input_tasks,
344  m_queue_offsets.begin() );
345  #endif
346 
347  start_timer( start, stop );
348  update_bins( BINS, n_objects, bbox_begin, Vector4f(-bbox[0][0],-bbox[0][1],-bbox[0][2],0.0f), objects, queue[ in_queue ], bins);
349  m_update_bins_time += stop_timer( start, stop );
350 
351  start_timer( start, stop );
352  sah_split(
353  BINS,
354  bins,
355  queue[ in_queue ],
356  input_node_offset,
357  queue[ out_queue ],
358  thrust::raw_pointer_cast( &m_counters.front() ),
359  output_node_offset,
360  thrust::raw_pointer_cast( &m_nodes->front() ),
361  thrust::raw_pointer_cast( &m_counters.front() ) + 1,
362  max_leaf_size,
363  max_cost );
364  m_sah_split_time += stop_timer( start, stop );
365 
366  start_timer( start, stop );
367 
368  #ifdef SAH_OBJECT_REORDERING
369  thrust::fill(
370  m_new_pos.begin() + (out_queue+0)*n_objects,
371  m_new_pos.begin() + (out_queue+1)*n_objects,
372  uint32(-1) );
373  #endif
374 
375  distribute_objects(
376  BINS,
377  objects,
378  n_objects,
379  queue[ in_queue ],
380  input_node_offset,
381  bins,
382  index[ out_queue ] );
383 
384  #ifdef SAH_OBJECT_REORDERING
385  // swap objects index
386  objects.index = index[ out_queue ];
387  #endif
388 
389  m_distribute_objects_time += stop_timer( start, stop );
390 
391  // get the new number of generated tasks
392  const uint32 n_output_tasks = m_counters[0];
393 
394  // update input & output counters
395  input_node_offset = output_node_offset;
396  output_node_offset += n_output_tasks;
397  n_input_tasks = n_output_tasks;
398 
399  // swap the input & output queues
400  std::swap( in_queue, out_queue );
401  }
402 
403  m_level_count = level;
404  for (; level < 128; ++level)
405  m_levels[ level ] = output_node_offset;
406 
407  // sort the objects by their leaf id
408  thrust::copy( thrust::make_counting_iterator(0u), thrust::make_counting_iterator(0u) + n_objects, m_index->begin() );
409  thrust::sort_by_key( m_node_ids.begin(), m_node_ids.begin() + n_objects, m_index->begin() );
410 
411  // setup leaf ranges
412  setup_leaves( n_objects, objects.node_ids, thrust::raw_pointer_cast( &m_leaves->front() ) );
413 
414  m_leaf_count = m_counters[1];
415  m_node_count = output_node_offset;
416 
417  cudaEventDestroy( start );
418  cudaEventDestroy( stop );
419 }
420 
421 } // namespace cuda
422 } // namespace nih
CUGAR_HOST_DEVICE uint32 quantize(const float x, const uint32 n)
Definition: numbers.h:600
Definition: sah_builder.h:37
Definition: sah_builder.h:115
Definition: sah_builder.h:103
void build(const uint32 BINS, const Bbox3f bbox, const Iterator bbox_begin, const Iterator bbox_end, const uint32 max_leaf_size, const float max_cost=1.8f)
build a bvh given a set of bboxes
Definition: sah_builder_inline.h:217
Definition: sah_builder.h:96
Definition: sah_builder.h:80
Definition: sah_builder_inline.h:204
void exclusive_scan(const uint32 n, InputIterator d_in, OutputIterator d_out, BinaryOp op, Identity identity, thrust::device_vector< uint8 > &d_temp_storage)
Definition: primitives_inl.h:265
CUGAR_FORCEINLINE CUGAR_HOST_DEVICE Out binary_cast(const In in)
Definition: types.h:288