Fermat
binned_sah_builder_inline.h
1 /*
2  * Copyright (c) 2010-2011, NVIDIA Corporation
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of NVIDIA Corporation nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
30 #include <nih/basic/utils.h>
31 #include <nih/basic/cuda/scan.h>
32 #include <nih/basic/cuda_config.h>
33 #include <thrust/scan.h>
34 #include <thrust/sort.h>
35 
36 namespace nih {
37 namespace cuda {
38 
39 #define SAH_SINGLE_WARP 0
40 #define SAH_MAX_BINS 128
41 //#define SAH_CHECKS
42 
43 namespace binned_sah {
44 
45 void init_bins(const uint32 BINS, const uint32 n_nodes, Bins bins);
46 
47 void sah_split(
48  const uint32 BINS,
49  Bins bins,
50  Queue qin,
51  const int input_node_offset,
52  Queue qout,
53  uint32* n_output,
54  int output_node_offset,
55  Bvh_node* nodes,
56  uint32* n_leaves,
57  const uint32 max_leaf_size,
58  const float max_cost);
59 
60 void distribute_objects(
61  const uint32 BINS,
62  Objects objects,
63  const int n_objects,
64  Queue queue,
65  const int input_node_offset,
66  Bins bins);
67 
68 void setup_leaves(
69  const int n_objects,
70  const int32* leaf_ids,
71  uint2* leaves);
72 
73 
74 FORCE_INLINE NIH_DEVICE void update_bin(float3* bin_bmin, float3* bin_bmax, int32* bin_counter, const Bbox4f bbox)
75 {
76  const float3 bmin = *bin_bmin;
77  if (bbox[0][0] < bmin.x) atomicMin( (int32*)&(bin_bmin->x), __float_as_int(bbox[0][0]) );
78  if (bbox[0][1] < bmin.y) atomicMin( (int32*)&(bin_bmin->y), __float_as_int(bbox[0][1]) );
79  if (bbox[0][2] < bmin.z) atomicMin( (int32*)&(bin_bmin->z), __float_as_int(bbox[0][2]) );
80 
81  const float3 bmax = *bin_bmax;
82  if (bbox[1][0] > bmax.x) atomicMax( (int32*)&(bin_bmax->x), __float_as_int(bbox[1][0]) );
83  if (bbox[1][1] > bmax.y) atomicMax( (int32*)&(bin_bmax->y), __float_as_int(bbox[1][1]) );
84  if (bbox[1][2] > bmax.z) atomicMax( (int32*)&(bin_bmax->z), __float_as_int(bbox[1][2]) );
85 
86  atomicAdd( bin_counter, 1 );
87 }
88 
93 template <typename Iterator>
94 __global__ void update_bins_kernel(
95  const uint32 BINS,
96  const uint32 n_objects,
97  const Iterator bboxes,
98  const Vector4f origin,
99  const Objects objects,
100  const Queue queue,
101  Bins bins)
102 {
103  const uint32 grid_size = gridDim.x * blockDim.x;
104 
105  // loop through all logical blocks associated to this physical one
106  for (uint32 base_idx = blockIdx.x * blockDim.x;
107  base_idx < n_objects;
108  base_idx += grid_size)
109  {
110  const uint32 idx = threadIdx.x + base_idx;
111 
112  if (idx >= n_objects)
113  return;
114 
115  const uint32 id = idx;
116 
117  // check if the object has already been assigned to a node
118  const int32 node_id = objects.node_ids[id];
119  if (node_id > -1)
120  continue;
121 
122  const int32 split_id = objects.split_ids[id];
123 
124  const Bin node_bbox = queue.bins[split_id];
125  const float3 node_size = make_float3(
126  node_bbox.bmax.x - node_bbox.bmin.x,
127  node_bbox.bmax.y - node_bbox.bmin.y,
128  node_bbox.bmax.z - node_bbox.bmin.z );
129 
130  Bbox4f bbox = bboxes[id];
131  bbox[0] += origin;
132  bbox[1] += origin;
133  const Vector3f center = (xyz(bbox[0]) + xyz(bbox[1]))*0.5f;
134 
135  const int4 bin_id = make_int4(
136  (node_size.x < 1.0e-8f ? 0 : quantize( (center[0] - node_bbox.bmin.x) / node_size.x, BINS )),
137  (node_size.y < 1.0e-8f ? 0 : quantize( (center[1] - node_bbox.bmin.y) / node_size.y, BINS )),
138  (node_size.z < 1.0e-8f ? 0 : quantize( (center[2] - node_bbox.bmin.z) / node_size.z, BINS )),
139  0 );
140 
141  objects.bin_ids[idx] = bin_id;
142 
143  const uint32 binX = split_id + (BINS * 0 + bin_id.x)*queue.size;
144  const uint32 binY = split_id + (BINS * 1 + bin_id.y)*queue.size;
145  const uint32 binZ = split_id + (BINS * 2 + bin_id.z)*queue.size;
146 
147  update_bin( bins.bmin + binX, bins.bmax + binX, (int32*)bins.size + binX, bbox );
148  update_bin( bins.bmin + binY, bins.bmax + binY, (int32*)bins.size + binY, bbox );
149  update_bin( bins.bmin + binZ, bins.bmax + binZ, (int32*)bins.size + binZ, bbox );
150  }
151 }
152 
156 template <typename Iterator>
157 inline void update_bins(
158  const uint32 BINS,
159  const uint32 n_objects,
160  const Iterator bboxes,
161  const Vector4f origin,
162  const Objects objects,
163  const Queue queue,
164  Bins bins)
165 {
166  const uint32 BLOCK_SIZE = SAH_SINGLE_WARP ? 32 : 128;
167  const size_t max_blocks = SAH_SINGLE_WARP ? 1 : thrust::detail::backend::cuda::arch::max_active_blocks(update_bins_kernel<Iterator>, BLOCK_SIZE, 0);
168  const size_t n_blocks = nih::min( max_blocks, (n_objects + BLOCK_SIZE-1) / BLOCK_SIZE );
169 
170  update_bins_kernel<<<n_blocks,BLOCK_SIZE>>> (
171  BINS,
172  n_objects,
173  bboxes,
174  origin,
175  objects,
176  queue,
177  bins );
178 
179  cudaThreadSynchronize();
180 }
181 
182 inline void start_timer(const cudaEvent_t start, const cudaEvent_t stop)
183 {
184  cudaEventRecord( start, 0 );
185 }
186 inline float stop_timer(const cudaEvent_t start, const cudaEvent_t stop)
187 {
188  float dtime;
189  cudaEventRecord( stop, 0 );
190  cudaEventSynchronize( stop );
191  cudaEventElapsedTime( &dtime, start, stop );
192  return dtime;
193 }
194 
196 {
197  typedef Bin argument_type;
198  typedef uint32 result_type;
199 
200  NIH_HOST_DEVICE uint32 operator() (const Bin bin) const
201  {
202  return binary_cast<int32>(bin.bmin.w);
203  }
204 };
205 
206 } // namespace binned_sah
207 
208 // build a bvh given a set of bboxes
209 template <typename Iterator>
211  const uint32 BINS,
212  const Bbox3f bbox,
213  const Iterator bbox_begin,
214  const Iterator bbox_end,
215 const Iterator h_bbox_begin,
216  const uint32 max_leaf_size,
217  const float max_cost)
218 {
219  const uint32 n_objects = uint32( bbox_end - bbox_begin );
220 
221  need_space( *m_nodes, n_objects*2 );
222  need_space( *m_leaves, n_objects );
223  need_space( *m_index, n_objects );
224 
225  need_space( m_bin_bmin, (n_objects / max_leaf_size) * BINS * 3 ); // might need more later on...
226  need_space( m_bin_bmax, (n_objects / max_leaf_size) * BINS * 3 ); // might need more later on...
227  need_space( m_bin_size, (n_objects / max_leaf_size) * BINS * 3 ); // might need more later on...
228  need_space( m_queue_bins, n_objects * 2 );
229  need_space( m_queue_splits, n_objects * 2 );
230 
231  need_space( m_bin_ids, n_objects );
232  need_space( m_split_ids, n_objects );
233  need_space( m_node_ids, n_objects );
234 
235  Queue queue[2];
236 
237  queue[0].bins = thrust::raw_pointer_cast( &m_queue_bins.front() );
238  queue[0].splits = thrust::raw_pointer_cast( &m_queue_splits.front() );
239  queue[1].bins = thrust::raw_pointer_cast( &m_queue_bins.front() ) + n_objects;
240  queue[1].splits = thrust::raw_pointer_cast( &m_queue_splits.front() ) + n_objects;
241 
242  Objects objects;
243  objects.bin_ids = thrust::raw_pointer_cast( &m_bin_ids.front() );
244  objects.node_ids = thrust::raw_pointer_cast( &m_node_ids.front() );
245  objects.split_ids = thrust::raw_pointer_cast( &m_split_ids.front() );
246 
247  // assign all objects to split task 0 and node -1
248  thrust::fill( m_split_ids.begin(), m_split_ids.begin() + n_objects, 0 );
249  thrust::fill( m_node_ids.begin(), m_node_ids.begin() + n_objects, -1 );
250 
251  // initialize root bounding box
252  {
253  Bin bin;
254  bin.bmin = make_float4( 0.0f,0.0f,0.0f,binary_cast<float>(n_objects) );
255  bin.bmax = make_float4( bbox[1][0]-bbox[0][0],bbox[1][1]-bbox[0][1],bbox[1][2]-bbox[0][2],binary_cast<float>(Bvh_node::kInvalid) );
256  m_queue_bins[0] = bin;
257  }
258 
259  int input_node_offset = 0;
260  int output_node_offset = 1;
261 
262  m_counters.resize(2);
263  m_counters[0] = 0;
264  m_counters[1] = 0;
265 
266  int in_queue = 0;
267  int out_queue = 1;
268  int n_input_tasks = 1;
269 
270  cudaEvent_t start, stop;
271  cudaEventCreate( &start );
272  cudaEventCreate( &stop );
273 
274  Bins bins;
275 
276  m_levels[0] = 0;
277  int32 level = 0;
278 
279  // keep processing nodes in the input task queue until there's no more output
280  while (n_input_tasks)
281  {
282  // mark the beginning of the new level
283  m_levels[ level++ ] = input_node_offset;
284 
285  need_space( m_bin_bmin, n_input_tasks * BINS * 3 );
286  need_space( m_bin_bmax, n_input_tasks * BINS * 3 );
287  need_space( m_bin_size, n_input_tasks * BINS * 3 );
288 
289  bins.bmin = thrust::raw_pointer_cast( &m_bin_bmin.front() );
290  bins.bmax = thrust::raw_pointer_cast( &m_bin_bmax.front() );
291  bins.size = thrust::raw_pointer_cast( &m_bin_size.front() );
292 
293  // reset the output task counter
294  m_counters[0] = 0;
295 
296  // set queue size
297  queue[ in_queue ].size = n_input_tasks;
298 
299  binned_sah::start_timer( start, stop );
300  //binned_sah::init_bins( BINS, n_input_tasks, bins );
301  {
302  const float HUGE = 1.0e8f;
303  thrust::fill( m_bin_bmin.begin(), m_bin_bmin.begin() + n_input_tasks * BINS * 3, make_float3( HUGE, HUGE, HUGE ) );
304  thrust::fill( m_bin_bmax.begin(), m_bin_bmax.begin() + n_input_tasks * BINS * 3, make_float3( -HUGE, -HUGE, -HUGE ) );
305  thrust::fill( m_bin_size.begin(), m_bin_size.begin() + n_input_tasks * BINS * 3, 0 );
306  }
307 
308  m_init_bins_time += binned_sah::stop_timer( start, stop );
309 
310  binned_sah::start_timer( start, stop );
311  binned_sah::update_bins( BINS, n_objects, bbox_begin, Vector4f(-bbox[0][0],-bbox[0][1],-bbox[0][2],0.0f), objects, queue[ in_queue ], bins);
312  m_update_bins_time += binned_sah::stop_timer( start, stop );
313 
314  binned_sah::start_timer( start, stop );
315  binned_sah::sah_split(
316  BINS,
317  bins,
318  queue[ in_queue ],
319  input_node_offset,
320  queue[ out_queue ],
321  thrust::raw_pointer_cast( &m_counters.front() ),
322  output_node_offset,
323  thrust::raw_pointer_cast( &m_nodes->front() ),
324  thrust::raw_pointer_cast( &m_counters.front() ) + 1,
325  max_leaf_size,
326  max_cost );
327  m_sah_split_time += binned_sah::stop_timer( start, stop );
328 
329  binned_sah::start_timer( start, stop );
330 
331  binned_sah::distribute_objects(
332  BINS,
333  objects,
334  n_objects,
335  queue[ in_queue ],
336  input_node_offset,
337  bins );
338 
339  m_distribute_objects_time += binned_sah::stop_timer( start, stop );
340 
341  // get the new number of generated tasks
342  const uint32 n_output_tasks = m_counters[0];
343 
344  // update input & output counters
345  input_node_offset = output_node_offset;
346  output_node_offset += n_output_tasks;
347  n_input_tasks = n_output_tasks;
348 
349  // swap the input & output queues
350  std::swap( in_queue, out_queue );
351  }
352 
353  m_level_count = level;
354  for (; level < 128; ++level)
355  m_levels[ level ] = output_node_offset;
356 
357  // sort the objects by their leaf id
358  thrust::copy( thrust::make_counting_iterator(0u), thrust::make_counting_iterator(0u) + n_objects, m_index->begin() );
359  thrust::sort_by_key( m_node_ids.begin(), m_node_ids.begin() + n_objects, m_index->begin() );
360 
361  // setup leaf ranges
362  binned_sah::setup_leaves( n_objects, objects.node_ids, thrust::raw_pointer_cast( &m_leaves->front() ) );
363 
364  m_leaf_count = m_counters[1];
365  m_node_count = output_node_offset;
366 
367  cudaEventDestroy( start );
368  cudaEventDestroy( stop );
369 }
370 
371 } // namespace cuda
372 } // namespace nih
Definition: binned_sah_builder.h:65
CUGAR_HOST_DEVICE uint32 quantize(const float x, const uint32 n)
Definition: numbers.h:600
Definition: sah_builder.h:37
void build(const uint32 BINS, const Bbox3f bbox, const Iterator bbox_begin, const Iterator bbox_end, const Iterator h_bbox_begin, const uint32 max_leaf_size, const float max_cost=1.8f)
Definition: binned_sah_builder_inline.h:210
Definition: binned_sah_builder_inline.h:195
Definition: binned_sah_builder.h:42
Definition: binned_sah_builder.h:58
CUGAR_FORCEINLINE CUGAR_HOST_DEVICE Out binary_cast(const In in)
Definition: types.h:288
Definition: binned_sah_builder.h:77