Fermat
All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Modules Pages
sah_builder_inline.h
1 /*
2  * Copyright (c) 2010-2011, NVIDIA Corporation
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of NVIDIA Corporation nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 #pragma once
29 
30 #include <nih/basic/utils.h>
31 #include <nih/basic/functors.h>
32 #include <nih/basic/cuda/scan.h>
33 #include <nih/basic/cuda_config.h>
34 #include <nih/thrust/iterator_wrapper.h>
35 #include <thrust/scan.h>
36 #include <thrust/sort.h>
37 #include <thrust/iterator/constant_iterator.h>
38 #include <thrust/iterator/counting_iterator.h>
39 #include <thrust/iterator/reverse_iterator.h>
40 #include <thrust/iterator/transform_iterator.h>
41 
42 namespace nih {
43 namespace cuda {
44 
45 namespace sah {
46 
47 inline void start_timer(const cudaEvent_t start, const cudaEvent_t stop)
48 {
49  cudaEventRecord( start, 0 );
50 }
51 inline float stop_timer(const cudaEvent_t start, const cudaEvent_t stop)
52 {
53  float dtime;
54  cudaEventRecord( stop, 0 );
55  cudaEventSynchronize( stop );
56  cudaEventElapsedTime( &dtime, start, stop );
57  return dtime;
58 }
59 
60 FORCE_INLINE NIH_HOST_DEVICE uint32 largest_axis(const float3 edge)
61 {
62  return edge.x > edge.y ?
63  (edge.x > edge.z ? 0 : 2) :
64  (edge.y > edge.z ? 1 : 2);
65 }
66 
71 {
72  typedef thrust::tuple<uint32,Bbox4f> argument_type;
73  typedef uint2 result_type;
74 
75  Bbox_compressor(const Bin* bins) : m_bins( bins ) {}
76 
77  FORCE_INLINE NIH_HOST_DEVICE uint2 operator() (const argument_type op) const
78  {
79  const uint32 key = thrust::get<0>( op );
80  if (key == uint32(-1))
81  return make_uint2(0,0);
82 
83  const Bbox4f bbox = thrust::get<1>( op );
84 
85  const float4 bmin = m_bins[ key-1 ].bmin;
86  const float4 bmax = m_bins[ key-1 ].bmax;
87 
88  const float3 delta = make_float3( bmax.x - bmin.x, bmax.y - bmin.y, bmax.z - bmin.z );
89 
90  const uint32 axis = largest_axis( delta );
91 
92  uint32 l;
93  l = delta.x < 1.0e-8f ? 0u : quantize( (bbox[0][0] - bmin.x) / delta.x, 1024 );
94  l |= delta.y < 1.0e-8f ? 0u : quantize( (bbox[0][1] - bmin.y) / delta.y, 1024 ) << 10;
95  l |= delta.z < 1.0e-8f ? 0u : quantize( (bbox[0][2] - bmin.z) / delta.z, 1024 ) << 20;
96  l |= axis << 30;
97 
98  uint32 r;
99  r = delta.x < 1.0e-8f ? 0u : quantize( (bbox[1][0] - bmin.x) / delta.x, 1024 );
100  r |= delta.y < 1.0e-8f ? 0u : quantize( (bbox[1][1] - bmin.y) / delta.y, 1024 ) << 10;
101  r |= delta.z < 1.0e-8f ? 0u : quantize( (bbox[1][2] - bmin.z) / delta.z, 1024 ) << 20;
102 
103  return make_uint2( l, r );
104  }
105 
106  const Bin* m_bins;
107 };
108 
109 // evaluate the new splits, choosing which ones to keep and which ones to discard.
110 void eval_splits(
111  const uint32 n_nodes,
112  uint32* split_planes,
113  const float* split_costs,
114  const uint32* segment_heads,
115  uint32* out_splits,
116  const uint32 max_leaf_size,
117  const float max_cost);
118 
119 // assign objects to their new nodes
120 void assign_objects(
121  const uint32 n_objects,
122  const uint32 n_leaves,
123  const uint32* order,
124  const uint32* segment_keys,
125  const uint32* split_index,
126  const uint32* allocation_map,
127  uint32* segment_ids,
128  uint32* leaf_ids);
129 
130 // compute the bounding box of the output segments
131 void compute_bins(
132  const uint32 n_segments,
133  const uint32 n_nodes,
134  const uint32 n_leaves,
135  const uint32 input_node_offset,
136  const uint32* split_index,
137  const uint32* allocation_map,
138  const Bin* in_bins,
139  const uint2* bounds_l,
140  const uint2* bounds_r,
141  Bin* out_bins,
142  Bvh_node* bvh_nodes);
143 
144 // setup the leaf array
145 void setup_leaves(
146  const uint32 n_objects,
147  const uint32* leaf_ids,
148  uint2* leaves);
149 
150 } // namespace sah
151 
152 // build a bvh given a set of bboxes
153 template <typename Iterator>
155  const Bbox3f bbox,
156  const Iterator bbox_begin,
157  const Iterator bbox_end,
158  const uint32 max_leaf_size,
159  const float max_cost)
160 {
161  const uint32 n_objects = uint32( bbox_end - bbox_begin );
162 
163  need_space( *m_nodes, n_objects*2 );
164  need_space( *m_leaves, n_objects );
165  need_space( *m_index, n_objects );
166 
167  uint32 storage = 0;
168 
169  need_space( m_segment_heads, n_objects+1 ); storage += (n_objects+1) * sizeof(uint32);
170  need_space( m_segment_keys, n_objects ); storage += n_objects * sizeof(uint32);
171 
172  need_space( m_queue_bins[0], n_objects / max_leaf_size ); storage += (n_objects / max_leaf_size) * sizeof(Bin);
173  need_space( m_queue_bins[1], n_objects / max_leaf_size ); storage += (n_objects / max_leaf_size) * sizeof(Bin);
174 
175  need_space( m_segment_ids, n_objects ); storage += n_objects * sizeof(uint32);
176  need_space( m_node_ids, n_objects ); storage += n_objects * sizeof(uint32);
177 
178  need_space( m_scan_bounds, n_objects * 3 ); storage += n_objects*3 * sizeof(uint2);
179  need_space( m_split_costs, n_objects ); storage += n_objects * sizeof(float);
180  need_space( m_split_index, n_objects ); storage += n_objects * sizeof(uint32);
181 
182  // assign all objects to node 0
183  thrust::fill( m_node_ids.begin(), m_node_ids.begin() + n_objects, 0 );
184 
185  thrust::fill( m_segment_keys.begin(), m_segment_keys.begin() + n_objects, 1u );
186  thrust::fill( m_segment_ids.begin(), m_segment_ids.begin() + n_objects, 1u );
187 
188  m_segment_heads[0] = 0;
189  m_segment_heads[1] = n_objects;
190 
191  thrust::device_vector<float>::iterator centroids = m_split_costs.begin();
192 
193  int in_queue = 0;
194  int out_queue = 1;
195 
196  // initialize root bounding box
197  {
198  Bin bin;
199  bin.bmin = make_float4( bbox[0][0],bbox[0][1],bbox[0][2],binary_cast<float>(n_objects) );
200  bin.bmax = make_float4( bbox[1][0],bbox[1][1],bbox[1][2],binary_cast<float>(Bvh_node::kInvalid) );
201  m_queue_bins[ in_queue ][0] = bin;
202  }
203 
204  m_counters.resize(2);
205  m_counters[0] = 0;
206  m_counters[1] = 0;
207 
208  uint32 input_node_offset = 0;
209  uint32 n_leaves = 0;
210  uint32 n_nodes = 1;
211  uint32 out_nodes = 1;
212 
213  cudaEvent_t start, stop;
214  cudaEventCreate( &start );
215  cudaEventCreate( &stop );
216 
217  thrust::device_vector<uint2>::iterator bounds_l = m_scan_bounds.begin() + n_objects*0;
218  thrust::device_vector<uint2>::iterator bounds_r = m_scan_bounds.begin() + n_objects*1;
219  thrust::device_vector<uint2>::iterator bounds = m_scan_bounds.begin() + n_objects*2;
220 
221  //const float HUGE = 1.0e9f;
222 
223  uint32 n_active_objects = n_objects;
224 
225  m_levels[0] = 0;
226  int32 level = 0;
227 
228  while (out_nodes)
229  {
230  // mark the beginning of the new level
231  m_levels[ level++ ] = n_nodes;
232 
233  // compress the bounds relative to their parents
234  {
235  sah::start_timer( start, stop );
236 
237  sah::Bbox_compressor bbox_compressor(
238  thrust::raw_pointer_cast( &m_queue_bins[ in_queue ].front() ) );
239 
241  thrust::make_zip_iterator( thrust::make_tuple( m_segment_ids.begin(), bbox_begin ) ),
242  thrust::make_zip_iterator( thrust::make_tuple( m_segment_ids.begin(), bbox_begin ) ) + n_objects,
243  bounds_l,
244  bbox_compressor );
245 
246  m_compression_time += sah::stop_timer( start, stop );
247  }
248 
249  uint32 n_segments = out_nodes;
250 
251  //
252  // Build the largest axis ordering for this pass. Basically, sort the
253  // objects by the segment they belong to and their centroid along the
254  // the largest axis of their segment.
255  //
256  thrust::device_vector<uint32>::iterator order = m_index->begin();
257 
258  sah::start_timer( start, stop );
259 
260  sort(
261  n_objects,
262  m_segment_ids.begin(),
263  m_segment_keys.begin(),
264  bounds_l,
265  bounds,
266  bounds_r,
267  order,
268  n_active_objects,
269  n_segments );
270 
271  m_sorting_time += sah::stop_timer( start, stop );
272 
273  sah::start_timer( start, stop );
274 
275  eval_split_costs(
276  n_active_objects,
277  n_segments,
278  m_segment_keys.begin(),
279  m_segment_heads.begin(),
280  bounds,
281  bounds_l,
282  bounds_r,
283  m_split_costs.begin(),
284  m_split_index.begin() );
285 
286  thrust::device_ptr<uint32> allocation_map( (uint32*)thrust::raw_pointer_cast( &*bounds ) );
287 
288  // evaluate the new splits, choosing which ones to keep and which ones to discard
289  sah::eval_splits(
290  n_segments,
291  thrust::raw_pointer_cast( &m_split_index.front() ),
292  thrust::raw_pointer_cast( &m_split_costs.front() ),
293  thrust::raw_pointer_cast( &m_segment_heads.front() ),
294  thrust::raw_pointer_cast( &*allocation_map ),
295  max_leaf_size,
296  max_cost );
297 
298  m_sah_split_time += sah::stop_timer( start, stop );
299 
300  sah::start_timer( start, stop );
301 
302  // scan the split booleans to find out the new node offsets
303  thrust::inclusive_scan( allocation_map, allocation_map + n_segments, allocation_map );
304 
305  const uint32 n_splits = allocation_map[ n_segments-1 ];
306  out_nodes = n_splits*2;
307 
308  // assign the objects to their new nodes
309  sah::assign_objects(
310  n_active_objects,
311  n_leaves,
312  thrust::raw_pointer_cast( &*order ),
313  thrust::raw_pointer_cast( &m_segment_keys.front() ),
314  thrust::raw_pointer_cast( &m_split_index.front() ),
315  thrust::raw_pointer_cast( &*allocation_map ),
316  thrust::raw_pointer_cast( &m_segment_ids.front() ),
317  thrust::raw_pointer_cast( &m_node_ids.front() ) );
318 
319  // realloc the output queue bins if needed
320  storage -= sizeof(Bin) * m_queue_bins[ out_queue ].size();
321  need_space( m_queue_bins[ out_queue ], out_nodes );
322  storage += sizeof(Bin) * m_queue_bins[ out_queue ].size();
323 
324  // compute the bounding box of the output segments
325  sah::compute_bins(
326  n_segments,
327  n_nodes,
328  n_leaves,
329  input_node_offset,
330  thrust::raw_pointer_cast( &m_split_index.front() ),
331  thrust::raw_pointer_cast( &*allocation_map ),
332  thrust::raw_pointer_cast( &m_queue_bins[ in_queue ].front() ),
333  thrust::raw_pointer_cast( &*bounds_l ),
334  thrust::raw_pointer_cast( &*bounds_r ),
335  thrust::raw_pointer_cast( &m_queue_bins[ out_queue ].front() ),
336  thrust::raw_pointer_cast( &m_nodes->front() ) );
337 
338  m_distribute_objects_time += sah::stop_timer( start, stop );
339 
340  input_node_offset = n_nodes;
341 
342  n_nodes += out_nodes;
343  n_leaves += (n_segments - n_splits);
344 
345  std::swap( in_queue, out_queue );
346  }
347 
348  m_level_count = level;
349  for (; level < 128; ++level)
350  m_levels[ level ] = n_nodes;
351 
352  // sort the objects by their leaf id
353  thrust::copy( thrust::make_counting_iterator(0u), thrust::make_counting_iterator(0u) + n_objects, m_index->begin() );
354  thrust::sort_by_key( m_node_ids.begin(), m_node_ids.begin() + n_objects, m_index->begin() );
355 
356  // setup leaf ranges
357  sah::setup_leaves(
358  n_objects,
359  thrust::raw_pointer_cast( &m_node_ids.front() ),
360  thrust::raw_pointer_cast( &m_leaves->front() ) );
361 
362  m_leaf_count = n_leaves;
363  m_node_count = n_nodes;
364 
365  cudaEventDestroy( start );
366  cudaEventDestroy( stop );
367 
368  m_temp_storage = nih::max( storage, m_temp_storage );
369 }
370 
371 } // namespace cuda
372 } // namespace nih
void transform(const uint32 n, const Iterator in, const Output out, const Functor functor)
Definition: primitives_inl.h:357
CUGAR_HOST_DEVICE uint32 quantize(const float x, const uint32 n)
Definition: numbers.h:600
Definition: sah_builder_inline.h:70
Definition: sah_builder.h:37
Definition: sah_builder.h:46
void build(const uint32 BINS, const Bbox3f bbox, const Iterator bbox_begin, const Iterator bbox_end, const uint32 max_leaf_size, const float max_cost=1.8f)
build a bvh given a set of bboxes
Definition: sah_builder_inline.h:217
void inclusive_scan(const uint32 n, InputIterator d_in, OutputIterator d_out, BinaryOp op, thrust::device_vector< uint8 > &d_temp_storage)
Definition: primitives_inl.h:228
Definition: sah_builder.h:80
CUGAR_FORCEINLINE CUGAR_HOST_DEVICE size_t largest_axis(const Bbox< Vector_t > &bbox)
Definition: bbox_inline.h:103