30 #include <nih/basic/utils.h> 31 #include <nih/basic/cuda/scan.h> 32 #include <nih/basic/cuda_config.h> 33 #include <thrust/scan.h> 34 #include <thrust/sort.h> 39 #define SAH_SINGLE_WARP 0 40 #define SAH_MAX_BINS 128 44 #define ACCESS_BINS(a,id,axis,index,stride) a[id + (axis*BINS + index)*stride] 47 void init_bins(
const uint32 BINS,
const uint32 n_nodes, Sah_builder::Bins bins);
51 Sah_builder::Bins bins,
52 Sah_builder::Queue qin,
53 const int input_node_offset,
54 Sah_builder::Queue qout,
56 int output_node_offset,
59 const uint32 max_leaf_size,
60 const float max_cost);
62 void distribute_objects(
64 Sah_builder::Objects objects,
66 Sah_builder::Queue queue,
67 const int input_node_offset,
68 Sah_builder::Bins bins,
73 const int32* leaf_ids,
77 FORCE_INLINE NIH_DEVICE
void update_bin(float3* bin_bmin, float3* bin_bmax, int32* bin_counter,
const Bbox4f bbox)
79 const float3 bmin = *bin_bmin;
80 if (bbox[0][0] < bmin.x) atomicMin( (int32*)&(bin_bmin->x), __float_as_int(bbox[0][0]) );
81 if (bbox[0][1] < bmin.y) atomicMin( (int32*)&(bin_bmin->y), __float_as_int(bbox[0][1]) );
82 if (bbox[0][2] < bmin.z) atomicMin( (int32*)&(bin_bmin->z), __float_as_int(bbox[0][2]) );
84 const float3 bmax = *bin_bmax;
85 if (bbox[1][0] > bmax.x) atomicMax( (int32*)&(bin_bmax->x), __float_as_int(bbox[1][0]) );
86 if (bbox[1][1] > bmax.y) atomicMax( (int32*)&(bin_bmax->y), __float_as_int(bbox[1][1]) );
87 if (bbox[1][2] > bmax.z) atomicMax( (int32*)&(bin_bmax->z), __float_as_int(bbox[1][2]) );
89 atomicAdd( bin_counter, 1 );
96 template <
typename Iterator>
97 __global__
void update_bins_kernel(
99 const uint32 n_objects,
100 const Iterator bboxes,
101 const Vector4f origin,
102 const Sah_builder::Objects objects,
103 const Sah_builder::Queue queue,
104 Sah_builder::Bins bins)
106 const uint32 grid_size = gridDim.x * blockDim.x;
109 for (uint32 base_idx = blockIdx.x * blockDim.x;
110 base_idx < n_objects;
111 base_idx += grid_size)
113 const uint32 idx = threadIdx.x + base_idx;
115 if (idx >= n_objects)
118 #ifdef SAH_OBJECT_REORDERING 119 const uint32
id = objects.index[ idx ];
120 if (
id == uint32(-1))
123 const uint32
id = idx;
127 const int32 node_id = objects.node_ids[id];
131 const int32 split_id = objects.split_ids[id];
133 const Sah_builder::Bin node_bbox = queue.bins[split_id];
134 const float3 node_size = make_float3(
135 node_bbox.bmax.x - node_bbox.bmin.x,
136 node_bbox.bmax.y - node_bbox.bmin.y,
137 node_bbox.bmax.z - node_bbox.bmin.z );
139 Bbox4f bbox = bboxes[id];
142 const Vector3f center = (xyz(bbox[0]) + xyz(bbox[1]))*0.5f;
144 const int4 bin_id = make_int4(
145 (node_size.x < 1.0e-8f ? 0 :
quantize( (center[0] - node_bbox.bmin.x) / node_size.x, BINS )),
146 (node_size.y < 1.0e-8f ? 0 :
quantize( (center[1] - node_bbox.bmin.y) / node_size.y, BINS )),
147 (node_size.z < 1.0e-8f ? 0 :
quantize( (center[2] - node_bbox.bmin.z) / node_size.z, BINS )),
150 objects.bin_ids[idx] = bin_id;
152 const uint32 binX = split_id + (BINS * 0 + bin_id.x)*queue.size;
153 const uint32 binY = split_id + (BINS * 1 + bin_id.y)*queue.size;
154 const uint32 binZ = split_id + (BINS * 2 + bin_id.z)*queue.size;
156 update_bin( bins.bmin + binX, bins.bmax + binX, (int32*)bins.size + binX, bbox );
157 update_bin( bins.bmin + binY, bins.bmax + binY, (int32*)bins.size + binY, bbox );
158 update_bin( bins.bmin + binZ, bins.bmax + binZ, (int32*)bins.size + binZ, bbox );
165 template <
typename Iterator>
166 inline void update_bins(
168 const uint32 n_objects,
169 const Iterator bboxes,
170 const Vector4f origin,
171 const Sah_builder::Objects objects,
172 const Sah_builder::Queue queue,
173 Sah_builder::Bins bins)
175 const uint32 BLOCK_SIZE = SAH_SINGLE_WARP ? 32 : 128;
176 const size_t max_blocks = SAH_SINGLE_WARP ? 1 : thrust::detail::device::cuda::arch::max_active_blocks(update_bins_kernel<Iterator>, BLOCK_SIZE, 0);
177 const size_t n_blocks = nih::min( max_blocks, (n_objects + BLOCK_SIZE-1) / BLOCK_SIZE );
179 update_bins_kernel<<<n_blocks,BLOCK_SIZE>>> (
188 cudaThreadSynchronize();
191 inline void start_timer(
const cudaEvent_t start,
const cudaEvent_t stop)
193 cudaEventRecord( start, 0 );
195 inline float stop_timer(
const cudaEvent_t start,
const cudaEvent_t stop)
198 cudaEventRecord( stop, 0 );
199 cudaEventSynchronize( stop );
200 cudaEventElapsedTime( &dtime, start, stop );
207 typedef uint32 result_type;
216 template <
typename Iterator>
220 const Iterator bbox_begin,
221 const Iterator bbox_end,
222 const uint32 max_leaf_size,
223 const float max_cost)
225 const uint32 n_objects = uint32( bbox_end - bbox_begin );
227 need_space( *m_nodes, n_objects*2 );
228 need_space( *m_leaves, n_objects );
229 need_space( *m_index, n_objects );
231 need_space( m_bin_bmin, (n_objects / max_leaf_size) * BINS * 3 );
232 need_space( m_bin_bmax, (n_objects / max_leaf_size) * BINS * 3 );
233 need_space( m_bin_size, (n_objects / max_leaf_size) * BINS * 3 );
234 need_space( m_queue_bins, n_objects * 2 );
235 need_space( m_queue_splits, n_objects * 2 );
236 #ifdef SAH_OBJECT_REORDERING 237 need_space( m_queue_offsets, n_objects );
238 need_space( m_new_pos, n_objects*2 );
241 need_space( m_bin_ids, n_objects );
242 need_space( m_split_ids, n_objects );
243 need_space( m_node_ids, n_objects );
247 queue[0].bins = thrust::raw_pointer_cast( &m_queue_bins.front() );
248 queue[0].splits = thrust::raw_pointer_cast( &m_queue_splits.front() );
249 queue[1].bins = thrust::raw_pointer_cast( &m_queue_bins.front() ) + n_objects;
250 queue[1].splits = thrust::raw_pointer_cast( &m_queue_splits.front() ) + n_objects;
251 #ifdef SAH_OBJECT_REORDERING 252 queue[0].offsets = thrust::raw_pointer_cast( &m_queue_offsets.front() );
253 queue[1].offsets = thrust::raw_pointer_cast( &m_queue_offsets.front() );
257 objects.bin_ids = thrust::raw_pointer_cast( &m_bin_ids.front() );
258 objects.split_ids = thrust::raw_pointer_cast( &m_split_ids.front() );
259 objects.node_ids = thrust::raw_pointer_cast( &m_node_ids.front() );
260 #ifdef SAH_OBJECT_REORDERING 261 objects.index = thrust::raw_pointer_cast( &m_new_pos.front() );
265 thrust::fill( m_split_ids.begin(), m_split_ids.begin() + n_objects, 0 );
266 thrust::fill( m_node_ids.begin(), m_node_ids.begin() + n_objects, -1 );
268 #ifdef SAH_OBJECT_REORDERING 270 thrust::copy( thrust::make_counting_iterator(0u), thrust::make_counting_iterator(0u) + n_objects, m_new_pos.begin() );
276 bin.bmin = make_float4( 0.0f,0.0f,0.0f,binary_cast<float>(n_objects) );
277 bin.bmax = make_float4( bbox[1][0]-bbox[0][0],bbox[1][1]-bbox[0][1],bbox[1][2]-bbox[0][2],binary_cast<float>(Bvh_node::kInternal) );
278 m_queue_bins[0] = bin;
281 int input_node_offset = 0;
282 int output_node_offset = 1;
284 m_counters.resize(2);
290 int n_input_tasks = 1;
292 cudaEvent_t start, stop;
293 cudaEventCreate( &start );
294 cudaEventCreate( &stop );
296 #ifdef SAH_OBJECT_REORDERING 298 thrust::raw_pointer_cast( &m_new_pos.front() ),
299 thrust::raw_pointer_cast( &m_new_pos.front() ) + n_objects
302 uint32* index[2] = { 0, 0 };
311 while (n_input_tasks)
314 m_levels[ level++ ] = input_node_offset;
316 need_space( m_bin_bmin, n_input_tasks * BINS * 3 );
317 need_space( m_bin_bmax, n_input_tasks * BINS * 3 );
318 need_space( m_bin_size, n_input_tasks * BINS * 3 );
320 bins.bmin = thrust::raw_pointer_cast( &m_bin_bmin.front() );
321 bins.bmax = thrust::raw_pointer_cast( &m_bin_bmax.front() );
322 bins.size = thrust::raw_pointer_cast( &m_bin_size.front() );
328 queue[ in_queue ].size = n_input_tasks;
330 start_timer( start, stop );
333 const float HUGE = 1.0e8f;
334 thrust::fill( m_bin_bmin.begin(), m_bin_bmin.begin() + n_input_tasks * BINS * 3, make_float3( HUGE, HUGE, HUGE ) );
335 thrust::fill( m_bin_bmax.begin(), m_bin_bmax.begin() + n_input_tasks * BINS * 3, make_float3( -HUGE, -HUGE, -HUGE ) );
336 thrust::fill( m_bin_size.begin(), m_bin_size.begin() + n_input_tasks * BINS * 3, 0 );
338 m_init_bins_time += stop_timer( start, stop );
340 #ifdef SAH_OBJECT_REORDERING 342 thrust::make_transform_iterator( m_queue_bins.begin(),
Bin_counter() ) + in_queue*n_objects,
343 thrust::make_transform_iterator( m_queue_bins.begin(),
Bin_counter() ) + in_queue*n_objects + n_input_tasks,
344 m_queue_offsets.begin() );
347 start_timer( start, stop );
348 update_bins( BINS, n_objects, bbox_begin, Vector4f(-bbox[0][0],-bbox[0][1],-bbox[0][2],0.0f), objects, queue[ in_queue ], bins);
349 m_update_bins_time += stop_timer( start, stop );
351 start_timer( start, stop );
358 thrust::raw_pointer_cast( &m_counters.front() ),
360 thrust::raw_pointer_cast( &m_nodes->front() ),
361 thrust::raw_pointer_cast( &m_counters.front() ) + 1,
364 m_sah_split_time += stop_timer( start, stop );
366 start_timer( start, stop );
368 #ifdef SAH_OBJECT_REORDERING 370 m_new_pos.begin() + (out_queue+0)*n_objects,
371 m_new_pos.begin() + (out_queue+1)*n_objects,
382 index[ out_queue ] );
384 #ifdef SAH_OBJECT_REORDERING 386 objects.index = index[ out_queue ];
389 m_distribute_objects_time += stop_timer( start, stop );
392 const uint32 n_output_tasks = m_counters[0];
395 input_node_offset = output_node_offset;
396 output_node_offset += n_output_tasks;
397 n_input_tasks = n_output_tasks;
400 std::swap( in_queue, out_queue );
403 m_level_count = level;
404 for (; level < 128; ++level)
405 m_levels[ level ] = output_node_offset;
408 thrust::copy( thrust::make_counting_iterator(0u), thrust::make_counting_iterator(0u) + n_objects, m_index->begin() );
409 thrust::sort_by_key( m_node_ids.begin(), m_node_ids.begin() + n_objects, m_index->begin() );
412 setup_leaves( n_objects, objects.node_ids, thrust::raw_pointer_cast( &m_leaves->front() ) );
414 m_leaf_count = m_counters[1];
415 m_node_count = output_node_offset;
417 cudaEventDestroy( start );
418 cudaEventDestroy( stop );
CUGAR_HOST_DEVICE uint32 quantize(const float x, const uint32 n)
Definition: numbers.h:600
Definition: sah_builder.h:37
Definition: sah_builder.h:115
Definition: sah_builder.h:103
void build(const uint32 BINS, const Bbox3f bbox, const Iterator bbox_begin, const Iterator bbox_end, const uint32 max_leaf_size, const float max_cost=1.8f)
build a bvh given a set of bboxes
Definition: sah_builder_inline.h:217
Definition: sah_builder.h:96
Definition: sah_builder.h:80
Definition: sah_builder_inline.h:204
void exclusive_scan(const uint32 n, InputIterator d_in, OutputIterator d_out, BinaryOp op, Identity identity, thrust::device_vector< uint8 > &d_temp_storage)
Definition: primitives_inl.h:265
CUGAR_FORCEINLINE CUGAR_HOST_DEVICE Out binary_cast(const In in)
Definition: types.h:288