30 #include <nih/basic/utils.h> 31 #include <nih/basic/cuda/scan.h> 32 #include <nih/basic/cuda_config.h> 33 #include <thrust/scan.h> 34 #include <thrust/sort.h> 39 #define SAH_SINGLE_WARP 0 40 #define SAH_MAX_BINS 128 43 namespace binned_sah {
45 void init_bins(
const uint32 BINS,
const uint32 n_nodes, Bins bins);
51 const int input_node_offset,
54 int output_node_offset,
57 const uint32 max_leaf_size,
58 const float max_cost);
60 void distribute_objects(
65 const int input_node_offset,
70 const int32* leaf_ids,
74 FORCE_INLINE NIH_DEVICE
void update_bin(float3* bin_bmin, float3* bin_bmax, int32* bin_counter,
const Bbox4f bbox)
76 const float3 bmin = *bin_bmin;
77 if (bbox[0][0] < bmin.x) atomicMin( (int32*)&(bin_bmin->x), __float_as_int(bbox[0][0]) );
78 if (bbox[0][1] < bmin.y) atomicMin( (int32*)&(bin_bmin->y), __float_as_int(bbox[0][1]) );
79 if (bbox[0][2] < bmin.z) atomicMin( (int32*)&(bin_bmin->z), __float_as_int(bbox[0][2]) );
81 const float3 bmax = *bin_bmax;
82 if (bbox[1][0] > bmax.x) atomicMax( (int32*)&(bin_bmax->x), __float_as_int(bbox[1][0]) );
83 if (bbox[1][1] > bmax.y) atomicMax( (int32*)&(bin_bmax->y), __float_as_int(bbox[1][1]) );
84 if (bbox[1][2] > bmax.z) atomicMax( (int32*)&(bin_bmax->z), __float_as_int(bbox[1][2]) );
86 atomicAdd( bin_counter, 1 );
93 template <
typename Iterator>
94 __global__
void update_bins_kernel(
96 const uint32 n_objects,
97 const Iterator bboxes,
98 const Vector4f origin,
99 const Objects objects,
103 const uint32 grid_size = gridDim.x * blockDim.x;
106 for (uint32 base_idx = blockIdx.x * blockDim.x;
107 base_idx < n_objects;
108 base_idx += grid_size)
110 const uint32 idx = threadIdx.x + base_idx;
112 if (idx >= n_objects)
115 const uint32
id = idx;
118 const int32 node_id = objects.node_ids[id];
122 const int32 split_id = objects.split_ids[id];
124 const Bin node_bbox = queue.bins[split_id];
125 const float3 node_size = make_float3(
126 node_bbox.bmax.x - node_bbox.bmin.x,
127 node_bbox.bmax.y - node_bbox.bmin.y,
128 node_bbox.bmax.z - node_bbox.bmin.z );
130 Bbox4f bbox = bboxes[id];
133 const Vector3f center = (xyz(bbox[0]) + xyz(bbox[1]))*0.5f;
135 const int4 bin_id = make_int4(
136 (node_size.x < 1.0e-8f ? 0 :
quantize( (center[0] - node_bbox.bmin.x) / node_size.x, BINS )),
137 (node_size.y < 1.0e-8f ? 0 :
quantize( (center[1] - node_bbox.bmin.y) / node_size.y, BINS )),
138 (node_size.z < 1.0e-8f ? 0 :
quantize( (center[2] - node_bbox.bmin.z) / node_size.z, BINS )),
141 objects.bin_ids[idx] = bin_id;
143 const uint32 binX = split_id + (BINS * 0 + bin_id.x)*queue.size;
144 const uint32 binY = split_id + (BINS * 1 + bin_id.y)*queue.size;
145 const uint32 binZ = split_id + (BINS * 2 + bin_id.z)*queue.size;
147 update_bin( bins.bmin + binX, bins.bmax + binX, (int32*)bins.size + binX, bbox );
148 update_bin( bins.bmin + binY, bins.bmax + binY, (int32*)bins.size + binY, bbox );
149 update_bin( bins.bmin + binZ, bins.bmax + binZ, (int32*)bins.size + binZ, bbox );
156 template <
typename Iterator>
157 inline void update_bins(
159 const uint32 n_objects,
160 const Iterator bboxes,
161 const Vector4f origin,
162 const Objects objects,
166 const uint32 BLOCK_SIZE = SAH_SINGLE_WARP ? 32 : 128;
167 const size_t max_blocks = SAH_SINGLE_WARP ? 1 : thrust::detail::backend::cuda::arch::max_active_blocks(update_bins_kernel<Iterator>, BLOCK_SIZE, 0);
168 const size_t n_blocks = nih::min( max_blocks, (n_objects + BLOCK_SIZE-1) / BLOCK_SIZE );
170 update_bins_kernel<<<n_blocks,BLOCK_SIZE>>> (
179 cudaThreadSynchronize();
182 inline void start_timer(
const cudaEvent_t start,
const cudaEvent_t stop)
184 cudaEventRecord( start, 0 );
186 inline float stop_timer(
const cudaEvent_t start,
const cudaEvent_t stop)
189 cudaEventRecord( stop, 0 );
190 cudaEventSynchronize( stop );
191 cudaEventElapsedTime( &dtime, start, stop );
198 typedef uint32 result_type;
200 NIH_HOST_DEVICE uint32 operator() (
const Bin bin)
const 209 template <
typename Iterator>
213 const Iterator bbox_begin,
214 const Iterator bbox_end,
215 const Iterator h_bbox_begin,
216 const uint32 max_leaf_size,
217 const float max_cost)
219 const uint32 n_objects = uint32( bbox_end - bbox_begin );
221 need_space( *m_nodes, n_objects*2 );
222 need_space( *m_leaves, n_objects );
223 need_space( *m_index, n_objects );
225 need_space( m_bin_bmin, (n_objects / max_leaf_size) * BINS * 3 );
226 need_space( m_bin_bmax, (n_objects / max_leaf_size) * BINS * 3 );
227 need_space( m_bin_size, (n_objects / max_leaf_size) * BINS * 3 );
228 need_space( m_queue_bins, n_objects * 2 );
229 need_space( m_queue_splits, n_objects * 2 );
231 need_space( m_bin_ids, n_objects );
232 need_space( m_split_ids, n_objects );
233 need_space( m_node_ids, n_objects );
237 queue[0].bins = thrust::raw_pointer_cast( &m_queue_bins.front() );
238 queue[0].splits = thrust::raw_pointer_cast( &m_queue_splits.front() );
239 queue[1].bins = thrust::raw_pointer_cast( &m_queue_bins.front() ) + n_objects;
240 queue[1].splits = thrust::raw_pointer_cast( &m_queue_splits.front() ) + n_objects;
243 objects.bin_ids = thrust::raw_pointer_cast( &m_bin_ids.front() );
244 objects.node_ids = thrust::raw_pointer_cast( &m_node_ids.front() );
245 objects.split_ids = thrust::raw_pointer_cast( &m_split_ids.front() );
248 thrust::fill( m_split_ids.begin(), m_split_ids.begin() + n_objects, 0 );
249 thrust::fill( m_node_ids.begin(), m_node_ids.begin() + n_objects, -1 );
254 bin.bmin = make_float4( 0.0f,0.0f,0.0f,binary_cast<float>(n_objects) );
255 bin.bmax = make_float4( bbox[1][0]-bbox[0][0],bbox[1][1]-bbox[0][1],bbox[1][2]-bbox[0][2],binary_cast<float>(Bvh_node::kInvalid) );
256 m_queue_bins[0] = bin;
259 int input_node_offset = 0;
260 int output_node_offset = 1;
262 m_counters.resize(2);
268 int n_input_tasks = 1;
270 cudaEvent_t start, stop;
271 cudaEventCreate( &start );
272 cudaEventCreate( &stop );
280 while (n_input_tasks)
283 m_levels[ level++ ] = input_node_offset;
285 need_space( m_bin_bmin, n_input_tasks * BINS * 3 );
286 need_space( m_bin_bmax, n_input_tasks * BINS * 3 );
287 need_space( m_bin_size, n_input_tasks * BINS * 3 );
289 bins.bmin = thrust::raw_pointer_cast( &m_bin_bmin.front() );
290 bins.bmax = thrust::raw_pointer_cast( &m_bin_bmax.front() );
291 bins.size = thrust::raw_pointer_cast( &m_bin_size.front() );
297 queue[ in_queue ].size = n_input_tasks;
299 binned_sah::start_timer( start, stop );
302 const float HUGE = 1.0e8f;
303 thrust::fill( m_bin_bmin.begin(), m_bin_bmin.begin() + n_input_tasks * BINS * 3, make_float3( HUGE, HUGE, HUGE ) );
304 thrust::fill( m_bin_bmax.begin(), m_bin_bmax.begin() + n_input_tasks * BINS * 3, make_float3( -HUGE, -HUGE, -HUGE ) );
305 thrust::fill( m_bin_size.begin(), m_bin_size.begin() + n_input_tasks * BINS * 3, 0 );
308 m_init_bins_time += binned_sah::stop_timer( start, stop );
310 binned_sah::start_timer( start, stop );
311 binned_sah::update_bins( BINS, n_objects, bbox_begin, Vector4f(-bbox[0][0],-bbox[0][1],-bbox[0][2],0.0f), objects, queue[ in_queue ], bins);
312 m_update_bins_time += binned_sah::stop_timer( start, stop );
314 binned_sah::start_timer( start, stop );
315 binned_sah::sah_split(
321 thrust::raw_pointer_cast( &m_counters.front() ),
323 thrust::raw_pointer_cast( &m_nodes->front() ),
324 thrust::raw_pointer_cast( &m_counters.front() ) + 1,
327 m_sah_split_time += binned_sah::stop_timer( start, stop );
329 binned_sah::start_timer( start, stop );
331 binned_sah::distribute_objects(
339 m_distribute_objects_time += binned_sah::stop_timer( start, stop );
342 const uint32 n_output_tasks = m_counters[0];
345 input_node_offset = output_node_offset;
346 output_node_offset += n_output_tasks;
347 n_input_tasks = n_output_tasks;
350 std::swap( in_queue, out_queue );
353 m_level_count = level;
354 for (; level < 128; ++level)
355 m_levels[ level ] = output_node_offset;
358 thrust::copy( thrust::make_counting_iterator(0u), thrust::make_counting_iterator(0u) + n_objects, m_index->begin() );
359 thrust::sort_by_key( m_node_ids.begin(), m_node_ids.begin() + n_objects, m_index->begin() );
362 binned_sah::setup_leaves( n_objects, objects.node_ids, thrust::raw_pointer_cast( &m_leaves->front() ) );
364 m_leaf_count = m_counters[1];
365 m_node_count = output_node_offset;
367 cudaEventDestroy( start );
368 cudaEventDestroy( stop );
Definition: binned_sah_builder.h:65
CUGAR_HOST_DEVICE uint32 quantize(const float x, const uint32 n)
Definition: numbers.h:600
Definition: sah_builder.h:37
void build(const uint32 BINS, const Bbox3f bbox, const Iterator bbox_begin, const Iterator bbox_end, const Iterator h_bbox_begin, const uint32 max_leaf_size, const float max_cost=1.8f)
Definition: binned_sah_builder_inline.h:210
Definition: binned_sah_builder_inline.h:195
Definition: binned_sah_builder.h:42
Definition: binned_sah_builder.h:58
CUGAR_FORCEINLINE CUGAR_HOST_DEVICE Out binary_cast(const In in)
Definition: types.h:288
Definition: binned_sah_builder.h:77