31 #include <pathtracer_core.h> 32 #include <pathtracer_queues.h> 35 #include <cugar/basic/memory_arena.h> 43 #define SHADE_HITS_BLOCKSIZE 64 44 #define SHADE_HITS_CTA_BLOCKS 8 // Maxwell / Volta : 16 - Turing : 8 55 template <
typename TPTVertexProcessor>
58 TPTVertexProcessor& _vertex_processor,
64 const uint32 _vertex_info = uint32(-1),
65 const uint32 _nee_slot = uint32(-1))
67 scatter_queue.warp_append( _pixel, _ray, _weight, _cone, _vertex_info, _nee_slot );
70 template <
typename TPTVertexProcessor>
72 void trace_shadow_ray(
73 TPTVertexProcessor& _vertex_processor,
80 const uint32 _vertex_info = uint32(-1),
81 const uint32 _nee_slot = uint32(-1),
82 const uint32 _nee_sample = uint32(-1))
84 shadow_queue.warp_append( _pixel, _ray,
cugar::Vector4f(_weight, 0.0f),
cugar::Vector4f(_weight_d, 0.0f),
cugar::Vector4f(_weight_g, 0.0f), _vertex_info, _nee_slot, _nee_sample );
93 const uint32 n_pixels,
100 input_queue.hits = arena.
alloc<
Hit>(n_pixels);
101 input_queue.weights = arena.
alloc<float4>(n_pixels);
102 input_queue.weights_d = NULL;
103 input_queue.weights_g = NULL;
104 input_queue.pixels = arena.
alloc<uint4>(n_pixels);
105 input_queue.cones = arena.
alloc<float2>(n_pixels);
106 input_queue.size = arena.
alloc<uint32>(1);
109 scatter_queue.hits = arena.
alloc<
Hit>(n_pixels);
110 scatter_queue.weights = arena.
alloc<float4>(n_pixels);
111 scatter_queue.weights_d = NULL;
112 scatter_queue.weights_g = NULL;
113 scatter_queue.pixels = arena.
alloc<uint4>(n_pixels);
114 scatter_queue.cones = arena.
alloc<float2>(n_pixels);
115 scatter_queue.size = arena.
alloc<uint32>(1);
117 const uint32 n_shadow_rays = 2u;
119 shadow_queue.rays = arena.
alloc<
MaskedRay>(n_pixels*n_shadow_rays);
120 shadow_queue.hits = arena.
alloc<
Hit>(n_pixels*n_shadow_rays);
121 shadow_queue.weights = arena.
alloc<float4>(n_pixels*n_shadow_rays);
122 shadow_queue.weights_d = arena.
alloc<float4>(n_pixels*n_shadow_rays);
123 shadow_queue.weights_g = arena.
alloc<float4>(n_pixels*n_shadow_rays);
124 shadow_queue.pixels = arena.
alloc<uint4>(n_pixels*n_shadow_rays);
125 shadow_queue.size = arena.
alloc<uint32>(1);
133 template <
typename TPTContext>
136 const uint2 pixel = make_uint2(
137 threadIdx.x + blockIdx.x*blockDim.x,
138 threadIdx.y + blockIdx.y*blockDim.y );
140 if (pixel.x >= renderer.res_x || pixel.y >= renderer.res_y)
143 const int idx = pixel.x + pixel.y*renderer.res_x;
147 reinterpret_cast<float4*
>(context.in_queue.rays)[2 * idx + 0] = make_float4(ray.origin.x, ray.origin.y, ray.origin.z, __uint_as_float(ray.mask));
148 reinterpret_cast<float4*
>(context.in_queue.rays)[2 * idx + 1] = make_float4(ray.dir.x, ray.dir.y, ray.dir.z, ray.tmax);
151 context.in_queue.weights[idx] =
cugar::Vector4f(1.0f, 1.0f, 1.0f, 1.0f);
153 const float out_p =
camera_direction_pdf(U, V, W, W_len, square_pixel_focal_length, ray.dir,
false);
156 context.in_queue.pixels[idx] = make_uint4( idx, uint32(-1), uint32(-1), uint32(-1) );
159 context.in_queue.cones[idx] = make_float2( 0, out_p );
162 *context.in_queue.size = renderer.res_x * renderer.res_y;
170 template <
typename TPTContext>
174 camera_frame(renderer.camera, renderer.aspect, U, V, W);
176 const float square_pixel_focal_length = renderer.camera.square_pixel_focal_length(renderer.res_x, renderer.res_y);
178 dim3 blockSize(32, 16);
180 generate_primary_rays_kernel << < gridSize, blockSize >> > (context, renderer, U, V, W,
length(W), square_pixel_focal_length);
189 template <u
int32 NUM_WARPS,
typename TPTContext,
typename TPTVertexProcessor>
192 void shade_hits_kernel(
const uint32 in_queue_size, TPTContext context, TPTVertexProcessor vertex_processor,
RenderingContextView renderer)
194 const uint32 thread_id = threadIdx.x + blockIdx.x * blockDim.x;
196 if (thread_id < in_queue_size)
198 const uint4 packed_pixel = cugar::cuda::load<cugar::cuda::LOAD_CG>( &context.in_queue.pixels[thread_id] );
200 const PixelInfo pixel_info = packed_pixel.x;
201 const uint32 prev_vertex_info = packed_pixel.y;
202 const uint32 prev_nee_slot = packed_pixel.z;
203 const float2 cone = context.in_queue.cones[thread_id];
204 const MaskedRay ray = context.in_queue.rays[thread_id];
205 const Hit hit = context.in_queue.hits[thread_id];
208 const uint2 pixel = make_uint2(
209 pixel_info.pixel % renderer.res_x,
210 pixel_info.pixel / renderer.res_x
234 template <
typename TPTContext,
typename TPTVertexProcessor>
237 const uint32 blockSize(SHADE_HITS_BLOCKSIZE);
240 shade_hits_kernel<blockSize / 32><<< gridSize, blockSize >>>( in_queue_size, context, vertex_processor, renderer );
248 template <
typename TPTContext,
typename TPTVertexProcessor>
252 const uint32 thread_id = threadIdx.x + blockIdx.x * blockDim.x;
254 if (thread_id < in_queue_size)
256 const PixelInfo pixel_info = context.shadow_queue.pixels[thread_id].x;
257 const uint32 vertex_info = context.shadow_queue.pixels[thread_id].y;
258 const uint32 nee_slot = context.shadow_queue.pixels[thread_id].z;
259 const uint32 nee_sample = context.shadow_queue.pixels[thread_id].w;
260 const Hit hit = context.shadow_queue.hits[thread_id];
265 solve_occlusion( context, vertex_processor, renderer, hit.t > 0.0f, pixel_info, w.xyz(), w_d.xyz(), w_g.xyz(), vertex_info, nee_slot, nee_sample );
274 template <
typename TPTContext,
typename TPTVertexProcessor>
277 const uint32 blockSize(128);
279 solve_occlusion_kernel<<< gridSize, blockSize >>>( in_queue_size, context, vertex_processor, renderer );
290 primary_rt_time = 0.0f;
292 shadow_rt_time = 0.0f;
293 path_shade_time = 0.0f;
294 shadow_shade_time = 0.0f;
309 template <
typename TPTContext,
typename TPTVertexProcessor>
312 TPTVertexProcessor& vertex_processor,
318 CUDA_CHECK(cugar::cuda::sync_and_check_error(
"generate primary rays"));
320 cudaMemset(context.device_timers, 0x00,
sizeof(uint64) * 16);
322 for (context.in_bounce = 0;
323 context.in_bounce < context.options.max_path_length;
326 uint32 in_queue_size;
329 cudaMemcpy(&in_queue_size, context.in_queue.size,
sizeof(uint32), cudaMemcpyDeviceToHost);
332 if (in_queue_size == 0)
343 renderer.
get_rt_context()->trace(in_queue_size, (
Ray*)context.in_queue.rays, context.in_queue.hits);
344 CUDA_CHECK(cugar::cuda::sync_and_check_error(
"trace shaded"));
348 cudaMemset(context.shadow_queue.size, 0x00,
sizeof(uint32));
349 cudaMemset(context.scatter_queue.size, 0x00,
sizeof(uint32));
350 CUDA_CHECK(cugar::cuda::check_error(
"memset"));
357 shade_hits(in_queue_size, context, vertex_processor, renderer_view);
358 CUDA_CHECK(cugar::cuda::sync_and_check_error(
"shade hits"));
365 uint32 shadow_queue_size;
366 cudaMemcpy(&shadow_queue_size, context.shadow_queue.size,
sizeof(uint32), cudaMemcpyDeviceToHost);
370 if (shadow_queue_size)
374 renderer.
get_rt_context()->trace_shadow(shadow_queue_size, (
MaskedRay*)context.shadow_queue.rays, context.shadow_queue.hits);
375 CUDA_CHECK(cugar::cuda::sync_and_check_error(
"trace occlusion"));
380 if (shadow_queue_size)
384 solve_occlusion(shadow_queue_size, context, vertex_processor, renderer_view);
385 CUDA_CHECK(cugar::cuda::sync_and_check_error(
"solve occlusion"));
389 std::swap(context.in_queue, context.scatter_queue);
393 inline void print_timer_stats(
const uint64* device_timers,
const PTLoopStats& stats)
395 uint64 h_device_timers[16];
396 cudaMemcpy(&h_device_timers, device_timers,
sizeof(uint64) * 16, cudaMemcpyDeviceToHost);
403 const float setup_time = float(h_device_timers[SETUP_TIME]) / float(shade_events);
404 const float brdf_eval_time = float(h_device_timers[BRDF_EVAL_TIME]) / float(shade_events);
405 const float dirlight_sample_time = float(h_device_timers[DIRLIGHT_SAMPLE_TIME]) / float(shade_events);
406 const float dirlight_eval_time = float(h_device_timers[DIRLIGHT_EVAL_TIME]) / float(shade_events);
407 const float lights_preprocess_time = float(h_device_timers[LIGHTS_PREPROCESS_TIME]) / float(shade_events);
408 const float lights_sample_time = float(h_device_timers[LIGHTS_SAMPLE_TIME]) / float(shade_events);
409 const float lights_eval_time = float(h_device_timers[LIGHTS_EVAL_TIME]) / float(shade_events);
410 const float lights_mapping_time = float(h_device_timers[LIGHTS_MAPPING_TIME]) / float(shade_events);
411 const float lights_update_time = float(h_device_timers[LIGHTS_UPDATE_TIME]) / float(shade_events);
412 const float brdf_sample_time = float(h_device_timers[BRDF_SAMPLE_TIME]) / float(shade_events);
413 const float trace_shadow_time = float(h_device_timers[TRACE_SHADOW_TIME]) / float(shade_events);
414 const float trace_shaded_time = float(h_device_timers[TRACE_SHADED_TIME]) / float(shade_events);
415 const float vertex_preprocess_time = float(h_device_timers[PREPROCESS_VERTEX_TIME]) / float(shade_events);
416 const float nee_weights_time = float(h_device_timers[NEE_WEIGHTS_TIME]) / float(shade_events);
417 const float scattering_weights_time = float(h_device_timers[SCATTERING_WEIGHTS_TIME]) / float(shade_events);
418 const float fbuffer_writes_time = float(h_device_timers[FBUFFER_WRITES_TIME]) / float(shade_events);
420 const float total_time =
423 + dirlight_sample_time
427 + lights_mapping_time
432 + vertex_preprocess_time
434 + scattering_weights_time
435 + fbuffer_writes_time;
437 fprintf(stderr,
"\n device timing: %f clks\n", total_time);
438 fprintf(stderr,
" setup : %4.1f %%, %f clks\n", 100.0 * setup_time / total_time, setup_time);
439 fprintf(stderr,
" dirlight sample : %4.1f %%, %f clks\n", 100.0 * dirlight_sample_time / total_time, dirlight_sample_time);
440 fprintf(stderr,
" dirlight eval : %4.1f %%, %f clks\n", 100.0 * dirlight_eval_time / total_time, dirlight_eval_time);
441 fprintf(stderr,
" lights preproc : %4.1f %%, %f clks\n", 100.0 * lights_preprocess_time / total_time, lights_preprocess_time);
442 fprintf(stderr,
" lights sample : %4.1f %%, %f clks\n", 100.0 * lights_sample_time / total_time, lights_sample_time);
443 fprintf(stderr,
" lights eval : %4.1f %%, %f clks\n", 100.0 * lights_eval_time / total_time, lights_eval_time);
444 fprintf(stderr,
" lights map : %4.1f %%, %f clks\n", 100.0 * lights_mapping_time / total_time, lights_mapping_time);
445 fprintf(stderr,
" lights update : %4.1f %%, %f clks\n", 100.0 * lights_update_time / total_time, lights_update_time);
446 fprintf(stderr,
" brdf eval : %4.1f %%, %f clks\n", 100.0 * brdf_eval_time / total_time, brdf_eval_time);
447 fprintf(stderr,
" brdf sample : %4.1f %%, %f clks\n", 100.0 * brdf_sample_time / total_time, brdf_sample_time);
448 fprintf(stderr,
" trace shadow : %4.1f %%, %f clks\n", 100.0 * trace_shadow_time / total_time, trace_shadow_time);
449 fprintf(stderr,
" trace shaded : %4.1f %%, %f clks\n", 100.0 * trace_shaded_time / total_time, trace_shaded_time);
450 fprintf(stderr,
" preprocess : %4.1f %%, %f clks\n", 100.0 * vertex_preprocess_time / total_time, vertex_preprocess_time);
451 fprintf(stderr,
" nee weights : %4.1f %%, %f clks\n", 100.0 * nee_weights_time / total_time, nee_weights_time);
452 fprintf(stderr,
" scatter weights : %4.1f %%, %f clks\n", 100.0 * scattering_weights_time / total_time, scattering_weights_time);
453 fprintf(stderr,
" fbuffer writes : %4.1f %%, %f clks\n", 100.0 * fbuffer_writes_time / total_time, fbuffer_writes_time);
__global__ void solve_occlusion_kernel(const uint32 in_queue_size, TPTContext context, TPTVertexProcessor vertex_processor, RenderingContextView renderer)
Definition: pathtracer_kernels.h:250
void solve_occlusion(const uint32 in_queue_size, TPTContext context, TPTVertexProcessor vertex_processor, RenderingContextView renderer)
Definition: pathtracer_kernels.h:275
Definition: pathtracer_kernels.h:49
float shadow_shade_time
time spent for shading shadow samples (i.e. in solve_occlusion)
Definition: pathtracer_kernels.h:303
__global__ void generate_primary_rays_kernel(TPTContext context, RenderingContextView renderer, cugar::Vector3f U, cugar::Vector3f V, cugar::Vector3f W, const float W_len, const float square_pixel_focal_length)
Definition: pathtracer_kernels.h:134
void generate_primary_rays(TPTContext context, const RenderingContextView renderer)
Definition: pathtracer_kernels.h:171
FERMAT_HOST_DEVICE void compute_per_bounce_options(TPTContext &context, const RenderingContextView &renderer)
Definition: pathtracer_core.h:596
__global__ __launch_bounds__(SHADE_HITS_BLOCKSIZE, SHADE_HITS_CTA_BLOCKS) void shade_hits_kernel(const uint32 in_queue_size
CUGAR_HOST_DEVICE T * alloc(const uint64 sz, const uint64 alignment=sizeof(T))
Definition: memory_arena.h:69
Definition: pathtracer.h:169
CUGAR_HOST_DEVICE L divide_ri(const L x, const R y)
Definition: numbers.h:180
Definition: pathtracer_kernels.h:284
float path_rt_time
time spent for tracing scattering rays
Definition: pathtracer_kernels.h:300
FERMAT_HOST_DEVICE void camera_frame(cugar::Vector3f eye, cugar::Vector3f lookat, cugar::Vector3f up, float hfov, float aspect_ratio, cugar::Vector3f &U, cugar::Vector3f &V, cugar::Vector3f &W)
Definition: camera.h:142
float shadow_rt_time
time spent for tracing shadow rays
Definition: pathtracer_kernels.h:301
float primary_rt_time
time spent for tracing primary rays
Definition: pathtracer_kernels.h:299
FERMAT_DEVICE MaskedRay generate_primary_ray(TPTContext &context, RenderingContextView &renderer, const uint2 pixel, cugar::Vector3f U, cugar::Vector3f V, cugar::Vector3f W)
Definition: pathtracer_core.h:635
RTContext * get_rt_context() const
void path_trace_loop(TPTContext &context, TPTVertexProcessor &vertex_processor, RenderingContext &renderer, RenderingContextView &renderer_view, PTLoopStats &stats)
Definition: pathtracer_kernels.h:310
Definition: renderer.h:52
void alloc_queues(PTOptions options, const uint32 n_pixels, PTRayQueue &input_queue, PTRayQueue &scatter_queue, PTRayQueue &shadow_queue, cugar::memory_arena &arena)
Definition: pathtracer_kernels.h:91
FERMAT_HOST_DEVICE float camera_direction_pdf(const cugar::Vector3f &U, const cugar::Vector3f &V, const cugar::Vector3f &W, const float W_len, const float square_focal_length, const cugar::Vector3f out, float *out_x=0, float *out_y=0)
Definition: camera.h:206
uint64 shade_events
number of path vertex shade events
Definition: pathtracer_kernels.h:304
Definition: pathtracer_core.h:527
Definition: memory_arena.h:44
Definition: renderer_view.h:80
CUGAR_FORCEINLINE CUGAR_HOST_DEVICE uint32 length(const vector_view< Iterator > &vec)
Definition: vector_view.h:228
Definition: pathtracer_queues.h:44
PTLoopStats()
Definition: pathtracer_kernels.h:288
float path_shade_time
time spent for shading path vertices
Definition: pathtracer_kernels.h:302
void shade_hits(const uint32 in_queue_size, TPTContext context, TPTVertexProcessor vertex_processor, RenderingContextView renderer)
Definition: pathtracer_kernels.h:235
FERMAT_DEVICE bool shade_vertex(TPTContext &context, TPTVertexProcessor &vertex_processor, RenderingContextView &renderer, const uint32 bounce, const PixelInfo pixel_info, const uint2 pixel, const MaskedRay &ray, const Hit hit, const cugar::Vector4f w, const uint32 prev_vertex_info=uint32(-1), const uint32 prev_nee_vertex_id=uint32(-1), const cugar::Vector2f cone=cugar::Vector2f(0))
Definition: pathtracer_core.h:773