Fermat
psfpt_impl.h
1 /*
2  * Fermat
3  *
4  * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  * * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * * Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * * Neither the name of the NVIDIA CORPORATION nor the
14  * names of its contributors may be used to endorse or promote products
15  * derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #pragma once
30 
31 #include <psfpt.h>
32 #include <renderer.h>
33 #include <rt.h>
34 #include <mesh/MeshStorage.h>
35 #include <cugar/basic/timer.h>
36 #include <cugar/basic/primitives.h>
37 #include <cugar/basic/memory_arena.h>
38 #include <pathtracer_core.h>
39 #include <pathtracer_queues.h>
40 #include <pathtracer_kernels.h>
41 #include <psfpt_vertex_processor.h>
42 
43 
44 #define SHIFT_RES 256u
45 
46 #define HASH_SIZE (64u * 1024u * 1024u)
47 
48 
49 namespace {
50 
52 
53  // a queue of references to PSF cells that will need to be blended in after path sampling
54  //
55  struct PSFRefQueue
56  {
57  float4* weights_d; // diffuse path weight
58  float4* weights_g; // glossy path weight
59  uint2* pixels;
60  uint32* size;
61 
62  FERMAT_DEVICE
63  void warp_append(const PixelInfo pixel, const PSFPTVertexProcessor::CacheInfo cache_slot, const float4 weight_d, const float4 weight_g)
64  {
65  const uint32 slot = cugar::cuda::warp_increment(size);
66 
67  weights_d[slot] = weight_d;
68  weights_g[slot] = weight_g;
69 
70  pixels[slot] = make_uint2(pixel.packed, cache_slot.packed);
71  }
72  };
73 
74  // the internal path tracing context
75  //
76  template <typename TDirectLightingSampler>
77  struct PSFPTContext : PTContextBase<PSFPTOptions>, PTContextQueues
78  {
79  PSFRefQueue ref_queue;
80 
81  HashMap psf_hashmap;
82  float4* psf_values;
83 
84  TDirectLightingSampler dl;
85  };
86 
87  // initialize the RL storage for mesh VTLs
88  void init(ClusteredRLStorage* vtls_rl, const MeshVTLStorage* mesh_vtls)
89  {
90  vtls_rl->init(
91  VTL_RL_HASH_SIZE,
92  mesh_vtls->get_bvh_clusters_count(),
93  mesh_vtls->get_bvh_cluster_offsets());
94  }
95  // initialize the RL storage for mesh VTLs
96  void init(AdaptiveClusteredRLStorage* vtls_rl, const MeshVTLStorage* mesh_vtls)
97  {
98  vtls_rl->init(
99  VTL_RL_HASH_SIZE,
100  mesh_vtls->get_bvh_nodes(),
101  mesh_vtls->get_bvh_parents(),
102  mesh_vtls->get_bvh_ranges(),
103  mesh_vtls->get_bvh_clusters_count(),
104  mesh_vtls->get_bvh_clusters(),
105  mesh_vtls->get_bvh_cluster_offsets());
106  }
107 
108  // the kernel blending/splatting PSF references into the framebuffer
109  //
110  template <typename TDirectLightingSampler>
111  __global__
112  void psf_blending_kernel(const uint32 in_queue_size, PSFPTContext<TDirectLightingSampler> context, RenderingContextView renderer, const float frame_weight)
113  {
114  const uint32 thread_id = threadIdx.x + blockIdx.x * blockDim.x;
115 
116  if (thread_id < in_queue_size) // *context.shadow_queue.size
117  {
118  typedef PSFPTVertexProcessor::CacheInfo CacheInfo;
119 
120  // fetch a reference from the ref queue
121  const PixelInfo pixel_info = context.ref_queue.pixels[thread_id].x;
122  const CacheInfo cache_info = context.ref_queue.pixels[thread_id].y;
123  const cugar::Vector4f w_d = context.ref_queue.weights_d[thread_id];
124  const cugar::Vector4f w_g = context.ref_queue.weights_g[thread_id];
125 
126  // check if it's valid
127  if (cache_info.is_valid())
128  {
129  // dereference the hashmap cell
130  const uint32 cache_slot = cache_info.pixel;
131 
132  cugar::Vector4f cache_value = context.psf_values[cache_slot];
133  cache_value /= cache_value.w; // normalize
134 
135  // compue the total weight
136  const cugar::Vector3f w =
137  ((pixel_info.comp & Bsdf::kDiffuseMask) ? w_d.xyz() : cugar::Vector3f(0.0f)) +
138  ((pixel_info.comp & Bsdf::kGlossyMask) ? w_g.xyz() : cugar::Vector3f(0.0f));
139 
140  // add to the composited framebuffer
141  add_in<false>(renderer.fb(FBufferDesc::COMPOSITED_C), pixel_info.pixel, cugar::min( cache_value.xyz() * w, context.options.firefly_filter ), frame_weight);
142 
143  // add to the diffuse channel, if the diffuse component is present
144  if (pixel_info.comp & Bsdf::kDiffuseMask)
145  add_in<true>(renderer.fb(FBufferDesc::DIFFUSE_C), pixel_info.pixel, cache_value.xyz() * w_d.xyz(), frame_weight);
146 
147  // add to the glossy channel, if the glossy component is present
148  if (pixel_info.comp & Bsdf::kGlossyMask)
149  add_in<true>(renderer.fb(FBufferDesc::SPECULAR_C), pixel_info.pixel, cache_value.xyz() * w_g.xyz(), frame_weight);
150  }
151  }
152  }
153 
154  // dispatch the blending kernel
155  //
156  template <typename TDirectLightingSampler>
157  void psf_blending(const uint32 in_queue_size, PSFPTContext<TDirectLightingSampler> context, RenderingContextView renderer)
158  {
159  if (!in_queue_size)
160  return;
161 
162  const uint32 blockSize(128);
163  const dim3 gridSize(cugar::divide_ri(in_queue_size, blockSize));
164  psf_blending_kernel << < gridSize, blockSize >> > (in_queue_size, context, renderer, 1.0f / float(renderer.instance + 1));
165  }
166 
167  // alloc all internal queues
168  //
169  void alloc_queues(
170  PSFPTOptions options,
171  const uint32 n_pixels,
172  PTRayQueue& input_queue,
173  PTRayQueue& scatter_queue,
174  PTRayQueue& shadow_queue,
175  PSFRefQueue& ref_queue,
176  cugar::memory_arena& arena)
177  {
178  ::alloc_queues( options, n_pixels, input_queue, scatter_queue, shadow_queue, arena );
179 
180  ref_queue.weights_d = arena.alloc<float4>(n_pixels * (options.max_path_length + 1));
181  ref_queue.weights_g = arena.alloc<float4>(n_pixels * (options.max_path_length + 1));
182  ref_queue.pixels = arena.alloc<uint2>(n_pixels * (options.max_path_length + 1));
183  ref_queue.size = arena.alloc<uint32>(1);
184  }
185 
186 } // anonymous namespace
187 
188 PSFPT::PSFPT() :
189  m_generator(32, cugar::LFSRGeneratorMatrix::GOOD_PROJECTIONS),
190  m_random(&m_generator, 1u, 1351u)
191 {
192  m_mesh_vtls = new MeshVTLStorage;
193  m_vtls_rl = new VTLRLStorage;
194 }
195 
196 void PSFPT::init(int argc, char** argv, RenderingContext& renderer)
197 {
198  const uint2 res = renderer.res();
199  const uint32 n_pixels = res.x * res.y;
200 
201  // parse the options
202  m_options.parse(argc, argv);
203 
204  const char* nee_alg[] = { "mesh", "vpl", "rl" };
205 
206  fprintf(stderr, " PSFPT settings:\n");
207  fprintf(stderr, " path-length : %u\n", m_options.max_path_length);
208  fprintf(stderr, " direct-nee : %u\n", m_options.direct_lighting_nee ? 1 : 0);
209  fprintf(stderr, " direct-bsdf : %u\n", m_options.direct_lighting_bsdf ? 1 : 0);
210  fprintf(stderr, " indirect-nee : %u\n", m_options.indirect_lighting_nee ? 1 : 0);
211  fprintf(stderr, " indirect-bsdf : %u\n", m_options.indirect_lighting_bsdf ? 1 : 0);
212  fprintf(stderr, " visible-lights : %u\n", m_options.visible_lights ? 1 : 0);
213  fprintf(stderr, " direct lighting : %u\n", m_options.direct_lighting ? 1 : 0);
214  fprintf(stderr, " diffuse : %u\n", m_options.diffuse_scattering ? 1 : 0);
215  fprintf(stderr, " glossy : %u\n", m_options.glossy_scattering ? 1 : 0);
216  fprintf(stderr, " indirect glossy : %u\n", m_options.indirect_glossy ? 1 : 0);
217  fprintf(stderr, " RR : %u\n", m_options.rr ? 1 : 0);
218  fprintf(stderr, " nee algorithm : %s\n", nee_alg[ m_options.nee_type ]);
219  fprintf(stderr, " filter width : %f\n", m_options.psf_width);
220  fprintf(stderr, " filter depth : %u\n", m_options.psf_depth);
221  fprintf(stderr, " filter min-dist : %f\n", m_options.psf_min_dist);
222  fprintf(stderr, " firefly filter : %f\n", m_options.firefly_filter);
223 
224  // allocate the PSF cache storage
225  m_psf_hash.resize(HASH_SIZE);
226  m_psf_values.alloc(HASH_SIZE);
227 
228  // pre-alloc queue storage
229  {
230  // determine how much storage we will need
231  cugar::memory_arena arena;
232 
233  PTRayQueue input_queue;
234  PTRayQueue scatter_queue;
235  PTRayQueue shadow_queue;
236  PSFRefQueue ref_queue;
237 
238  alloc_queues(
239  m_options,
240  n_pixels,
241  input_queue,
242  scatter_queue,
243  shadow_queue,
244  ref_queue,
245  arena );
246 
247  // alloc space for device timers
248  arena.alloc<int64>( 16 );
249 
250  fprintf(stderr, " allocating queue storage: %.1f MB\n", float(arena.size) / (1024*1024));
251  m_memory_pool.alloc(arena.size);
252  }
253 
254  // build the set of shifts
255  const uint32 n_dimensions = 6 * (m_options.max_path_length + 1);
256  fprintf(stderr, " initializing sampler: %u dimensions\n", n_dimensions);
257  m_sequence.setup(n_dimensions, SHIFT_RES);
258 
259  const uint32 n_light_paths = n_pixels;
260 
261  fprintf(stderr, " creating mesh lights... started\n");
262 
263  // initialize the mesh lights sampler
264  renderer.get_mesh_lights().init( n_light_paths, renderer, 0u );
265 
266  fprintf(stderr, " creating mesh lights... done\n");
267 
268  // compute the scene bbox
269  m_bbox = renderer.compute_bbox();
270 
271  // disable smart algorithms if there are no emissive surfaces
272  if (renderer.get_mesh_lights().get_vpl_count() == 0)
273  m_options.nee_type = NEE_ALGORITHM_MESH;
274 
275  if (m_options.nee_type == NEE_ALGORITHM_RL)
276  {
277  fprintf(stderr, " creating mesh VTLs... started\n");
278  m_mesh_vtls->init(n_light_paths, renderer, 0u );
279  fprintf(stderr, " creating mesh VTLs... done (%u VTLs, %u clusters)\n", m_mesh_vtls->get_vtl_count(), m_mesh_vtls->get_bvh_clusters_count());
280 
281  fprintf(stderr, " initializing VTLs RL... started\n");
282  ::init( m_vtls_rl, m_mesh_vtls );
283  fprintf(stderr, " initializing VTLs RL... done (%.1f MB)\n", m_vtls_rl->needed_bytes(VTL_RL_HASH_SIZE, m_mesh_vtls->get_bvh_clusters_count()) / float(1024*1024));
284  }
285 }
286 
287 void PSFPT::render(const uint32 instance, RenderingContext& renderer)
288 {
289  // pre-multiply the previous frame for blending
290  renderer.rescale_frame( instance );
291 
292  //render_pass( instance, renderer, PSFPT::kPresamplePass );
293  render_pass( instance, renderer, PSFPT::kFinalPass );
294 
295  renderer.update_variances( instance );
296 
297  // clamp the framebuffer contents to a reasonably high value, just to avoid outrageous fireflies
298  renderer.clamp_frame( 100.0f );
299 }
300 
301 void PSFPT::render_pass(const uint32 instance, RenderingContext& renderer, const PassType pass_type)
302 {
303  //fprintf(stderr, "render started (%u)\n", instance);
305  const uint2 res = renderer.res();
306  const uint32 n_pixels = res.x * res.y;
307 
308  // carve an arena out of the pre-allocated memory pool
309  cugar::memory_arena arena( m_memory_pool.ptr() );
310 
311  // alloc all the queues
312  PTRayQueue input_queue;
313  PTRayQueue scatter_queue;
314  PTRayQueue shadow_queue;
315  PSFRefQueue ref_queue;
316 
317  alloc_queues(
318  m_options,
319  n_pixels,
320  input_queue,
321  scatter_queue,
322  shadow_queue,
323  ref_queue,
324  arena );
325 
326  // fetch a view of the renderer
327  RenderingContextView renderer_view = renderer.view(instance);
328 
330  // instantiate our vertex processor
331  PSFPTVertexProcessor vertex_processor( m_options.firefly_filter );
334 
335  // alloc space for device timers
336  uint64* device_timers = arena.alloc<uint64>( 16 );
337 
338  cugar::Timer timer;
339  timer.start();
340 
341  PTLoopStats stats;
342 
343  if (m_options.nee_type == NEE_ALGORITHM_RL)
344  {
345  if ((instance % 32) == 0)
346  {
347  // clear the RL hash tables after a bunch of iterations to avoid overflow...
348  m_vtls_rl->clear();
349  }
350  else
351  {
352  // update the vtl cdfs
353  m_vtls_rl->update();
354  CUDA_CHECK(cugar::cuda::sync_and_check_error("vtl-rl update"));
355  }
356  }
357 
358  // setup the samples for this frame
359  m_sequence.set_instance(instance);
360  {
361  // use the RL direct-lighting sampler
362  if (m_options.nee_type == NEE_ALGORITHM_RL)
363  {
364  PSFPTContext<DirectLightingRL> context;
365  context.options = m_options;
366  context.in_bounce = 0;
367  context.in_queue = input_queue;
368  context.scatter_queue = scatter_queue;
369  context.shadow_queue = shadow_queue;
370  context.sequence = m_sequence.view();
371  context.frame_weight = 1.0f / float(renderer_view.instance + 1);
372  context.device_timers = device_timers;
373  context.bbox = m_bbox;
374  context.dl = DirectLightingRL(
375  view( *m_vtls_rl ),
376  m_mesh_vtls->view() );
377  context.ref_queue = ref_queue;
378  context.psf_hashmap = HashMap(
379  HASH_SIZE,
380  m_psf_hash.m_keys.ptr(),
381  m_psf_hash.m_unique.ptr(),
382  m_psf_hash.m_slots.ptr(),
383  m_psf_hash.m_size.ptr()
384  );
385  context.psf_values = m_psf_values.ptr();
386 
387  // initialize the shading cache
388  if ((instance % m_options.psf_temporal_reuse) == 0)
389  m_psf_hash.clear();
390 
391  // reset the reference queue size
392  cudaMemset(context.ref_queue.size, 0x00, sizeof(uint32));
393  CUDA_CHECK(cugar::cuda::sync_and_check_error("clear reference queue"));
394 
395  // perform the actual path tracing
396  path_trace_loop( context, vertex_processor, renderer, renderer_view, stats );
397 
398  // blend-in the PSF references
399  if (pass_type == PSFPT::kFinalPass)
400  {
401  uint32 ref_queue_size;
402  cudaMemcpy(&ref_queue_size, context.ref_queue.size, sizeof(uint32), cudaMemcpyDeviceToHost);
403 
404  psf_blending(ref_queue_size, context, renderer_view);
405  CUDA_CHECK(cugar::cuda::sync_and_check_error("psf blending"));
406  }
407  }
408  else // use the regular mesh emitter direct-lighting sampler
409  {
410  // select which instantiation of the mesh light to use (VPLs or the plain mesh)
411  MeshLight mesh_light = m_options.nee_type == NEE_ALGORITHM_VPL ? renderer_view.mesh_vpls : renderer_view.mesh_light;
412 
414  PSFPTContext<DirectLightingMesh> context;
415  context.options = m_options;
416  context.in_bounce = 0;
417  context.in_queue = input_queue;
418  context.scatter_queue = scatter_queue;
419  context.shadow_queue = shadow_queue;
420  context.sequence = m_sequence.view();
421  context.frame_weight = 1.0f / float(renderer_view.instance + 1);
422  context.device_timers = device_timers;
423  context.bbox = m_bbox;
424  context.dl = DirectLightingMesh( mesh_light );
425  context.ref_queue = ref_queue;
426  context.psf_hashmap = HashMap(
427  HASH_SIZE,
428  m_psf_hash.m_keys.ptr(),
429  m_psf_hash.m_unique.ptr(),
430  m_psf_hash.m_slots.ptr(),
431  m_psf_hash.m_size.ptr()
432  );
433  context.psf_values = m_psf_values.ptr();
434 
435  // initialize the shading cache
436  if ((instance % m_options.psf_temporal_reuse) == 0)
437  m_psf_hash.clear();
438 
439  // reset the reference queue size
440  cudaMemset(context.ref_queue.size, 0x00, sizeof(uint32));
441  CUDA_CHECK(cugar::cuda::sync_and_check_error("clear reference queue"));
442 
443  // perform the actual path tracing
444  path_trace_loop( context, vertex_processor, renderer, renderer_view, stats );
445 
446  // blend-in the PSF references
447  if (pass_type == PSFPT::kFinalPass)
448  {
449  uint32 ref_queue_size;
450  cudaMemcpy(&ref_queue_size, context.ref_queue.size, sizeof(uint32), cudaMemcpyDeviceToHost);
451 
452  psf_blending(ref_queue_size, context, renderer_view);
453  CUDA_CHECK(cugar::cuda::sync_and_check_error("psf blending"));
454  }
456  }
457  }
458  timer.stop();
459  const float time = timer.seconds();
460  // clear the global timer at instance zero
461  if (instance == 0)
462  m_time = time;
463  else
464  m_time += time;
465 
466  fprintf(stderr, "\r %.1fs (%.1fms = rt[%.1fms + %.1fms + %.1fms] + shade[%.1fms + %.1fms] - %uK cells) ",
467  m_time,
468  time * 1000.0f,
469  stats.primary_rt_time * 1000.0f,
470  stats.path_rt_time * 1000.0f,
471  stats.shadow_rt_time * 1000.0f,
472  stats.path_shade_time * 1000.0f,
473  stats.shadow_shade_time * 1000.0f,
474  m_psf_hash.size() / 1000);
475 
476 #if defined(DEVICE_TIMING) && DEVICE_TIMING
477  if (instance % 64 == 0)
478  print_timer_stats( device_timers, stats );
479 #endif
480 
481  if (instance) // skip the first frame
482  {
483  m_stats.primary_rt_time += stats.primary_rt_time;
484  m_stats.path_rt_time += stats.path_rt_time;
485  m_stats.shadow_rt_time += stats.shadow_rt_time;
486  m_stats.path_shade_time += stats.path_shade_time;
487  m_stats.shadow_shade_time += stats.shadow_shade_time;
488  }
489 }
Definition: direct_lighting_rl.h:45
void init(int argc, char **argv, RenderingContext &renderer)
Definition: psfpt_impl.h:196
Definition: pathtracer_kernels.h:49
float shadow_shade_time
time spent for shading shadow samples (i.e. in solve_occlusion)
Definition: pathtracer_kernels.h:303
__device__ __forceinline__ unsigned int warp_increment(unsigned int *ptr)
Definition: warp_atomics.h:56
void update_variances(const uint32 instance)
void start()
start timing
Definition: timer.cpp:116
Definition: lights.h:299
Definition: mesh_lights.h:59
Definition: direct_lighting_mesh.h:41
Definition: psfpt_vertex_processor.h:49
Definition: clustered_rl.h:161
CUGAR_HOST_DEVICE T * alloc(const uint64 sz, const uint64 alignment=sizeof(T))
Definition: memory_arena.h:69
Definition: timer.h:83
CUGAR_HOST_DEVICE L divide_ri(const L x, const R y)
Definition: numbers.h:180
Definition: pathtracer_kernels.h:284
void render(const uint32 instance, RenderingContext &renderer)
Definition: psfpt_impl.h:287
float path_rt_time
time spent for tracing scattering rays
Definition: pathtracer_kernels.h:300
MeshLightsStorage & get_mesh_lights()
Definition: psfpt.h:350
float shadow_rt_time
time spent for tracing shadow rays
Definition: pathtracer_kernels.h:301
void clamp_frame(const float max_value)
float primary_rt_time
time spent for tracing primary rays
Definition: pathtracer_kernels.h:299
void render_pass(const uint32 instance, RenderingContext &renderer, const PassType pass_type)
Definition: psfpt_impl.h:301
Definition: pathtracer_core.h:570
RenderingContextView view(const uint32 instance)
Define a vector_view POD type and plain_view() for std::vector.
Definition: diff.h:38
Definition: clustered_rl.h:87
uint2 res() const
void path_trace_loop(TPTContext &context, TPTVertexProcessor &vertex_processor, RenderingContext &renderer, RenderingContextView &renderer_view, PTLoopStats &stats)
Definition: pathtracer_kernels.h:310
Definition: renderer.h:52
void alloc_queues(PTOptions options, const uint32 n_pixels, PTRayQueue &input_queue, PTRayQueue &scatter_queue, PTRayQueue &shadow_queue, cugar::memory_arena &arena)
Definition: pathtracer_kernels.h:91
void rescale_frame(const uint32 instance)
Definition: pathtracer_core.h:527
Definition: memory_arena.h:44
Definition: renderer_view.h:80
Definition: pathtracer_queues.h:44
[PSFPTVertexProcessor::CacheInfo]
Definition: psfpt_vertex_processor.h:56
cugar::Bbox3f compute_bbox()
float path_shade_time
time spent for shading path vertices
Definition: pathtracer_kernels.h:302