fermat/psfpt__impl_8h_source.html

 /*
  * Fermat
  *
  * Copyright (c) 2016-2019, NVIDIA CORPORATION. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are met:
  *    * Redistributions of source code must retain the above copyright
  *      notice, this list of conditions and the following disclaimer.
  *    * Redistributions in binary form must reproduce the above copyright
  *      notice, this list of conditions and the following disclaimer in the
  *      documentation and/or other materials provided with the distribution.
  *    * Neither the name of the NVIDIA CORPORATION nor the
  *      names of its contributors may be used to endorse or promote products
  *      derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */

 #pragma once

 #include <psfpt.h>
 #include <renderer.h>
 #include <rt.h>
 #include <mesh/MeshStorage.h>
 #include <cugar/basic/timer.h>
 #include <cugar/basic/primitives.h>
 #include <cugar/basic/memory_arena.h>
 #include <pathtracer_core.h>
 #include <pathtracer_queues.h>
 #include <pathtracer_kernels.h>
 #include <psfpt_vertex_processor.h>


 #define SHIFT_RES   256u

 #define HASH_SIZE (64u * 1024u * 1024u)


 namespace {

     typedef cugar::cuda::SyncFreeHashMap<uint64, uint32, 0xFFFFFFFFFFFFFFFFllu> HashMap;

     // a queue of references to PSF cells that will need to be blended in after path sampling
     //
     struct PSFRefQueue
     {
         float4*     weights_d;  // diffuse path weight
         float4*     weights_g;  // glossy path weight
         uint2*      pixels;
         uint32*     size;

         FERMAT_DEVICE
         void warp_append(const PixelInfo pixel, const PSFPTVertexProcessor::CacheInfo cache_slot, const float4 weight_d, const float4 weight_g)
         {
             const uint32 slot = cugar::cuda::warp_increment(size);

             weights_d[slot] = weight_d;
             weights_g[slot] = weight_g;

             pixels[slot] = make_uint2(pixel.packed, cache_slot.packed);
         }
     };

     // the internal path tracing context
     //
     template <typename TDirectLightingSampler>
     struct PSFPTContext : PTContextBase<PSFPTOptions>, PTContextQueues
     {
         PSFRefQueue ref_queue;

         HashMap     psf_hashmap;
         float4*     psf_values;

         TDirectLightingSampler dl;
     };

     // initialize the RL storage for mesh VTLs
     void init(ClusteredRLStorage* vtls_rl, const MeshVTLStorage* mesh_vtls)
     {
         vtls_rl->init(
             VTL_RL_HASH_SIZE,
             mesh_vtls->get_bvh_clusters_count(),
             mesh_vtls->get_bvh_cluster_offsets());
     }
     // initialize the RL storage for mesh VTLs
     void init(AdaptiveClusteredRLStorage* vtls_rl, const MeshVTLStorage* mesh_vtls)
     {
         vtls_rl->init(
             VTL_RL_HASH_SIZE,
             mesh_vtls->get_bvh_nodes(),
             mesh_vtls->get_bvh_parents(),
             mesh_vtls->get_bvh_ranges(),
             mesh_vtls->get_bvh_clusters_count(),
             mesh_vtls->get_bvh_clusters(),
             mesh_vtls->get_bvh_cluster_offsets());
     }

     // the kernel blending/splatting PSF references into the framebuffer
     //
     template <typename TDirectLightingSampler>
     __global__
     void psf_blending_kernel(const uint32 in_queue_size, PSFPTContext<TDirectLightingSampler> context, RenderingContextView renderer, const float frame_weight)
     {
         const uint32 thread_id = threadIdx.x + blockIdx.x * blockDim.x;

         if (thread_id < in_queue_size) // *context.shadow_queue.size
         {
             typedef PSFPTVertexProcessor::CacheInfo CacheInfo;

             // fetch a reference from the ref queue
             const PixelInfo       pixel_info = context.ref_queue.pixels[thread_id].x;
             const CacheInfo       cache_info = context.ref_queue.pixels[thread_id].y;
             const cugar::Vector4f w_d = context.ref_queue.weights_d[thread_id];
             const cugar::Vector4f w_g = context.ref_queue.weights_g[thread_id];

             // check if it's valid
             if (cache_info.is_valid())
             {
                 // dereference the hashmap cell
                 const uint32 cache_slot = cache_info.pixel;

                 cugar::Vector4f cache_value = context.psf_values[cache_slot];
                                 cache_value /= cache_value.w; // normalize

                 // compue the total weight
                 const cugar::Vector3f w =
                     ((pixel_info.comp & Bsdf::kDiffuseMask) ? w_d.xyz() : cugar::Vector3f(0.0f)) +
                     ((pixel_info.comp & Bsdf::kGlossyMask)  ? w_g.xyz() : cugar::Vector3f(0.0f));

                 // add to the composited framebuffer
                 add_in<false>(renderer.fb(FBufferDesc::COMPOSITED_C), pixel_info.pixel, cugar::min( cache_value.xyz() * w, context.options.firefly_filter ), frame_weight);

                 // add to the diffuse channel, if the diffuse component is present
                 if (pixel_info.comp & Bsdf::kDiffuseMask)
                     add_in<true>(renderer.fb(FBufferDesc::DIFFUSE_C),     pixel_info.pixel, cache_value.xyz() * w_d.xyz(), frame_weight);

                 // add to the glossy channel, if the glossy component is present
                 if (pixel_info.comp & Bsdf::kGlossyMask)
                     add_in<true>(renderer.fb(FBufferDesc::SPECULAR_C),    pixel_info.pixel, cache_value.xyz() * w_g.xyz(), frame_weight);
             }
         }
     }

     // dispatch the blending kernel
     //
     template <typename TDirectLightingSampler>
     void psf_blending(const uint32 in_queue_size, PSFPTContext<TDirectLightingSampler> context, RenderingContextView renderer)
     {
         if (!in_queue_size)
             return;

         const uint32 blockSize(128);
         const dim3 gridSize(cugar::divide_ri(in_queue_size, blockSize));
         psf_blending_kernel << < gridSize, blockSize >> > (in_queue_size, context, renderer, 1.0f / float(renderer.instance + 1));
     }

     // alloc all internal queues
     //
     void alloc_queues(
         PSFPTOptions            options,
         const uint32            n_pixels,
         PTRayQueue&             input_queue,
         PTRayQueue&             scatter_queue,
         PTRayQueue&             shadow_queue,
         PSFRefQueue&            ref_queue,
         cugar::memory_arena&    arena)
     {
         ::alloc_queues( options, n_pixels, input_queue, scatter_queue, shadow_queue, arena );

         ref_queue.weights_d     = arena.alloc<float4>(n_pixels * (options.max_path_length + 1));
         ref_queue.weights_g     = arena.alloc<float4>(n_pixels * (options.max_path_length + 1));
         ref_queue.pixels        = arena.alloc<uint2>(n_pixels * (options.max_path_length + 1));
         ref_queue.size          = arena.alloc<uint32>(1);
     }

 } // anonymous namespace

 PSFPT::PSFPT() :
     m_generator(32, cugar::LFSRGeneratorMatrix::GOOD_PROJECTIONS),
     m_random(&m_generator, 1u, 1351u)
 {
     m_mesh_vtls = new MeshVTLStorage;
     m_vtls_rl = new VTLRLStorage;
 }

 void PSFPT::init(int argc, char** argv, RenderingContext& renderer)
 {
     const uint2 res = renderer.res();
     const uint32 n_pixels = res.x * res.y;

     // parse the options
     m_options.parse(argc, argv);

     const char* nee_alg[] = { "mesh", "vpl", "rl" };

     fprintf(stderr, "  PSFPT settings:\n");
     fprintf(stderr, "    path-length     : %u\n", m_options.max_path_length);
     fprintf(stderr, "    direct-nee      : %u\n", m_options.direct_lighting_nee ? 1 : 0);
     fprintf(stderr, "    direct-bsdf     : %u\n", m_options.direct_lighting_bsdf ? 1 : 0);
     fprintf(stderr, "    indirect-nee    : %u\n", m_options.indirect_lighting_nee ? 1 : 0);
     fprintf(stderr, "    indirect-bsdf   : %u\n", m_options.indirect_lighting_bsdf ? 1 : 0);
     fprintf(stderr, "    visible-lights  : %u\n", m_options.visible_lights ? 1 : 0);
     fprintf(stderr, "    direct lighting : %u\n", m_options.direct_lighting ? 1 : 0);
     fprintf(stderr, "    diffuse         : %u\n", m_options.diffuse_scattering ? 1 : 0);
     fprintf(stderr, "    glossy          : %u\n", m_options.glossy_scattering ? 1 : 0);
     fprintf(stderr, "    indirect glossy : %u\n", m_options.indirect_glossy ? 1 : 0);
     fprintf(stderr, "    RR              : %u\n", m_options.rr ? 1 : 0);
     fprintf(stderr, "    nee algorithm   : %s\n", nee_alg[ m_options.nee_type ]);
     fprintf(stderr, "    filter width    : %f\n", m_options.psf_width);
     fprintf(stderr, "    filter depth    : %u\n", m_options.psf_depth);
     fprintf(stderr, "    filter min-dist : %f\n", m_options.psf_min_dist);
     fprintf(stderr, "    firefly filter  : %f\n", m_options.firefly_filter);

     // allocate the PSF cache storage
     m_psf_hash.resize(HASH_SIZE);
     m_psf_values.alloc(HASH_SIZE);

     // pre-alloc queue storage
     {
         // determine how much storage we will need
         cugar::memory_arena arena;

         PTRayQueue  input_queue;
         PTRayQueue  scatter_queue;
         PTRayQueue  shadow_queue;
         PSFRefQueue ref_queue;

         alloc_queues(
             m_options,
             n_pixels,
             input_queue,
             scatter_queue,
             shadow_queue,
             ref_queue,
             arena );

         // alloc space for device timers
         arena.alloc<int64>( 16 );

         fprintf(stderr, "  allocating queue storage: %.1f MB\n", float(arena.size) / (1024*1024));
         m_memory_pool.alloc(arena.size);
     }

     // build the set of shifts
     const uint32 n_dimensions = 6 * (m_options.max_path_length + 1);
     fprintf(stderr, "  initializing sampler: %u dimensions\n", n_dimensions);
     m_sequence.setup(n_dimensions, SHIFT_RES);

     const uint32 n_light_paths = n_pixels;

     fprintf(stderr, "  creating mesh lights... started\n");

     // initialize the mesh lights sampler
     renderer.get_mesh_lights().init( n_light_paths, renderer, 0u );

     fprintf(stderr, "  creating mesh lights... done\n");

     // compute the scene bbox
     m_bbox = renderer.compute_bbox();

     // disable smart algorithms if there are no emissive surfaces
     if (renderer.get_mesh_lights().get_vpl_count() == 0)
         m_options.nee_type = NEE_ALGORITHM_MESH;

     if (m_options.nee_type == NEE_ALGORITHM_RL)
     {
         fprintf(stderr, "  creating mesh VTLs... started\n");
         m_mesh_vtls->init(n_light_paths, renderer, 0u );
         fprintf(stderr, "  creating mesh VTLs... done (%u VTLs, %u clusters)\n", m_mesh_vtls->get_vtl_count(), m_mesh_vtls->get_bvh_clusters_count());

         fprintf(stderr, "  initializing VTLs RL... started\n");
         ::init( m_vtls_rl, m_mesh_vtls );
         fprintf(stderr, "  initializing VTLs RL... done (%.1f MB)\n", m_vtls_rl->needed_bytes(VTL_RL_HASH_SIZE, m_mesh_vtls->get_bvh_clusters_count()) / float(1024*1024));
     }
 }

 void PSFPT::render(const uint32 instance, RenderingContext& renderer)
 {
     // pre-multiply the previous frame for blending
     renderer.rescale_frame( instance );

     //render_pass( instance, renderer, PSFPT::kPresamplePass );
     render_pass( instance, renderer, PSFPT::kFinalPass );

     renderer.update_variances( instance );

     // clamp the framebuffer contents to a reasonably high value, just to avoid outrageous fireflies
     renderer.clamp_frame( 100.0f );
 }

 void PSFPT::render_pass(const uint32 instance, RenderingContext& renderer, const PassType pass_type)
 {
     //fprintf(stderr, "render started (%u)\n", instance);
     const uint2 res = renderer.res();
     const uint32 n_pixels = res.x * res.y;

     // carve an arena out of the pre-allocated memory pool
     cugar::memory_arena arena( m_memory_pool.ptr() );

     // alloc all the queues
     PTRayQueue  input_queue;
     PTRayQueue  scatter_queue;
     PTRayQueue  shadow_queue;
     PSFRefQueue ref_queue;

     alloc_queues(
         m_options,
         n_pixels,
         input_queue,
         scatter_queue,
         shadow_queue,
         ref_queue,
         arena );

     // fetch a view of the renderer
     RenderingContextView renderer_view = renderer.view(instance);

     // instantiate our vertex processor
     PSFPTVertexProcessor vertex_processor( m_options.firefly_filter );

     // alloc space for device timers
     uint64* device_timers = arena.alloc<uint64>( 16 );

     cugar::Timer timer;
     timer.start();

     PTLoopStats stats;

     if (m_options.nee_type == NEE_ALGORITHM_RL)
     {
         if ((instance % 32) == 0)
         {
             // clear the RL hash tables after a bunch of iterations to avoid overflow...
             m_vtls_rl->clear();
         }
         else
         {
             // update the vtl cdfs
             m_vtls_rl->update();
             CUDA_CHECK(cugar::cuda::sync_and_check_error("vtl-rl update"));
         }
     }

     // setup the samples for this frame
     m_sequence.set_instance(instance);
     {
         // use the RL direct-lighting sampler
         if (m_options.nee_type == NEE_ALGORITHM_RL)
         {
             PSFPTContext<DirectLightingRL> context;
             context.options         = m_options;
             context.in_bounce       = 0;
             context.in_queue        = input_queue;
             context.scatter_queue   = scatter_queue;
             context.shadow_queue    = shadow_queue;
             context.sequence        = m_sequence.view();
             context.frame_weight    = 1.0f / float(renderer_view.instance + 1);
             context.device_timers   = device_timers;
             context.bbox            = m_bbox;
             context.dl              = DirectLightingRL(
                 view( *m_vtls_rl ),
                 m_mesh_vtls->view() );
             context.ref_queue       = ref_queue;
             context.psf_hashmap     = HashMap(
                 HASH_SIZE,
                 m_psf_hash.m_keys.ptr(),
                 m_psf_hash.m_unique.ptr(),
                 m_psf_hash.m_slots.ptr(),
                 m_psf_hash.m_size.ptr()
             );
             context.psf_values = m_psf_values.ptr();

             // initialize the shading cache
             if ((instance % m_options.psf_temporal_reuse) == 0)
                 m_psf_hash.clear();

             // reset the reference queue size
             cudaMemset(context.ref_queue.size, 0x00, sizeof(uint32));
             CUDA_CHECK(cugar::cuda::sync_and_check_error("clear reference queue"));

             // perform the actual path tracing
             path_trace_loop( context, vertex_processor, renderer, renderer_view, stats );

             // blend-in the PSF references
             if (pass_type == PSFPT::kFinalPass)
             {
                 uint32 ref_queue_size;
                 cudaMemcpy(&ref_queue_size, context.ref_queue.size, sizeof(uint32), cudaMemcpyDeviceToHost);

                 psf_blending(ref_queue_size, context, renderer_view);
                 CUDA_CHECK(cugar::cuda::sync_and_check_error("psf blending"));
             }
         }
         else // use the regular mesh emitter direct-lighting sampler
         {
             // select which instantiation of the mesh light to use (VPLs or the plain mesh)
             MeshLight mesh_light = m_options.nee_type == NEE_ALGORITHM_VPL ? renderer_view.mesh_vpls : renderer_view.mesh_light;

             PSFPTContext<DirectLightingMesh> context;
             context.options         = m_options;
             context.in_bounce       = 0;
             context.in_queue        = input_queue;
             context.scatter_queue   = scatter_queue;
             context.shadow_queue    = shadow_queue;
             context.sequence        = m_sequence.view();
             context.frame_weight    = 1.0f / float(renderer_view.instance + 1);
             context.device_timers   = device_timers;
             context.bbox            = m_bbox;
             context.dl              = DirectLightingMesh( mesh_light );
             context.ref_queue       = ref_queue;
             context.psf_hashmap     = HashMap(
                 HASH_SIZE,
                 m_psf_hash.m_keys.ptr(),
                 m_psf_hash.m_unique.ptr(),
                 m_psf_hash.m_slots.ptr(),
                 m_psf_hash.m_size.ptr()
             );
             context.psf_values = m_psf_values.ptr();

             // initialize the shading cache
             if ((instance % m_options.psf_temporal_reuse) == 0)
                 m_psf_hash.clear();

             // reset the reference queue size
             cudaMemset(context.ref_queue.size, 0x00, sizeof(uint32));
             CUDA_CHECK(cugar::cuda::sync_and_check_error("clear reference queue"));

             // perform the actual path tracing
             path_trace_loop( context, vertex_processor, renderer, renderer_view, stats );

             // blend-in the PSF references
             if (pass_type == PSFPT::kFinalPass)
             {
                 uint32 ref_queue_size;
                 cudaMemcpy(&ref_queue_size, context.ref_queue.size, sizeof(uint32), cudaMemcpyDeviceToHost);

                 psf_blending(ref_queue_size, context, renderer_view);
                 CUDA_CHECK(cugar::cuda::sync_and_check_error("psf blending"));
             }
         }
     }
     timer.stop();
     const float time = timer.seconds();
     // clear the global timer at instance zero
     if (instance == 0)
         m_time = time;
     else
         m_time += time;

     fprintf(stderr, "\r  %.1fs (%.1fms = rt[%.1fms + %.1fms + %.1fms] + shade[%.1fms + %.1fms] - %uK cells)        ",
         m_time,
         time * 1000.0f,
         stats.primary_rt_time * 1000.0f,
         stats.path_rt_time * 1000.0f,
         stats.shadow_rt_time * 1000.0f,
         stats.path_shade_time * 1000.0f,
         stats.shadow_shade_time * 1000.0f,
         m_psf_hash.size() / 1000);

 #if defined(DEVICE_TIMING) && DEVICE_TIMING
     if (instance % 64 == 0)
         print_timer_stats( device_timers, stats );
 #endif

     if (instance) // skip the first frame
     {
         m_stats.primary_rt_time += stats.primary_rt_time;
         m_stats.path_rt_time += stats.path_rt_time;
         m_stats.shadow_rt_time += stats.shadow_rt_time;
         m_stats.path_shade_time += stats.path_shade_time;
         m_stats.shadow_shade_time += stats.shadow_shade_time;
     }
 }
DirectLightingRL
Definition: direct_lighting_rl.h:45

PSFPT::init
void init(int argc, char **argv, RenderingContext &renderer)
Definition: psfpt_impl.h:196

PTContextQueues
Definition: pathtracer_kernels.h:49

PTLoopStats::shadow_shade_time
float shadow_shade_time
time spent for shading shadow samples (i.e. in solve_occlusion)
Definition: pathtracer_kernels.h:303

cugar::cuda::warp_increment
__device__ __forceinline__ unsigned int warp_increment(unsigned int *ptr)
Definition: warp_atomics.h:56

RenderingContext::update_variances
void update_variances(const uint32 instance)

cugar::Timer::start
void start()
start timing
Definition: timer.cpp:116

MeshLight
Definition: lights.h:299

MeshVTLStorage
Definition: mesh_lights.h:59

DirectLightingMesh
Definition: direct_lighting_mesh.h:41

PSFPTVertexProcessor
Definition: psfpt_vertex_processor.h:49

AdaptiveClusteredRLStorage
Definition: clustered_rl.h:161

cugar::memory_arena::alloc
CUGAR_HOST_DEVICE T * alloc(const uint64 sz, const uint64 alignment=sizeof(T))
Definition: memory_arena.h:69

cugar::Timer
Definition: timer.h:83

cugar::divide_ri
CUGAR_HOST_DEVICE L divide_ri(const L x, const R y)
Definition: numbers.h:180

PTLoopStats
Definition: pathtracer_kernels.h:284

PSFPT::render
void render(const uint32 instance, RenderingContext &renderer)
Definition: psfpt_impl.h:287

PTLoopStats::path_rt_time
float path_rt_time
time spent for tracing scattering rays
Definition: pathtracer_kernels.h:300

RenderingContext::get_mesh_lights
MeshLightsStorage & get_mesh_lights()

PSFPTOptions
Definition: psfpt.h:350

PTLoopStats::shadow_rt_time
float shadow_rt_time
time spent for tracing shadow rays
Definition: pathtracer_kernels.h:301

cugar::cuda::SyncFreeHashMap< uint64, uint32, 0xFFFFFFFFFFFFFFFFllu >

RenderingContext::clamp_frame
void clamp_frame(const float max_value)

PTLoopStats::primary_rt_time
float primary_rt_time
time spent for tracing primary rays
Definition: pathtracer_kernels.h:299

PSFPT::render_pass
void render_pass(const uint32 instance, RenderingContext &renderer, const PassType pass_type)
Definition: psfpt_impl.h:301

PTContextBase
Definition: pathtracer_core.h:570

cugar::Vector< float, 4 >

RenderingContext::view
RenderingContextView view(const uint32 instance)

cugar
Define a vector_view POD type and plain_view() for std::vector.
Definition: diff.h:38

ClusteredRLStorage
Definition: clustered_rl.h:87

RenderingContext::res
uint2 res() const

path_trace_loop
void path_trace_loop(TPTContext &context, TPTVertexProcessor &vertex_processor, RenderingContext &renderer, RenderingContextView &renderer_view, PTLoopStats &stats)
Definition: pathtracer_kernels.h:310

RenderingContext
Definition: renderer.h:52

alloc_queues
void alloc_queues(PTOptions options, const uint32 n_pixels, PTRayQueue &input_queue, PTRayQueue &scatter_queue, PTRayQueue &shadow_queue, cugar::memory_arena &arena)
Definition: pathtracer_kernels.h:91

RenderingContext::rescale_frame
void rescale_frame(const uint32 instance)

PixelInfo
Definition: pathtracer_core.h:527

cugar::memory_arena
Definition: memory_arena.h:44

RenderingContextView
Definition: renderer_view.h:80

PTRayQueue
Definition: pathtracer_queues.h:44

PSFPTVertexProcessor::CacheInfo
[PSFPTVertexProcessor::CacheInfo]
Definition: psfpt_vertex_processor.h:56

RenderingContext::compute_bbox
cugar::Bbox3f compute_bbox()

PTLoopStats::path_shade_time
float path_shade_time
time spent for shading path vertices
Definition: pathtracer_kernels.h:302