#pragma once
#include <psfpt.h>
#include <renderer.h>
#include <rt.h>
#include <mesh/MeshStorage.h>
#include <cugar/basic/timer.h>
#include <cugar/basic/primitives.h>
#include <cugar/basic/memory_arena.h>
#include <pathtracer_core.h>
#include <pathtracer_queues.h>
#include <pathtracer_kernels.h>
#include <psfpt_vertex_processor.h>
#define SHIFT_RES 256u
#define HASH_SIZE (64u * 1024u * 1024u)
namespace {
struct PSFRefQueue
{
float4* weights_d;
float4* weights_g;
uint2* pixels;
uint32* size;
FERMAT_DEVICE
{
weights_d[slot] = weight_d;
weights_g[slot] = weight_g;
pixels[slot] = make_uint2(pixel.packed, cache_slot.packed);
}
};
template <typename TDirectLightingSampler>
{
PSFRefQueue ref_queue;
HashMap psf_hashmap;
float4* psf_values;
TDirectLightingSampler dl;
};
{
vtls_rl->init(
VTL_RL_HASH_SIZE,
mesh_vtls->get_bvh_clusters_count(),
mesh_vtls->get_bvh_cluster_offsets());
}
{
vtls_rl->init(
VTL_RL_HASH_SIZE,
mesh_vtls->get_bvh_nodes(),
mesh_vtls->get_bvh_parents(),
mesh_vtls->get_bvh_ranges(),
mesh_vtls->get_bvh_clusters_count(),
mesh_vtls->get_bvh_clusters(),
mesh_vtls->get_bvh_cluster_offsets());
}
template <typename TDirectLightingSampler>
__global__
void psf_blending_kernel(
const uint32 in_queue_size, PSFPTContext<TDirectLightingSampler> context,
RenderingContextView renderer,
const float frame_weight)
{
const uint32 thread_id = threadIdx.x + blockIdx.x * blockDim.x;
if (thread_id < in_queue_size)
{
const PixelInfo pixel_info = context.ref_queue.pixels[thread_id].x;
const CacheInfo cache_info = context.ref_queue.pixels[thread_id].y;
if (cache_info.is_valid())
{
const uint32 cache_slot = cache_info.pixel;
cache_value /= cache_value.w;
((pixel_info.comp & Bsdf::kDiffuseMask) ? w_d.xyz() :
cugar::Vector3f(0.0f)) +
((pixel_info.comp & Bsdf::kGlossyMask) ? w_g.xyz() :
cugar::Vector3f(0.0f));
add_in<false>(renderer.fb(FBufferDesc::COMPOSITED_C), pixel_info.pixel, cugar::min( cache_value.xyz() * w, context.options.firefly_filter ), frame_weight);
if (pixel_info.comp & Bsdf::kDiffuseMask)
add_in<true>(renderer.fb(FBufferDesc::DIFFUSE_C), pixel_info.pixel, cache_value.xyz() * w_d.xyz(), frame_weight);
if (pixel_info.comp & Bsdf::kGlossyMask)
add_in<true>(renderer.fb(FBufferDesc::SPECULAR_C), pixel_info.pixel, cache_value.xyz() * w_g.xyz(), frame_weight);
}
}
}
template <typename TDirectLightingSampler>
void psf_blending(
const uint32 in_queue_size, PSFPTContext<TDirectLightingSampler> context,
RenderingContextView renderer)
{
if (!in_queue_size)
return;
const uint32 blockSize(128);
psf_blending_kernel << < gridSize, blockSize >> > (in_queue_size, context, renderer, 1.0f / float(renderer.instance + 1));
}
const uint32 n_pixels,
PSFRefQueue& ref_queue,
{
::alloc_queues( options, n_pixels, input_queue, scatter_queue, shadow_queue, arena );
ref_queue.weights_d = arena.
alloc<float4>(n_pixels * (options.max_path_length + 1));
ref_queue.weights_g = arena.
alloc<float4>(n_pixels * (options.max_path_length + 1));
ref_queue.pixels = arena.
alloc<uint2>(n_pixels * (options.max_path_length + 1));
ref_queue.size = arena.
alloc<uint32>(1);
}
}
PSFPT::PSFPT() :
m_generator(32,
cugar::LFSRGeneratorMatrix::GOOD_PROJECTIONS),
m_random(&m_generator, 1u, 1351u)
{
m_vtls_rl = new VTLRLStorage;
}
{
const uint2 res = renderer.
res();
const uint32 n_pixels = res.x * res.y;
m_options.
parse(argc, argv);
const char* nee_alg[] = { "mesh", "vpl", "rl" };
fprintf(stderr, " PSFPT settings:\n");
fprintf(stderr, " path-length : %u\n", m_options.max_path_length);
fprintf(stderr, " direct-nee : %u\n", m_options.direct_lighting_nee ? 1 : 0);
fprintf(stderr, " direct-bsdf : %u\n", m_options.direct_lighting_bsdf ? 1 : 0);
fprintf(stderr, " indirect-nee : %u\n", m_options.indirect_lighting_nee ? 1 : 0);
fprintf(stderr, " indirect-bsdf : %u\n", m_options.indirect_lighting_bsdf ? 1 : 0);
fprintf(stderr, " visible-lights : %u\n", m_options.visible_lights ? 1 : 0);
fprintf(stderr, " direct lighting : %u\n", m_options.direct_lighting ? 1 : 0);
fprintf(stderr, " diffuse : %u\n", m_options.diffuse_scattering ? 1 : 0);
fprintf(stderr, " glossy : %u\n", m_options.glossy_scattering ? 1 : 0);
fprintf(stderr, " indirect glossy : %u\n", m_options.indirect_glossy ? 1 : 0);
fprintf(stderr, " RR : %u\n", m_options.rr ? 1 : 0);
fprintf(stderr, " nee algorithm : %s\n", nee_alg[ m_options.nee_type ]);
fprintf(stderr, " filter width : %f\n", m_options.psf_width);
fprintf(stderr, " filter depth : %u\n", m_options.psf_depth);
fprintf(stderr, " filter min-dist : %f\n", m_options.psf_min_dist);
fprintf(stderr, " firefly filter : %f\n", m_options.firefly_filter);
m_psf_hash.resize(HASH_SIZE);
m_psf_values.alloc(HASH_SIZE);
{
PSFRefQueue ref_queue;
m_options,
n_pixels,
input_queue,
scatter_queue,
shadow_queue,
ref_queue,
arena );
arena.
alloc<int64>( 16 );
fprintf(stderr, " allocating queue storage: %.1f MB\n", float(arena.size) / (1024*1024));
m_memory_pool.alloc(arena.size);
}
const uint32 n_dimensions = 6 * (m_options.max_path_length + 1);
fprintf(stderr, " initializing sampler: %u dimensions\n", n_dimensions);
m_sequence.setup(n_dimensions, SHIFT_RES);
const uint32 n_light_paths = n_pixels;
fprintf(stderr, " creating mesh lights... started\n");
fprintf(stderr, " creating mesh lights... done\n");
m_options.nee_type = NEE_ALGORITHM_MESH;
if (m_options.nee_type == NEE_ALGORITHM_RL)
{
fprintf(stderr, " creating mesh VTLs... started\n");
m_mesh_vtls->init(n_light_paths, renderer, 0u );
fprintf(stderr, " creating mesh VTLs... done (%u VTLs, %u clusters)\n", m_mesh_vtls->get_vtl_count(), m_mesh_vtls->get_bvh_clusters_count());
fprintf(stderr, " initializing VTLs RL... started\n");
::init( m_vtls_rl, m_mesh_vtls );
fprintf(stderr, " initializing VTLs RL... done (%.1f MB)\n", m_vtls_rl->needed_bytes(VTL_RL_HASH_SIZE, m_mesh_vtls->get_bvh_clusters_count()) / float(1024*1024));
}
}
{
render_pass( instance, renderer, PSFPT::kFinalPass );
}
{
const uint2 res = renderer.
res();
const uint32 n_pixels = res.x * res.y;
PSFRefQueue ref_queue;
m_options,
n_pixels,
input_queue,
scatter_queue,
shadow_queue,
ref_queue,
arena );
uint64* device_timers = arena.
alloc<uint64>( 16 );
if (m_options.nee_type == NEE_ALGORITHM_RL)
{
if ((instance % 32) == 0)
{
m_vtls_rl->clear();
}
else
{
m_vtls_rl->update();
CUDA_CHECK(cugar::cuda::sync_and_check_error("vtl-rl update"));
}
}
m_sequence.set_instance(instance);
{
if (m_options.nee_type == NEE_ALGORITHM_RL)
{
PSFPTContext<DirectLightingRL> context;
context.options = m_options;
context.in_bounce = 0;
context.in_queue = input_queue;
context.scatter_queue = scatter_queue;
context.shadow_queue = shadow_queue;
context.sequence = m_sequence.view();
context.frame_weight = 1.0f / float(renderer_view.instance + 1);
context.device_timers = device_timers;
context.bbox = m_bbox;
view( *m_vtls_rl ),
m_mesh_vtls->view() );
context.ref_queue = ref_queue;
context.psf_hashmap = HashMap(
HASH_SIZE,
m_psf_hash.m_keys.ptr(),
m_psf_hash.m_unique.ptr(),
m_psf_hash.m_slots.ptr(),
m_psf_hash.m_size.ptr()
);
context.psf_values = m_psf_values.ptr();
if ((instance % m_options.psf_temporal_reuse) == 0)
m_psf_hash.clear();
cudaMemset(context.ref_queue.size, 0x00, sizeof(uint32));
CUDA_CHECK(cugar::cuda::sync_and_check_error("clear reference queue"));
path_trace_loop( context, vertex_processor, renderer, renderer_view, stats );
if (pass_type == PSFPT::kFinalPass)
{
uint32 ref_queue_size;
cudaMemcpy(&ref_queue_size, context.ref_queue.size, sizeof(uint32), cudaMemcpyDeviceToHost);
psf_blending(ref_queue_size, context, renderer_view);
CUDA_CHECK(cugar::cuda::sync_and_check_error("psf blending"));
}
}
else
{
MeshLight mesh_light = m_options.nee_type == NEE_ALGORITHM_VPL ? renderer_view.mesh_vpls : renderer_view.mesh_light;
PSFPTContext<DirectLightingMesh> context;
context.options = m_options;
context.in_bounce = 0;
context.in_queue = input_queue;
context.scatter_queue = scatter_queue;
context.shadow_queue = shadow_queue;
context.sequence = m_sequence.view();
context.frame_weight = 1.0f / float(renderer_view.instance + 1);
context.device_timers = device_timers;
context.bbox = m_bbox;
context.ref_queue = ref_queue;
context.psf_hashmap = HashMap(
HASH_SIZE,
m_psf_hash.m_keys.ptr(),
m_psf_hash.m_unique.ptr(),
m_psf_hash.m_slots.ptr(),
m_psf_hash.m_size.ptr()
);
context.psf_values = m_psf_values.ptr();
if ((instance % m_options.psf_temporal_reuse) == 0)
m_psf_hash.clear();
cudaMemset(context.ref_queue.size, 0x00, sizeof(uint32));
CUDA_CHECK(cugar::cuda::sync_and_check_error("clear reference queue"));
path_trace_loop( context, vertex_processor, renderer, renderer_view, stats );
if (pass_type == PSFPT::kFinalPass)
{
uint32 ref_queue_size;
cudaMemcpy(&ref_queue_size, context.ref_queue.size, sizeof(uint32), cudaMemcpyDeviceToHost);
psf_blending(ref_queue_size, context, renderer_view);
CUDA_CHECK(cugar::cuda::sync_and_check_error("psf blending"));
}
}
}
timer.stop();
const float time = timer.seconds();
if (instance == 0)
m_time = time;
else
m_time += time;
fprintf(stderr, "\r %.1fs (%.1fms = rt[%.1fms + %.1fms + %.1fms] + shade[%.1fms + %.1fms] - %uK cells) ",
m_time,
time * 1000.0f,
m_psf_hash.size() / 1000);
#if defined(DEVICE_TIMING) && DEVICE_TIMING
if (instance % 64 == 0)
print_timer_stats( device_timers, stats );
#endif
if (instance)
{
}
}