Merge branch 'master' into blender2.8
This commit is contained in:
@@ -195,7 +195,7 @@ endif()
|
||||
# Options
|
||||
|
||||
# First platform specific non-cached vars
|
||||
if(UNIX AND NOT APPLE)
|
||||
if(UNIX AND NOT (APPLE OR HAIKU))
|
||||
set(WITH_X11 ON)
|
||||
endif()
|
||||
|
||||
|
||||
@@ -42,7 +42,7 @@ FIND_PATH(SDL2_INCLUDE_DIR
|
||||
HINTS
|
||||
${_sdl2_SEARCH_DIRS}
|
||||
PATH_SUFFIXES
|
||||
include/SDL2 include
|
||||
include/SDL2 include SDL2
|
||||
)
|
||||
|
||||
FIND_LIBRARY(SDL2_LIBRARY
|
||||
|
||||
@@ -1519,6 +1519,7 @@ function(find_python_package
|
||||
PATH_SUFFIXES
|
||||
site-packages
|
||||
dist-packages
|
||||
vendor-packages
|
||||
NO_DEFAULT_PATH
|
||||
)
|
||||
|
||||
@@ -1529,6 +1530,8 @@ function(find_python_package
|
||||
"'${PYTHON_LIBPATH}/python${_PY_VER_MAJOR}/site-packages/${package}', "
|
||||
"'${PYTHON_LIBPATH}/python${PYTHON_VERSION}/dist-packages/${package}', "
|
||||
"'${PYTHON_LIBPATH}/python${_PY_VER_MAJOR}/dist-packages/${package}', "
|
||||
"'${PYTHON_LIBPATH}/python${PYTHON_VERSION}/vendor-packages/${package}', "
|
||||
"'${PYTHON_LIBPATH}/python${_PY_VER_MAJOR}/vendor-packages/${package}', "
|
||||
"\n"
|
||||
"The 'WITH_PYTHON_INSTALL_${_upper_package}' option will be ignored when installing Python.\n"
|
||||
"The build will be usable, only add-ons that depend on this package won't be functional."
|
||||
|
||||
@@ -358,7 +358,11 @@ if(WITH_OPENSUBDIV OR WITH_CYCLES_OPENSUBDIV)
|
||||
endif()
|
||||
|
||||
# OpenSuse needs lutil, ArchLinux not, for now keep, can avoid by using --as-needed
|
||||
list(APPEND PLATFORM_LINKLIBS -lutil -lc -lm)
|
||||
if(HAIKU)
|
||||
list(APPEND PLATFORM_LINKLIBS -lnetwork)
|
||||
else()
|
||||
list(APPEND PLATFORM_LINKLIBS -lutil -lc -lm)
|
||||
endif()
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
list(APPEND PLATFORM_LINKLIBS ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
2
extern/glog/src/config.h
vendored
2
extern/glog/src/config.h
vendored
@@ -14,4 +14,6 @@
|
||||
#include "windows/config.h"
|
||||
#elif defined(__GNU__)
|
||||
#include "config_hurd.h"
|
||||
#elif defined(__HAIKU__)
|
||||
#include "config_haiku.h"
|
||||
#endif
|
||||
|
||||
172
extern/glog/src/config_haiku.h
vendored
Normal file
172
extern/glog/src/config_haiku.h
vendored
Normal file
@@ -0,0 +1,172 @@
|
||||
/* src/config.h. Generated from config.h.in by configure. */
|
||||
/* src/config.h.in. Generated from configure.ac by autoheader. */
|
||||
|
||||
/* Namespace for Google classes */
|
||||
#define GOOGLE_NAMESPACE google
|
||||
|
||||
/* Define if you have the `dladdr' function */
|
||||
#define HAVE_DLADDR 1
|
||||
|
||||
/* Define to 1 if you have the <dlfcn.h> header file. */
|
||||
#define HAVE_DLFCN_H 1
|
||||
|
||||
/* Define to 1 if you have the <execinfo.h> header file. */
|
||||
#undef HAVE_EXECINFO_H
|
||||
|
||||
/* Define if you have the `fcntl' function */
|
||||
#define HAVE_FCNTL 1
|
||||
|
||||
/* Define to 1 if you have the <glob.h> header file. */
|
||||
#define HAVE_GLOB_H 1
|
||||
|
||||
/* Define to 1 if you have the <inttypes.h> header file. */
|
||||
#define HAVE_INTTYPES_H 1
|
||||
|
||||
/* Define to 1 if you have the `pthread' library (-lpthread). */
|
||||
#define HAVE_LIBPTHREAD 1
|
||||
|
||||
/* Define to 1 if you have the <libunwind.h> header file. */
|
||||
/* #undef HAVE_LIBUNWIND_H */
|
||||
|
||||
/* define if you have google gflags library */
|
||||
#define HAVE_LIB_GFLAGS 1
|
||||
|
||||
/* define if you have google gmock library */
|
||||
/* #undef HAVE_LIB_GMOCK */
|
||||
|
||||
/* define if you have google gtest library */
|
||||
/* #undef HAVE_LIB_GTEST */
|
||||
|
||||
/* define if you have libunwind */
|
||||
/* #undef HAVE_LIB_UNWIND */
|
||||
|
||||
/* Define to 1 if you have the <memory.h> header file. */
|
||||
#define HAVE_MEMORY_H 1
|
||||
|
||||
/* define if the compiler implements namespaces */
|
||||
#define HAVE_NAMESPACES 1
|
||||
|
||||
/* Define if you have the 'pread' function */
|
||||
#define HAVE_PREAD 1
|
||||
|
||||
/* Define if you have POSIX threads libraries and header files. */
|
||||
#define HAVE_PTHREAD 1
|
||||
|
||||
/* Define to 1 if you have the <pwd.h> header file. */
|
||||
#define HAVE_PWD_H 1
|
||||
|
||||
/* Define if you have the 'pwrite' function */
|
||||
#define HAVE_PWRITE 1
|
||||
|
||||
/* define if the compiler implements pthread_rwlock_* */
|
||||
#define HAVE_RWLOCK 1
|
||||
|
||||
/* Define if you have the `sigaltstack' function */
|
||||
#define HAVE_SIGALTSTACK 1
|
||||
|
||||
/* Define to 1 if you have the <stdint.h> header file. */
|
||||
#define HAVE_STDINT_H 1
|
||||
|
||||
/* Define to 1 if you have the <stdlib.h> header file. */
|
||||
#define HAVE_STDLIB_H 1
|
||||
|
||||
/* Define to 1 if you have the <strings.h> header file. */
|
||||
#define HAVE_STRINGS_H 1
|
||||
|
||||
/* Define to 1 if you have the <string.h> header file. */
|
||||
#define HAVE_STRING_H 1
|
||||
|
||||
/* Define to 1 if you have the <syscall.h> header file. */
|
||||
/* #undef HAVE_SYSCALL_H */
|
||||
|
||||
/* Define to 1 if you have the <syslog.h> header file. */
|
||||
#define HAVE_SYSLOG_H 1
|
||||
|
||||
/* Define to 1 if you have the <sys/stat.h> header file. */
|
||||
#define HAVE_SYS_STAT_H 1
|
||||
|
||||
/* Define to 1 if you have the <sys/syscall.h> header file. */
|
||||
/* #undef HAVE_SYS_SYSCALL_H */
|
||||
|
||||
/* Define to 1 if you have the <sys/time.h> header file. */
|
||||
#define HAVE_SYS_TIME_H 1
|
||||
|
||||
/* Define to 1 if you have the <sys/types.h> header file. */
|
||||
#define HAVE_SYS_TYPES_H 1
|
||||
|
||||
/* Define to 1 if you have the <sys/ucontext.h> header file. */
|
||||
/* #undef HAVE_SYS_UCONTEXT_H */
|
||||
|
||||
/* Define to 1 if you have the <sys/utsname.h> header file. */
|
||||
#define HAVE_SYS_UTSNAME_H 1
|
||||
|
||||
/* Define to 1 if you have the <ucontext.h> header file. */
|
||||
/* #undef HAVE_UCONTEXT_H */
|
||||
|
||||
/* Define to 1 if you have the <unistd.h> header file. */
|
||||
#define HAVE_UNISTD_H 1
|
||||
|
||||
/* define if the compiler supports using expression for operator */
|
||||
#define HAVE_USING_OPERATOR 1
|
||||
|
||||
/* define if your compiler has __attribute__ */
|
||||
#define HAVE___ATTRIBUTE__ 1
|
||||
|
||||
/* define if your compiler has __builtin_expect */
|
||||
#define HAVE___BUILTIN_EXPECT 1
|
||||
|
||||
/* define if your compiler has __sync_val_compare_and_swap */
|
||||
/* #undef HAVE___SYNC_VAL_COMPARE_AND_SWAP */
|
||||
|
||||
/* Name of package */
|
||||
#define PACKAGE "glog"
|
||||
|
||||
/* Define to the address where bug reports for this package should be sent. */
|
||||
#define PACKAGE_BUGREPORT "opensource@google.com"
|
||||
|
||||
/* Define to the full name of this package. */
|
||||
#define PACKAGE_NAME "glog"
|
||||
|
||||
/* Define to the full name and version of this package. */
|
||||
#define PACKAGE_STRING "glog 0.3.4"
|
||||
|
||||
/* Define to the one symbol short name of this package. */
|
||||
#define PACKAGE_TARNAME "glog"
|
||||
|
||||
/* Define to the version of this package. */
|
||||
#define PACKAGE_VERSION "0.3.4"
|
||||
|
||||
/* How to access the PC from a struct ucontext */
|
||||
/* #define PC_FROM_UCONTEXT uc_mcontext.gregs[REG_RIP] */
|
||||
|
||||
/* Define to necessary symbol if this constant uses a non-standard name on
|
||||
your system. */
|
||||
/* #undef PTHREAD_CREATE_JOINABLE */
|
||||
|
||||
/* The size of `void *', as computed by sizeof. */
|
||||
#define SIZEOF_VOID_P 8
|
||||
|
||||
/* Define to 1 if you have the ANSI C header files. */
|
||||
/* #undef STDC_HEADERS */
|
||||
|
||||
#define STDC_HEADERS 1
|
||||
/* the namespace where STL code like vector<> is defined */
|
||||
#define STL_NAMESPACE std
|
||||
|
||||
/* location of source code */
|
||||
#define TEST_SRC_DIR "."
|
||||
|
||||
/* Version number of package */
|
||||
#define VERSION "0.3.4"
|
||||
|
||||
/* Stops putting the code inside the Google namespace */
|
||||
#define _END_GOOGLE_NAMESPACE_ }
|
||||
|
||||
/* Puts following code inside the Google namespace */
|
||||
#define _START_GOOGLE_NAMESPACE_ namespace google {
|
||||
|
||||
/* isn't getting defined by configure script when clang compilers are used
|
||||
and cuases compilation errors in stactrace/unwind modules */
|
||||
#ifdef __clang__
|
||||
# define NO_FRAME_POINTER
|
||||
#endif
|
||||
@@ -190,9 +190,9 @@ public:
|
||||
KernelFunctions<void(*)(int, int, float*, float*, float*, float*, int*, int, int)> filter_nlm_update_output_kernel;
|
||||
KernelFunctions<void(*)(float*, float*, int*, int)> filter_nlm_normalize_kernel;
|
||||
|
||||
KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel;
|
||||
KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int, int)> filter_nlm_construct_gramian_kernel;
|
||||
KernelFunctions<void(*)(int, int, int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel;
|
||||
KernelFunctions<void(*)(float*, int, int, int, float*, int*, int*, int, int, float)> filter_construct_transform_kernel;
|
||||
KernelFunctions<void(*)(int, int, float*, float*, float*, int*, float*, float3*, int*, int*, int, int, int)> filter_nlm_construct_gramian_kernel;
|
||||
KernelFunctions<void(*)(int, int, int, float*, int*, float*, float3*, int*, int)> filter_finalize_kernel;
|
||||
|
||||
KernelFunctions<void(*)(KernelGlobals *, ccl_constant KernelData*, ccl_global void*, int, ccl_global char*,
|
||||
int, int, int, int, int, int, int, int, ccl_global int*, int,
|
||||
@@ -565,13 +565,13 @@ public:
|
||||
(float*) color_variance_ptr,
|
||||
difference,
|
||||
local_rect,
|
||||
task->buffer.w,
|
||||
task->buffer.stride,
|
||||
task->buffer.pass_stride,
|
||||
1.0f,
|
||||
task->nlm_k_2);
|
||||
filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4);
|
||||
filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.w, 4);
|
||||
filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.w, 4);
|
||||
filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
|
||||
filter_nlm_calc_weight_kernel()(blurDifference, difference, local_rect, task->buffer.stride, 4);
|
||||
filter_nlm_blur_kernel()(difference, blurDifference, local_rect, task->buffer.stride, 4);
|
||||
filter_nlm_construct_gramian_kernel()(dx, dy,
|
||||
blurDifference,
|
||||
(float*) task->buffer.mem.device_pointer,
|
||||
@@ -580,9 +580,8 @@ public:
|
||||
(float*) task->storage.XtWX.device_pointer,
|
||||
(float3*) task->storage.XtWY.device_pointer,
|
||||
local_rect,
|
||||
&task->reconstruction_state.filter_rect.x,
|
||||
task->buffer.w,
|
||||
task->buffer.h,
|
||||
&task->reconstruction_state.filter_window.x,
|
||||
task->buffer.stride,
|
||||
4,
|
||||
task->buffer.pass_stride);
|
||||
}
|
||||
@@ -591,8 +590,6 @@ public:
|
||||
filter_finalize_kernel()(x,
|
||||
y,
|
||||
y*task->filter_area.z + x,
|
||||
task->buffer.w,
|
||||
task->buffer.h,
|
||||
(float*) output_ptr,
|
||||
(int*) task->storage.rank.device_pointer,
|
||||
(float*) task->storage.XtWX.device_pointer,
|
||||
|
||||
@@ -1087,6 +1087,19 @@ public:
|
||||
threads, threads, 1, \
|
||||
0, 0, args, 0));
|
||||
|
||||
/* Similar as above, but for 1-dimensional blocks. */
|
||||
#define CUDA_GET_BLOCKSIZE_1D(func, w, h) \
|
||||
int threads_per_block; \
|
||||
cuda_assert(cuFuncGetAttribute(&threads_per_block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, func)); \
|
||||
int xblocks = ((w) + threads_per_block - 1)/threads_per_block; \
|
||||
int yblocks = h;
|
||||
|
||||
#define CUDA_LAUNCH_KERNEL_1D(func, args) \
|
||||
cuda_assert(cuLaunchKernel(func, \
|
||||
xblocks, yblocks, 1, \
|
||||
threads_per_block, 1, 1, \
|
||||
0, 0, args, 0));
|
||||
|
||||
bool denoising_non_local_means(device_ptr image_ptr, device_ptr guide_ptr, device_ptr variance_ptr, device_ptr out_ptr,
|
||||
DenoisingTask *task)
|
||||
{
|
||||
@@ -1095,60 +1108,65 @@ public:
|
||||
|
||||
CUDAContextScope scope(this);
|
||||
|
||||
int4 rect = task->rect;
|
||||
int w = align_up(rect.z-rect.x, 4);
|
||||
int h = rect.w-rect.y;
|
||||
int stride = task->buffer.stride;
|
||||
int w = task->buffer.width;
|
||||
int h = task->buffer.h;
|
||||
int r = task->nlm_state.r;
|
||||
int f = task->nlm_state.f;
|
||||
float a = task->nlm_state.a;
|
||||
float k_2 = task->nlm_state.k_2;
|
||||
|
||||
CUdeviceptr difference = task->nlm_state.temporary_1_ptr;
|
||||
CUdeviceptr blurDifference = task->nlm_state.temporary_2_ptr;
|
||||
CUdeviceptr weightAccum = task->nlm_state.temporary_3_ptr;
|
||||
|
||||
cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*w*h));
|
||||
cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*w*h));
|
||||
|
||||
CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput, cuNLMNormalize;
|
||||
cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
|
||||
cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
|
||||
cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
|
||||
cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
|
||||
cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
|
||||
|
||||
cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
|
||||
cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
|
||||
cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
|
||||
cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
|
||||
cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
|
||||
|
||||
CUDA_GET_BLOCKSIZE(cuNLMCalcDifference, rect.z-rect.x, rect.w-rect.y);
|
||||
|
||||
int dx, dy;
|
||||
int4 local_rect;
|
||||
int shift_stride = stride*h;
|
||||
int num_shifts = (2*r+1)*(2*r+1);
|
||||
int mem_size = sizeof(float)*shift_stride*2*num_shifts;
|
||||
int channel_offset = 0;
|
||||
void *calc_difference_args[] = {&dx, &dy, &guide_ptr, &variance_ptr, &difference, &local_rect, &w, &channel_offset, &a, &k_2};
|
||||
void *blur_args[] = {&difference, &blurDifference, &local_rect, &w, &f};
|
||||
void *calc_weight_args[] = {&blurDifference, &difference, &local_rect, &w, &f};
|
||||
void *update_output_args[] = {&dx, &dy, &blurDifference, &image_ptr, &out_ptr, &weightAccum, &local_rect, &w, &f};
|
||||
|
||||
for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
|
||||
dy = i / (2*r+1) - r;
|
||||
dx = i % (2*r+1) - r;
|
||||
local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
|
||||
CUdeviceptr temporary_mem;
|
||||
cuda_assert(cuMemAlloc(&temporary_mem, mem_size));
|
||||
CUdeviceptr difference = temporary_mem;
|
||||
CUdeviceptr blurDifference = temporary_mem + sizeof(float)*shift_stride * num_shifts;
|
||||
|
||||
CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
|
||||
CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
|
||||
CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
|
||||
CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
|
||||
CUDA_LAUNCH_KERNEL(cuNLMUpdateOutput, update_output_args);
|
||||
CUdeviceptr weightAccum = task->nlm_state.temporary_3_ptr;
|
||||
cuda_assert(cuMemsetD8(weightAccum, 0, sizeof(float)*shift_stride));
|
||||
cuda_assert(cuMemsetD8(out_ptr, 0, sizeof(float)*shift_stride));
|
||||
|
||||
{
|
||||
CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMUpdateOutput;
|
||||
cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
|
||||
cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
|
||||
cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
|
||||
cuda_assert(cuModuleGetFunction(&cuNLMUpdateOutput, cuFilterModule, "kernel_cuda_filter_nlm_update_output"));
|
||||
|
||||
cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
|
||||
cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
|
||||
cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
|
||||
cuda_assert(cuFuncSetCacheConfig(cuNLMUpdateOutput, CU_FUNC_CACHE_PREFER_L1));
|
||||
|
||||
CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference, w*h, num_shifts);
|
||||
|
||||
void *calc_difference_args[] = {&guide_ptr, &variance_ptr, &difference, &w, &h, &stride, &shift_stride, &r, &channel_offset, &a, &k_2};
|
||||
void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &shift_stride, &r, &f};
|
||||
void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &shift_stride, &r, &f};
|
||||
void *update_output_args[] = {&blurDifference, &image_ptr, &out_ptr, &weightAccum, &w, &h, &stride, &shift_stride, &r, &f};
|
||||
|
||||
CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
|
||||
CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
|
||||
CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
|
||||
CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
|
||||
CUDA_LAUNCH_KERNEL_1D(cuNLMUpdateOutput, update_output_args);
|
||||
}
|
||||
|
||||
local_rect = make_int4(0, 0, rect.z-rect.x, rect.w-rect.y);
|
||||
void *normalize_args[] = {&out_ptr, &weightAccum, &local_rect, &w};
|
||||
CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
|
||||
cuda_assert(cuCtxSynchronize());
|
||||
cuMemFree(temporary_mem);
|
||||
|
||||
{
|
||||
CUfunction cuNLMNormalize;
|
||||
cuda_assert(cuModuleGetFunction(&cuNLMNormalize, cuFilterModule, "kernel_cuda_filter_nlm_normalize"));
|
||||
cuda_assert(cuFuncSetCacheConfig(cuNLMNormalize, CU_FUNC_CACHE_PREFER_L1));
|
||||
void *normalize_args[] = {&out_ptr, &weightAccum, &w, &h, &stride};
|
||||
CUDA_GET_BLOCKSIZE(cuNLMNormalize, w, h);
|
||||
CUDA_LAUNCH_KERNEL(cuNLMNormalize, normalize_args);
|
||||
cuda_assert(cuCtxSynchronize());
|
||||
}
|
||||
|
||||
return !have_error();
|
||||
}
|
||||
@@ -1194,91 +1212,81 @@ public:
|
||||
mem_zero(task->storage.XtWX);
|
||||
mem_zero(task->storage.XtWY);
|
||||
|
||||
CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian, cuFinalize;
|
||||
cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
|
||||
cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
|
||||
cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
|
||||
cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
|
||||
cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
|
||||
|
||||
cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
|
||||
cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
|
||||
cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
|
||||
cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
|
||||
cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
|
||||
|
||||
CUDA_GET_BLOCKSIZE(cuNLMCalcDifference,
|
||||
task->reconstruction_state.source_w,
|
||||
task->reconstruction_state.source_h);
|
||||
|
||||
CUdeviceptr difference = task->reconstruction_state.temporary_1_ptr;
|
||||
CUdeviceptr blurDifference = task->reconstruction_state.temporary_2_ptr;
|
||||
|
||||
int r = task->radius;
|
||||
int f = 4;
|
||||
float a = 1.0f;
|
||||
for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
|
||||
int dy = i / (2*r+1) - r;
|
||||
int dx = i % (2*r+1) - r;
|
||||
float k_2 = task->nlm_k_2;
|
||||
|
||||
int local_rect[4] = {max(0, -dx), max(0, -dy),
|
||||
task->reconstruction_state.source_w - max(0, dx),
|
||||
task->reconstruction_state.source_h - max(0, dy)};
|
||||
int w = task->reconstruction_state.source_w;
|
||||
int h = task->reconstruction_state.source_h;
|
||||
int stride = task->buffer.stride;
|
||||
|
||||
void *calc_difference_args[] = {&dx, &dy,
|
||||
&color_ptr,
|
||||
&color_variance_ptr,
|
||||
&difference,
|
||||
&local_rect,
|
||||
&task->buffer.w,
|
||||
&task->buffer.pass_stride,
|
||||
&a,
|
||||
&task->nlm_k_2};
|
||||
CUDA_LAUNCH_KERNEL(cuNLMCalcDifference, calc_difference_args);
|
||||
int shift_stride = stride*h;
|
||||
int num_shifts = (2*r+1)*(2*r+1);
|
||||
int mem_size = sizeof(float)*shift_stride*num_shifts;
|
||||
|
||||
void *blur_args[] = {&difference,
|
||||
&blurDifference,
|
||||
&local_rect,
|
||||
&task->buffer.w,
|
||||
&f};
|
||||
CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
|
||||
CUdeviceptr temporary_mem;
|
||||
cuda_assert(cuMemAlloc(&temporary_mem, 2*mem_size));
|
||||
CUdeviceptr difference = temporary_mem;
|
||||
CUdeviceptr blurDifference = temporary_mem + mem_size;
|
||||
|
||||
void *calc_weight_args[] = {&blurDifference,
|
||||
&difference,
|
||||
&local_rect,
|
||||
&task->buffer.w,
|
||||
&f};
|
||||
CUDA_LAUNCH_KERNEL(cuNLMCalcWeight, calc_weight_args);
|
||||
{
|
||||
CUfunction cuNLMCalcDifference, cuNLMBlur, cuNLMCalcWeight, cuNLMConstructGramian;
|
||||
cuda_assert(cuModuleGetFunction(&cuNLMCalcDifference, cuFilterModule, "kernel_cuda_filter_nlm_calc_difference"));
|
||||
cuda_assert(cuModuleGetFunction(&cuNLMBlur, cuFilterModule, "kernel_cuda_filter_nlm_blur"));
|
||||
cuda_assert(cuModuleGetFunction(&cuNLMCalcWeight, cuFilterModule, "kernel_cuda_filter_nlm_calc_weight"));
|
||||
cuda_assert(cuModuleGetFunction(&cuNLMConstructGramian, cuFilterModule, "kernel_cuda_filter_nlm_construct_gramian"));
|
||||
|
||||
/* Reuse previous arguments. */
|
||||
CUDA_LAUNCH_KERNEL(cuNLMBlur, blur_args);
|
||||
cuda_assert(cuFuncSetCacheConfig(cuNLMCalcDifference, CU_FUNC_CACHE_PREFER_L1));
|
||||
cuda_assert(cuFuncSetCacheConfig(cuNLMBlur, CU_FUNC_CACHE_PREFER_L1));
|
||||
cuda_assert(cuFuncSetCacheConfig(cuNLMCalcWeight, CU_FUNC_CACHE_PREFER_L1));
|
||||
cuda_assert(cuFuncSetCacheConfig(cuNLMConstructGramian, CU_FUNC_CACHE_PREFER_SHARED));
|
||||
|
||||
void *construct_gramian_args[] = {&dx, &dy,
|
||||
&blurDifference,
|
||||
CUDA_GET_BLOCKSIZE_1D(cuNLMCalcDifference,
|
||||
task->reconstruction_state.source_w * task->reconstruction_state.source_h,
|
||||
num_shifts);
|
||||
|
||||
void *calc_difference_args[] = {&color_ptr, &color_variance_ptr, &difference, &w, &h, &stride, &shift_stride, &r, &task->buffer.pass_stride, &a, &k_2};
|
||||
void *blur_args[] = {&difference, &blurDifference, &w, &h, &stride, &shift_stride, &r, &f};
|
||||
void *calc_weight_args[] = {&blurDifference, &difference, &w, &h, &stride, &shift_stride, &r, &f};
|
||||
void *construct_gramian_args[] = {&blurDifference,
|
||||
&task->buffer.mem.device_pointer,
|
||||
&task->storage.transform.device_pointer,
|
||||
&task->storage.rank.device_pointer,
|
||||
&task->storage.XtWX.device_pointer,
|
||||
&task->storage.XtWY.device_pointer,
|
||||
&local_rect,
|
||||
&task->reconstruction_state.filter_rect,
|
||||
&task->buffer.w,
|
||||
&task->buffer.h,
|
||||
&task->reconstruction_state.filter_window,
|
||||
&w, &h, &stride,
|
||||
&shift_stride, &r,
|
||||
&f,
|
||||
&task->buffer.pass_stride};
|
||||
CUDA_LAUNCH_KERNEL(cuNLMConstructGramian, construct_gramian_args);
|
||||
|
||||
CUDA_LAUNCH_KERNEL_1D(cuNLMCalcDifference, calc_difference_args);
|
||||
CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
|
||||
CUDA_LAUNCH_KERNEL_1D(cuNLMCalcWeight, calc_weight_args);
|
||||
CUDA_LAUNCH_KERNEL_1D(cuNLMBlur, blur_args);
|
||||
CUDA_LAUNCH_KERNEL_1D(cuNLMConstructGramian, construct_gramian_args);
|
||||
}
|
||||
|
||||
cuMemFree(temporary_mem);
|
||||
|
||||
{
|
||||
CUfunction cuFinalize;
|
||||
cuda_assert(cuModuleGetFunction(&cuFinalize, cuFilterModule, "kernel_cuda_filter_finalize"));
|
||||
cuda_assert(cuFuncSetCacheConfig(cuFinalize, CU_FUNC_CACHE_PREFER_L1));
|
||||
void *finalize_args[] = {&output_ptr,
|
||||
&task->storage.rank.device_pointer,
|
||||
&task->storage.XtWX.device_pointer,
|
||||
&task->storage.XtWY.device_pointer,
|
||||
&task->filter_area,
|
||||
&task->reconstruction_state.buffer_params.x,
|
||||
&task->render_buffer.samples};
|
||||
CUDA_GET_BLOCKSIZE(cuFinalize,
|
||||
task->reconstruction_state.source_w,
|
||||
task->reconstruction_state.source_h);
|
||||
CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
|
||||
}
|
||||
|
||||
void *finalize_args[] = {&task->buffer.w,
|
||||
&task->buffer.h,
|
||||
&output_ptr,
|
||||
&task->storage.rank.device_pointer,
|
||||
&task->storage.XtWX.device_pointer,
|
||||
&task->storage.XtWY.device_pointer,
|
||||
&task->filter_area,
|
||||
&task->reconstruction_state.buffer_params.x,
|
||||
&task->render_buffer.samples};
|
||||
CUDA_LAUNCH_KERNEL(cuFinalize, finalize_args);
|
||||
cuda_assert(cuCtxSynchronize());
|
||||
|
||||
return !have_error();
|
||||
|
||||
@@ -57,10 +57,9 @@ void DenoisingTask::init_from_devicetask(const DeviceTask &task)
|
||||
render_buffer.denoising_clean_offset = task.pass_denoising_clean;
|
||||
|
||||
/* Expand filter_area by radius pixels and clamp the result to the extent of the neighboring tiles */
|
||||
rect = make_int4(max(tiles->x[0], filter_area.x - radius),
|
||||
max(tiles->y[0], filter_area.y - radius),
|
||||
min(tiles->x[3], filter_area.x + filter_area.z + radius),
|
||||
min(tiles->y[3], filter_area.y + filter_area.w + radius));
|
||||
rect = rect_from_shape(filter_area.x, filter_area.y, filter_area.z, filter_area.w);
|
||||
rect = rect_expand(rect, radius);
|
||||
rect = rect_clip(rect, make_int4(tiles->x[0], tiles->y[0], tiles->x[3], tiles->y[3]));
|
||||
}
|
||||
|
||||
void DenoisingTask::tiles_from_rendertiles(RenderTile *rtiles)
|
||||
@@ -93,9 +92,10 @@ bool DenoisingTask::run_denoising()
|
||||
{
|
||||
/* Allocate denoising buffer. */
|
||||
buffer.passes = 14;
|
||||
buffer.w = align_up(rect.z - rect.x, 4);
|
||||
buffer.width = rect.z - rect.x;
|
||||
buffer.stride = align_up(buffer.width, 4);
|
||||
buffer.h = rect.w - rect.y;
|
||||
buffer.pass_stride = align_up(buffer.w * buffer.h, divide_up(device->mem_address_alignment(), sizeof(float)));
|
||||
buffer.pass_stride = align_up(buffer.stride * buffer.h, divide_up(device->mem_address_alignment(), sizeof(float)));
|
||||
buffer.mem.alloc_to_device(buffer.pass_stride * buffer.passes, false);
|
||||
|
||||
device_ptr null_ptr = (device_ptr) 0;
|
||||
@@ -203,15 +203,17 @@ bool DenoisingTask::run_denoising()
|
||||
|
||||
functions.construct_transform();
|
||||
|
||||
storage.temporary_1.alloc_to_device(buffer.w*buffer.h, false);
|
||||
storage.temporary_2.alloc_to_device(buffer.w*buffer.h, false);
|
||||
reconstruction_state.temporary_1_ptr = storage.temporary_1.device_pointer;
|
||||
reconstruction_state.temporary_2_ptr = storage.temporary_2.device_pointer;
|
||||
device_only_memory<float> temporary_1(device, "Denoising NLM temporary 1");
|
||||
device_only_memory<float> temporary_2(device, "Denoising NLM temporary 2");
|
||||
temporary_1.alloc_to_device(buffer.pass_stride, false);
|
||||
temporary_2.alloc_to_device(buffer.pass_stride, false);
|
||||
reconstruction_state.temporary_1_ptr = temporary_1.device_pointer;
|
||||
reconstruction_state.temporary_2_ptr = temporary_2.device_pointer;
|
||||
|
||||
storage.XtWX.alloc_to_device(storage.w*storage.h*XTWX_SIZE, false);
|
||||
storage.XtWY.alloc_to_device(storage.w*storage.h*XTWY_SIZE, false);
|
||||
|
||||
reconstruction_state.filter_rect = make_int4(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h);
|
||||
reconstruction_state.filter_window = rect_from_shape(filter_area.x-rect.x, filter_area.y-rect.y, storage.w, storage.h);
|
||||
int tile_coordinate_offset = filter_area.y*render_buffer.stride + filter_area.x;
|
||||
reconstruction_state.buffer_params = make_int4(render_buffer.offset + tile_coordinate_offset,
|
||||
render_buffer.stride,
|
||||
|
||||
@@ -94,7 +94,7 @@ public:
|
||||
device_ptr temporary_1_ptr; /* There two images are used as temporary storage. */
|
||||
device_ptr temporary_2_ptr;
|
||||
|
||||
int4 filter_rect;
|
||||
int4 filter_window;
|
||||
int4 buffer_params;
|
||||
|
||||
int source_w;
|
||||
@@ -148,8 +148,9 @@ public:
|
||||
struct DenoiseBuffers {
|
||||
int pass_stride;
|
||||
int passes;
|
||||
int w;
|
||||
int stride;
|
||||
int h;
|
||||
int width;
|
||||
device_only_memory<float> mem;
|
||||
|
||||
DenoiseBuffers(Device *device)
|
||||
|
||||
@@ -353,7 +353,9 @@ public:
|
||||
void tex_free(device_memory& mem);
|
||||
|
||||
size_t global_size_round_up(int group_size, int global_size);
|
||||
void enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size = -1);
|
||||
void enqueue_kernel(cl_kernel kernel, size_t w, size_t h,
|
||||
bool x_workgroups = false,
|
||||
size_t max_workgroup_size = -1);
|
||||
void set_kernel_arg_mem(cl_kernel kernel, cl_uint *narg, const char *name);
|
||||
void set_kernel_arg_buffers(cl_kernel kernel, cl_uint *narg);
|
||||
|
||||
|
||||
@@ -560,7 +560,7 @@ size_t OpenCLDeviceBase::global_size_round_up(int group_size, int global_size)
|
||||
return global_size + ((r == 0)? 0: group_size - r);
|
||||
}
|
||||
|
||||
void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size_t max_workgroup_size)
|
||||
void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, bool x_workgroups, size_t max_workgroup_size)
|
||||
{
|
||||
size_t workgroup_size, max_work_items[3];
|
||||
|
||||
@@ -574,8 +574,15 @@ void OpenCLDeviceBase::enqueue_kernel(cl_kernel kernel, size_t w, size_t h, size
|
||||
}
|
||||
|
||||
/* Try to divide evenly over 2 dimensions. */
|
||||
size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
|
||||
size_t local_size[2] = {sqrt_workgroup_size, sqrt_workgroup_size};
|
||||
size_t local_size[2];
|
||||
if(x_workgroups) {
|
||||
local_size[0] = workgroup_size;
|
||||
local_size[1] = 1;
|
||||
}
|
||||
else {
|
||||
size_t sqrt_workgroup_size = max((size_t)sqrt((double)workgroup_size), 1);
|
||||
local_size[0] = local_size[1] = sqrt_workgroup_size;
|
||||
}
|
||||
|
||||
/* Some implementations have max size 1 on 2nd dimension. */
|
||||
if(local_size[1] > max_work_items[1]) {
|
||||
@@ -731,17 +738,25 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
|
||||
device_ptr out_ptr,
|
||||
DenoisingTask *task)
|
||||
{
|
||||
int4 rect = task->rect;
|
||||
int w = rect.z-rect.x;
|
||||
int h = rect.w-rect.y;
|
||||
|
||||
int stride = task->buffer.stride;
|
||||
int w = task->buffer.width;
|
||||
int h = task->buffer.h;
|
||||
int r = task->nlm_state.r;
|
||||
int f = task->nlm_state.f;
|
||||
float a = task->nlm_state.a;
|
||||
float k_2 = task->nlm_state.k_2;
|
||||
|
||||
cl_mem difference = CL_MEM_PTR(task->nlm_state.temporary_1_ptr);
|
||||
cl_mem blurDifference = CL_MEM_PTR(task->nlm_state.temporary_2_ptr);
|
||||
cl_mem weightAccum = CL_MEM_PTR(task->nlm_state.temporary_3_ptr);
|
||||
int shift_stride = stride*h;
|
||||
int num_shifts = (2*r+1)*(2*r+1);
|
||||
int mem_size = sizeof(float)*shift_stride*num_shifts;
|
||||
|
||||
cl_mem weightAccum = CL_MEM_PTR(task->nlm_state.temporary_3_ptr);
|
||||
|
||||
cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
|
||||
opencl_assert_err(ciErr, "clCreateBuffer denoising_non_local_means");
|
||||
cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
|
||||
opencl_assert_err(ciErr, "clCreateBuffer denoising_non_local_means");
|
||||
|
||||
cl_mem image_mem = CL_MEM_PTR(image_ptr);
|
||||
cl_mem guide_mem = CL_MEM_PTR(guide_ptr);
|
||||
@@ -757,31 +772,45 @@ bool OpenCLDeviceBase::denoising_non_local_means(device_ptr image_ptr,
|
||||
cl_kernel ckNLMUpdateOutput = denoising_program(ustring("filter_nlm_update_output"));
|
||||
cl_kernel ckNLMNormalize = denoising_program(ustring("filter_nlm_normalize"));
|
||||
|
||||
for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
|
||||
int dy = i / (2*r+1) - r;
|
||||
int dx = i % (2*r+1) - r;
|
||||
int4 local_rect = make_int4(max(0, -dx), max(0, -dy), rect.z-rect.x - max(0, dx), rect.w-rect.y - max(0, dy));
|
||||
kernel_set_args(ckNLMCalcDifference, 0,
|
||||
dx, dy, guide_mem, variance_mem,
|
||||
difference, local_rect, w, 0, a, k_2);
|
||||
kernel_set_args(ckNLMBlur, 0,
|
||||
difference, blurDifference, local_rect, w, f);
|
||||
kernel_set_args(ckNLMCalcWeight, 0,
|
||||
blurDifference, difference, local_rect, w, f);
|
||||
kernel_set_args(ckNLMUpdateOutput, 0,
|
||||
dx, dy, blurDifference, image_mem,
|
||||
out_mem, weightAccum, local_rect, w, f);
|
||||
kernel_set_args(ckNLMCalcDifference, 0,
|
||||
guide_mem,
|
||||
variance_mem,
|
||||
difference,
|
||||
w, h, stride,
|
||||
shift_stride,
|
||||
r, 0, a, k_2);
|
||||
kernel_set_args(ckNLMBlur, 0,
|
||||
difference,
|
||||
blurDifference,
|
||||
w, h, stride,
|
||||
shift_stride,
|
||||
r, f);
|
||||
kernel_set_args(ckNLMCalcWeight, 0,
|
||||
blurDifference,
|
||||
difference,
|
||||
w, h, stride,
|
||||
shift_stride,
|
||||
r, f);
|
||||
kernel_set_args(ckNLMUpdateOutput, 0,
|
||||
blurDifference,
|
||||
image_mem,
|
||||
out_mem,
|
||||
weightAccum,
|
||||
w, h, stride,
|
||||
shift_stride,
|
||||
r, f);
|
||||
|
||||
enqueue_kernel(ckNLMCalcDifference, w, h);
|
||||
enqueue_kernel(ckNLMBlur, w, h);
|
||||
enqueue_kernel(ckNLMCalcWeight, w, h);
|
||||
enqueue_kernel(ckNLMBlur, w, h);
|
||||
enqueue_kernel(ckNLMUpdateOutput, w, h);
|
||||
}
|
||||
enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true);
|
||||
enqueue_kernel(ckNLMBlur, w*h, num_shifts, true);
|
||||
enqueue_kernel(ckNLMCalcWeight, w*h, num_shifts, true);
|
||||
enqueue_kernel(ckNLMBlur, w*h, num_shifts, true);
|
||||
enqueue_kernel(ckNLMUpdateOutput, w*h, num_shifts, true);
|
||||
|
||||
opencl_assert(clReleaseMemObject(difference));
|
||||
opencl_assert(clReleaseMemObject(blurDifference));
|
||||
|
||||
int4 local_rect = make_int4(0, 0, w, h);
|
||||
kernel_set_args(ckNLMNormalize, 0,
|
||||
out_mem, weightAccum, local_rect, w);
|
||||
out_mem, weightAccum, w, h, stride);
|
||||
enqueue_kernel(ckNLMNormalize, w, h);
|
||||
|
||||
return true;
|
||||
@@ -837,81 +866,63 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
|
||||
cl_kernel ckNLMConstructGramian = denoising_program(ustring("filter_nlm_construct_gramian"));
|
||||
cl_kernel ckFinalize = denoising_program(ustring("filter_finalize"));
|
||||
|
||||
cl_mem difference = CL_MEM_PTR(task->reconstruction_state.temporary_1_ptr);
|
||||
cl_mem blurDifference = CL_MEM_PTR(task->reconstruction_state.temporary_2_ptr);
|
||||
int w = task->reconstruction_state.source_w;
|
||||
int h = task->reconstruction_state.source_h;
|
||||
int stride = task->buffer.stride;
|
||||
|
||||
int r = task->radius;
|
||||
int f = 4;
|
||||
float a = 1.0f;
|
||||
for(int i = 0; i < (2*r+1)*(2*r+1); i++) {
|
||||
int dy = i / (2*r+1) - r;
|
||||
int dx = i % (2*r+1) - r;
|
||||
int shift_stride = stride*h;
|
||||
int num_shifts = (2*task->radius + 1)*(2*task->radius + 1);
|
||||
int mem_size = sizeof(float)*shift_stride*num_shifts;
|
||||
|
||||
int local_rect[4] = {max(0, -dx), max(0, -dy),
|
||||
task->reconstruction_state.source_w - max(0, dx),
|
||||
task->reconstruction_state.source_h - max(0, dy)};
|
||||
cl_mem difference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
|
||||
opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct");
|
||||
cl_mem blurDifference = clCreateBuffer(cxContext, CL_MEM_READ_WRITE, mem_size, NULL, &ciErr);
|
||||
opencl_assert_err(ciErr, "clCreateBuffer denoising_reconstruct");
|
||||
|
||||
kernel_set_args(ckNLMCalcDifference, 0,
|
||||
dx, dy,
|
||||
color_mem,
|
||||
color_variance_mem,
|
||||
difference,
|
||||
local_rect,
|
||||
task->buffer.w,
|
||||
task->buffer.pass_stride,
|
||||
a, task->nlm_k_2);
|
||||
enqueue_kernel(ckNLMCalcDifference,
|
||||
task->reconstruction_state.source_w,
|
||||
task->reconstruction_state.source_h);
|
||||
kernel_set_args(ckNLMCalcDifference, 0,
|
||||
color_mem,
|
||||
color_variance_mem,
|
||||
difference,
|
||||
w, h, stride,
|
||||
shift_stride,
|
||||
task->radius,
|
||||
task->buffer.pass_stride,
|
||||
1.0f, task->nlm_k_2);
|
||||
kernel_set_args(ckNLMBlur, 0,
|
||||
difference,
|
||||
blurDifference,
|
||||
w, h, stride,
|
||||
shift_stride,
|
||||
task->radius, 4);
|
||||
kernel_set_args(ckNLMCalcWeight, 0,
|
||||
blurDifference,
|
||||
difference,
|
||||
w, h, stride,
|
||||
shift_stride,
|
||||
task->radius, 4);
|
||||
kernel_set_args(ckNLMConstructGramian, 0,
|
||||
blurDifference,
|
||||
buffer_mem,
|
||||
transform_mem,
|
||||
rank_mem,
|
||||
XtWX_mem,
|
||||
XtWY_mem,
|
||||
task->reconstruction_state.filter_window,
|
||||
w, h, stride,
|
||||
shift_stride,
|
||||
task->radius, 4,
|
||||
task->buffer.pass_stride);
|
||||
|
||||
kernel_set_args(ckNLMBlur, 0,
|
||||
difference,
|
||||
blurDifference,
|
||||
local_rect,
|
||||
task->buffer.w,
|
||||
f);
|
||||
enqueue_kernel(ckNLMBlur,
|
||||
task->reconstruction_state.source_w,
|
||||
task->reconstruction_state.source_h);
|
||||
enqueue_kernel(ckNLMCalcDifference, w*h, num_shifts, true);
|
||||
enqueue_kernel(ckNLMBlur, w*h, num_shifts, true);
|
||||
enqueue_kernel(ckNLMCalcWeight, w*h, num_shifts, true);
|
||||
enqueue_kernel(ckNLMBlur, w*h, num_shifts, true);
|
||||
enqueue_kernel(ckNLMConstructGramian, w*h, num_shifts, true, 256);
|
||||
|
||||
kernel_set_args(ckNLMCalcWeight, 0,
|
||||
blurDifference,
|
||||
difference,
|
||||
local_rect,
|
||||
task->buffer.w,
|
||||
f);
|
||||
enqueue_kernel(ckNLMCalcWeight,
|
||||
task->reconstruction_state.source_w,
|
||||
task->reconstruction_state.source_h);
|
||||
|
||||
/* Reuse previous arguments. */
|
||||
enqueue_kernel(ckNLMBlur,
|
||||
task->reconstruction_state.source_w,
|
||||
task->reconstruction_state.source_h);
|
||||
|
||||
kernel_set_args(ckNLMConstructGramian, 0,
|
||||
dx, dy,
|
||||
blurDifference,
|
||||
buffer_mem,
|
||||
transform_mem,
|
||||
rank_mem,
|
||||
XtWX_mem,
|
||||
XtWY_mem,
|
||||
local_rect,
|
||||
task->reconstruction_state.filter_rect,
|
||||
task->buffer.w,
|
||||
task->buffer.h,
|
||||
f,
|
||||
task->buffer.pass_stride);
|
||||
enqueue_kernel(ckNLMConstructGramian,
|
||||
task->reconstruction_state.source_w,
|
||||
task->reconstruction_state.source_h,
|
||||
256);
|
||||
}
|
||||
opencl_assert(clReleaseMemObject(difference));
|
||||
opencl_assert(clReleaseMemObject(blurDifference));
|
||||
|
||||
kernel_set_args(ckFinalize, 0,
|
||||
task->buffer.w,
|
||||
task->buffer.h,
|
||||
output_mem,
|
||||
rank_mem,
|
||||
XtWX_mem,
|
||||
@@ -919,9 +930,7 @@ bool OpenCLDeviceBase::denoising_reconstruct(device_ptr color_ptr,
|
||||
task->filter_area,
|
||||
task->reconstruction_state.buffer_params,
|
||||
task->render_buffer.samples);
|
||||
enqueue_kernel(ckFinalize,
|
||||
task->reconstruction_state.source_w,
|
||||
task->reconstruction_state.source_h);
|
||||
enqueue_kernel(ckFinalize, w, h);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -254,6 +254,7 @@ set(SRC_UTIL_HEADERS
|
||||
../util/util_math_int3.h
|
||||
../util/util_math_int4.h
|
||||
../util/util_math_matrix.h
|
||||
../util/util_rect.h
|
||||
../util/util_static_assert.h
|
||||
../util/util_transform.h
|
||||
../util/util_texture.h
|
||||
|
||||
@@ -21,7 +21,7 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
|
||||
const float *ccl_restrict variance_image,
|
||||
float *difference_image,
|
||||
int4 rect,
|
||||
int w,
|
||||
int stride,
|
||||
int channel_offset,
|
||||
float a,
|
||||
float k_2)
|
||||
@@ -31,15 +31,15 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
|
||||
float diff = 0.0f;
|
||||
int numChannels = channel_offset? 3 : 1;
|
||||
for(int c = 0; c < numChannels; c++) {
|
||||
float cdiff = weight_image[c*channel_offset + y*w+x] - weight_image[c*channel_offset + (y+dy)*w+(x+dx)];
|
||||
float pvar = variance_image[c*channel_offset + y*w+x];
|
||||
float qvar = variance_image[c*channel_offset + (y+dy)*w+(x+dx)];
|
||||
float cdiff = weight_image[c*channel_offset + y*stride + x] - weight_image[c*channel_offset + (y+dy)*stride + (x+dx)];
|
||||
float pvar = variance_image[c*channel_offset + y*stride + x];
|
||||
float qvar = variance_image[c*channel_offset + (y+dy)*stride + (x+dx)];
|
||||
diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
|
||||
}
|
||||
if(numChannels > 1) {
|
||||
diff *= 1.0f/numChannels;
|
||||
}
|
||||
difference_image[y*w+x] = diff;
|
||||
difference_image[y*stride + x] = diff;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -47,7 +47,7 @@ ccl_device_inline void kernel_filter_nlm_calc_difference(int dx, int dy,
|
||||
ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict difference_image,
|
||||
float *out_image,
|
||||
int4 rect,
|
||||
int w,
|
||||
int stride,
|
||||
int f)
|
||||
{
|
||||
int aligned_lowx = rect.x / 4;
|
||||
@@ -56,17 +56,17 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen
|
||||
const int low = max(rect.y, y-f);
|
||||
const int high = min(rect.w, y+f+1);
|
||||
for(int x = rect.x; x < rect.z; x++) {
|
||||
out_image[y*w+x] = 0.0f;
|
||||
out_image[y*stride + x] = 0.0f;
|
||||
}
|
||||
for(int y1 = low; y1 < high; y1++) {
|
||||
float4* out_image4 = (float4*)(out_image + y*w);
|
||||
float4* difference_image4 = (float4*)(difference_image + y1*w);
|
||||
float4* out_image4 = (float4*)(out_image + y*stride);
|
||||
float4* difference_image4 = (float4*)(difference_image + y1*stride);
|
||||
for(int x = aligned_lowx; x < aligned_highx; x++) {
|
||||
out_image4[x] += difference_image4[x];
|
||||
}
|
||||
}
|
||||
for(int x = rect.x; x < rect.z; x++) {
|
||||
out_image[y*w+x] *= 1.0f/(high - low);
|
||||
out_image[y*stride + x] *= 1.0f/(high - low);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -74,12 +74,12 @@ ccl_device_inline void kernel_filter_nlm_blur(const float *ccl_restrict differen
|
||||
ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
|
||||
float *out_image,
|
||||
int4 rect,
|
||||
int w,
|
||||
int stride,
|
||||
int f)
|
||||
{
|
||||
for(int y = rect.y; y < rect.w; y++) {
|
||||
for(int x = rect.x; x < rect.z; x++) {
|
||||
out_image[y*w+x] = 0.0f;
|
||||
out_image[y*stride + x] = 0.0f;
|
||||
}
|
||||
}
|
||||
for(int dx = -f; dx <= f; dx++) {
|
||||
@@ -87,7 +87,7 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict d
|
||||
int neg_dx = min(0, dx);
|
||||
for(int y = rect.y; y < rect.w; y++) {
|
||||
for(int x = rect.x-neg_dx; x < rect.z-pos_dx; x++) {
|
||||
out_image[y*w+x] += difference_image[y*w+dx+x];
|
||||
out_image[y*stride + x] += difference_image[y*stride + x+dx];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -95,7 +95,7 @@ ccl_device_inline void kernel_filter_nlm_calc_weight(const float *ccl_restrict d
|
||||
for(int x = rect.x; x < rect.z; x++) {
|
||||
const int low = max(rect.x, x-f);
|
||||
const int high = min(rect.z, x+f+1);
|
||||
out_image[y*w+x] = fast_expf(-max(out_image[y*w+x] * (1.0f/(high - low)), 0.0f));
|
||||
out_image[y*stride + x] = fast_expf(-max(out_image[y*stride + x] * (1.0f/(high - low)), 0.0f));
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -106,7 +106,7 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
|
||||
float *out_image,
|
||||
float *accum_image,
|
||||
int4 rect,
|
||||
int w,
|
||||
int stride,
|
||||
int f)
|
||||
{
|
||||
for(int y = rect.y; y < rect.w; y++) {
|
||||
@@ -115,11 +115,11 @@ ccl_device_inline void kernel_filter_nlm_update_output(int dx, int dy,
|
||||
const int high = min(rect.z, x+f+1);
|
||||
float sum = 0.0f;
|
||||
for(int x1 = low; x1 < high; x1++) {
|
||||
sum += difference_image[y*w+x1];
|
||||
sum += difference_image[y*stride + x1];
|
||||
}
|
||||
float weight = sum * (1.0f/(high - low));
|
||||
accum_image[y*w+x] += weight;
|
||||
out_image[y*w+x] += weight*image[(y+dy)*w+(x+dx)];
|
||||
accum_image[y*stride + x] += weight;
|
||||
out_image[y*stride + x] += weight*image[(y+dy)*stride + (x+dx)];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -132,31 +132,31 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int dx, int dy,
|
||||
float *XtWX,
|
||||
float3 *XtWY,
|
||||
int4 rect,
|
||||
int4 filter_rect,
|
||||
int w, int h, int f,
|
||||
int4 filter_window,
|
||||
int stride, int f,
|
||||
int pass_stride)
|
||||
{
|
||||
int4 clip_area = rect_clip(rect, filter_window);
|
||||
/* fy and fy are in filter-window-relative coordinates, while x and y are in feature-window-relative coordinates. */
|
||||
for(int fy = max(0, rect.y-filter_rect.y); fy < min(filter_rect.w, rect.w-filter_rect.y); fy++) {
|
||||
int y = fy + filter_rect.y;
|
||||
for(int fx = max(0, rect.x-filter_rect.x); fx < min(filter_rect.z, rect.z-filter_rect.x); fx++) {
|
||||
int x = fx + filter_rect.x;
|
||||
for(int y = clip_area.y; y < clip_area.w; y++) {
|
||||
for(int x = clip_area.x; x < clip_area.z; x++) {
|
||||
const int low = max(rect.x, x-f);
|
||||
const int high = min(rect.z, x+f+1);
|
||||
float sum = 0.0f;
|
||||
for(int x1 = low; x1 < high; x1++) {
|
||||
sum += difference_image[y*w+x1];
|
||||
sum += difference_image[y*stride + x1];
|
||||
}
|
||||
float weight = sum * (1.0f/(high - low));
|
||||
|
||||
int storage_ofs = fy*filter_rect.z + fx;
|
||||
int storage_ofs = coord_to_local_index(filter_window, x, y);
|
||||
float *l_transform = transform + storage_ofs*TRANSFORM_SIZE;
|
||||
float *l_XtWX = XtWX + storage_ofs*XTWX_SIZE;
|
||||
float3 *l_XtWY = XtWY + storage_ofs*XTWY_SIZE;
|
||||
int *l_rank = rank + storage_ofs;
|
||||
|
||||
kernel_filter_construct_gramian(x, y, 1,
|
||||
dx, dy, w, h,
|
||||
dx, dy,
|
||||
stride,
|
||||
pass_stride,
|
||||
buffer,
|
||||
l_transform, l_rank,
|
||||
|
||||
@@ -16,57 +16,114 @@
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
/* Determines pixel coordinates and offset for the current thread.
|
||||
* Returns whether the thread should do any work.
|
||||
*
|
||||
* All coordinates are relative to the denoising buffer!
|
||||
*
|
||||
* Window is the rect that should be processed.
|
||||
* co is filled with (x, y, dx, dy).
|
||||
*/
|
||||
ccl_device_inline bool get_nlm_coords_window(int w, int h, int r, int stride,
|
||||
int4 *rect, int4 *co, int *ofs,
|
||||
int4 window)
|
||||
{
|
||||
/* Determine the pixel offset that this thread should apply. */
|
||||
int s = 2*r+1;
|
||||
int si = ccl_global_id(1);
|
||||
int sx = si % s;
|
||||
int sy = si / s;
|
||||
if(sy >= s) {
|
||||
return false;
|
||||
}
|
||||
co->z = sx-r;
|
||||
co->w = sy-r;
|
||||
|
||||
/* Pixels still need to lie inside the denoising buffer after applying the offset,
|
||||
* so determine the area for which this is the case. */
|
||||
*rect = make_int4(max(0, -co->z), max(0, -co->w),
|
||||
w - max(0, co->z), h - max(0, co->w));
|
||||
|
||||
/* Find the intersection of the area that we want to process (window) and the area
|
||||
* that can be processed (rect) to get the final area for this offset. */
|
||||
int4 clip_area = rect_clip(window, *rect);
|
||||
|
||||
/* If the radius is larger than one of the sides of the window,
|
||||
* there will be shifts for which there is no usable pixel at all. */
|
||||
if(!rect_is_valid(clip_area)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Map the linear thread index to pixels inside the clip area. */
|
||||
int x, y;
|
||||
if(!local_index_to_coord(clip_area, ccl_global_id(0), &x, &y)) {
|
||||
return false;
|
||||
}
|
||||
co->x = x;
|
||||
co->y = y;
|
||||
|
||||
*ofs = (sy*s + sx) * stride;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
ccl_device_inline bool get_nlm_coords(int w, int h, int r, int stride,
|
||||
int4 *rect, int4 *co, int *ofs)
|
||||
{
|
||||
return get_nlm_coords_window(w, h, r, stride, rect, co, ofs, make_int4(0, 0, w, h));
|
||||
}
|
||||
|
||||
ccl_device_inline void kernel_filter_nlm_calc_difference(int x, int y,
|
||||
int dx, int dy,
|
||||
const ccl_global float *ccl_restrict weight_image,
|
||||
const ccl_global float *ccl_restrict variance_image,
|
||||
ccl_global float *difference_image,
|
||||
int4 rect, int w,
|
||||
int4 rect, int stride,
|
||||
int channel_offset,
|
||||
float a, float k_2)
|
||||
{
|
||||
float diff = 0.0f;
|
||||
int numChannels = channel_offset? 3 : 1;
|
||||
for(int c = 0; c < numChannels; c++) {
|
||||
float cdiff = weight_image[c*channel_offset + y*w+x] - weight_image[c*channel_offset + (y+dy)*w+(x+dx)];
|
||||
float pvar = variance_image[c*channel_offset + y*w+x];
|
||||
float qvar = variance_image[c*channel_offset + (y+dy)*w+(x+dx)];
|
||||
float cdiff = weight_image[c*channel_offset + y*stride + x] - weight_image[c*channel_offset + (y+dy)*stride + (x+dx)];
|
||||
float pvar = variance_image[c*channel_offset + y*stride + x];
|
||||
float qvar = variance_image[c*channel_offset + (y+dy)*stride + (x+dx)];
|
||||
diff += (cdiff*cdiff - a*(pvar + min(pvar, qvar))) / (1e-8f + k_2*(pvar+qvar));
|
||||
}
|
||||
if(numChannels > 1) {
|
||||
diff *= 1.0f/numChannels;
|
||||
}
|
||||
difference_image[y*w+x] = diff;
|
||||
difference_image[y*stride + x] = diff;
|
||||
}
|
||||
|
||||
ccl_device_inline void kernel_filter_nlm_blur(int x, int y,
|
||||
const ccl_global float *ccl_restrict difference_image,
|
||||
ccl_global float *out_image,
|
||||
int4 rect, int w, int f)
|
||||
int4 rect, int stride, int f)
|
||||
{
|
||||
float sum = 0.0f;
|
||||
const int low = max(rect.y, y-f);
|
||||
const int high = min(rect.w, y+f+1);
|
||||
for(int y1 = low; y1 < high; y1++) {
|
||||
sum += difference_image[y1*w+x];
|
||||
sum += difference_image[y1*stride + x];
|
||||
}
|
||||
sum *= 1.0f/(high-low);
|
||||
out_image[y*w+x] = sum;
|
||||
out_image[y*stride + x] = sum;
|
||||
}
|
||||
|
||||
ccl_device_inline void kernel_filter_nlm_calc_weight(int x, int y,
|
||||
const ccl_global float *ccl_restrict difference_image,
|
||||
ccl_global float *out_image,
|
||||
int4 rect, int w, int f)
|
||||
int4 rect, int stride, int f)
|
||||
{
|
||||
float sum = 0.0f;
|
||||
const int low = max(rect.x, x-f);
|
||||
const int high = min(rect.z, x+f+1);
|
||||
for(int x1 = low; x1 < high; x1++) {
|
||||
sum += difference_image[y*w+x1];
|
||||
sum += difference_image[y*stride + x1];
|
||||
}
|
||||
sum *= 1.0f/(high-low);
|
||||
out_image[y*w+x] = fast_expf(-max(sum, 0.0f));
|
||||
out_image[y*stride + x] = fast_expf(-max(sum, 0.0f));
|
||||
}
|
||||
|
||||
ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
|
||||
@@ -75,25 +132,25 @@ ccl_device_inline void kernel_filter_nlm_update_output(int x, int y,
|
||||
const ccl_global float *ccl_restrict image,
|
||||
ccl_global float *out_image,
|
||||
ccl_global float *accum_image,
|
||||
int4 rect, int w, int f)
|
||||
int4 rect, int stride, int f)
|
||||
{
|
||||
float sum = 0.0f;
|
||||
const int low = max(rect.x, x-f);
|
||||
const int high = min(rect.z, x+f+1);
|
||||
for(int x1 = low; x1 < high; x1++) {
|
||||
sum += difference_image[y*w+x1];
|
||||
sum += difference_image[y*stride + x1];
|
||||
}
|
||||
sum *= 1.0f/(high-low);
|
||||
if(out_image) {
|
||||
accum_image[y*w+x] += sum;
|
||||
out_image[y*w+x] += sum*image[(y+dy)*w+(x+dx)];
|
||||
atomic_add_and_fetch_float(accum_image + y*stride + x, sum);
|
||||
atomic_add_and_fetch_float(out_image + y*stride + x, sum*image[(y+dy)*stride + (x+dx)]);
|
||||
}
|
||||
else {
|
||||
accum_image[y*w+x] = sum;
|
||||
accum_image[y*stride + x] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy,
|
||||
ccl_device_inline void kernel_filter_nlm_construct_gramian(int x, int y,
|
||||
int dx, int dy,
|
||||
const ccl_global float *ccl_restrict difference_image,
|
||||
const ccl_global float *ccl_restrict buffer,
|
||||
@@ -102,30 +159,31 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy,
|
||||
ccl_global float *XtWX,
|
||||
ccl_global float3 *XtWY,
|
||||
int4 rect,
|
||||
int4 filter_rect,
|
||||
int w, int h, int f,
|
||||
int4 filter_window,
|
||||
int stride, int f,
|
||||
int pass_stride,
|
||||
int localIdx)
|
||||
{
|
||||
int y = fy + filter_rect.y;
|
||||
int x = fx + filter_rect.x;
|
||||
const int low = max(rect.x, x-f);
|
||||
const int high = min(rect.z, x+f+1);
|
||||
float sum = 0.0f;
|
||||
for(int x1 = low; x1 < high; x1++) {
|
||||
sum += difference_image[y*w+x1];
|
||||
sum += difference_image[y*stride + x1];
|
||||
}
|
||||
float weight = sum * (1.0f/(high - low));
|
||||
|
||||
int storage_ofs = fy*filter_rect.z + fx;
|
||||
/* Reconstruction data is only stored for pixels inside the filter window,
|
||||
* so compute the pixels's index in there. */
|
||||
int storage_ofs = coord_to_local_index(filter_window, x, y);
|
||||
transform += storage_ofs;
|
||||
rank += storage_ofs;
|
||||
XtWX += storage_ofs;
|
||||
XtWY += storage_ofs;
|
||||
|
||||
kernel_filter_construct_gramian(x, y,
|
||||
filter_rect.z*filter_rect.w,
|
||||
dx, dy, w, h,
|
||||
rect_size(filter_window),
|
||||
dx, dy,
|
||||
stride,
|
||||
pass_stride,
|
||||
buffer,
|
||||
transform, rank,
|
||||
@@ -136,9 +194,9 @@ ccl_device_inline void kernel_filter_nlm_construct_gramian(int fx, int fy,
|
||||
ccl_device_inline void kernel_filter_nlm_normalize(int x, int y,
|
||||
ccl_global float *out_image,
|
||||
const ccl_global float *ccl_restrict accum_image,
|
||||
int4 rect, int w)
|
||||
int stride)
|
||||
{
|
||||
out_image[y*w+x] /= accum_image[y*w+x];
|
||||
out_image[y*stride + x] /= accum_image[y*stride + x];
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
@@ -19,7 +19,7 @@ CCL_NAMESPACE_BEGIN
|
||||
ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
|
||||
int storage_stride,
|
||||
int dx, int dy,
|
||||
int w, int h,
|
||||
int buffer_stride,
|
||||
int pass_stride,
|
||||
const ccl_global float *ccl_restrict buffer,
|
||||
const ccl_global float *ccl_restrict transform,
|
||||
@@ -33,8 +33,8 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
|
||||
return;
|
||||
}
|
||||
|
||||
int p_offset = y *w + x;
|
||||
int q_offset = (y+dy)*w + (x+dx);
|
||||
int p_offset = y * buffer_stride + x;
|
||||
int q_offset = (y+dy) * buffer_stride + (x+dx);
|
||||
|
||||
#ifdef __KERNEL_GPU__
|
||||
const int stride = storage_stride;
|
||||
@@ -65,7 +65,7 @@ ccl_device_inline void kernel_filter_construct_gramian(int x, int y,
|
||||
math_vec3_add_strided(XtWY, (*rank)+1, design_row, weight * q_color, stride);
|
||||
}
|
||||
|
||||
ccl_device_inline void kernel_filter_finalize(int x, int y, int w, int h,
|
||||
ccl_device_inline void kernel_filter_finalize(int x, int y,
|
||||
ccl_global float *buffer,
|
||||
ccl_global int *rank,
|
||||
int storage_stride,
|
||||
|
||||
@@ -74,7 +74,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
|
||||
float *variance,
|
||||
float *difference_image,
|
||||
int* rect,
|
||||
int w,
|
||||
int stride,
|
||||
int channel_offset,
|
||||
float a,
|
||||
float k_2);
|
||||
@@ -82,13 +82,13 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image,
|
||||
float *out_image,
|
||||
int* rect,
|
||||
int w,
|
||||
int stride,
|
||||
int f);
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image,
|
||||
float *out_image,
|
||||
int* rect,
|
||||
int w,
|
||||
int stride,
|
||||
int f);
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
|
||||
@@ -98,7 +98,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
|
||||
float *out_image,
|
||||
float *accum_image,
|
||||
int* rect,
|
||||
int w,
|
||||
int stride,
|
||||
int f);
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
|
||||
@@ -110,22 +110,19 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
|
||||
float *XtWX,
|
||||
float3 *XtWY,
|
||||
int *rect,
|
||||
int *filter_rect,
|
||||
int w,
|
||||
int h,
|
||||
int *filter_window,
|
||||
int stride,
|
||||
int f,
|
||||
int pass_stride);
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
|
||||
float *accum_image,
|
||||
int* rect,
|
||||
int w);
|
||||
int stride);
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
|
||||
int y,
|
||||
int storage_ofs,
|
||||
int w,
|
||||
int h,
|
||||
float *buffer,
|
||||
int *rank,
|
||||
float *XtWX,
|
||||
|
||||
@@ -150,7 +150,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
|
||||
float *variance,
|
||||
float *difference_image,
|
||||
int *rect,
|
||||
int w,
|
||||
int stride,
|
||||
int channel_offset,
|
||||
float a,
|
||||
float k_2)
|
||||
@@ -158,33 +158,33 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_difference)(int dx,
|
||||
#ifdef KERNEL_STUB
|
||||
STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_difference);
|
||||
#else
|
||||
kernel_filter_nlm_calc_difference(dx, dy, weight_image, variance, difference_image, load_int4(rect), w, channel_offset, a, k_2);
|
||||
kernel_filter_nlm_calc_difference(dx, dy, weight_image, variance, difference_image, load_int4(rect), stride, channel_offset, a, k_2);
|
||||
#endif
|
||||
}
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_blur)(float *difference_image,
|
||||
float *out_image,
|
||||
int *rect,
|
||||
int w,
|
||||
int stride,
|
||||
int f)
|
||||
{
|
||||
#ifdef KERNEL_STUB
|
||||
STUB_ASSERT(KERNEL_ARCH, filter_nlm_blur);
|
||||
#else
|
||||
kernel_filter_nlm_blur(difference_image, out_image, load_int4(rect), w, f);
|
||||
kernel_filter_nlm_blur(difference_image, out_image, load_int4(rect), stride, f);
|
||||
#endif
|
||||
}
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_calc_weight)(float *difference_image,
|
||||
float *out_image,
|
||||
int *rect,
|
||||
int w,
|
||||
int stride,
|
||||
int f)
|
||||
{
|
||||
#ifdef KERNEL_STUB
|
||||
STUB_ASSERT(KERNEL_ARCH, filter_nlm_calc_weight);
|
||||
#else
|
||||
kernel_filter_nlm_calc_weight(difference_image, out_image, load_int4(rect), w, f);
|
||||
kernel_filter_nlm_calc_weight(difference_image, out_image, load_int4(rect), stride, f);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -195,13 +195,13 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_update_output)(int dx,
|
||||
float *out_image,
|
||||
float *accum_image,
|
||||
int *rect,
|
||||
int w,
|
||||
int stride,
|
||||
int f)
|
||||
{
|
||||
#ifdef KERNEL_STUB
|
||||
STUB_ASSERT(KERNEL_ARCH, filter_nlm_update_output);
|
||||
#else
|
||||
kernel_filter_nlm_update_output(dx, dy, difference_image, image, out_image, accum_image, load_int4(rect), w, f);
|
||||
kernel_filter_nlm_update_output(dx, dy, difference_image, image, out_image, accum_image, load_int4(rect), stride, f);
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -214,36 +214,33 @@ void KERNEL_FUNCTION_FULL_NAME(filter_nlm_construct_gramian)(int dx,
|
||||
float *XtWX,
|
||||
float3 *XtWY,
|
||||
int *rect,
|
||||
int *filter_rect,
|
||||
int w,
|
||||
int h,
|
||||
int *filter_window,
|
||||
int stride,
|
||||
int f,
|
||||
int pass_stride)
|
||||
{
|
||||
#ifdef KERNEL_STUB
|
||||
STUB_ASSERT(KERNEL_ARCH, filter_nlm_construct_gramian);
|
||||
#else
|
||||
kernel_filter_nlm_construct_gramian(dx, dy, difference_image, buffer, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_rect), w, h, f, pass_stride);
|
||||
kernel_filter_nlm_construct_gramian(dx, dy, difference_image, buffer, transform, rank, XtWX, XtWY, load_int4(rect), load_int4(filter_window), stride, f, pass_stride);
|
||||
#endif
|
||||
}
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_nlm_normalize)(float *out_image,
|
||||
float *accum_image,
|
||||
int *rect,
|
||||
int w)
|
||||
int stride)
|
||||
{
|
||||
#ifdef KERNEL_STUB
|
||||
STUB_ASSERT(KERNEL_ARCH, filter_nlm_normalize);
|
||||
#else
|
||||
kernel_filter_nlm_normalize(out_image, accum_image, load_int4(rect), w);
|
||||
kernel_filter_nlm_normalize(out_image, accum_image, load_int4(rect), stride);
|
||||
#endif
|
||||
}
|
||||
|
||||
void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
|
||||
int y,
|
||||
int storage_ofs,
|
||||
int w,
|
||||
int h,
|
||||
float *buffer,
|
||||
int *rank,
|
||||
float *XtWX,
|
||||
@@ -257,7 +254,7 @@ void KERNEL_FUNCTION_FULL_NAME(filter_finalize)(int x,
|
||||
XtWX += storage_ofs*XTWX_SIZE;
|
||||
XtWY += storage_ofs*XTWY_SIZE;
|
||||
rank += storage_ofs;
|
||||
kernel_filter_finalize(x, y, w, h, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample);
|
||||
kernel_filter_finalize(x, y, buffer, rank, 1, XtWX, XtWY, load_int4(buffer_params), sample);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -134,95 +134,140 @@ kernel_cuda_filter_construct_transform(float const* __restrict__ buffer,
|
||||
|
||||
extern "C" __global__ void
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||
kernel_cuda_filter_nlm_calc_difference(int dx, int dy,
|
||||
const float *ccl_restrict weight_image,
|
||||
kernel_cuda_filter_nlm_calc_difference(const float *ccl_restrict weight_image,
|
||||
const float *ccl_restrict variance_image,
|
||||
float *difference_image,
|
||||
int4 rect, int w,
|
||||
int w,
|
||||
int h,
|
||||
int stride,
|
||||
int shift_stride,
|
||||
int r,
|
||||
int channel_offset,
|
||||
float a, float k_2)
|
||||
float a,
|
||||
float k_2)
|
||||
{
|
||||
int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
|
||||
int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
|
||||
if(x < rect.z && y < rect.w) {
|
||||
kernel_filter_nlm_calc_difference(x, y, dx, dy, weight_image, variance_image, difference_image, rect, w, channel_offset, a, k_2);
|
||||
int4 co, rect;
|
||||
int ofs;
|
||||
if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
|
||||
kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
|
||||
weight_image,
|
||||
variance_image,
|
||||
difference_image + ofs,
|
||||
rect, stride,
|
||||
channel_offset, a, k_2);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ void
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||
kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f)
|
||||
kernel_cuda_filter_nlm_blur(const float *ccl_restrict difference_image,
|
||||
float *out_image,
|
||||
int w,
|
||||
int h,
|
||||
int stride,
|
||||
int shift_stride,
|
||||
int r,
|
||||
int f)
|
||||
{
|
||||
int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
|
||||
int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
|
||||
if(x < rect.z && y < rect.w) {
|
||||
kernel_filter_nlm_blur(x, y, difference_image, out_image, rect, w, f);
|
||||
int4 co, rect;
|
||||
int ofs;
|
||||
if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
|
||||
kernel_filter_nlm_blur(co.x, co.y,
|
||||
difference_image + ofs,
|
||||
out_image + ofs,
|
||||
rect, stride, f);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ void
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||
kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image, float *out_image, int4 rect, int w, int f)
|
||||
kernel_cuda_filter_nlm_calc_weight(const float *ccl_restrict difference_image,
|
||||
float *out_image,
|
||||
int w,
|
||||
int h,
|
||||
int stride,
|
||||
int shift_stride,
|
||||
int r,
|
||||
int f)
|
||||
{
|
||||
int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
|
||||
int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
|
||||
if(x < rect.z && y < rect.w) {
|
||||
kernel_filter_nlm_calc_weight(x, y, difference_image, out_image, rect, w, f);
|
||||
int4 co, rect;
|
||||
int ofs;
|
||||
if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
|
||||
kernel_filter_nlm_calc_weight(co.x, co.y,
|
||||
difference_image + ofs,
|
||||
out_image + ofs,
|
||||
rect, stride, f);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ void
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||
kernel_cuda_filter_nlm_update_output(int dx, int dy,
|
||||
const float *ccl_restrict difference_image,
|
||||
kernel_cuda_filter_nlm_update_output(const float *ccl_restrict difference_image,
|
||||
const float *ccl_restrict image,
|
||||
float *out_image, float *accum_image,
|
||||
int4 rect, int w,
|
||||
float *out_image,
|
||||
float *accum_image,
|
||||
int w,
|
||||
int h,
|
||||
int stride,
|
||||
int shift_stride,
|
||||
int r,
|
||||
int f)
|
||||
{
|
||||
int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
|
||||
int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
|
||||
if(x < rect.z && y < rect.w) {
|
||||
kernel_filter_nlm_update_output(x, y, dx, dy, difference_image, image, out_image, accum_image, rect, w, f);
|
||||
int4 co, rect;
|
||||
int ofs;
|
||||
if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
|
||||
kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
|
||||
difference_image + ofs,
|
||||
image,
|
||||
out_image,
|
||||
accum_image,
|
||||
rect, stride, f);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ void
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||
kernel_cuda_filter_nlm_normalize(float *out_image, const float *ccl_restrict accum_image, int4 rect, int w)
|
||||
kernel_cuda_filter_nlm_normalize(float *out_image,
|
||||
const float *ccl_restrict accum_image,
|
||||
int w,
|
||||
int h,
|
||||
int stride)
|
||||
{
|
||||
int x = blockDim.x*blockIdx.x + threadIdx.x + rect.x;
|
||||
int y = blockDim.y*blockIdx.y + threadIdx.y + rect.y;
|
||||
if(x < rect.z && y < rect.w) {
|
||||
kernel_filter_nlm_normalize(x, y, out_image, accum_image, rect, w);
|
||||
int x = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
int y = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
if(x < w && y < h) {
|
||||
kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride);
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" __global__ void
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||
kernel_cuda_filter_nlm_construct_gramian(int dx, int dy,
|
||||
const float *ccl_restrict difference_image,
|
||||
kernel_cuda_filter_nlm_construct_gramian(const float *ccl_restrict difference_image,
|
||||
const float *ccl_restrict buffer,
|
||||
float const* __restrict__ transform,
|
||||
int *rank,
|
||||
float *XtWX,
|
||||
float3 *XtWY,
|
||||
int4 rect,
|
||||
int4 filter_rect,
|
||||
int w, int h, int f,
|
||||
int4 filter_window,
|
||||
int w,
|
||||
int h,
|
||||
int stride,
|
||||
int shift_stride,
|
||||
int r,
|
||||
int f,
|
||||
int pass_stride)
|
||||
{
|
||||
int x = blockDim.x*blockIdx.x + threadIdx.x + max(0, rect.x-filter_rect.x);
|
||||
int y = blockDim.y*blockIdx.y + threadIdx.y + max(0, rect.y-filter_rect.y);
|
||||
if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
|
||||
kernel_filter_nlm_construct_gramian(x, y,
|
||||
dx, dy,
|
||||
difference_image,
|
||||
int4 co, rect;
|
||||
int ofs;
|
||||
if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) {
|
||||
kernel_filter_nlm_construct_gramian(co.x, co.y,
|
||||
co.z, co.w,
|
||||
difference_image + ofs,
|
||||
buffer,
|
||||
transform, rank,
|
||||
XtWX, XtWY,
|
||||
rect, filter_rect,
|
||||
w, h, f,
|
||||
rect, filter_window,
|
||||
stride, f,
|
||||
pass_stride,
|
||||
threadIdx.y*blockDim.x + threadIdx.x);
|
||||
}
|
||||
@@ -230,10 +275,12 @@ kernel_cuda_filter_nlm_construct_gramian(int dx, int dy,
|
||||
|
||||
extern "C" __global__ void
|
||||
CUDA_LAUNCH_BOUNDS(CUDA_THREADS_BLOCK_WIDTH, CUDA_KERNEL_MAX_REGISTERS)
|
||||
kernel_cuda_filter_finalize(int w, int h,
|
||||
float *buffer, int *rank,
|
||||
float *XtWX, float3 *XtWY,
|
||||
int4 filter_area, int4 buffer_params,
|
||||
kernel_cuda_filter_finalize(float *buffer,
|
||||
int *rank,
|
||||
float *XtWX,
|
||||
float3 *XtWY,
|
||||
int4 filter_area,
|
||||
int4 buffer_params,
|
||||
int sample)
|
||||
{
|
||||
int x = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
@@ -243,7 +290,10 @@ kernel_cuda_filter_finalize(int w, int h,
|
||||
rank += storage_ofs;
|
||||
XtWX += storage_ofs;
|
||||
XtWY += storage_ofs;
|
||||
kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
|
||||
kernel_filter_finalize(x, y, buffer, rank,
|
||||
filter_area.z*filter_area.w,
|
||||
XtWX, XtWY,
|
||||
buffer_params, sample);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -126,113 +126,136 @@ __kernel void kernel_ocl_filter_construct_transform(const ccl_global float *ccl_
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void kernel_ocl_filter_nlm_calc_difference(int dx,
|
||||
int dy,
|
||||
const ccl_global float *ccl_restrict weight_image,
|
||||
__kernel void kernel_ocl_filter_nlm_calc_difference(const ccl_global float *ccl_restrict weight_image,
|
||||
const ccl_global float *ccl_restrict variance_image,
|
||||
ccl_global float *difference_image,
|
||||
int4 rect,
|
||||
int w,
|
||||
int h,
|
||||
int stride,
|
||||
int shift_stride,
|
||||
int r,
|
||||
int channel_offset,
|
||||
float a,
|
||||
float k_2)
|
||||
{
|
||||
int x = get_global_id(0) + rect.x;
|
||||
int y = get_global_id(1) + rect.y;
|
||||
if(x < rect.z && y < rect.w) {
|
||||
kernel_filter_nlm_calc_difference(x, y, dx, dy, weight_image, variance_image, difference_image, rect, w, channel_offset, a, k_2);
|
||||
int4 co, rect;
|
||||
int ofs;
|
||||
if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
|
||||
kernel_filter_nlm_calc_difference(co.x, co.y, co.z, co.w,
|
||||
weight_image,
|
||||
variance_image,
|
||||
difference_image + ofs,
|
||||
rect, stride,
|
||||
channel_offset, a, k_2);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void kernel_ocl_filter_nlm_blur(const ccl_global float *ccl_restrict difference_image,
|
||||
ccl_global float *out_image,
|
||||
int4 rect,
|
||||
int w,
|
||||
int h,
|
||||
int stride,
|
||||
int shift_stride,
|
||||
int r,
|
||||
int f)
|
||||
{
|
||||
int x = get_global_id(0) + rect.x;
|
||||
int y = get_global_id(1) + rect.y;
|
||||
if(x < rect.z && y < rect.w) {
|
||||
kernel_filter_nlm_blur(x, y, difference_image, out_image, rect, w, f);
|
||||
int4 co, rect;
|
||||
int ofs;
|
||||
if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
|
||||
kernel_filter_nlm_blur(co.x, co.y,
|
||||
difference_image + ofs,
|
||||
out_image + ofs,
|
||||
rect, stride, f);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void kernel_ocl_filter_nlm_calc_weight(const ccl_global float *ccl_restrict difference_image,
|
||||
ccl_global float *out_image,
|
||||
int4 rect,
|
||||
int w,
|
||||
int h,
|
||||
int stride,
|
||||
int shift_stride,
|
||||
int r,
|
||||
int f)
|
||||
{
|
||||
int x = get_global_id(0) + rect.x;
|
||||
int y = get_global_id(1) + rect.y;
|
||||
if(x < rect.z && y < rect.w) {
|
||||
kernel_filter_nlm_calc_weight(x, y, difference_image, out_image, rect, w, f);
|
||||
int4 co, rect;
|
||||
int ofs;
|
||||
if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
|
||||
kernel_filter_nlm_calc_weight(co.x, co.y,
|
||||
difference_image + ofs,
|
||||
out_image + ofs,
|
||||
rect, stride, f);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void kernel_ocl_filter_nlm_update_output(int dx,
|
||||
int dy,
|
||||
const ccl_global float *ccl_restrict difference_image,
|
||||
__kernel void kernel_ocl_filter_nlm_update_output(const ccl_global float *ccl_restrict difference_image,
|
||||
const ccl_global float *ccl_restrict image,
|
||||
ccl_global float *out_image,
|
||||
ccl_global float *accum_image,
|
||||
int4 rect,
|
||||
int w,
|
||||
int h,
|
||||
int stride,
|
||||
int shift_stride,
|
||||
int r,
|
||||
int f)
|
||||
{
|
||||
int x = get_global_id(0) + rect.x;
|
||||
int y = get_global_id(1) + rect.y;
|
||||
if(x < rect.z && y < rect.w) {
|
||||
kernel_filter_nlm_update_output(x, y, dx, dy, difference_image, image, out_image, accum_image, rect, w, f);
|
||||
int4 co, rect;
|
||||
int ofs;
|
||||
if(get_nlm_coords(w, h, r, shift_stride, &rect, &co, &ofs)) {
|
||||
kernel_filter_nlm_update_output(co.x, co.y, co.z, co.w,
|
||||
difference_image + ofs,
|
||||
image,
|
||||
out_image,
|
||||
accum_image,
|
||||
rect, stride, f);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void kernel_ocl_filter_nlm_normalize(ccl_global float *out_image,
|
||||
const ccl_global float *ccl_restrict accum_image,
|
||||
int4 rect,
|
||||
int w)
|
||||
int w,
|
||||
int h,
|
||||
int stride)
|
||||
{
|
||||
int x = get_global_id(0) + rect.x;
|
||||
int y = get_global_id(1) + rect.y;
|
||||
if(x < rect.z && y < rect.w) {
|
||||
kernel_filter_nlm_normalize(x, y, out_image, accum_image, rect, w);
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
if(x < w && y < h) {
|
||||
kernel_filter_nlm_normalize(x, y, out_image, accum_image, stride);
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void kernel_ocl_filter_nlm_construct_gramian(int dx,
|
||||
int dy,
|
||||
const ccl_global float *ccl_restrict difference_image,
|
||||
__kernel void kernel_ocl_filter_nlm_construct_gramian(const ccl_global float *ccl_restrict difference_image,
|
||||
const ccl_global float *ccl_restrict buffer,
|
||||
const ccl_global float *ccl_restrict transform,
|
||||
ccl_global int *rank,
|
||||
ccl_global float *XtWX,
|
||||
ccl_global float3 *XtWY,
|
||||
int4 rect,
|
||||
int4 filter_rect,
|
||||
int4 filter_window,
|
||||
int w,
|
||||
int h,
|
||||
int stride,
|
||||
int shift_stride,
|
||||
int r,
|
||||
int f,
|
||||
int pass_stride)
|
||||
{
|
||||
int x = get_global_id(0) + max(0, rect.x-filter_rect.x);
|
||||
int y = get_global_id(1) + max(0, rect.y-filter_rect.y);
|
||||
if(x < min(filter_rect.z, rect.z-filter_rect.x) && y < min(filter_rect.w, rect.w-filter_rect.y)) {
|
||||
kernel_filter_nlm_construct_gramian(x, y,
|
||||
dx, dy,
|
||||
difference_image,
|
||||
int4 co, rect;
|
||||
int ofs;
|
||||
if(get_nlm_coords_window(w, h, r, shift_stride, &rect, &co, &ofs, filter_window)) {
|
||||
kernel_filter_nlm_construct_gramian(co.x, co.y,
|
||||
co.z, co.w,
|
||||
difference_image + ofs,
|
||||
buffer,
|
||||
transform, rank,
|
||||
XtWX, XtWY,
|
||||
rect, filter_rect,
|
||||
w, h, f,
|
||||
rect, filter_window,
|
||||
stride, f,
|
||||
pass_stride,
|
||||
get_local_id(1)*get_local_size(0) + get_local_id(0));
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void kernel_ocl_filter_finalize(int w,
|
||||
int h,
|
||||
ccl_global float *buffer,
|
||||
__kernel void kernel_ocl_filter_finalize(ccl_global float *buffer,
|
||||
ccl_global int *rank,
|
||||
ccl_global float *XtWX,
|
||||
ccl_global float3 *XtWY,
|
||||
@@ -247,7 +270,10 @@ __kernel void kernel_ocl_filter_finalize(int w,
|
||||
rank += storage_ofs;
|
||||
XtWX += storage_ofs;
|
||||
XtWY += storage_ofs;
|
||||
kernel_filter_finalize(x, y, w, h, buffer, rank, filter_area.z*filter_area.w, XtWX, XtWY, buffer_params, sample);
|
||||
kernel_filter_finalize(x, y, buffer, rank,
|
||||
filter_area.z*filter_area.w,
|
||||
XtWX, XtWY,
|
||||
buffer_params, sample);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -68,6 +68,7 @@ set(SRC_HEADERS
|
||||
util_path.h
|
||||
util_progress.h
|
||||
util_queue.h
|
||||
util_rect.h
|
||||
util_set.h
|
||||
util_simd.h
|
||||
util_sky_model.cpp
|
||||
|
||||
@@ -320,6 +320,8 @@ CCL_NAMESPACE_END
|
||||
#include "util/util_math_float3.h"
|
||||
#include "util/util_math_float4.h"
|
||||
|
||||
#include "util/util_rect.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
#ifndef __KERNEL_OPENCL__
|
||||
|
||||
@@ -98,7 +98,10 @@ ccl_device_inline void math_vec3_add(float3 *v, int n, float *x, float3 w)
|
||||
ccl_device_inline void math_vec3_add_strided(ccl_global float3 *v, int n, float *x, float3 w, int stride)
|
||||
{
|
||||
for(int i = 0; i < n; i++) {
|
||||
v[i*stride] += w*x[i];
|
||||
ccl_global float *elem = (ccl_global float*) (v + i*stride);
|
||||
atomic_add_and_fetch_float(elem+0, w.x*x[i]);
|
||||
atomic_add_and_fetch_float(elem+1, w.y*x[i]);
|
||||
atomic_add_and_fetch_float(elem+2, w.z*x[i]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -136,7 +139,7 @@ ccl_device_inline void math_trimatrix_add_gramian_strided(ccl_global float *A,
|
||||
{
|
||||
for(int row = 0; row < n; row++) {
|
||||
for(int col = 0; col <= row; col++) {
|
||||
MATHS(A, row, col, stride) += v[row]*v[col]*weight;
|
||||
atomic_add_and_fetch_float(&MATHS(A, row, col, stride), v[row]*v[col]*weight);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
73
intern/cycles/util/util_rect.h
Normal file
73
intern/cycles/util/util_rect.h
Normal file
@@ -0,0 +1,73 @@
|
||||
/*
|
||||
* Copyright 2017 Blender Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef __UTIL_RECT_H__
|
||||
#define __UTIL_RECT_H__
|
||||
|
||||
#include "util/util_types.h"
|
||||
|
||||
CCL_NAMESPACE_BEGIN
|
||||
|
||||
/* Rectangles are represented as a int4 containing the coordinates of the lower-left and
|
||||
* upper-right corners in the order (x0, y0, x1, y1). */
|
||||
|
||||
ccl_device_inline int4 rect_from_shape(int x0, int y0, int w, int h)
|
||||
{
|
||||
return make_int4(x0, y0, x0 + w, y0 + h);
|
||||
}
|
||||
|
||||
ccl_device_inline int4 rect_expand(int4 rect, int d)
|
||||
{
|
||||
return make_int4(rect.x - d, rect.y - d, rect.z + d, rect.w + d);
|
||||
}
|
||||
|
||||
/* Returns the intersection of two rects. */
|
||||
ccl_device_inline int4 rect_clip(int4 a, int4 b)
|
||||
{
|
||||
return make_int4(max(a.x, b.x), max(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
|
||||
}
|
||||
|
||||
ccl_device_inline bool rect_is_valid(int4 rect)
|
||||
{
|
||||
return (rect.z > rect.x) && (rect.w > rect.y);
|
||||
}
|
||||
|
||||
/* Returns the local row-major index of the pixel inside the rect. */
|
||||
ccl_device_inline int coord_to_local_index(int4 rect, int x, int y)
|
||||
{
|
||||
int w = rect.z - rect.x;
|
||||
return (y - rect.y) * w + (x - rect.x);
|
||||
}
|
||||
|
||||
/* Finds the coordinates of a pixel given by its row-major index in the rect,
|
||||
* and returns whether the pixel is inside it. */
|
||||
ccl_device_inline bool local_index_to_coord(int4 rect, int idx, int *x, int *y)
|
||||
{
|
||||
int w = rect.z - rect.x;
|
||||
*x = (idx % w) + rect.x;
|
||||
*y = (idx / w) + rect.y;
|
||||
return (*y < rect.w);
|
||||
}
|
||||
|
||||
ccl_device_inline int rect_size(int4 rect)
|
||||
{
|
||||
return (rect.z - rect.x) * (rect.w - rect.y);
|
||||
}
|
||||
|
||||
CCL_NAMESPACE_END
|
||||
|
||||
#endif /* __UTIL_RECT_H__ */
|
||||
|
||||
@@ -63,6 +63,9 @@
|
||||
#elif defined(WIN32)
|
||||
# include <malloc.h>
|
||||
# define malloc_usable_size _msize
|
||||
#elif defined(__HAIKU__)
|
||||
# include <malloc.h>
|
||||
size_t malloc_usable_size(void *ptr);
|
||||
#else
|
||||
# pragma message "We don't know how to use malloc_usable_size on your platform"
|
||||
# undef USE_MALLOC_USABLE_SIZE
|
||||
|
||||
@@ -35,7 +35,8 @@
|
||||
|
||||
#if !defined(__MINGW64__)
|
||||
# if defined(_WIN32) || defined(__APPLE__) || \
|
||||
defined(__FreeBSD__) || defined(__NetBSD__)
|
||||
defined(__FreeBSD__) || defined(__NetBSD__) || \
|
||||
defined(__HAIKU__)
|
||||
inline void sincos(double x, double *sinx, double *cosx) {
|
||||
*sinx = sin(x);
|
||||
*cosx = cos(x);
|
||||
|
||||
@@ -217,7 +217,8 @@ class DATA_PT_vertex_groups(MeshButtonsPanel, Panel):
|
||||
|
||||
col = row.column(align=True)
|
||||
col.operator("object.vertex_group_add", icon='ZOOMIN', text="")
|
||||
col.operator("object.vertex_group_remove", icon='ZOOMOUT', text="").all = False
|
||||
props = col.operator("object.vertex_group_remove", icon='ZOOMOUT', text="")
|
||||
props.all_unlocked = props.all = False
|
||||
col.menu("MESH_MT_vertex_group_specials", icon='DOWNARROW_HLT', text="")
|
||||
if group:
|
||||
col.separator()
|
||||
|
||||
@@ -1165,6 +1165,10 @@ void set_current_material_texture(Material *ma, Tex *newtex)
|
||||
ma->mtex[act] = BKE_texture_mtex_add();
|
||||
/* Reset this slot's ON/OFF toggle, for materials, when slot was empty. */
|
||||
ma->septex &= ~(1 << act);
|
||||
/* For volumes the default UV texture coordinates are not available. */
|
||||
if (ma->material_type == MA_TYPE_VOLUME) {
|
||||
ma->mtex[act]->texco = TEXCO_ORCO;
|
||||
}
|
||||
}
|
||||
|
||||
ma->mtex[act]->tex = newtex;
|
||||
|
||||
@@ -47,7 +47,9 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if defined(__linux__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__FreeBSD_kernel__) || defined(__GNU__)
|
||||
#if defined(__linux__) || defined(__GNU__) || \
|
||||
defined(__NetBSD__) || defined(__OpenBSD__) || defined(__FreeBSD_kernel__) || \
|
||||
defined(__HAIKU__)
|
||||
|
||||
/* Linux-i386, Linux-Alpha, Linux-ppc */
|
||||
#include <stdint.h>
|
||||
|
||||
@@ -621,7 +621,21 @@ static int recursive_operation(const char *startfrom, const char *startto,
|
||||
if (to)
|
||||
join_dirfile_alloc(&to_path, &to_alloc_len, to, dirent->d_name);
|
||||
|
||||
if (dirent->d_type == DT_DIR) {
|
||||
bool is_dir;
|
||||
|
||||
#ifdef __HAIKU__
|
||||
{
|
||||
struct stat st_dir;
|
||||
char filename[FILE_MAX];
|
||||
BLI_path_join(filename, sizeof(filename), startfrom, dirent->d_name, NULL);
|
||||
lstat(filename, &st_dir);
|
||||
is_dir = S_ISDIR(st_dir.st_mode);
|
||||
}
|
||||
#else
|
||||
is_dir = (dirent->d_type == DT_DIR);
|
||||
#endif
|
||||
|
||||
if (is_dir) {
|
||||
/* recursively dig into a subfolder */
|
||||
ret = recursive_operation(from_path, to_path, callback_dir_pre, callback_file, callback_dir_post);
|
||||
}
|
||||
|
||||
@@ -37,7 +37,7 @@
|
||||
|
||||
#include <sys/stat.h>
|
||||
|
||||
#if defined(__NetBSD__) || defined(__DragonFly__)
|
||||
#if defined(__NetBSD__) || defined(__DragonFly__) || defined(__HAIKU__)
|
||||
/* Other modern unix os's should probably use this also */
|
||||
# include <sys/statvfs.h>
|
||||
# define USE_STATFS_STATVFS
|
||||
|
||||
@@ -807,6 +807,7 @@ enum {
|
||||
SIMEDBONE_PREFIX,
|
||||
SIMEDBONE_SUFFIX,
|
||||
SIMEDBONE_LAYER,
|
||||
SIMEDBONE_GROUP,
|
||||
SIMEDBONE_SHAPE,
|
||||
};
|
||||
|
||||
@@ -819,6 +820,7 @@ static const EnumPropertyItem prop_similar_types[] = {
|
||||
{SIMEDBONE_PREFIX, "PREFIX", 0, "Prefix", ""},
|
||||
{SIMEDBONE_SUFFIX, "SUFFIX", 0, "Suffix", ""},
|
||||
{SIMEDBONE_LAYER, "LAYER", 0, "Layer", ""},
|
||||
{SIMEDBONE_GROUP, "GROUP", 0, "Group", ""},
|
||||
{SIMEDBONE_SHAPE, "SHAPE", 0, "Shape", ""},
|
||||
{0, NULL, 0, NULL, NULL}
|
||||
};
|
||||
@@ -1009,6 +1011,9 @@ static int armature_select_similar_exec(bContext *C, wmOperator *op)
|
||||
return OPERATOR_CANCELLED;
|
||||
}
|
||||
|
||||
#define STRUCT_SIZE_AND_OFFSET(_struct, _member) \
|
||||
sizeof(((_struct *)NULL)->_member), offsetof(_struct, _member)
|
||||
|
||||
switch (type) {
|
||||
case SIMEDBONE_CHILDREN:
|
||||
select_similar_children(arm, ebone_act);
|
||||
@@ -1034,13 +1039,20 @@ static int armature_select_similar_exec(bContext *C, wmOperator *op)
|
||||
case SIMEDBONE_LAYER:
|
||||
select_similar_layer(arm, ebone_act);
|
||||
break;
|
||||
case SIMEDBONE_GROUP:
|
||||
select_similar_data_pchan(
|
||||
arm, obedit, ebone_act,
|
||||
STRUCT_SIZE_AND_OFFSET(bPoseChannel, agrp_index));
|
||||
break;
|
||||
case SIMEDBONE_SHAPE:
|
||||
select_similar_data_pchan(
|
||||
arm, obedit, ebone_act,
|
||||
sizeof(void *), offsetof(bPoseChannel, custom));
|
||||
STRUCT_SIZE_AND_OFFSET(bPoseChannel, custom));
|
||||
break;
|
||||
}
|
||||
|
||||
#undef STRUCT_SIZE_AND_OFFSET
|
||||
|
||||
WM_event_add_notifier(C, NC_OBJECT | ND_BONE_SELECT, obedit);
|
||||
|
||||
return OPERATOR_FINISHED;
|
||||
|
||||
Reference in New Issue
Block a user