Compositor: speedup Kuwahara classic variation

Compute luminance only once per region and per pixel by means of vectorization.

Measured time improvements on intel i9 CPU on a 1920 x 3199 image:
- from 14.7s to 3.9s for full-frame compositor
- from 50.1s to 15.3s for tiled compositor

Pull Request: https://projects.blender.org/blender/blender/pulls/108859
This commit is contained in:
Habib Gahbiche
2023-06-13 22:22:01 +02:00
committed by Habib Gahbiche
parent 7d935f94f3
commit ab4286f940

View File

@@ -4,6 +4,7 @@
#include "COM_KuwaharaClassicOperation.h"
#include "BLI_math_vector_types.hh"
#include "IMB_colormanagement.h"
namespace blender::compositor {
@@ -32,9 +33,106 @@ void KuwaharaClassicOperation::execute_pixel_sampled(float output[4],
float y,
PixelSampler sampler)
{
for (int ch = 0; ch < 3; ch++) {
Vector<float3> mean(4, float3(0.0f));
float sum[4] = {0.0f, 0.0f, 0.0f, 0.0f};
float var[4] = {0.0f, 0.0f, 0.0f, 0.0f};
int cnt[4] = {0, 0, 0, 0};
/* Split surroundings of pixel into 4 overlapping regions. */
for (int dy = -kernel_size_; dy <= kernel_size_; dy++) {
for (int dx = -kernel_size_; dx <= kernel_size_; dx++) {
int xx = x + dx;
int yy = y + dy;
if (xx >= 0 && yy >= 0 && xx < this->get_width() && yy < this->get_height()) {
float4 color;
image_reader_->read_sampled(color, xx, yy, sampler);
const float3 v = color.xyz();
const float lum = IMB_colormanagement_get_luminance(color);
if (dx <= 0 && dy <= 0) {
mean[0] += v;
sum[0] += lum;
var[0] += lum * lum;
cnt[0]++;
}
if (dx >= 0 && dy <= 0) {
mean[1] += v;
sum[1] += lum;
var[1] += lum * lum;
cnt[1]++;
}
if (dx <= 0 && dy >= 0) {
mean[2] += v;
sum[2] += lum;
var[2] += lum * lum;
cnt[2]++;
}
if (dx >= 0 && dy >= 0) {
mean[3] += v;
sum[3] += lum;
var[3] += lum * lum;
cnt[3]++;
}
}
}
}
/* Compute region variances. */
for (int i = 0; i < 4; i++) {
mean[i] = cnt[i] != 0 ? mean[i] / cnt[i] : float3{0.0f, 0.0f, 0.0f};
sum[i] = cnt[i] != 0 ? sum[i] / cnt[i] : 0.0f;
var[i] = cnt[i] != 0 ? var[i] / cnt[i] : 0.0f;
const float temp = sum[i] * sum[i];
var[i] = var[i] > temp ? sqrt(var[i] - temp) : 0.0f;
}
/* Choose the region with lowest variance. */
float min_var = FLT_MAX;
int min_index = 0;
for (int i = 0; i < 4; i++) {
if (var[i] < min_var) {
min_var = var[i];
min_index = i;
}
}
output[0] = mean[min_index].x;
output[1] = mean[min_index].y;
output[2] = mean[min_index].z;
/* No changes for alpha channel. */
float tmp[4];
image_reader_->read_sampled(tmp, x, y, sampler);
output[3] = tmp[3];
}
void KuwaharaClassicOperation::set_kernel_size(int kernel_size)
{
kernel_size_ = kernel_size;
}
int KuwaharaClassicOperation::get_kernel_size()
{
return kernel_size_;
}
void KuwaharaClassicOperation::update_memory_buffer_partial(MemoryBuffer *output,
const rcti &area,
Span<MemoryBuffer *> inputs)
{
MemoryBuffer *image = inputs[0];
for (BuffersIterator<float> it = output->iterate_with(inputs, area); !it.is_end(); ++it) {
const int x = it.x;
const int y = it.y;
Vector<float3> mean(4, float3(0.0f));
float sum[4] = {0.0f, 0.0f, 0.0f, 0.0f};
float mean[4] = {0.0f, 0.0f, 0.0f, 0.0f};
float var[4] = {0.0f, 0.0f, 0.0f, 0.0f};
int cnt[4] = {0, 0, 0, 0};
@@ -44,10 +142,12 @@ void KuwaharaClassicOperation::execute_pixel_sampled(float output[4],
int xx = x + dx;
int yy = y + dy;
if (xx >= 0 && yy >= 0 && xx < this->get_width() && yy < this->get_height()) {
float color[4];
image_reader_->read_sampled(color, xx, yy, sampler);
const float v = color[ch];
if (xx >= 0 && yy >= 0 && xx < image->get_width() && yy < image->get_height()) {
float4 color;
image->read_elem(xx, yy, &color.x);
const float3 v = color.xyz();
const float lum = IMB_colormanagement_get_luminance(color);
if (dx <= 0 && dy <= 0) {
@@ -83,7 +183,7 @@ void KuwaharaClassicOperation::execute_pixel_sampled(float output[4],
/* Compute region variances. */
for (int i = 0; i < 4; i++) {
mean[i] = cnt[i] != 0 ? mean[i] / cnt[i] : 0.0f;
mean[i] = cnt[i] != 0 ? mean[i] / cnt[i] : float3{0.0f, 0.0f, 0.0f};
sum[i] = cnt[i] != 0 ? sum[i] / cnt[i] : 0.0f;
var[i] = cnt[i] != 0 ? var[i] / cnt[i] : 0.0f;
const float temp = sum[i] * sum[i];
@@ -99,105 +199,12 @@ void KuwaharaClassicOperation::execute_pixel_sampled(float output[4],
min_index = i;
}
}
output[ch] = mean[min_index];
}
it.out[0] = mean[min_index].x;
it.out[1] = mean[min_index].y;
it.out[2] = mean[min_index].z;
/* No changes for alpha channel. */
float tmp[4];
image_reader_->read_sampled(tmp, x, y, sampler);
output[3] = tmp[3];
}
void KuwaharaClassicOperation::set_kernel_size(int kernel_size)
{
kernel_size_ = kernel_size;
}
int KuwaharaClassicOperation::get_kernel_size()
{
return kernel_size_;
}
void KuwaharaClassicOperation::update_memory_buffer_partial(MemoryBuffer *output,
const rcti &area,
Span<MemoryBuffer *> inputs)
{
MemoryBuffer *image = inputs[0];
for (BuffersIterator<float> it = output->iterate_with(inputs, area); !it.is_end(); ++it) {
const int x = it.x;
const int y = it.y;
/* No changes for alpha channel. */
it.out[3] = image->get_value(x, y, 3);
for (int ch = 0; ch < 3; ch++) {
float sum[4] = {0.0f, 0.0f, 0.0f, 0.0f};
float mean[4] = {0.0f, 0.0f, 0.0f, 0.0f};
float var[4] = {0.0f, 0.0f, 0.0f, 0.0f};
int cnt[4] = {0, 0, 0, 0};
/* Split surroundings of pixel into 4 overlapping regions. */
for (int dy = -kernel_size_; dy <= kernel_size_; dy++) {
for (int dx = -kernel_size_; dx <= kernel_size_; dx++) {
int xx = x + dx;
int yy = y + dy;
if (xx >= 0 && yy >= 0 && xx < image->get_width() && yy < image->get_height()) {
const float v = image->get_value(xx, yy, ch);
float color[4];
image->read_elem(xx, yy, color);
const float lum = IMB_colormanagement_get_luminance(color);
if (dx <= 0 && dy <= 0) {
mean[0] += v;
sum[0] += lum;
var[0] += lum * lum;
cnt[0]++;
}
if (dx >= 0 && dy <= 0) {
mean[1] += v;
sum[1] += lum;
var[1] += lum * lum;
cnt[1]++;
}
if (dx <= 0 && dy >= 0) {
mean[2] += v;
sum[2] += lum;
var[2] += lum * lum;
cnt[2]++;
}
if (dx >= 0 && dy >= 0) {
mean[3] += v;
sum[3] += lum;
var[3] += lum * lum;
cnt[3]++;
}
}
}
}
/* Compute region variances. */
for (int i = 0; i < 4; i++) {
mean[i] = cnt[i] != 0 ? mean[i] / cnt[i] : 0.0f;
sum[i] = cnt[i] != 0 ? sum[i] / cnt[i] : 0.0f;
var[i] = cnt[i] != 0 ? var[i] / cnt[i] : 0.0f;
const float temp = sum[i] * sum[i];
var[i] = var[i] > temp ? sqrt(var[i] - temp) : 0.0f;
}
/* Choose the region with lowest variance. */
float min_var = FLT_MAX;
int min_index = 0;
for (int i = 0; i < 4; i++) {
if (var[i] < min_var) {
min_var = var[i];
min_index = i;
}
}
output->get_value(x, y, ch) = mean[min_index];
}
}
}