Files
test/source/blender/blenkernel/BKE_geometry_fields.hh

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

513 lines
17 KiB
C++
Raw Permalink Normal View History

/* SPDX-FileCopyrightText: 2023 Blender Authors
*
* SPDX-License-Identifier: GPL-2.0-or-later */
#pragma once
/** \file
* \ingroup bke
*
* Common field utilities and field definitions for geometry components.
*/
#include "BKE_anonymous_attribute_id.hh"
#include "BKE_geometry_set.hh"
#include "FN_field.hh"
struct Mesh;
struct PointCloud;
namespace blender::bke {
2022-08-30 13:06:09 -05:00
class CurvesGeometry;
class GeometryFieldInput;
namespace greasepencil {
class Drawing;
}
class MeshFieldContext : public fn::FieldContext {
private:
const Mesh &mesh_;
AttrDomain domain_;
public:
MeshFieldContext(const Mesh &mesh, AttrDomain domain);
const Mesh &mesh() const
{
return mesh_;
}
AttrDomain domain() const
{
return domain_;
}
};
class CurvesFieldContext : public fn::FieldContext {
private:
const CurvesGeometry &curves_;
AttrDomain domain_;
const Curves *curves_id_ = nullptr;
public:
CurvesFieldContext(const CurvesGeometry &curves, AttrDomain domain);
CurvesFieldContext(const Curves &curves_id, AttrDomain domain);
const CurvesGeometry &curves() const
{
return curves_;
}
const Curves *curves_id() const
{
return curves_id_;
}
AttrDomain domain() const
{
return domain_;
}
};
class PointCloudFieldContext : public fn::FieldContext {
private:
const PointCloud &pointcloud_;
public:
PointCloudFieldContext(const PointCloud &pointcloud) : pointcloud_(pointcloud) {}
const PointCloud &pointcloud() const
{
return pointcloud_;
}
};
class GreasePencilFieldContext : public fn::FieldContext {
private:
const GreasePencil &grease_pencil_;
public:
GreasePencilFieldContext(const GreasePencil &grease_pencil) : grease_pencil_(grease_pencil) {}
const GreasePencil &grease_pencil() const
{
return grease_pencil_;
}
};
class GreasePencilLayerFieldContext : public fn::FieldContext {
private:
const GreasePencil &grease_pencil_;
AttrDomain domain_;
int layer_index_;
public:
GreasePencilLayerFieldContext(const GreasePencil &grease_pencil,
AttrDomain domain,
int layer_index)
: grease_pencil_(grease_pencil), domain_(domain), layer_index_(layer_index)
{
}
const GreasePencil &grease_pencil() const
{
return grease_pencil_;
}
AttrDomain domain() const
{
return domain_;
}
int layer_index() const
{
return layer_index_;
}
GVArray get_varray_for_input(const fn::FieldInput &field_input,
const IndexMask &mask,
ResourceScope &scope) const override;
};
class InstancesFieldContext : public fn::FieldContext {
private:
const Instances &instances_;
public:
InstancesFieldContext(const Instances &instances) : instances_(instances) {}
const Instances &instances() const
{
return instances_;
}
};
/**
* A field context that can represent meshes, curves, point clouds, instances or grease pencil
* layers, used for field inputs that can work for multiple geometry types.
*/
class GeometryFieldContext : public fn::FieldContext {
private:
/**
* Store the geometry as a void pointer instead of a #GeometryComponent to allow referencing data
* that doesn't correspond directly to a geometry component type, in this case #CurvesGeometry
* instead of #Curves.
*/
const void *geometry_;
const GeometryComponent::Type type_;
AttrDomain domain_;
const Curves *curves_id_ = nullptr;
/**
* Only used when the type is grease pencil and the domain is either points or curves
* (not layers).
*/
int grease_pencil_layer_index_;
friend GeometryFieldInput;
public:
GeometryFieldContext(const GeometryFieldContext &other, AttrDomain domain);
GeometryFieldContext(const GeometryComponent &component, AttrDomain domain);
GeometryFieldContext(const void *geometry,
GeometryComponent::Type type,
AttrDomain domain,
int grease_pencil_layer_index);
GeometryFieldContext(const Mesh &mesh, AttrDomain domain);
GeometryFieldContext(const CurvesGeometry &curves, AttrDomain domain);
GeometryFieldContext(const Curves &curves_id, AttrDomain domain);
GeometryFieldContext(const GreasePencil &grease_pencil);
GeometryFieldContext(const GreasePencil &grease_pencil, AttrDomain domain, int layer_index);
GeometryFieldContext(const PointCloud &points);
GeometryFieldContext(const Instances &instances);
const void *geometry() const
{
return geometry_;
}
GeometryComponent::Type type() const
{
return type_;
}
AttrDomain domain() const
{
return domain_;
}
int grease_pencil_layer_index() const
{
BLI_assert(this->type_ == GeometryComponent::Type::GreasePencil);
BLI_assert(ELEM(this->domain_, AttrDomain::Layer, AttrDomain::Curve, AttrDomain::Point));
return grease_pencil_layer_index_;
}
std::optional<AttributeAccessor> attributes() const;
const Mesh *mesh() const;
const CurvesGeometry *curves() const;
const PointCloud *pointcloud() const;
const GreasePencil *grease_pencil() const;
const greasepencil::Drawing *grease_pencil_layer_drawing() const;
const Instances *instances() const;
const CurvesGeometry *curves_or_strokes() const;
const Curves *curves_id() const;
};
class GeometryFieldInput : public fn::FieldInput {
public:
using fn::FieldInput::FieldInput;
GVArray get_varray_for_context(const fn::FieldContext &context,
BLI: refactor IndexMask for better performance and memory usage Goals of this refactor: * Reduce memory consumption of `IndexMask`. The old `IndexMask` uses an `int64_t` for each index which is more than necessary in pretty much all practical cases currently. Using `int32_t` might still become limiting in the future in case we use this to index e.g. byte buffers larger than a few gigabytes. We also don't want to template `IndexMask`, because that would cause a split in the "ecosystem", or everything would have to be implemented twice or templated. * Allow for more multi-threading. The old `IndexMask` contains a single array. This is generally good but has the problem that it is hard to fill from multiple-threads when the final size is not known from the beginning. This is commonly the case when e.g. converting an array of bool to an index mask. Currently, this kind of code only runs on a single thread. * Allow for efficient set operations like join, intersect and difference. It should be possible to multi-thread those operations. * It should be possible to iterate over an `IndexMask` very efficiently. The most important part of that is to avoid all memory access when iterating over continuous ranges. For some core nodes (e.g. math nodes), we generate optimized code for the cases of irregular index masks and simple index ranges. To achieve these goals, a few compromises had to made: * Slicing of the mask (at specific indices) and random element access is `O(log #indices)` now, but with a low constant factor. It should be possible to split a mask into n approximately equally sized parts in `O(n)` though, making the time per split `O(1)`. * Using range-based for loops does not work well when iterating over a nested data structure like the new `IndexMask`. Therefor, `foreach_*` functions with callbacks have to be used. To avoid extra code complexity at the call site, the `foreach_*` methods support multi-threading out of the box. The new data structure splits an `IndexMask` into an arbitrary number of ordered `IndexMaskSegment`. Each segment can contain at most `2^14 = 16384` indices. The indices within a segment are stored as `int16_t`. Each segment has an additional `int64_t` offset which allows storing arbitrary `int64_t` indices. This approach has the main benefits that segments can be processed/constructed individually on multiple threads without a serial bottleneck. Also it reduces the memory requirements significantly. For more details see comments in `BLI_index_mask.hh`. I did a few tests to verify that the data structure generally improves performance and does not cause regressions: * Our field evaluation benchmarks take about as much as before. This is to be expected because we already made sure that e.g. add node evaluation is vectorized. The important thing here is to check that changes to the way we iterate over the indices still allows for auto-vectorization. * Memory usage by a mask is about 1/4 of what it was before in the average case. That's mainly caused by the switch from `int64_t` to `int16_t` for indices. In the worst case, the memory requirements can be larger when there are many indices that are very far away. However, when they are far away from each other, that indicates that there aren't many indices in total. In common cases, memory usage can be way lower than 1/4 of before, because sub-ranges use static memory. * For some more specific numbers I benchmarked `IndexMask::from_bools` in `index_mask_from_selection` on 10.000.000 elements at various probabilities for `true` at every index: ``` Probability Old New 0 4.6 ms 0.8 ms 0.001 5.1 ms 1.3 ms 0.2 8.4 ms 1.8 ms 0.5 15.3 ms 3.0 ms 0.8 20.1 ms 3.0 ms 0.999 25.1 ms 1.7 ms 1 13.5 ms 1.1 ms ``` Pull Request: https://projects.blender.org/blender/blender/pulls/104629
2023-05-24 18:11:41 +02:00
const IndexMask &mask,
ResourceScope &scope) const override;
virtual GVArray get_varray_for_context(const GeometryFieldContext &context,
BLI: refactor IndexMask for better performance and memory usage Goals of this refactor: * Reduce memory consumption of `IndexMask`. The old `IndexMask` uses an `int64_t` for each index which is more than necessary in pretty much all practical cases currently. Using `int32_t` might still become limiting in the future in case we use this to index e.g. byte buffers larger than a few gigabytes. We also don't want to template `IndexMask`, because that would cause a split in the "ecosystem", or everything would have to be implemented twice or templated. * Allow for more multi-threading. The old `IndexMask` contains a single array. This is generally good but has the problem that it is hard to fill from multiple-threads when the final size is not known from the beginning. This is commonly the case when e.g. converting an array of bool to an index mask. Currently, this kind of code only runs on a single thread. * Allow for efficient set operations like join, intersect and difference. It should be possible to multi-thread those operations. * It should be possible to iterate over an `IndexMask` very efficiently. The most important part of that is to avoid all memory access when iterating over continuous ranges. For some core nodes (e.g. math nodes), we generate optimized code for the cases of irregular index masks and simple index ranges. To achieve these goals, a few compromises had to made: * Slicing of the mask (at specific indices) and random element access is `O(log #indices)` now, but with a low constant factor. It should be possible to split a mask into n approximately equally sized parts in `O(n)` though, making the time per split `O(1)`. * Using range-based for loops does not work well when iterating over a nested data structure like the new `IndexMask`. Therefor, `foreach_*` functions with callbacks have to be used. To avoid extra code complexity at the call site, the `foreach_*` methods support multi-threading out of the box. The new data structure splits an `IndexMask` into an arbitrary number of ordered `IndexMaskSegment`. Each segment can contain at most `2^14 = 16384` indices. The indices within a segment are stored as `int16_t`. Each segment has an additional `int64_t` offset which allows storing arbitrary `int64_t` indices. This approach has the main benefits that segments can be processed/constructed individually on multiple threads without a serial bottleneck. Also it reduces the memory requirements significantly. For more details see comments in `BLI_index_mask.hh`. I did a few tests to verify that the data structure generally improves performance and does not cause regressions: * Our field evaluation benchmarks take about as much as before. This is to be expected because we already made sure that e.g. add node evaluation is vectorized. The important thing here is to check that changes to the way we iterate over the indices still allows for auto-vectorization. * Memory usage by a mask is about 1/4 of what it was before in the average case. That's mainly caused by the switch from `int64_t` to `int16_t` for indices. In the worst case, the memory requirements can be larger when there are many indices that are very far away. However, when they are far away from each other, that indicates that there aren't many indices in total. In common cases, memory usage can be way lower than 1/4 of before, because sub-ranges use static memory. * For some more specific numbers I benchmarked `IndexMask::from_bools` in `index_mask_from_selection` on 10.000.000 elements at various probabilities for `true` at every index: ``` Probability Old New 0 4.6 ms 0.8 ms 0.001 5.1 ms 1.3 ms 0.2 8.4 ms 1.8 ms 0.5 15.3 ms 3.0 ms 0.8 20.1 ms 3.0 ms 0.999 25.1 ms 1.7 ms 1 13.5 ms 1.1 ms ``` Pull Request: https://projects.blender.org/blender/blender/pulls/104629
2023-05-24 18:11:41 +02:00
const IndexMask &mask) const = 0;
virtual std::optional<AttrDomain> preferred_domain(const GeometryComponent &component) const;
};
class MeshFieldInput : public fn::FieldInput {
public:
using fn::FieldInput::FieldInput;
GVArray get_varray_for_context(const fn::FieldContext &context,
BLI: refactor IndexMask for better performance and memory usage Goals of this refactor: * Reduce memory consumption of `IndexMask`. The old `IndexMask` uses an `int64_t` for each index which is more than necessary in pretty much all practical cases currently. Using `int32_t` might still become limiting in the future in case we use this to index e.g. byte buffers larger than a few gigabytes. We also don't want to template `IndexMask`, because that would cause a split in the "ecosystem", or everything would have to be implemented twice or templated. * Allow for more multi-threading. The old `IndexMask` contains a single array. This is generally good but has the problem that it is hard to fill from multiple-threads when the final size is not known from the beginning. This is commonly the case when e.g. converting an array of bool to an index mask. Currently, this kind of code only runs on a single thread. * Allow for efficient set operations like join, intersect and difference. It should be possible to multi-thread those operations. * It should be possible to iterate over an `IndexMask` very efficiently. The most important part of that is to avoid all memory access when iterating over continuous ranges. For some core nodes (e.g. math nodes), we generate optimized code for the cases of irregular index masks and simple index ranges. To achieve these goals, a few compromises had to made: * Slicing of the mask (at specific indices) and random element access is `O(log #indices)` now, but with a low constant factor. It should be possible to split a mask into n approximately equally sized parts in `O(n)` though, making the time per split `O(1)`. * Using range-based for loops does not work well when iterating over a nested data structure like the new `IndexMask`. Therefor, `foreach_*` functions with callbacks have to be used. To avoid extra code complexity at the call site, the `foreach_*` methods support multi-threading out of the box. The new data structure splits an `IndexMask` into an arbitrary number of ordered `IndexMaskSegment`. Each segment can contain at most `2^14 = 16384` indices. The indices within a segment are stored as `int16_t`. Each segment has an additional `int64_t` offset which allows storing arbitrary `int64_t` indices. This approach has the main benefits that segments can be processed/constructed individually on multiple threads without a serial bottleneck. Also it reduces the memory requirements significantly. For more details see comments in `BLI_index_mask.hh`. I did a few tests to verify that the data structure generally improves performance and does not cause regressions: * Our field evaluation benchmarks take about as much as before. This is to be expected because we already made sure that e.g. add node evaluation is vectorized. The important thing here is to check that changes to the way we iterate over the indices still allows for auto-vectorization. * Memory usage by a mask is about 1/4 of what it was before in the average case. That's mainly caused by the switch from `int64_t` to `int16_t` for indices. In the worst case, the memory requirements can be larger when there are many indices that are very far away. However, when they are far away from each other, that indicates that there aren't many indices in total. In common cases, memory usage can be way lower than 1/4 of before, because sub-ranges use static memory. * For some more specific numbers I benchmarked `IndexMask::from_bools` in `index_mask_from_selection` on 10.000.000 elements at various probabilities for `true` at every index: ``` Probability Old New 0 4.6 ms 0.8 ms 0.001 5.1 ms 1.3 ms 0.2 8.4 ms 1.8 ms 0.5 15.3 ms 3.0 ms 0.8 20.1 ms 3.0 ms 0.999 25.1 ms 1.7 ms 1 13.5 ms 1.1 ms ``` Pull Request: https://projects.blender.org/blender/blender/pulls/104629
2023-05-24 18:11:41 +02:00
const IndexMask &mask,
ResourceScope &scope) const override;
virtual GVArray get_varray_for_context(const Mesh &mesh,
AttrDomain domain,
BLI: refactor IndexMask for better performance and memory usage Goals of this refactor: * Reduce memory consumption of `IndexMask`. The old `IndexMask` uses an `int64_t` for each index which is more than necessary in pretty much all practical cases currently. Using `int32_t` might still become limiting in the future in case we use this to index e.g. byte buffers larger than a few gigabytes. We also don't want to template `IndexMask`, because that would cause a split in the "ecosystem", or everything would have to be implemented twice or templated. * Allow for more multi-threading. The old `IndexMask` contains a single array. This is generally good but has the problem that it is hard to fill from multiple-threads when the final size is not known from the beginning. This is commonly the case when e.g. converting an array of bool to an index mask. Currently, this kind of code only runs on a single thread. * Allow for efficient set operations like join, intersect and difference. It should be possible to multi-thread those operations. * It should be possible to iterate over an `IndexMask` very efficiently. The most important part of that is to avoid all memory access when iterating over continuous ranges. For some core nodes (e.g. math nodes), we generate optimized code for the cases of irregular index masks and simple index ranges. To achieve these goals, a few compromises had to made: * Slicing of the mask (at specific indices) and random element access is `O(log #indices)` now, but with a low constant factor. It should be possible to split a mask into n approximately equally sized parts in `O(n)` though, making the time per split `O(1)`. * Using range-based for loops does not work well when iterating over a nested data structure like the new `IndexMask`. Therefor, `foreach_*` functions with callbacks have to be used. To avoid extra code complexity at the call site, the `foreach_*` methods support multi-threading out of the box. The new data structure splits an `IndexMask` into an arbitrary number of ordered `IndexMaskSegment`. Each segment can contain at most `2^14 = 16384` indices. The indices within a segment are stored as `int16_t`. Each segment has an additional `int64_t` offset which allows storing arbitrary `int64_t` indices. This approach has the main benefits that segments can be processed/constructed individually on multiple threads without a serial bottleneck. Also it reduces the memory requirements significantly. For more details see comments in `BLI_index_mask.hh`. I did a few tests to verify that the data structure generally improves performance and does not cause regressions: * Our field evaluation benchmarks take about as much as before. This is to be expected because we already made sure that e.g. add node evaluation is vectorized. The important thing here is to check that changes to the way we iterate over the indices still allows for auto-vectorization. * Memory usage by a mask is about 1/4 of what it was before in the average case. That's mainly caused by the switch from `int64_t` to `int16_t` for indices. In the worst case, the memory requirements can be larger when there are many indices that are very far away. However, when they are far away from each other, that indicates that there aren't many indices in total. In common cases, memory usage can be way lower than 1/4 of before, because sub-ranges use static memory. * For some more specific numbers I benchmarked `IndexMask::from_bools` in `index_mask_from_selection` on 10.000.000 elements at various probabilities for `true` at every index: ``` Probability Old New 0 4.6 ms 0.8 ms 0.001 5.1 ms 1.3 ms 0.2 8.4 ms 1.8 ms 0.5 15.3 ms 3.0 ms 0.8 20.1 ms 3.0 ms 0.999 25.1 ms 1.7 ms 1 13.5 ms 1.1 ms ``` Pull Request: https://projects.blender.org/blender/blender/pulls/104629
2023-05-24 18:11:41 +02:00
const IndexMask &mask) const = 0;
virtual std::optional<AttrDomain> preferred_domain(const Mesh &mesh) const;
};
class CurvesFieldInput : public fn::FieldInput {
public:
using fn::FieldInput::FieldInput;
GVArray get_varray_for_context(const fn::FieldContext &context,
BLI: refactor IndexMask for better performance and memory usage Goals of this refactor: * Reduce memory consumption of `IndexMask`. The old `IndexMask` uses an `int64_t` for each index which is more than necessary in pretty much all practical cases currently. Using `int32_t` might still become limiting in the future in case we use this to index e.g. byte buffers larger than a few gigabytes. We also don't want to template `IndexMask`, because that would cause a split in the "ecosystem", or everything would have to be implemented twice or templated. * Allow for more multi-threading. The old `IndexMask` contains a single array. This is generally good but has the problem that it is hard to fill from multiple-threads when the final size is not known from the beginning. This is commonly the case when e.g. converting an array of bool to an index mask. Currently, this kind of code only runs on a single thread. * Allow for efficient set operations like join, intersect and difference. It should be possible to multi-thread those operations. * It should be possible to iterate over an `IndexMask` very efficiently. The most important part of that is to avoid all memory access when iterating over continuous ranges. For some core nodes (e.g. math nodes), we generate optimized code for the cases of irregular index masks and simple index ranges. To achieve these goals, a few compromises had to made: * Slicing of the mask (at specific indices) and random element access is `O(log #indices)` now, but with a low constant factor. It should be possible to split a mask into n approximately equally sized parts in `O(n)` though, making the time per split `O(1)`. * Using range-based for loops does not work well when iterating over a nested data structure like the new `IndexMask`. Therefor, `foreach_*` functions with callbacks have to be used. To avoid extra code complexity at the call site, the `foreach_*` methods support multi-threading out of the box. The new data structure splits an `IndexMask` into an arbitrary number of ordered `IndexMaskSegment`. Each segment can contain at most `2^14 = 16384` indices. The indices within a segment are stored as `int16_t`. Each segment has an additional `int64_t` offset which allows storing arbitrary `int64_t` indices. This approach has the main benefits that segments can be processed/constructed individually on multiple threads without a serial bottleneck. Also it reduces the memory requirements significantly. For more details see comments in `BLI_index_mask.hh`. I did a few tests to verify that the data structure generally improves performance and does not cause regressions: * Our field evaluation benchmarks take about as much as before. This is to be expected because we already made sure that e.g. add node evaluation is vectorized. The important thing here is to check that changes to the way we iterate over the indices still allows for auto-vectorization. * Memory usage by a mask is about 1/4 of what it was before in the average case. That's mainly caused by the switch from `int64_t` to `int16_t` for indices. In the worst case, the memory requirements can be larger when there are many indices that are very far away. However, when they are far away from each other, that indicates that there aren't many indices in total. In common cases, memory usage can be way lower than 1/4 of before, because sub-ranges use static memory. * For some more specific numbers I benchmarked `IndexMask::from_bools` in `index_mask_from_selection` on 10.000.000 elements at various probabilities for `true` at every index: ``` Probability Old New 0 4.6 ms 0.8 ms 0.001 5.1 ms 1.3 ms 0.2 8.4 ms 1.8 ms 0.5 15.3 ms 3.0 ms 0.8 20.1 ms 3.0 ms 0.999 25.1 ms 1.7 ms 1 13.5 ms 1.1 ms ``` Pull Request: https://projects.blender.org/blender/blender/pulls/104629
2023-05-24 18:11:41 +02:00
const IndexMask &mask,
ResourceScope &scope) const override;
virtual GVArray get_varray_for_context(const CurvesGeometry &curves,
AttrDomain domain,
BLI: refactor IndexMask for better performance and memory usage Goals of this refactor: * Reduce memory consumption of `IndexMask`. The old `IndexMask` uses an `int64_t` for each index which is more than necessary in pretty much all practical cases currently. Using `int32_t` might still become limiting in the future in case we use this to index e.g. byte buffers larger than a few gigabytes. We also don't want to template `IndexMask`, because that would cause a split in the "ecosystem", or everything would have to be implemented twice or templated. * Allow for more multi-threading. The old `IndexMask` contains a single array. This is generally good but has the problem that it is hard to fill from multiple-threads when the final size is not known from the beginning. This is commonly the case when e.g. converting an array of bool to an index mask. Currently, this kind of code only runs on a single thread. * Allow for efficient set operations like join, intersect and difference. It should be possible to multi-thread those operations. * It should be possible to iterate over an `IndexMask` very efficiently. The most important part of that is to avoid all memory access when iterating over continuous ranges. For some core nodes (e.g. math nodes), we generate optimized code for the cases of irregular index masks and simple index ranges. To achieve these goals, a few compromises had to made: * Slicing of the mask (at specific indices) and random element access is `O(log #indices)` now, but with a low constant factor. It should be possible to split a mask into n approximately equally sized parts in `O(n)` though, making the time per split `O(1)`. * Using range-based for loops does not work well when iterating over a nested data structure like the new `IndexMask`. Therefor, `foreach_*` functions with callbacks have to be used. To avoid extra code complexity at the call site, the `foreach_*` methods support multi-threading out of the box. The new data structure splits an `IndexMask` into an arbitrary number of ordered `IndexMaskSegment`. Each segment can contain at most `2^14 = 16384` indices. The indices within a segment are stored as `int16_t`. Each segment has an additional `int64_t` offset which allows storing arbitrary `int64_t` indices. This approach has the main benefits that segments can be processed/constructed individually on multiple threads without a serial bottleneck. Also it reduces the memory requirements significantly. For more details see comments in `BLI_index_mask.hh`. I did a few tests to verify that the data structure generally improves performance and does not cause regressions: * Our field evaluation benchmarks take about as much as before. This is to be expected because we already made sure that e.g. add node evaluation is vectorized. The important thing here is to check that changes to the way we iterate over the indices still allows for auto-vectorization. * Memory usage by a mask is about 1/4 of what it was before in the average case. That's mainly caused by the switch from `int64_t` to `int16_t` for indices. In the worst case, the memory requirements can be larger when there are many indices that are very far away. However, when they are far away from each other, that indicates that there aren't many indices in total. In common cases, memory usage can be way lower than 1/4 of before, because sub-ranges use static memory. * For some more specific numbers I benchmarked `IndexMask::from_bools` in `index_mask_from_selection` on 10.000.000 elements at various probabilities for `true` at every index: ``` Probability Old New 0 4.6 ms 0.8 ms 0.001 5.1 ms 1.3 ms 0.2 8.4 ms 1.8 ms 0.5 15.3 ms 3.0 ms 0.8 20.1 ms 3.0 ms 0.999 25.1 ms 1.7 ms 1 13.5 ms 1.1 ms ``` Pull Request: https://projects.blender.org/blender/blender/pulls/104629
2023-05-24 18:11:41 +02:00
const IndexMask &mask) const = 0;
virtual std::optional<AttrDomain> preferred_domain(const CurvesGeometry &curves) const;
};
class PointCloudFieldInput : public fn::FieldInput {
public:
using fn::FieldInput::FieldInput;
GVArray get_varray_for_context(const fn::FieldContext &context,
BLI: refactor IndexMask for better performance and memory usage Goals of this refactor: * Reduce memory consumption of `IndexMask`. The old `IndexMask` uses an `int64_t` for each index which is more than necessary in pretty much all practical cases currently. Using `int32_t` might still become limiting in the future in case we use this to index e.g. byte buffers larger than a few gigabytes. We also don't want to template `IndexMask`, because that would cause a split in the "ecosystem", or everything would have to be implemented twice or templated. * Allow for more multi-threading. The old `IndexMask` contains a single array. This is generally good but has the problem that it is hard to fill from multiple-threads when the final size is not known from the beginning. This is commonly the case when e.g. converting an array of bool to an index mask. Currently, this kind of code only runs on a single thread. * Allow for efficient set operations like join, intersect and difference. It should be possible to multi-thread those operations. * It should be possible to iterate over an `IndexMask` very efficiently. The most important part of that is to avoid all memory access when iterating over continuous ranges. For some core nodes (e.g. math nodes), we generate optimized code for the cases of irregular index masks and simple index ranges. To achieve these goals, a few compromises had to made: * Slicing of the mask (at specific indices) and random element access is `O(log #indices)` now, but with a low constant factor. It should be possible to split a mask into n approximately equally sized parts in `O(n)` though, making the time per split `O(1)`. * Using range-based for loops does not work well when iterating over a nested data structure like the new `IndexMask`. Therefor, `foreach_*` functions with callbacks have to be used. To avoid extra code complexity at the call site, the `foreach_*` methods support multi-threading out of the box. The new data structure splits an `IndexMask` into an arbitrary number of ordered `IndexMaskSegment`. Each segment can contain at most `2^14 = 16384` indices. The indices within a segment are stored as `int16_t`. Each segment has an additional `int64_t` offset which allows storing arbitrary `int64_t` indices. This approach has the main benefits that segments can be processed/constructed individually on multiple threads without a serial bottleneck. Also it reduces the memory requirements significantly. For more details see comments in `BLI_index_mask.hh`. I did a few tests to verify that the data structure generally improves performance and does not cause regressions: * Our field evaluation benchmarks take about as much as before. This is to be expected because we already made sure that e.g. add node evaluation is vectorized. The important thing here is to check that changes to the way we iterate over the indices still allows for auto-vectorization. * Memory usage by a mask is about 1/4 of what it was before in the average case. That's mainly caused by the switch from `int64_t` to `int16_t` for indices. In the worst case, the memory requirements can be larger when there are many indices that are very far away. However, when they are far away from each other, that indicates that there aren't many indices in total. In common cases, memory usage can be way lower than 1/4 of before, because sub-ranges use static memory. * For some more specific numbers I benchmarked `IndexMask::from_bools` in `index_mask_from_selection` on 10.000.000 elements at various probabilities for `true` at every index: ``` Probability Old New 0 4.6 ms 0.8 ms 0.001 5.1 ms 1.3 ms 0.2 8.4 ms 1.8 ms 0.5 15.3 ms 3.0 ms 0.8 20.1 ms 3.0 ms 0.999 25.1 ms 1.7 ms 1 13.5 ms 1.1 ms ``` Pull Request: https://projects.blender.org/blender/blender/pulls/104629
2023-05-24 18:11:41 +02:00
const IndexMask &mask,
ResourceScope &scope) const override;
BLI: refactor IndexMask for better performance and memory usage Goals of this refactor: * Reduce memory consumption of `IndexMask`. The old `IndexMask` uses an `int64_t` for each index which is more than necessary in pretty much all practical cases currently. Using `int32_t` might still become limiting in the future in case we use this to index e.g. byte buffers larger than a few gigabytes. We also don't want to template `IndexMask`, because that would cause a split in the "ecosystem", or everything would have to be implemented twice or templated. * Allow for more multi-threading. The old `IndexMask` contains a single array. This is generally good but has the problem that it is hard to fill from multiple-threads when the final size is not known from the beginning. This is commonly the case when e.g. converting an array of bool to an index mask. Currently, this kind of code only runs on a single thread. * Allow for efficient set operations like join, intersect and difference. It should be possible to multi-thread those operations. * It should be possible to iterate over an `IndexMask` very efficiently. The most important part of that is to avoid all memory access when iterating over continuous ranges. For some core nodes (e.g. math nodes), we generate optimized code for the cases of irregular index masks and simple index ranges. To achieve these goals, a few compromises had to made: * Slicing of the mask (at specific indices) and random element access is `O(log #indices)` now, but with a low constant factor. It should be possible to split a mask into n approximately equally sized parts in `O(n)` though, making the time per split `O(1)`. * Using range-based for loops does not work well when iterating over a nested data structure like the new `IndexMask`. Therefor, `foreach_*` functions with callbacks have to be used. To avoid extra code complexity at the call site, the `foreach_*` methods support multi-threading out of the box. The new data structure splits an `IndexMask` into an arbitrary number of ordered `IndexMaskSegment`. Each segment can contain at most `2^14 = 16384` indices. The indices within a segment are stored as `int16_t`. Each segment has an additional `int64_t` offset which allows storing arbitrary `int64_t` indices. This approach has the main benefits that segments can be processed/constructed individually on multiple threads without a serial bottleneck. Also it reduces the memory requirements significantly. For more details see comments in `BLI_index_mask.hh`. I did a few tests to verify that the data structure generally improves performance and does not cause regressions: * Our field evaluation benchmarks take about as much as before. This is to be expected because we already made sure that e.g. add node evaluation is vectorized. The important thing here is to check that changes to the way we iterate over the indices still allows for auto-vectorization. * Memory usage by a mask is about 1/4 of what it was before in the average case. That's mainly caused by the switch from `int64_t` to `int16_t` for indices. In the worst case, the memory requirements can be larger when there are many indices that are very far away. However, when they are far away from each other, that indicates that there aren't many indices in total. In common cases, memory usage can be way lower than 1/4 of before, because sub-ranges use static memory. * For some more specific numbers I benchmarked `IndexMask::from_bools` in `index_mask_from_selection` on 10.000.000 elements at various probabilities for `true` at every index: ``` Probability Old New 0 4.6 ms 0.8 ms 0.001 5.1 ms 1.3 ms 0.2 8.4 ms 1.8 ms 0.5 15.3 ms 3.0 ms 0.8 20.1 ms 3.0 ms 0.999 25.1 ms 1.7 ms 1 13.5 ms 1.1 ms ``` Pull Request: https://projects.blender.org/blender/blender/pulls/104629
2023-05-24 18:11:41 +02:00
virtual GVArray get_varray_for_context(const PointCloud &pointcloud,
const IndexMask &mask) const = 0;
};
class InstancesFieldInput : public fn::FieldInput {
public:
using fn::FieldInput::FieldInput;
GVArray get_varray_for_context(const fn::FieldContext &context,
BLI: refactor IndexMask for better performance and memory usage Goals of this refactor: * Reduce memory consumption of `IndexMask`. The old `IndexMask` uses an `int64_t` for each index which is more than necessary in pretty much all practical cases currently. Using `int32_t` might still become limiting in the future in case we use this to index e.g. byte buffers larger than a few gigabytes. We also don't want to template `IndexMask`, because that would cause a split in the "ecosystem", or everything would have to be implemented twice or templated. * Allow for more multi-threading. The old `IndexMask` contains a single array. This is generally good but has the problem that it is hard to fill from multiple-threads when the final size is not known from the beginning. This is commonly the case when e.g. converting an array of bool to an index mask. Currently, this kind of code only runs on a single thread. * Allow for efficient set operations like join, intersect and difference. It should be possible to multi-thread those operations. * It should be possible to iterate over an `IndexMask` very efficiently. The most important part of that is to avoid all memory access when iterating over continuous ranges. For some core nodes (e.g. math nodes), we generate optimized code for the cases of irregular index masks and simple index ranges. To achieve these goals, a few compromises had to made: * Slicing of the mask (at specific indices) and random element access is `O(log #indices)` now, but with a low constant factor. It should be possible to split a mask into n approximately equally sized parts in `O(n)` though, making the time per split `O(1)`. * Using range-based for loops does not work well when iterating over a nested data structure like the new `IndexMask`. Therefor, `foreach_*` functions with callbacks have to be used. To avoid extra code complexity at the call site, the `foreach_*` methods support multi-threading out of the box. The new data structure splits an `IndexMask` into an arbitrary number of ordered `IndexMaskSegment`. Each segment can contain at most `2^14 = 16384` indices. The indices within a segment are stored as `int16_t`. Each segment has an additional `int64_t` offset which allows storing arbitrary `int64_t` indices. This approach has the main benefits that segments can be processed/constructed individually on multiple threads without a serial bottleneck. Also it reduces the memory requirements significantly. For more details see comments in `BLI_index_mask.hh`. I did a few tests to verify that the data structure generally improves performance and does not cause regressions: * Our field evaluation benchmarks take about as much as before. This is to be expected because we already made sure that e.g. add node evaluation is vectorized. The important thing here is to check that changes to the way we iterate over the indices still allows for auto-vectorization. * Memory usage by a mask is about 1/4 of what it was before in the average case. That's mainly caused by the switch from `int64_t` to `int16_t` for indices. In the worst case, the memory requirements can be larger when there are many indices that are very far away. However, when they are far away from each other, that indicates that there aren't many indices in total. In common cases, memory usage can be way lower than 1/4 of before, because sub-ranges use static memory. * For some more specific numbers I benchmarked `IndexMask::from_bools` in `index_mask_from_selection` on 10.000.000 elements at various probabilities for `true` at every index: ``` Probability Old New 0 4.6 ms 0.8 ms 0.001 5.1 ms 1.3 ms 0.2 8.4 ms 1.8 ms 0.5 15.3 ms 3.0 ms 0.8 20.1 ms 3.0 ms 0.999 25.1 ms 1.7 ms 1 13.5 ms 1.1 ms ``` Pull Request: https://projects.blender.org/blender/blender/pulls/104629
2023-05-24 18:11:41 +02:00
const IndexMask &mask,
ResourceScope &scope) const override;
BLI: refactor IndexMask for better performance and memory usage Goals of this refactor: * Reduce memory consumption of `IndexMask`. The old `IndexMask` uses an `int64_t` for each index which is more than necessary in pretty much all practical cases currently. Using `int32_t` might still become limiting in the future in case we use this to index e.g. byte buffers larger than a few gigabytes. We also don't want to template `IndexMask`, because that would cause a split in the "ecosystem", or everything would have to be implemented twice or templated. * Allow for more multi-threading. The old `IndexMask` contains a single array. This is generally good but has the problem that it is hard to fill from multiple-threads when the final size is not known from the beginning. This is commonly the case when e.g. converting an array of bool to an index mask. Currently, this kind of code only runs on a single thread. * Allow for efficient set operations like join, intersect and difference. It should be possible to multi-thread those operations. * It should be possible to iterate over an `IndexMask` very efficiently. The most important part of that is to avoid all memory access when iterating over continuous ranges. For some core nodes (e.g. math nodes), we generate optimized code for the cases of irregular index masks and simple index ranges. To achieve these goals, a few compromises had to made: * Slicing of the mask (at specific indices) and random element access is `O(log #indices)` now, but with a low constant factor. It should be possible to split a mask into n approximately equally sized parts in `O(n)` though, making the time per split `O(1)`. * Using range-based for loops does not work well when iterating over a nested data structure like the new `IndexMask`. Therefor, `foreach_*` functions with callbacks have to be used. To avoid extra code complexity at the call site, the `foreach_*` methods support multi-threading out of the box. The new data structure splits an `IndexMask` into an arbitrary number of ordered `IndexMaskSegment`. Each segment can contain at most `2^14 = 16384` indices. The indices within a segment are stored as `int16_t`. Each segment has an additional `int64_t` offset which allows storing arbitrary `int64_t` indices. This approach has the main benefits that segments can be processed/constructed individually on multiple threads without a serial bottleneck. Also it reduces the memory requirements significantly. For more details see comments in `BLI_index_mask.hh`. I did a few tests to verify that the data structure generally improves performance and does not cause regressions: * Our field evaluation benchmarks take about as much as before. This is to be expected because we already made sure that e.g. add node evaluation is vectorized. The important thing here is to check that changes to the way we iterate over the indices still allows for auto-vectorization. * Memory usage by a mask is about 1/4 of what it was before in the average case. That's mainly caused by the switch from `int64_t` to `int16_t` for indices. In the worst case, the memory requirements can be larger when there are many indices that are very far away. However, when they are far away from each other, that indicates that there aren't many indices in total. In common cases, memory usage can be way lower than 1/4 of before, because sub-ranges use static memory. * For some more specific numbers I benchmarked `IndexMask::from_bools` in `index_mask_from_selection` on 10.000.000 elements at various probabilities for `true` at every index: ``` Probability Old New 0 4.6 ms 0.8 ms 0.001 5.1 ms 1.3 ms 0.2 8.4 ms 1.8 ms 0.5 15.3 ms 3.0 ms 0.8 20.1 ms 3.0 ms 0.999 25.1 ms 1.7 ms 1 13.5 ms 1.1 ms ``` Pull Request: https://projects.blender.org/blender/blender/pulls/104629
2023-05-24 18:11:41 +02:00
virtual GVArray get_varray_for_context(const Instances &instances,
const IndexMask &mask) const = 0;
};
class AttributeFieldInput : public GeometryFieldInput {
private:
std::string name_;
std::optional<std::string> socket_inspection_name_;
public:
AttributeFieldInput(std::string name,
const CPPType &type,
std::optional<std::string> socket_inspection_name = std::nullopt)
: GeometryFieldInput(type, name),
name_(std::move(name)),
socket_inspection_name_(std::move(socket_inspection_name))
{
category_ = attribute_name_is_anonymous(name_) ? Category::AnonymousAttribute :
Category::NamedAttribute;
}
static fn::GField from(std::string name,
const CPPType &type,
std::optional<std::string> socket_inspection_name = std::nullopt)
{
auto field_input = std::make_shared<AttributeFieldInput>(
std::move(name), type, std::move(socket_inspection_name));
return fn::GField(field_input);
}
template<typename T>
static fn::Field<T> from(std::string name,
std::optional<std::string> socket_inspection_name = std::nullopt)
{
return fn::Field<T>(
from(std::move(name), CPPType::get<T>(), std::move(socket_inspection_name)));
}
StringRefNull attribute_name() const
{
return name_;
}
GVArray get_varray_for_context(const GeometryFieldContext &context,
BLI: refactor IndexMask for better performance and memory usage Goals of this refactor: * Reduce memory consumption of `IndexMask`. The old `IndexMask` uses an `int64_t` for each index which is more than necessary in pretty much all practical cases currently. Using `int32_t` might still become limiting in the future in case we use this to index e.g. byte buffers larger than a few gigabytes. We also don't want to template `IndexMask`, because that would cause a split in the "ecosystem", or everything would have to be implemented twice or templated. * Allow for more multi-threading. The old `IndexMask` contains a single array. This is generally good but has the problem that it is hard to fill from multiple-threads when the final size is not known from the beginning. This is commonly the case when e.g. converting an array of bool to an index mask. Currently, this kind of code only runs on a single thread. * Allow for efficient set operations like join, intersect and difference. It should be possible to multi-thread those operations. * It should be possible to iterate over an `IndexMask` very efficiently. The most important part of that is to avoid all memory access when iterating over continuous ranges. For some core nodes (e.g. math nodes), we generate optimized code for the cases of irregular index masks and simple index ranges. To achieve these goals, a few compromises had to made: * Slicing of the mask (at specific indices) and random element access is `O(log #indices)` now, but with a low constant factor. It should be possible to split a mask into n approximately equally sized parts in `O(n)` though, making the time per split `O(1)`. * Using range-based for loops does not work well when iterating over a nested data structure like the new `IndexMask`. Therefor, `foreach_*` functions with callbacks have to be used. To avoid extra code complexity at the call site, the `foreach_*` methods support multi-threading out of the box. The new data structure splits an `IndexMask` into an arbitrary number of ordered `IndexMaskSegment`. Each segment can contain at most `2^14 = 16384` indices. The indices within a segment are stored as `int16_t`. Each segment has an additional `int64_t` offset which allows storing arbitrary `int64_t` indices. This approach has the main benefits that segments can be processed/constructed individually on multiple threads without a serial bottleneck. Also it reduces the memory requirements significantly. For more details see comments in `BLI_index_mask.hh`. I did a few tests to verify that the data structure generally improves performance and does not cause regressions: * Our field evaluation benchmarks take about as much as before. This is to be expected because we already made sure that e.g. add node evaluation is vectorized. The important thing here is to check that changes to the way we iterate over the indices still allows for auto-vectorization. * Memory usage by a mask is about 1/4 of what it was before in the average case. That's mainly caused by the switch from `int64_t` to `int16_t` for indices. In the worst case, the memory requirements can be larger when there are many indices that are very far away. However, when they are far away from each other, that indicates that there aren't many indices in total. In common cases, memory usage can be way lower than 1/4 of before, because sub-ranges use static memory. * For some more specific numbers I benchmarked `IndexMask::from_bools` in `index_mask_from_selection` on 10.000.000 elements at various probabilities for `true` at every index: ``` Probability Old New 0 4.6 ms 0.8 ms 0.001 5.1 ms 1.3 ms 0.2 8.4 ms 1.8 ms 0.5 15.3 ms 3.0 ms 0.8 20.1 ms 3.0 ms 0.999 25.1 ms 1.7 ms 1 13.5 ms 1.1 ms ``` Pull Request: https://projects.blender.org/blender/blender/pulls/104629
2023-05-24 18:11:41 +02:00
const IndexMask &mask) const override;
std::string socket_inspection_name() const override;
uint64_t hash() const override;
bool is_equal_to(const fn::FieldNode &other) const override;
std::optional<AttrDomain> preferred_domain(const GeometryComponent &component) const override;
};
class AttributeExistsFieldInput final : public bke::GeometryFieldInput {
private:
std::string name_;
public:
AttributeExistsFieldInput(std::string name, const CPPType &type)
: GeometryFieldInput(type, name), name_(std::move(name))
{
category_ = Category::Generated;
}
static fn::Field<bool> from(std::string name)
{
const CPPType &type = CPPType::get<bool>();
auto field_input = std::make_shared<AttributeExistsFieldInput>(std::move(name), type);
return fn::Field<bool>(field_input);
}
GVArray get_varray_for_context(const bke::GeometryFieldContext &context,
const IndexMask &mask) const final;
};
class NamedLayerSelectionFieldInput final : public bke::GeometryFieldInput {
private:
std::string layer_name_;
public:
NamedLayerSelectionFieldInput(std::string layer_name)
: bke::GeometryFieldInput(CPPType::get<bool>(), "Named Layer node"),
layer_name_(std::move(layer_name))
{
category_ = Category::Generated;
}
GVArray get_varray_for_context(const bke::GeometryFieldContext &context,
const IndexMask &mask) const final;
uint64_t hash() const override;
bool is_equal_to(const fn::FieldNode &other) const override;
std::optional<AttrDomain> preferred_domain(const GeometryComponent &component) const override;
};
class IDAttributeFieldInput : public GeometryFieldInput {
public:
IDAttributeFieldInput() : GeometryFieldInput(CPPType::get<int>())
{
category_ = Category::Generated;
}
GVArray get_varray_for_context(const GeometryFieldContext &context,
BLI: refactor IndexMask for better performance and memory usage Goals of this refactor: * Reduce memory consumption of `IndexMask`. The old `IndexMask` uses an `int64_t` for each index which is more than necessary in pretty much all practical cases currently. Using `int32_t` might still become limiting in the future in case we use this to index e.g. byte buffers larger than a few gigabytes. We also don't want to template `IndexMask`, because that would cause a split in the "ecosystem", or everything would have to be implemented twice or templated. * Allow for more multi-threading. The old `IndexMask` contains a single array. This is generally good but has the problem that it is hard to fill from multiple-threads when the final size is not known from the beginning. This is commonly the case when e.g. converting an array of bool to an index mask. Currently, this kind of code only runs on a single thread. * Allow for efficient set operations like join, intersect and difference. It should be possible to multi-thread those operations. * It should be possible to iterate over an `IndexMask` very efficiently. The most important part of that is to avoid all memory access when iterating over continuous ranges. For some core nodes (e.g. math nodes), we generate optimized code for the cases of irregular index masks and simple index ranges. To achieve these goals, a few compromises had to made: * Slicing of the mask (at specific indices) and random element access is `O(log #indices)` now, but with a low constant factor. It should be possible to split a mask into n approximately equally sized parts in `O(n)` though, making the time per split `O(1)`. * Using range-based for loops does not work well when iterating over a nested data structure like the new `IndexMask`. Therefor, `foreach_*` functions with callbacks have to be used. To avoid extra code complexity at the call site, the `foreach_*` methods support multi-threading out of the box. The new data structure splits an `IndexMask` into an arbitrary number of ordered `IndexMaskSegment`. Each segment can contain at most `2^14 = 16384` indices. The indices within a segment are stored as `int16_t`. Each segment has an additional `int64_t` offset which allows storing arbitrary `int64_t` indices. This approach has the main benefits that segments can be processed/constructed individually on multiple threads without a serial bottleneck. Also it reduces the memory requirements significantly. For more details see comments in `BLI_index_mask.hh`. I did a few tests to verify that the data structure generally improves performance and does not cause regressions: * Our field evaluation benchmarks take about as much as before. This is to be expected because we already made sure that e.g. add node evaluation is vectorized. The important thing here is to check that changes to the way we iterate over the indices still allows for auto-vectorization. * Memory usage by a mask is about 1/4 of what it was before in the average case. That's mainly caused by the switch from `int64_t` to `int16_t` for indices. In the worst case, the memory requirements can be larger when there are many indices that are very far away. However, when they are far away from each other, that indicates that there aren't many indices in total. In common cases, memory usage can be way lower than 1/4 of before, because sub-ranges use static memory. * For some more specific numbers I benchmarked `IndexMask::from_bools` in `index_mask_from_selection` on 10.000.000 elements at various probabilities for `true` at every index: ``` Probability Old New 0 4.6 ms 0.8 ms 0.001 5.1 ms 1.3 ms 0.2 8.4 ms 1.8 ms 0.5 15.3 ms 3.0 ms 0.8 20.1 ms 3.0 ms 0.999 25.1 ms 1.7 ms 1 13.5 ms 1.1 ms ``` Pull Request: https://projects.blender.org/blender/blender/pulls/104629
2023-05-24 18:11:41 +02:00
const IndexMask &mask) const override;
std::string socket_inspection_name() const override;
uint64_t hash() const override;
bool is_equal_to(const fn::FieldNode &other) const override;
};
VArray<float3> curve_normals_varray(const CurvesGeometry &curves, AttrDomain domain);
VArray<float3> mesh_normals_varray(const Mesh &mesh,
const IndexMask &mask,
AttrDomain domain,
Mesh: Add "free" custom normals Add a "dumb vector" storage option for custom normals, with the "custom_normal" attribute. Adjust the mesh normals caching to provide this attribute if it's available, and add a geometry node to store custom normals. ## Free Normals They're called "free" in the sense that they're just direction vectors in the object's local space, rather than the existing "smooth corner fan space" storage. They're also "free" in that they make further normals calculation very inexpensive, since we just use the custom normals instead. That's a big improvement from the existing custom normals storage, which usually significantly decreases viewport performance. For example, in a simple test file just storing the vertex normals on a UV sphere, using free normals gives 25 times better playback performance and 10% lower memory usage. Free normals are adjusted when applying a transformation to the entire mesh or when realizing instances, but in general they're not updated for vertex deformations. ## Set Mesh Normal Node The new geometry node allows storing free custom normals as well as the existing corner fan space normals. When free normals are chosen, free normals can be stored on vertices, faces, or face corners. Using the face corner domain is necessary to bake existing mixed sharp and smooth edges into the custom normal vectors. The node also has a mode for storing edge and mesh sharpness, meant as a "soft" replacement to the "Set Shade Smooth" node that's a bit more convenient. ## Normal Input Node The normal node outputs free custom normals mixed to whatever domain is requested. A "true normal" output that ignores custom normals and sharpness is added as well. Across Blender, custom normals are generally accessed via face and vertex normals, when "true normals" are not requested explicitly. In many cases that means they are mixed from the face corner domain. ## Future Work 1. There are many places where propagation of free normals could be improved. They should probably be normalized after mixing, and it may be useful to not just use 0 vectors for new elements. To keep the scope of this change smaller, that sort of thing generally isn't handled here. Searching `CD_NORMAL` gives a hint of where better propagation could be useful. 2. Free normals are displayed properly in edit mode, but the existing custom normal editing operators don't work with free normals yet. This will hopefully be fairly straightforward since custom normals are usually converted to `float3` for editing anyway. Edit mode changes aren't included here because they're unnecessary for the procedural custom normals use cases. 3. Most importers can probably switch to using free normals instead, or at least provide an option for it. That will give a significant import performance improvement, and an improvement of Blender's FPS for imported scenes too. Pull Request: https://projects.blender.org/blender/blender/pulls/132583
2025-04-04 19:16:51 +02:00
bool no_corner_normals = false,
bool true_normals = false);
class NormalFieldInput : public GeometryFieldInput {
bool legacy_corner_normals_ = false;
Mesh: Add "free" custom normals Add a "dumb vector" storage option for custom normals, with the "custom_normal" attribute. Adjust the mesh normals caching to provide this attribute if it's available, and add a geometry node to store custom normals. ## Free Normals They're called "free" in the sense that they're just direction vectors in the object's local space, rather than the existing "smooth corner fan space" storage. They're also "free" in that they make further normals calculation very inexpensive, since we just use the custom normals instead. That's a big improvement from the existing custom normals storage, which usually significantly decreases viewport performance. For example, in a simple test file just storing the vertex normals on a UV sphere, using free normals gives 25 times better playback performance and 10% lower memory usage. Free normals are adjusted when applying a transformation to the entire mesh or when realizing instances, but in general they're not updated for vertex deformations. ## Set Mesh Normal Node The new geometry node allows storing free custom normals as well as the existing corner fan space normals. When free normals are chosen, free normals can be stored on vertices, faces, or face corners. Using the face corner domain is necessary to bake existing mixed sharp and smooth edges into the custom normal vectors. The node also has a mode for storing edge and mesh sharpness, meant as a "soft" replacement to the "Set Shade Smooth" node that's a bit more convenient. ## Normal Input Node The normal node outputs free custom normals mixed to whatever domain is requested. A "true normal" output that ignores custom normals and sharpness is added as well. Across Blender, custom normals are generally accessed via face and vertex normals, when "true normals" are not requested explicitly. In many cases that means they are mixed from the face corner domain. ## Future Work 1. There are many places where propagation of free normals could be improved. They should probably be normalized after mixing, and it may be useful to not just use 0 vectors for new elements. To keep the scope of this change smaller, that sort of thing generally isn't handled here. Searching `CD_NORMAL` gives a hint of where better propagation could be useful. 2. Free normals are displayed properly in edit mode, but the existing custom normal editing operators don't work with free normals yet. This will hopefully be fairly straightforward since custom normals are usually converted to `float3` for editing anyway. Edit mode changes aren't included here because they're unnecessary for the procedural custom normals use cases. 3. Most importers can probably switch to using free normals instead, or at least provide an option for it. That will give a significant import performance improvement, and an improvement of Blender's FPS for imported scenes too. Pull Request: https://projects.blender.org/blender/blender/pulls/132583
2025-04-04 19:16:51 +02:00
bool true_normals_ = false;
public:
Mesh: Add "free" custom normals Add a "dumb vector" storage option for custom normals, with the "custom_normal" attribute. Adjust the mesh normals caching to provide this attribute if it's available, and add a geometry node to store custom normals. ## Free Normals They're called "free" in the sense that they're just direction vectors in the object's local space, rather than the existing "smooth corner fan space" storage. They're also "free" in that they make further normals calculation very inexpensive, since we just use the custom normals instead. That's a big improvement from the existing custom normals storage, which usually significantly decreases viewport performance. For example, in a simple test file just storing the vertex normals on a UV sphere, using free normals gives 25 times better playback performance and 10% lower memory usage. Free normals are adjusted when applying a transformation to the entire mesh or when realizing instances, but in general they're not updated for vertex deformations. ## Set Mesh Normal Node The new geometry node allows storing free custom normals as well as the existing corner fan space normals. When free normals are chosen, free normals can be stored on vertices, faces, or face corners. Using the face corner domain is necessary to bake existing mixed sharp and smooth edges into the custom normal vectors. The node also has a mode for storing edge and mesh sharpness, meant as a "soft" replacement to the "Set Shade Smooth" node that's a bit more convenient. ## Normal Input Node The normal node outputs free custom normals mixed to whatever domain is requested. A "true normal" output that ignores custom normals and sharpness is added as well. Across Blender, custom normals are generally accessed via face and vertex normals, when "true normals" are not requested explicitly. In many cases that means they are mixed from the face corner domain. ## Future Work 1. There are many places where propagation of free normals could be improved. They should probably be normalized after mixing, and it may be useful to not just use 0 vectors for new elements. To keep the scope of this change smaller, that sort of thing generally isn't handled here. Searching `CD_NORMAL` gives a hint of where better propagation could be useful. 2. Free normals are displayed properly in edit mode, but the existing custom normal editing operators don't work with free normals yet. This will hopefully be fairly straightforward since custom normals are usually converted to `float3` for editing anyway. Edit mode changes aren't included here because they're unnecessary for the procedural custom normals use cases. 3. Most importers can probably switch to using free normals instead, or at least provide an option for it. That will give a significant import performance improvement, and an improvement of Blender's FPS for imported scenes too. Pull Request: https://projects.blender.org/blender/blender/pulls/132583
2025-04-04 19:16:51 +02:00
NormalFieldInput(const bool legacy_corner_normals = false, const bool true_normals = false)
: GeometryFieldInput(CPPType::get<float3>()),
legacy_corner_normals_(legacy_corner_normals),
true_normals_(true_normals)
{
category_ = Category::Generated;
}
GVArray get_varray_for_context(const GeometryFieldContext &context,
BLI: refactor IndexMask for better performance and memory usage Goals of this refactor: * Reduce memory consumption of `IndexMask`. The old `IndexMask` uses an `int64_t` for each index which is more than necessary in pretty much all practical cases currently. Using `int32_t` might still become limiting in the future in case we use this to index e.g. byte buffers larger than a few gigabytes. We also don't want to template `IndexMask`, because that would cause a split in the "ecosystem", or everything would have to be implemented twice or templated. * Allow for more multi-threading. The old `IndexMask` contains a single array. This is generally good but has the problem that it is hard to fill from multiple-threads when the final size is not known from the beginning. This is commonly the case when e.g. converting an array of bool to an index mask. Currently, this kind of code only runs on a single thread. * Allow for efficient set operations like join, intersect and difference. It should be possible to multi-thread those operations. * It should be possible to iterate over an `IndexMask` very efficiently. The most important part of that is to avoid all memory access when iterating over continuous ranges. For some core nodes (e.g. math nodes), we generate optimized code for the cases of irregular index masks and simple index ranges. To achieve these goals, a few compromises had to made: * Slicing of the mask (at specific indices) and random element access is `O(log #indices)` now, but with a low constant factor. It should be possible to split a mask into n approximately equally sized parts in `O(n)` though, making the time per split `O(1)`. * Using range-based for loops does not work well when iterating over a nested data structure like the new `IndexMask`. Therefor, `foreach_*` functions with callbacks have to be used. To avoid extra code complexity at the call site, the `foreach_*` methods support multi-threading out of the box. The new data structure splits an `IndexMask` into an arbitrary number of ordered `IndexMaskSegment`. Each segment can contain at most `2^14 = 16384` indices. The indices within a segment are stored as `int16_t`. Each segment has an additional `int64_t` offset which allows storing arbitrary `int64_t` indices. This approach has the main benefits that segments can be processed/constructed individually on multiple threads without a serial bottleneck. Also it reduces the memory requirements significantly. For more details see comments in `BLI_index_mask.hh`. I did a few tests to verify that the data structure generally improves performance and does not cause regressions: * Our field evaluation benchmarks take about as much as before. This is to be expected because we already made sure that e.g. add node evaluation is vectorized. The important thing here is to check that changes to the way we iterate over the indices still allows for auto-vectorization. * Memory usage by a mask is about 1/4 of what it was before in the average case. That's mainly caused by the switch from `int64_t` to `int16_t` for indices. In the worst case, the memory requirements can be larger when there are many indices that are very far away. However, when they are far away from each other, that indicates that there aren't many indices in total. In common cases, memory usage can be way lower than 1/4 of before, because sub-ranges use static memory. * For some more specific numbers I benchmarked `IndexMask::from_bools` in `index_mask_from_selection` on 10.000.000 elements at various probabilities for `true` at every index: ``` Probability Old New 0 4.6 ms 0.8 ms 0.001 5.1 ms 1.3 ms 0.2 8.4 ms 1.8 ms 0.5 15.3 ms 3.0 ms 0.8 20.1 ms 3.0 ms 0.999 25.1 ms 1.7 ms 1 13.5 ms 1.1 ms ``` Pull Request: https://projects.blender.org/blender/blender/pulls/104629
2023-05-24 18:11:41 +02:00
const IndexMask &mask) const override;
std::string socket_inspection_name() const override;
uint64_t hash() const override;
bool is_equal_to(const fn::FieldNode &other) const override;
};
class CurveLengthFieldInput final : public CurvesFieldInput {
public:
CurveLengthFieldInput();
GVArray get_varray_for_context(const CurvesGeometry &curves,
AttrDomain domain,
BLI: refactor IndexMask for better performance and memory usage Goals of this refactor: * Reduce memory consumption of `IndexMask`. The old `IndexMask` uses an `int64_t` for each index which is more than necessary in pretty much all practical cases currently. Using `int32_t` might still become limiting in the future in case we use this to index e.g. byte buffers larger than a few gigabytes. We also don't want to template `IndexMask`, because that would cause a split in the "ecosystem", or everything would have to be implemented twice or templated. * Allow for more multi-threading. The old `IndexMask` contains a single array. This is generally good but has the problem that it is hard to fill from multiple-threads when the final size is not known from the beginning. This is commonly the case when e.g. converting an array of bool to an index mask. Currently, this kind of code only runs on a single thread. * Allow for efficient set operations like join, intersect and difference. It should be possible to multi-thread those operations. * It should be possible to iterate over an `IndexMask` very efficiently. The most important part of that is to avoid all memory access when iterating over continuous ranges. For some core nodes (e.g. math nodes), we generate optimized code for the cases of irregular index masks and simple index ranges. To achieve these goals, a few compromises had to made: * Slicing of the mask (at specific indices) and random element access is `O(log #indices)` now, but with a low constant factor. It should be possible to split a mask into n approximately equally sized parts in `O(n)` though, making the time per split `O(1)`. * Using range-based for loops does not work well when iterating over a nested data structure like the new `IndexMask`. Therefor, `foreach_*` functions with callbacks have to be used. To avoid extra code complexity at the call site, the `foreach_*` methods support multi-threading out of the box. The new data structure splits an `IndexMask` into an arbitrary number of ordered `IndexMaskSegment`. Each segment can contain at most `2^14 = 16384` indices. The indices within a segment are stored as `int16_t`. Each segment has an additional `int64_t` offset which allows storing arbitrary `int64_t` indices. This approach has the main benefits that segments can be processed/constructed individually on multiple threads without a serial bottleneck. Also it reduces the memory requirements significantly. For more details see comments in `BLI_index_mask.hh`. I did a few tests to verify that the data structure generally improves performance and does not cause regressions: * Our field evaluation benchmarks take about as much as before. This is to be expected because we already made sure that e.g. add node evaluation is vectorized. The important thing here is to check that changes to the way we iterate over the indices still allows for auto-vectorization. * Memory usage by a mask is about 1/4 of what it was before in the average case. That's mainly caused by the switch from `int64_t` to `int16_t` for indices. In the worst case, the memory requirements can be larger when there are many indices that are very far away. However, when they are far away from each other, that indicates that there aren't many indices in total. In common cases, memory usage can be way lower than 1/4 of before, because sub-ranges use static memory. * For some more specific numbers I benchmarked `IndexMask::from_bools` in `index_mask_from_selection` on 10.000.000 elements at various probabilities for `true` at every index: ``` Probability Old New 0 4.6 ms 0.8 ms 0.001 5.1 ms 1.3 ms 0.2 8.4 ms 1.8 ms 0.5 15.3 ms 3.0 ms 0.8 20.1 ms 3.0 ms 0.999 25.1 ms 1.7 ms 1 13.5 ms 1.1 ms ``` Pull Request: https://projects.blender.org/blender/blender/pulls/104629
2023-05-24 18:11:41 +02:00
const IndexMask &mask) const final;
uint64_t hash() const override;
bool is_equal_to(const fn::FieldNode &other) const override;
std::optional<AttrDomain> preferred_domain(const bke::CurvesGeometry &curves) const final;
};
class EvaluateAtIndexInput final : public bke::GeometryFieldInput {
private:
fn::Field<int> index_field_;
fn::GField value_field_;
AttrDomain value_field_domain_;
public:
EvaluateAtIndexInput(fn::Field<int> index_field,
fn::GField value_field,
AttrDomain value_field_domain);
GVArray get_varray_for_context(const bke::GeometryFieldContext &context,
const IndexMask &mask) const final;
std::optional<AttrDomain> preferred_domain(const GeometryComponent & /*component*/) const final
{
return value_field_domain_;
}
};
void copy_with_checked_indices(const GVArray &src,
const VArray<int> &indices,
const IndexMask &mask,
GMutableSpan dst);
class EvaluateOnDomainInput final : public bke::GeometryFieldInput {
private:
fn::GField src_field_;
AttrDomain src_domain_;
public:
EvaluateOnDomainInput(fn::GField field, AttrDomain domain);
GVArray get_varray_for_context(const bke::GeometryFieldContext &context,
const IndexMask & /*mask*/) const final;
void for_each_field_input_recursive(FunctionRef<void(const FieldInput &)> fn) const override;
std::optional<AttrDomain> preferred_domain(
const GeometryComponent & /*component*/) const override;
};
bool try_capture_fields_on_geometry(MutableAttributeAccessor attributes,
const fn::FieldContext &field_context,
Span<StringRef> attribute_ids,
AttrDomain domain,
const fn::Field<bool> &selection,
Span<fn::GField> fields);
inline bool try_capture_field_on_geometry(MutableAttributeAccessor attributes,
const fn::FieldContext &field_context,
const StringRef attribute_id,
AttrDomain domain,
const fn::Field<bool> &selection,
const fn::GField &field)
{
return try_capture_fields_on_geometry(
attributes, field_context, {attribute_id}, domain, selection, {field});
}
bool try_capture_fields_on_geometry(GeometryComponent &component,
Span<StringRef> attribute_ids,
AttrDomain domain,
Span<fn::GField> fields);
inline bool try_capture_field_on_geometry(GeometryComponent &component,
const StringRef attribute_id,
AttrDomain domain,
const fn::GField &field)
{
return try_capture_fields_on_geometry(component, {attribute_id}, domain, {field});
}
bool try_capture_fields_on_geometry(GeometryComponent &component,
Span<StringRef> attribute_ids,
AttrDomain domain,
const fn::Field<bool> &selection,
Span<fn::GField> fields);
inline bool try_capture_field_on_geometry(GeometryComponent &component,
const StringRef attribute_id,
AttrDomain domain,
const fn::Field<bool> &selection,
const fn::GField &field)
{
return try_capture_fields_on_geometry(component, {attribute_id}, domain, selection, {field});
}
Geometry Nodes: viewport preview This adds support for showing geometry passed to the Viewer in the 3d viewport (instead of just in the spreadsheet). The "viewer geometry" bypasses the group output. So it is not necessary to change the final output of the node group to be able to see the intermediate geometry. **Activation and deactivation of a viewer node** * A viewer node is activated by clicking on it. * Ctrl+shift+click on any node/socket connects it to the viewer and makes it active. * Ctrl+shift+click in empty space deactivates the active viewer. * When the active viewer is not visible anymore (e.g. another object is selected, or the current node group is exit), it is deactivated. * Clicking on the icon in the header of the Viewer node toggles whether its active or not. **Pinning** * The spreadsheet still allows pinning the active viewer as before. When pinned, the spreadsheet still references the viewer node even when it becomes inactive. * The viewport does not support pinning at the moment. It always shows the active viewer. **Attribute** * When a field is linked to the second input of the viewer node it is displayed as an overlay in the viewport. * When possible the correct domain for the attribute is determined automatically. This does not work in all cases. It falls back to the face corner domain on meshes and the point domain on curves. When necessary, the domain can be picked manually. * The spreadsheet now only shows the "Viewer" column for the domain that is selected in the Viewer node. * Instance attributes are visualized as a constant color per instance. **Viewport Options** * The attribute overlay opacity can be controlled with the "Viewer Node" setting in the overlays popover. * A viewport can be configured not to show intermediate viewer-geometry by disabling the "Viewer Node" option in the "View" menu. **Implementation Details** * The "spreadsheet context path" was generalized to a "viewer path" that is used in more places now. * The viewer node itself determines the attribute domain, evaluates the field and stores the result in a `.viewer` attribute. * A new "viewer attribute' overlay displays the data from the `.viewer` attribute. * The ground truth for the active viewer node is stored in the workspace now. Node editors, spreadsheets and viewports retrieve the active viewer from there unless they are pinned. * The depsgraph object iterator has a new "viewer path" setting. When set, the viewed geometry of the corresponding object is part of the iterator instead of the final evaluated geometry. * To support the instance attribute overlay `DupliObject` was extended to contain the information necessary for drawing the overlay. * The ctrl+shift+click operator has been refactored so that it can make existing links to viewers active again. * The auto-domain-detection in the Viewer node works by checking the "preferred domain" for every field input. If there is not exactly one preferred domain, the fallback is used. Known limitations: * Loose edges of meshes don't have the attribute overlay. This could be added separately if necessary. * Some attributes are hard to visualize as a color directly. For example, the values might have to be normalized or some should be drawn as arrays. For now, we encourage users to build node groups that generate appropriate viewer-geometry. We might include some of that functionality in future versions. Support for displaying attribute values as text in the viewport is planned as well. * There seems to be an issue with the attribute overlay for pointclouds on nvidia gpus, to be investigated. Differential Revision: https://developer.blender.org/D15954
2022-09-28 17:54:59 +02:00
/**
* Try to find the geometry domain that the field should be evaluated on. If it is not obvious
* which domain is correct, none is returned.
*/
std::optional<AttrDomain> try_detect_field_domain(const GeometryComponent &component,
2023-12-20 14:54:38 -05:00
const fn::GField &field);
Geometry Nodes: viewport preview This adds support for showing geometry passed to the Viewer in the 3d viewport (instead of just in the spreadsheet). The "viewer geometry" bypasses the group output. So it is not necessary to change the final output of the node group to be able to see the intermediate geometry. **Activation and deactivation of a viewer node** * A viewer node is activated by clicking on it. * Ctrl+shift+click on any node/socket connects it to the viewer and makes it active. * Ctrl+shift+click in empty space deactivates the active viewer. * When the active viewer is not visible anymore (e.g. another object is selected, or the current node group is exit), it is deactivated. * Clicking on the icon in the header of the Viewer node toggles whether its active or not. **Pinning** * The spreadsheet still allows pinning the active viewer as before. When pinned, the spreadsheet still references the viewer node even when it becomes inactive. * The viewport does not support pinning at the moment. It always shows the active viewer. **Attribute** * When a field is linked to the second input of the viewer node it is displayed as an overlay in the viewport. * When possible the correct domain for the attribute is determined automatically. This does not work in all cases. It falls back to the face corner domain on meshes and the point domain on curves. When necessary, the domain can be picked manually. * The spreadsheet now only shows the "Viewer" column for the domain that is selected in the Viewer node. * Instance attributes are visualized as a constant color per instance. **Viewport Options** * The attribute overlay opacity can be controlled with the "Viewer Node" setting in the overlays popover. * A viewport can be configured not to show intermediate viewer-geometry by disabling the "Viewer Node" option in the "View" menu. **Implementation Details** * The "spreadsheet context path" was generalized to a "viewer path" that is used in more places now. * The viewer node itself determines the attribute domain, evaluates the field and stores the result in a `.viewer` attribute. * A new "viewer attribute' overlay displays the data from the `.viewer` attribute. * The ground truth for the active viewer node is stored in the workspace now. Node editors, spreadsheets and viewports retrieve the active viewer from there unless they are pinned. * The depsgraph object iterator has a new "viewer path" setting. When set, the viewed geometry of the corresponding object is part of the iterator instead of the final evaluated geometry. * To support the instance attribute overlay `DupliObject` was extended to contain the information necessary for drawing the overlay. * The ctrl+shift+click operator has been refactored so that it can make existing links to viewers active again. * The auto-domain-detection in the Viewer node works by checking the "preferred domain" for every field input. If there is not exactly one preferred domain, the fallback is used. Known limitations: * Loose edges of meshes don't have the attribute overlay. This could be added separately if necessary. * Some attributes are hard to visualize as a color directly. For example, the values might have to be normalized or some should be drawn as arrays. For now, we encourage users to build node groups that generate appropriate viewer-geometry. We might include some of that functionality in future versions. Support for displaying attribute values as text in the viewport is planned as well. * There seems to be an issue with the attribute overlay for pointclouds on nvidia gpus, to be investigated. Differential Revision: https://developer.blender.org/D15954
2022-09-28 17:54:59 +02:00
} // namespace blender::bke