Files
test2/source/blender/io/csv/importer/csv_reader.cc
Jacques Lucke 55e2fd2929 Cleanup: unify naming for named constructors
Previously, we used an inconsistent naming scheme for such "named constructors".
Now it always uses `from_*`.

Pull Request: https://projects.blender.org/blender/blender/pulls/142175
2025-07-17 09:09:16 +02:00

370 lines
13 KiB
C++

/* SPDX-FileCopyrightText: 2024 Blender Authors
*
* SPDX-License-Identifier: GPL-2.0-or-later */
/** \file
* \ingroup csv
*/
#include <atomic>
#include <charconv>
#include <optional>
#include <variant>
#include "BLI_array_utils.hh"
#include "fast_float.h"
#include "BKE_anonymous_attribute_id.hh"
#include "BKE_attribute.hh"
#include "BKE_pointcloud.hh"
#include "BKE_report.hh"
#include "BLI_csv_parse.hh"
#include "BLI_fileops.hh"
#include "BLI_implicit_sharing.hh"
#include "BLI_vector.hh"
#include "IO_csv.hh"
namespace blender::io::csv {
struct ColumnInfo {
StringRef name;
bool has_invalid_name = false;
std::atomic<bool> found_invalid = false;
std::atomic<bool> found_int = false;
std::atomic<bool> found_float = false;
};
using ColumnData = std::variant<std::monostate, Vector<float>, Vector<int>>;
struct ChunkResult {
int rows_num;
Vector<ColumnData> columns;
};
struct ParseFloatColumnResult {
Vector<float> data;
bool found_invalid = false;
};
struct ParseIntColumnResult {
Vector<int> data;
bool found_invalid = false;
bool found_float = false;
};
static ParseFloatColumnResult parse_column_as_floats(const csv_parse::CsvRecords &records,
const int column_i)
{
ParseFloatColumnResult result;
result.data.reserve(records.size());
for (const int row_i : records.index_range()) {
const Span<char> value_span = records.record(row_i).field(column_i);
const char *value_begin = value_span.begin();
const char *value_end = value_span.end();
/* Skip leading white-space and plus sign. */
while (value_begin < value_end && ELEM(*value_begin, ' ', '+')) {
value_begin++;
}
float value;
fast_float::from_chars_result res = fast_float::from_chars(value_begin, value_end, value);
if (res.ec != std::errc()) {
result.found_invalid = true;
return result;
}
if (res.ptr < value_end) {
/* Allow trailing white-space in the value. */
while (res.ptr < value_end && res.ptr[0] == ' ') {
res.ptr++;
}
if (res.ptr < value_end) {
result.found_invalid = true;
return result;
}
}
result.data.append(value);
}
return result;
}
static ParseIntColumnResult parse_column_as_ints(const csv_parse::CsvRecords &records,
const int column_i)
{
ParseIntColumnResult result;
result.data.reserve(records.size());
for (const int row_i : records.index_range()) {
const Span<char> value_span = records.record(row_i).field(column_i);
const char *value_begin = value_span.begin();
const char *value_end = value_span.end();
/* Skip leading white-space and plus sign. */
while (value_begin < value_end && ELEM(*value_begin, ' ', '+')) {
value_begin++;
}
int value;
std::from_chars_result res = std::from_chars(value_begin, value_end, value);
if (res.ec != std::errc()) {
result.found_invalid = true;
return result;
}
if (res.ptr < value_end) {
/* If the next character after the value is a dot, it should be parsed again as float. */
if (res.ptr[0] == '.') {
result.found_float = true;
return result;
}
/* Allow trailing white-space in the value. */
while (res.ptr < value_end && res.ptr[0] == ' ') {
res.ptr++;
}
if (res.ptr < value_end) {
result.found_invalid = true;
return result;
}
}
result.data.append(value);
}
return result;
}
static ChunkResult parse_records_chunk(const csv_parse::CsvRecords &records,
MutableSpan<ColumnInfo> columns_info)
{
const int columns_num = columns_info.size();
ChunkResult chunk_result;
chunk_result.rows_num = records.size();
chunk_result.columns.resize(columns_num);
for (const int column_i : IndexRange(columns_num)) {
ColumnInfo &column_info = columns_info[column_i];
if (column_info.has_invalid_name) {
/* Column can be ignored. */
continue;
}
if (column_info.found_invalid.load(std::memory_order_relaxed)) {
/* Invalid values have been found in this column already, skip it. */
continue;
}
/* A float was found in this column already, so parse everything as floats. */
const bool found_float = column_info.found_float.load(std::memory_order_relaxed);
if (found_float) {
ParseFloatColumnResult float_column_result = parse_column_as_floats(records, column_i);
if (float_column_result.found_invalid) {
column_info.found_invalid.store(true, std::memory_order_relaxed);
continue;
}
chunk_result.columns[column_i] = std::move(float_column_result.data);
continue;
}
/* No float was found so far in this column, so attempt to parse it as integers. */
ParseIntColumnResult int_column_result = parse_column_as_ints(records, column_i);
if (int_column_result.found_invalid) {
column_info.found_invalid.store(true, std::memory_order_relaxed);
continue;
}
if (!int_column_result.found_float) {
chunk_result.columns[column_i] = std::move(int_column_result.data);
column_info.found_int.store(true, std::memory_order_relaxed);
continue;
}
/* While parsing it as integers, floats were detected. So parse it as floats again. */
column_info.found_float.store(true, std::memory_order_relaxed);
ParseFloatColumnResult float_column_result = parse_column_as_floats(records, column_i);
if (float_column_result.found_invalid) {
column_info.found_invalid.store(true, std::memory_order_relaxed);
continue;
}
chunk_result.columns[column_i] = std::move(float_column_result.data);
}
return chunk_result;
}
/**
* So far, the parsed data is still split into many chunks. This function flattens the chunks into
* continuous buffers that can be used as attributes.
*/
static Array<std::optional<GArray<>>> flatten_valid_attribute_chunks(
const Span<ColumnInfo> columns_info,
OffsetIndices<int> chunk_offsets,
MutableSpan<ChunkResult> chunks)
{
const int points_num = chunk_offsets.total_size();
Array<std::optional<GArray<>>> flattened_attributes(columns_info.size());
threading::parallel_for(columns_info.index_range(), 1, [&](const IndexRange columns_range) {
for (const int column_i : columns_range) {
const ColumnInfo &column_info = columns_info[column_i];
if (column_info.has_invalid_name || column_info.found_invalid) {
/* Column can be ignored. */
continue;
}
if (column_info.found_float) {
/* Should read column as floats. */
GArray<> attribute(CPPType::get<float>(), points_num);
float *attribute_buffer = static_cast<float *>(attribute.data());
threading::parallel_for(chunks.index_range(), 1, [&](const IndexRange chunks_range) {
for (const int chunk_i : chunks_range) {
const IndexRange dst_range = chunk_offsets[chunk_i];
ChunkResult &chunk = chunks[chunk_i];
ColumnData &column_data = chunk.columns[column_i];
if (const auto *float_vec = std::get_if<Vector<float>>(&column_data)) {
BLI_assert(float_vec->size() == dst_range.size());
uninitialized_copy_n(
float_vec->data(), dst_range.size(), attribute_buffer + dst_range.first());
}
else if (const auto *int_vec = std::get_if<Vector<int>>(&column_data)) {
/* This chunk was read entirely as integers, so it still has to be converted to
* floats. */
BLI_assert(int_vec->size() == dst_range.size());
uninitialized_convert_n(int_vec->data(), dst_range.size(), attribute_buffer);
}
else {
/* Expected data to be available, because the `found_invalid` flag was not
* set. */
BLI_assert_unreachable();
}
/* Free data for chunk. */
column_data = std::monostate{};
}
});
flattened_attributes[column_i] = std::move(attribute);
continue;
}
if (column_info.found_int) {
/* Should read column as ints. */
GArray<> attribute(CPPType::get<int>(), points_num);
int *attribute_buffer = static_cast<int *>(attribute.data());
threading::parallel_for(chunks.index_range(), 1, [&](const IndexRange chunks_range) {
for (const int chunk_i : chunks_range) {
const IndexRange dst_range = chunk_offsets[chunk_i];
ChunkResult &chunk = chunks[chunk_i];
ColumnData &column_data = chunk.columns[column_i];
if (const auto *int_vec = std::get_if<Vector<int>>(&column_data)) {
BLI_assert(int_vec->size() == dst_range.size());
uninitialized_copy_n(
int_vec->data(), dst_range.size(), attribute_buffer + dst_range.first());
}
else {
/* Expected data to be available, because the `found_invalid` and
* `found_float` flags were not set. */
BLI_assert_unreachable();
}
/* Free data for chunk. */
column_data = std::monostate{};
}
});
flattened_attributes[column_i] = std::move(attribute);
continue;
}
}
});
return flattened_attributes;
}
PointCloud *import_csv_as_pointcloud(const CSVImportParams &import_params)
{
size_t buffer_len;
void *buffer = BLI_file_read_text_as_mem(import_params.filepath, 0, &buffer_len);
if (buffer == nullptr) {
BKE_reportf(import_params.reports,
RPT_ERROR,
"CSV Import: Cannot open file '%s'",
import_params.filepath);
return nullptr;
}
BLI_SCOPED_DEFER([&]() { MEM_freeN(buffer); });
if (buffer_len == 0) {
BKE_reportf(
import_params.reports, RPT_ERROR, "CSV Import: empty file '%s'", import_params.filepath);
return nullptr;
}
LinearAllocator<> allocator;
Array<ColumnInfo> columns_info;
csv_parse::CsvParseOptions parse_options;
parse_options.delimiter = import_params.delimiter;
const auto parse_header = [&](const csv_parse::CsvRecord &record) {
columns_info.reinitialize(record.size());
for (const int i : record.index_range()) {
ColumnInfo &column_info = columns_info[i];
const StringRef name = csv_parse::unescape_field(
record.field_str(i), parse_options, allocator);
column_info.name = name;
if (!bke::allow_procedural_attribute_access(name) ||
bke::attribute_name_is_anonymous(name) || name.is_empty())
{
column_info.has_invalid_name = true;
continue;
}
}
};
const auto parse_data_chunk = [&](const csv_parse::CsvRecords &records) {
return parse_records_chunk(records, columns_info);
};
const Span<char> buffer_span{static_cast<char *>(buffer), int64_t(buffer_len)};
std::optional<Vector<ChunkResult>> parsed_chunks = csv_parse::parse_csv_in_chunks<ChunkResult>(
buffer_span, parse_options, parse_header, parse_data_chunk);
if (!parsed_chunks.has_value()) {
BKE_reportf(import_params.reports,
RPT_ERROR,
"CSV import: failed to parse file '%s'",
import_params.filepath);
return nullptr;
}
/* Count the total number of records and compute the offset of each chunk which is used when
* flattening the parsed data. */
Vector<int> chunk_offsets_vec;
chunk_offsets_vec.append(0);
for (const ChunkResult &chunk : *parsed_chunks) {
chunk_offsets_vec.append(chunk_offsets_vec.last() + chunk.rows_num);
}
const OffsetIndices<int> chunk_offsets(chunk_offsets_vec);
const int points_num = chunk_offsets_vec.last();
PointCloud *pointcloud = BKE_pointcloud_new_nomain(points_num);
Array<std::optional<GArray<>>> flattened_attributes;
threading::memory_bandwidth_bound_task(points_num * 16, [&]() {
threading::parallel_invoke(
[&]() {
array_utils::copy(VArray<float3>::from_single(float3(0), points_num),
pointcloud->positions_for_write());
},
[&]() {
flattened_attributes = flatten_valid_attribute_chunks(
columns_info, chunk_offsets, *parsed_chunks);
});
});
/* Add all valid attributes to the pointcloud. */
bke::MutableAttributeAccessor attributes = pointcloud->attributes_for_write();
for (const int column_i : columns_info.index_range()) {
std::optional<GArray<>> &attribute = flattened_attributes[column_i];
if (!attribute.has_value()) {
continue;
}
const auto *data = new ImplicitSharedValue<GArray<>>(std::move(*attribute));
const bke::AttrType type = bke::cpp_type_to_attribute_type(attribute->type());
const ColumnInfo &column_info = columns_info[column_i];
attributes.add(column_info.name,
bke::AttrDomain::Point,
type,
bke::AttributeInitShared{data->data.data(), *data});
data->remove_user_and_delete_if_last();
}
/* Since all positions are set to zero, the bounding box can be updated eagerly to avoid
* computing it later. */
pointcloud->runtime->bounds_cache.ensure([](Bounds<float3> &r_bounds) {
r_bounds.min = float3(0);
r_bounds.max = float3(0);
});
return pointcloud;
}
} // namespace blender::io::csv