diff --git a/source/blender/blenlib/BLI_csv_parse.hh b/source/blender/blenlib/BLI_csv_parse.hh new file mode 100644 index 00000000000..9f4138340d0 --- /dev/null +++ b/source/blender/blenlib/BLI_csv_parse.hh @@ -0,0 +1,231 @@ +/* SPDX-FileCopyrightText: 2025 Blender Authors + * + * SPDX-License-Identifier: GPL-2.0-or-later */ + +#include "BLI_any.hh" +#include "BLI_function_ref.hh" +#include "BLI_offset_indices.hh" +#include "BLI_string_ref.hh" +#include "BLI_vector.hh" + +namespace blender::csv_parse { + +/** + * Contains the fields of a single record of a .csv file. Usually that corresponds to a single + * line. + */ +class CsvRecord { + private: + Span> fields_; + + public: + CsvRecord(Span> fields); + + /** Number of fields in the record. */ + int64_t size() const; + IndexRange index_range() const; + + /** Get the field at the given index. Empty data is returned if the index is too large. */ + Span field(const int64_t index) const; + StringRef field_str(const int64_t index) const; +}; + +/** + * Contains the fields of multiple records. + */ +class CsvRecords { + private: + OffsetIndices offsets_; + Span> fields_; + + public: + CsvRecords(OffsetIndices offsets, Span> fields); + + /** Number of records (rows). */ + int64_t size() const; + IndexRange index_range() const; + + /** Get the record at the given index. */ + CsvRecord record(const int64_t index) const; +}; + +struct CsvParseOptions { + /** The character that separates fields within a row. */ + char delimiter = ','; + /** + * The character that can be used to enclose fields which contain the delimiter or span multiple + * lines. + */ + char quote = '"'; + /** + * Characters that can be used to escape the quote character. By default, "" or \" both represent + * an escaped quote. + */ + Span quote_escape_chars = Span(StringRef("\"\\")); + /** Approximate number of bytes per chunk that the input is split into. */ + int64_t chunk_size_bytes = 64 * 1024; +}; + +/** + * Parses a .csv file. There are two important aspects to the way this interface is designed: + * 1. It allows the file to be split into chunks that can be parsed in parallel. + * 2. Splitting the file into individual records and fields is separated from parsing the actual + * content into e.g. floats. This simplifies the implementation of both parts because the + * logical parsing does not have to worry about e.g. the delimiter or quote characters. It also + * simplifies unit testing. + * + * \param buffer: The buffer containing the .csv file. + * \param options: Options that control how the file is parsed. + * \param process_header: A function that is called at most once and contains the fields of the + * first row/record. + * \param process_records: A function that is called potentially many times in parallel and that + * processes a chunk of parsed records. Typically this function parses raw byte fields into e.g. + * ints or floats. The result of the parsing process has to be returned. Note that under specific + * circumstances, this function may be called twice for the same records. That can happen when + * the .csv file contains multi-line fields which were split incorrectly at first. + * \return A vector containing the return values of the `process_records` function in the correct + * order. Nullopt is returned if the file was malformed, e.g. if it has a quoted field that is + * not closed. + */ +std::optional>> parse_csv_in_chunks( + const Span buffer, + const CsvParseOptions &options, + FunctionRef process_header, + FunctionRef(const CsvRecords &records)> process_records); + +/** + * Same as above, but uses a templated chunk type instead of using #Any which can be more + * convenient to use. + */ +template +inline std::optional> parse_csv_in_chunks( + const Span buffer, + const CsvParseOptions &options, + FunctionRef process_header, + FunctionRef process_records) +{ + std::optional>> result = parse_csv_in_chunks( + buffer, options, process_header, [&](const CsvRecords &records) { + return Any<>(process_records(records)); + }); + if (!result.has_value()) { + return std::nullopt; + } + Vector result_chunks; + result_chunks.reserve(result->size()); + for (Any<> &value : *result) { + result_chunks.append(std::move(value.get())); + } + return result_chunks; +} + +/* -------------------------------------------------------------------- */ +/** \name #CsvRecord inline functions. + * \{ */ + +inline CsvRecord::CsvRecord(Span> fields) : fields_(fields) {} + +inline int64_t CsvRecord::size() const +{ + return fields_.size(); +} + +inline IndexRange CsvRecord::index_range() const +{ + return fields_.index_range(); +} + +inline Span CsvRecord::field(const int64_t index) const +{ + BLI_assert(index >= 0); + if (index >= fields_.size()) { + return {}; + } + return fields_[index]; +} + +inline StringRef CsvRecord::field_str(const int64_t index) const +{ + const Span value = this->field(index); + return StringRef(value.data(), value.size()); +} + +/** \} */ + +/* -------------------------------------------------------------------- */ +/** \name #CsvRecords inline functions. + * \{ */ + +inline CsvRecords::CsvRecords(const OffsetIndices offsets, const Span> fields) + : offsets_(offsets), fields_(fields) +{ +} + +inline int64_t CsvRecords::size() const +{ + return offsets_.size(); +} + +inline IndexRange CsvRecords::index_range() const +{ + return offsets_.index_range(); +} + +inline CsvRecord CsvRecords::record(const int64_t index) const +{ + return CsvRecord(fields_.slice(offsets_[index])); +} + +/** \} */ + +/* -------------------------------------------------------------------- */ +/** \name Internal functions exposed for testing. + * \{ */ + +namespace detail { + +/** + * Find the index that ends the current field, i.e. the index of the next delimiter of newline. + * The start index has to be the index of the first character in the field. It may also be the + * end of the field already if it is empty. + * + * \param start: The index of the first character in the field. This may also be the end of the + * field already if it is empty. + * \param delimiter: The character that ends the field. + * \return Index of the next delimiter, a newline character or the end of the buffer. + */ +int64_t find_end_of_simple_field(Span buffer, int64_t start, char delimiter); + +/** + * Find the index of the quote that ends the current field. + * + * \param start: The index after the opening quote. + * \param quote: The quote character that ends the field. + * \param escape_chars: The characters that may be used to escape the quote character. + * \return Index of the quote character that ends the field, or std::nullopt if the field is + * malformed and does not have an end. + */ +std::optional find_end_of_quoted_field(Span buffer, + int64_t start, + char quote, + Span escape_chars); + +/** + * Finds all fields for the record starting at the given index. Typically, the record ends with a + * newline, but quoted multiline records are supported as well. + * + * \return Index of the the start of the next record or the end of the buffer. Nullopt is returned + * if the buffer has a malformed record at the end, i.e. a quoted field that is not closed. + */ +std::optional parse_record_fields(const Span buffer, + const int64_t start, + const char delimiter, + const char quote, + const Span quote_escape_chars, + Vector> &r_fields); + +} // namespace detail + +/** \} */ + +} // namespace blender::csv_parse diff --git a/source/blender/blenlib/CMakeLists.txt b/source/blender/blenlib/CMakeLists.txt index 2301347f9df..1bfe1e330b8 100644 --- a/source/blender/blenlib/CMakeLists.txt +++ b/source/blender/blenlib/CMakeLists.txt @@ -58,6 +58,7 @@ set(SRC intern/convexhull_2d.cc intern/cpp_type.cc intern/cpp_types.cc + intern/csv_parse.cc intern/delaunay_2d.cc intern/dot_export.cc intern/easing.cc @@ -215,6 +216,7 @@ set(SRC BLI_cpp_type_make.hh BLI_cpp_types.hh BLI_cpp_types_make.hh + BLI_csv_parse.hh BLI_delaunay_2d.hh BLI_devirtualize_parameters.hh BLI_dial_2d.h @@ -527,6 +529,7 @@ if(WITH_GTESTS) tests/BLI_color_test.cc tests/BLI_convexhull_2d_test.cc tests/BLI_cpp_type_test.cc + tests/BLI_csv_parse_test.cc tests/BLI_delaunay_2d_test.cc tests/BLI_disjoint_set_test.cc tests/BLI_expr_pylike_eval_test.cc diff --git a/source/blender/blenlib/intern/csv_parse.cc b/source/blender/blenlib/intern/csv_parse.cc new file mode 100644 index 00000000000..5ae3ab82732 --- /dev/null +++ b/source/blender/blenlib/intern/csv_parse.cc @@ -0,0 +1,286 @@ +/* SPDX-FileCopyrightText: 2025 Blender Authors + * + * SPDX-License-Identifier: GPL-2.0-or-later */ + +#include "BLI_csv_parse.hh" +#include "BLI_enumerable_thread_specific.hh" +#include "BLI_task.hh" + +namespace blender::csv_parse { + +/** + * Returns a guess for the start of the next record. Note that this could split up quoted fields. + * This case needs to be detected at a higher level. + */ +static int64_t guess_next_record_start(const Span buffer, const int64_t start) +{ + int64_t i = start; + while (i < buffer.size()) { + const char c = buffer[i]; + if (c == '\n') { + return i + 1; + } + i++; + } + return buffer.size(); +} + +/** + * Split the buffer into chunks of approximately the given size. The function attempts to align the + * chunks so that records are not split. This works in the majority of cases, but can fail with + * multi-line fields. This has to be detected at a higher level. + */ +static Vector> split_into_aligned_chunks(const Span buffer, + int64_t approximate_chunk_size) +{ + approximate_chunk_size = std::max(approximate_chunk_size, 1); + Vector> chunks; + int64_t start = 0; + while (start < buffer.size()) { + int64_t end = std::min(start + approximate_chunk_size, buffer.size()); + end = guess_next_record_start(buffer, end); + chunks.append(buffer.slice(IndexRange::from_begin_end(start, end))); + start = end; + } + return chunks; +} + +/** + * Parses the given buffer into records and their fields. + * + * r_data_offsets and r_data_fields are passed into to be able to reuse their memory. + */ +static std::optional parse_records(const Span buffer, + const CsvParseOptions &options, + Vector &r_data_offsets, + Vector> &r_data_fields) +{ + using namespace detail; + /* Clear the data that may still be in there, but do not free the memory. */ + r_data_offsets.clear(); + r_data_fields.clear(); + + r_data_offsets.append(0); + int64_t start = 0; + while (start < buffer.size()) { + const std::optional next_record_start = parse_record_fields( + buffer, + start, + options.delimiter, + options.quote, + options.quote_escape_chars, + r_data_fields); + if (!next_record_start.has_value()) { + return std::nullopt; + } + r_data_offsets.append(r_data_fields.size()); + start = *next_record_start; + } + return CsvRecords(OffsetIndices(r_data_offsets), r_data_fields); +} + +std::optional>> parse_csv_in_chunks( + const Span buffer, + const CsvParseOptions &options, + FunctionRef process_header, + FunctionRef(const CsvRecords &records)> process_records) +{ + using namespace detail; + + /* First parse the first row to get the column names. */ + Vector> header_fields; + const std::optional first_data_record_start = parse_record_fields( + buffer, 0, options.delimiter, options.quote, options.quote_escape_chars, header_fields); + if (!first_data_record_start.has_value()) { + return std::nullopt; + } + /* Call this before starting to process the remaining data. This allows the caller to do some + * preprocessing that is used during chunk parsing. */ + process_header(CsvRecord(header_fields)); + + /* This buffer contains only the data records, without the header. */ + const Span data_buffer = buffer.drop_front(*first_data_record_start); + /* Split the buffer into chunks that can be processed in parallel. */ + const Vector> data_buffer_chunks = split_into_aligned_chunks( + data_buffer, options.chunk_size_bytes); + + /* It's not common, but it can happen that .csv files contain quoted multi-line values. In the + * unlucky case that we split the buffer in the middle of such a multi-line field, there will be + * malformed chunks. In this case we fallback to parsing the whole buffer with a single thread. + * If this case becomes more common, we could try to avoid splitting into malformed chunks by + * making the splitting logic a bit smarter. */ + std::atomic found_malformed_chunk = false; + Vector>> chunk_results(data_buffer_chunks.size()); + struct TLS { + Vector data_offsets; + Vector> data_fields; + }; + threading::EnumerableThreadSpecific all_tls; + threading::parallel_for(chunk_results.index_range(), 1, [&](const IndexRange range) { + TLS &tls = all_tls.local(); + for (const int64_t i : range) { + if (found_malformed_chunk.load(std::memory_order_relaxed)) { + /* All work is cancelled when there was a malformed chunk. */ + return; + } + const Span chunk_buffer = data_buffer_chunks[i]; + const std::optional records = parse_records( + chunk_buffer, options, tls.data_offsets, tls.data_fields); + if (!records.has_value()) { + found_malformed_chunk.store(true, std::memory_order_relaxed); + return; + } + chunk_results[i] = process_records(*records); + } + }); + + /* If there was a malformed chunk, process the data again in a single thread without splitting + * the input into chunks. This should happen quite rarely but is important for overall + * correctness. */ + if (found_malformed_chunk) { + chunk_results.clear(); + TLS &tls = all_tls.local(); + const std::optional records = parse_records( + data_buffer, options, tls.data_offsets, tls.data_fields); + if (!records.has_value()) { + return std::nullopt; + } + chunk_results.append(process_records(*records)); + } + + /* Prepare the return value. */ + Vector> results; + for (std::optional> &result : chunk_results) { + BLI_assert(result.has_value()); + results.append(std::move(result.value())); + } + return results; +} + +namespace detail { + +std::optional parse_record_fields(const Span buffer, + const int64_t start, + const char delimiter, + const char quote, + const Span quote_escape_chars, + Vector> &r_fields) +{ + using namespace detail; + + const auto handle_potentially_trailing_delimiter = [&](const int64_t i) { + if (i <= buffer.size()) { + if (i < buffer.size()) { + if (ELEM(buffer[i], '\n', '\r')) { + r_fields.append({}); + } + } + else { + r_fields.append({}); + } + } + }; + + int64_t i = start; + while (i < buffer.size()) { + const char c = buffer[i]; + if (c == '\n') { + return i + 1; + } + if (c == '\r') { + i++; + continue; + } + if (c == delimiter) { + r_fields.append({}); + i++; + handle_potentially_trailing_delimiter(i); + continue; + } + if (c == quote) { + i++; + const std::optional end_of_field = find_end_of_quoted_field( + buffer, i, quote, quote_escape_chars); + if (!end_of_field.has_value()) { + return std::nullopt; + } + r_fields.append(buffer.slice(IndexRange::from_begin_end(i, *end_of_field))); + i = *end_of_field; + while (i < buffer.size()) { + const char inner_c = buffer[i]; + if (inner_c == quote) { + i++; + continue; + } + if (inner_c == delimiter) { + i++; + handle_potentially_trailing_delimiter(i); + break; + } + if (ELEM(inner_c, '\n', '\r')) { + break; + } + i++; + } + continue; + } + const int64_t end_of_field = find_end_of_simple_field(buffer, i, delimiter); + r_fields.append(buffer.slice(IndexRange::from_begin_end(i, end_of_field))); + i = end_of_field; + while (i < buffer.size()) { + const char inner_c = buffer[i]; + if (inner_c == delimiter) { + i++; + handle_potentially_trailing_delimiter(i); + break; + } + if (ELEM(inner_c, '\n', '\r')) { + break; + } + BLI_assert_unreachable(); + } + } + + return buffer.size(); +} + +int64_t find_end_of_simple_field(const Span buffer, + const int64_t start, + const char delimiter) +{ + int64_t i = start; + while (i < buffer.size()) { + const char c = buffer[i]; + if (ELEM(c, delimiter, '\n', '\r')) { + return i; + } + i++; + } + return buffer.size(); +} + +std::optional find_end_of_quoted_field(const Span buffer, + const int64_t start, + const char quote, + const Span escape_chars) +{ + int64_t i = start; + while (i < buffer.size()) { + const char c = buffer[i]; + if (escape_chars.contains(c)) { + if (i + 1 < buffer.size() && buffer[i + 1] == quote) { + i += 2; + continue; + } + } + if (c == quote) { + return i; + } + i++; + } + return std::nullopt; +} + +} // namespace detail + +} // namespace blender::csv_parse diff --git a/source/blender/blenlib/tests/BLI_csv_parse_test.cc b/source/blender/blenlib/tests/BLI_csv_parse_test.cc new file mode 100644 index 00000000000..effc2bb2994 --- /dev/null +++ b/source/blender/blenlib/tests/BLI_csv_parse_test.cc @@ -0,0 +1,259 @@ +/* SPDX-FileCopyrightText: 2025 Blender Authors + * + * SPDX-License-Identifier: Apache-2.0 */ + +#include "testing/testing.h" + +#include "BLI_csv_parse.hh" +#include "BLI_string_ref.hh" + +namespace blender::csv_parse::tests { + +static std::optional find_end_of_simple_field(const StringRef buffer, + const int64_t start, + const char delimiter = ',') +{ + return detail::find_end_of_simple_field(Span(buffer), start, delimiter); +} + +static std::optional find_end_of_quoted_field( + const StringRef buffer, + const int64_t start, + const char quote = '"', + const Span escape_chars = Span(StringRef("\"\\"))) +{ + return detail::find_end_of_quoted_field(Span(buffer), start, quote, escape_chars); +} + +static std::optional> parse_record_fields( + const StringRef buffer, + const int64_t start = 0, + const char delimiter = ',', + const char quote = '"', + const Span quote_escape_chars = Span{'"', '\\'}) +{ + Vector> fields; + const std::optional end_of_record = detail::parse_record_fields( + Span(buffer), start, delimiter, quote, quote_escape_chars, fields); + if (!end_of_record.has_value()) { + return std::nullopt; + } + Vector result; + for (const Span field : fields) { + result.append(std::string(field.begin(), field.end())); + } + return result; +} + +struct StrParseResult { + bool success = false; + Vector column_names; + Vector> records; +}; + +static StrParseResult parse_csv_fields(const StringRef str, const CsvParseOptions &options) +{ + struct Chunk { + Vector> fields; + }; + + StrParseResult result; + const std::optional> chunks = parse_csv_in_chunks( + Span(str), + options, + [&](const CsvRecord &record) { + for (const int64_t i : record.index_range()) { + result.column_names.append(record.field_str(i)); + } + }, + [&](const CsvRecords &records) { + Chunk result; + for (const int64_t record_i : records.index_range()) { + const CsvRecord record = records.record(record_i); + Vector fields; + for (const int64_t column_i : record.index_range()) { + fields.append(record.field_str(column_i)); + } + result.fields.append(std::move(fields)); + } + return result; + }); + if (!chunks.has_value()) { + result.success = false; + return result; + } + result.success = true; + for (const Chunk &chunk : *chunks) { + result.records.extend(std::move(chunk.fields)); + } + return result; +} + +TEST(csv_parse, FindEndOfSimpleField) +{ + EXPECT_EQ(find_end_of_simple_field("123", 0), 3); + EXPECT_EQ(find_end_of_simple_field("123", 1), 3); + EXPECT_EQ(find_end_of_simple_field("123", 2), 3); + EXPECT_EQ(find_end_of_simple_field("123", 3), 3); + EXPECT_EQ(find_end_of_simple_field("1'3", 3), 3); + EXPECT_EQ(find_end_of_simple_field("123,", 0), 3); + EXPECT_EQ(find_end_of_simple_field("123,456", 0), 3); + EXPECT_EQ(find_end_of_simple_field("123,456,789", 0), 3); + EXPECT_EQ(find_end_of_simple_field(" 23", 0), 3); + EXPECT_EQ(find_end_of_simple_field("", 0), 0); + EXPECT_EQ(find_end_of_simple_field("\n", 0), 0); + EXPECT_EQ(find_end_of_simple_field("12\n", 0), 2); + EXPECT_EQ(find_end_of_simple_field("0,12\n", 0), 1); + EXPECT_EQ(find_end_of_simple_field("0,12\n", 2), 4); + EXPECT_EQ(find_end_of_simple_field("\r\n", 0), 0); + EXPECT_EQ(find_end_of_simple_field("12\r\n", 0), 2); + EXPECT_EQ(find_end_of_simple_field("0,12\r\n", 0), 1); + EXPECT_EQ(find_end_of_simple_field("0,12\r\n", 2), 4); + EXPECT_EQ(find_end_of_simple_field("0,\t12\r\n", 2), 5); + EXPECT_EQ(find_end_of_simple_field("0,\t12\r\n", 2, '\t'), 2); +} + +TEST(csv_parse, FindEndOfQuotedField) +{ + EXPECT_EQ(find_end_of_quoted_field("", 0), std::nullopt); + EXPECT_EQ(find_end_of_quoted_field("123", 0), std::nullopt); + EXPECT_EQ(find_end_of_quoted_field("123\n", 0), std::nullopt); + EXPECT_EQ(find_end_of_quoted_field("123\r\n", 0), std::nullopt); + EXPECT_EQ(find_end_of_quoted_field("123\"", 0), 3); + EXPECT_EQ(find_end_of_quoted_field("\"", 0), 0); + EXPECT_EQ(find_end_of_quoted_field("\"\"", 0), std::nullopt); + EXPECT_EQ(find_end_of_quoted_field("\"\"\"", 0), 2); + EXPECT_EQ(find_end_of_quoted_field("123\"\"", 0), std::nullopt); + EXPECT_EQ(find_end_of_quoted_field("123\"\"\"", 0), 5); + EXPECT_EQ(find_end_of_quoted_field("123\"\"\"\"", 0), std::nullopt); + EXPECT_EQ(find_end_of_quoted_field("123\"\"\"\"\"", 0), 7); + EXPECT_EQ(find_end_of_quoted_field("123\"\"0\"\"\"", 0), 8); + EXPECT_EQ(find_end_of_quoted_field(",", 0), std::nullopt); + EXPECT_EQ(find_end_of_quoted_field(",\"", 0), 1); + EXPECT_EQ(find_end_of_quoted_field("0,1\"", 0), 3); + EXPECT_EQ(find_end_of_quoted_field("0,1\n", 0), std::nullopt); + EXPECT_EQ(find_end_of_quoted_field("0,1\"\"", 0), std::nullopt); + EXPECT_EQ(find_end_of_quoted_field("0,1\"\"\"", 0), 5); + EXPECT_EQ(find_end_of_quoted_field("0\n1\n\"", 0), 4); + EXPECT_EQ(find_end_of_quoted_field("\n\"", 0), 1); + EXPECT_EQ(find_end_of_quoted_field("\\\"", 0), std::nullopt); + EXPECT_EQ(find_end_of_quoted_field("\\\"\"", 0), 2); + EXPECT_EQ(find_end_of_quoted_field("\\\"\"\"", 0), std::nullopt); + EXPECT_EQ(find_end_of_quoted_field("\\\"\"\"\"", 0), 4); +} + +TEST(csv_parse, ParseRecordFields) +{ + using StrVec = Vector; + EXPECT_EQ(parse_record_fields(""), StrVec()); + EXPECT_EQ(parse_record_fields("1"), StrVec{"1"}); + EXPECT_EQ(parse_record_fields("1,2"), StrVec({"1", "2"})); + EXPECT_EQ(parse_record_fields("1,2,3"), StrVec({"1", "2", "3"})); + EXPECT_EQ(parse_record_fields("1\n,2,3"), StrVec({"1"})); + EXPECT_EQ(parse_record_fields("1, 2\n,3"), StrVec({"1", " 2"})); + EXPECT_EQ(parse_record_fields("1, 2\r\n,3"), StrVec({"1", " 2"})); + EXPECT_EQ(parse_record_fields("\"1,2,3\""), StrVec({"1,2,3"})); + EXPECT_EQ(parse_record_fields("\"1,2,3"), std::nullopt); + EXPECT_EQ(parse_record_fields("\"1,\n2\t\r\n,3\""), StrVec({"1,\n2\t\r\n,3"})); + EXPECT_EQ(parse_record_fields("\"1,2,3\",\"4,5\""), StrVec({"1,2,3", "4,5"})); + EXPECT_EQ(parse_record_fields(","), StrVec({"", ""})); + EXPECT_EQ(parse_record_fields(",,"), StrVec({"", "", ""})); + EXPECT_EQ(parse_record_fields(",,\n"), StrVec({"", "", ""})); + EXPECT_EQ(parse_record_fields("\r\n,,"), StrVec()); + EXPECT_EQ(parse_record_fields("\"a\"\"b\""), StrVec({"a\"\"b"})); + EXPECT_EQ(parse_record_fields("\"a\\\"b\""), StrVec({"a\\\"b"})); + EXPECT_EQ(parse_record_fields("\"a\"\nb"), StrVec({"a"})); + EXPECT_EQ(parse_record_fields("\"a\" \nb"), StrVec({"a"})); +} + +TEST(csv_parse, ParseCsvBasic) +{ + CsvParseOptions options; + options.chunk_size_bytes = 1; + StrParseResult result = parse_csv_fields("a,b,c\n1,2,3,4\n4\n77,88,99\n", options); + + EXPECT_TRUE(result.success); + + EXPECT_EQ(result.column_names.size(), 3); + EXPECT_EQ(result.column_names[0], "a"); + EXPECT_EQ(result.column_names[1], "b"); + EXPECT_EQ(result.column_names[2], "c"); + + EXPECT_EQ(result.records.size(), 3); + EXPECT_EQ(result.records[0].size(), 4); + EXPECT_EQ(result.records[1].size(), 1); + EXPECT_EQ(result.records[2].size(), 3); + + EXPECT_EQ(result.records[0][0], "1"); + EXPECT_EQ(result.records[0][1], "2"); + EXPECT_EQ(result.records[0][2], "3"); + EXPECT_EQ(result.records[0][3], "4"); + + EXPECT_EQ(result.records[1][0], "4"); + + EXPECT_EQ(result.records[2][0], "77"); + EXPECT_EQ(result.records[2][1], "88"); + EXPECT_EQ(result.records[2][2], "99"); +} + +TEST(csv_parse, ParseCsvMissingEnd) +{ + CsvParseOptions options; + options.chunk_size_bytes = 1; + StrParseResult result = parse_csv_fields("a,b,c\n1,\"2", options); + EXPECT_FALSE(result.success); +} + +TEST(csv_parse, ParseCsvMultiLine) +{ + CsvParseOptions options; + options.chunk_size_bytes = 1; + StrParseResult result = parse_csv_fields("a,b,c\n1,\"2\n\n\",3,4", options); + EXPECT_TRUE(result.success); + EXPECT_EQ(result.records.size(), 1); + EXPECT_EQ(result.records[0].size(), 4); + EXPECT_EQ(result.records[0][0], "1"); + EXPECT_EQ(result.records[0][1], "2\n\n"); + EXPECT_EQ(result.records[0][2], "3"); + EXPECT_EQ(result.records[0][3], "4"); +} + +TEST(csv_parse, ParseCsvEmpty) +{ + CsvParseOptions options; + options.chunk_size_bytes = 1; + StrParseResult result = parse_csv_fields("", options); + EXPECT_TRUE(result.success); + EXPECT_EQ(result.column_names.size(), 0); + EXPECT_EQ(result.records.size(), 0); +} + +TEST(csv_parse, ParseCsvTitlesOnly) +{ + CsvParseOptions options; + options.chunk_size_bytes = 1; + StrParseResult result = parse_csv_fields("a,b,c", options); + EXPECT_TRUE(result.success); + EXPECT_EQ(result.column_names.size(), 3); + EXPECT_EQ(result.column_names[0], "a"); + EXPECT_EQ(result.column_names[1], "b"); + EXPECT_EQ(result.column_names[2], "c"); + EXPECT_TRUE(result.records.is_empty()); +} + +TEST(csv_parse, ParseCsvTrailingNewline) +{ + CsvParseOptions options; + options.chunk_size_bytes = 1; + StrParseResult result = parse_csv_fields("a\n1\n2\n", options); + EXPECT_TRUE(result.success); + EXPECT_EQ(result.column_names.size(), 1); + EXPECT_EQ(result.column_names[0], "a"); + EXPECT_EQ(result.records.size(), 2); + EXPECT_EQ(result.records[0].size(), 1); + EXPECT_EQ(result.records[0][0], "1"); + EXPECT_EQ(result.records[1].size(), 1); + EXPECT_EQ(result.records[1][0], "2"); +} + +} // namespace blender::csv_parse::tests diff --git a/source/blender/io/csv/importer/csv_reader.cc b/source/blender/io/csv/importer/csv_reader.cc index ffa53eb7d35..90476e944c4 100644 --- a/source/blender/io/csv/importer/csv_reader.cc +++ b/source/blender/io/csv/importer/csv_reader.cc @@ -6,207 +6,257 @@ * \ingroup csv */ +#include +#include #include +#include +#include "fast_float.h" + +#include "BKE_anonymous_attribute_id.hh" #include "BKE_attribute.hh" #include "BKE_pointcloud.hh" #include "BKE_report.hh" +#include "BLI_csv_parse.hh" #include "BLI_fileops.hh" -#include "BLI_generic_span.hh" +#include "BLI_implicit_sharing.hh" #include "BLI_vector.hh" #include "IO_csv.hh" -#include "IO_string_utils.hh" namespace blender::io::csv { -static Vector parse_column_names(const StringRef line) +struct ColumnInfo { + StringRef name; + bool has_invalid_name = false; + std::atomic found_invalid = false; + std::atomic found_int = false; + std::atomic found_float = false; +}; + +using ColumnData = std::variant, Vector>; + +struct ChunkResult { + int rows_num; + Vector columns; +}; + +struct ParseFloatColumnResult { + Vector data; + bool found_invalid = false; +}; + +struct ParseIntColumnResult { + Vector data; + bool found_invalid = false; + bool found_float = false; +}; + +static ParseFloatColumnResult parse_column_as_floats(const csv_parse::CsvRecords &records, + const int column_i) { - Vector columns; - const char delim = ','; - const char *start = line.begin(), *end = line.end(); - const char *cell_start = start, *cell_end = start; - - int64_t delim_index = line.find_first_of(delim); - - while (delim_index != StringRef::not_found) { - cell_end = start + delim_index; - - columns.append_as(cell_start, cell_end); - - cell_start = cell_end + 1; - delim_index = line.find_first_of(delim, delim_index + 1); - } - - columns.append_as(cell_start, end); - - return columns; -} - -static std::optional get_column_type(const char *start, const char *end) -{ - bool success = false; - - int _val_int = 0; - try_parse_int(start, end, 0, success, _val_int); - - if (success) { - return CD_PROP_INT32; - } - - float _val_float = 0.0f; - try_parse_float(start, end, 0.0f, success, _val_float); - - if (success) { - return CD_PROP_FLOAT; - } - - return std::nullopt; -} - -static bool get_column_types(const StringRef line, Vector &column_types) -{ - const char delim = ','; - const char *start = line.begin(), *end = line.end(); - const char *cell_start = start, *cell_end = start; - - int64_t delim_index = line.find_first_of(delim); - - while (delim_index != StringRef::not_found) { - cell_end = start + delim_index; - - std::optional column_type = get_column_type(cell_start, cell_end); - if (!column_type.has_value()) { - return false; + ParseFloatColumnResult result; + result.data.reserve(records.size()); + for (const int row_i : records.index_range()) { + const Span value_span = records.record(row_i).field(column_i); + const char *value_begin = value_span.begin(); + const char *value_end = value_span.end(); + /* Skip leading whitespace and plus sign. */ + while (value_begin < value_end && ELEM(*value_begin, ' ', '+')) { + value_begin++; } - column_types.append(column_type.value()); - - cell_start = cell_end + 1; - delim_index = line.find_first_of(delim, delim_index + 1); - } - - std::optional column_type = get_column_type(cell_start, end); - if (!column_type.has_value()) { - return false; - } - column_types.append(column_type.value()); - - return true; -} - -static int64_t get_row_count(StringRef buffer) -{ - int64_t row_count = 1; - - while (!buffer.is_empty()) { - read_next_line(buffer); - row_count++; - } - - return row_count; -} - -static void parse_csv_cell(const Span data, - const Span types, - const Span column_names, - const int64_t row_index, - const int64_t col_index, - const char *start, - const char *end, - const CSVImportParams &import_params) -{ - bool success = false; - - switch (types[col_index]) { - case CD_PROP_INT32: { - int value = 0; - try_parse_int(start, end, 0, success, value); - data[col_index].typed()[row_index] = value; - if (!success) { - StringRef column_name = column_names[col_index]; - BKE_reportf(import_params.reports, - RPT_ERROR, - "CSV Import: file '%s' has an unexpected value at row %d for column %s of " - "type Integer", - import_params.filepath, - int(row_index), - std::string(column_name).c_str()); + float value; + fast_float::from_chars_result res = fast_float::from_chars(value_begin, value_end, value); + if (res.ec != std::errc()) { + result.found_invalid = true; + return result; + } + if (res.ptr < value_end) { + /* Allow trailing whitespace in the value. */ + while (res.ptr < value_end && res.ptr[0] == ' ') { + res.ptr++; } - break; - } - case CD_PROP_FLOAT: { - float value = 0.0f; - try_parse_float(start, end, 0.0f, success, value); - data[col_index].typed()[row_index] = value; - if (!success) { - StringRef column_name = column_names[col_index]; - BKE_reportf(import_params.reports, - RPT_ERROR, - "CSV Import: file '%s' has an unexpected value at row %d for column %s of " - "type Float", - import_params.filepath, - int(row_index), - std::string(column_name).c_str()); + if (res.ptr < value_end) { + result.found_invalid = true; + return result; } - break; - } - default: { - StringRef column_name = column_names[col_index]; - BKE_reportf(import_params.reports, - RPT_ERROR, - "CSV Import: file '%s' has an unsupported value at row %d for column %s", - import_params.filepath, - int(row_index), - std::string(column_name).c_str()); - break; } + result.data.append(value); } + return result; } -static void parse_csv_line(const Span data, - const Span types, - const Span column_names, - int64_t row_index, - const StringRef line, - const CSVImportParams &import_params) +static ParseIntColumnResult parse_column_as_ints(const csv_parse::CsvRecords &records, + const int column_i) { - const char delim = ','; - const char *start = line.begin(), *end = line.end(); - const char *cell_start = start, *cell_end = start; - - int64_t col_index = 0; - - int64_t delim_index = line.find_first_of(delim); - - while (delim_index != StringRef::not_found) { - cell_end = start + delim_index; - - parse_csv_cell( - data, types, column_names, row_index, col_index, cell_start, cell_end, import_params); - col_index++; - - cell_start = cell_end + 1; - delim_index = line.find_first_of(delim, delim_index + 1); + ParseIntColumnResult result; + result.data.reserve(records.size()); + for (const int row_i : records.index_range()) { + const Span value_span = records.record(row_i).field(column_i); + const char *value_begin = value_span.begin(); + const char *value_end = value_span.end(); + /* Skip leading whitespace and plus sign. */ + while (value_begin < value_end && ELEM(*value_begin, ' ', '+')) { + value_begin++; + } + int value; + std::from_chars_result res = std::from_chars(value_begin, value_end, value); + if (res.ec != std::errc()) { + result.found_invalid = true; + return result; + } + if (res.ptr < value_end) { + /* If the next character after the value is a dot, it should be parsed again as float. */ + if (res.ptr[0] == '.') { + result.found_float = true; + return result; + } + /* Allow trailing whitespace in the value. */ + while (res.ptr < value_end && res.ptr[0] == ' ') { + res.ptr++; + } + if (res.ptr < value_end) { + result.found_invalid = true; + return result; + } + } + result.data.append(value); } - - parse_csv_cell(data, types, column_names, row_index, col_index, cell_start, end, import_params); + return result; } -static void parse_csv_data(const Span data, - const Span types, - const Span column_names, - StringRef buffer, - const CSVImportParams &import_params) +static ChunkResult parse_records_chunk(const csv_parse::CsvRecords &records, + MutableSpan columns_info) { - int64_t row_index = 0; - while (!buffer.is_empty()) { - const StringRef line = read_next_line(buffer); - - parse_csv_line(data, types, column_names, row_index, line, import_params); - - row_index++; + const int columns_num = columns_info.size(); + ChunkResult chunk_result; + chunk_result.rows_num = records.size(); + chunk_result.columns.resize(columns_num); + for (const int column_i : IndexRange(columns_num)) { + ColumnInfo &column_info = columns_info[column_i]; + if (column_info.has_invalid_name) { + /* Column can be ignored. */ + continue; + } + if (column_info.found_invalid.load(std::memory_order_relaxed)) { + /* Invalid values have been found in this column already, skip it. */ + continue; + } + /* A float was found in this column already, so parse everything as floats. */ + const bool found_float = column_info.found_float.load(std::memory_order_relaxed); + if (found_float) { + ParseFloatColumnResult float_column_result = parse_column_as_floats(records, column_i); + if (float_column_result.found_invalid) { + column_info.found_invalid.store(true, std::memory_order_relaxed); + continue; + } + chunk_result.columns[column_i] = std::move(float_column_result.data); + continue; + } + /* No float was found so far in this column, so attempt to parse it as integers. */ + ParseIntColumnResult int_column_result = parse_column_as_ints(records, column_i); + if (int_column_result.found_invalid) { + column_info.found_invalid.store(true, std::memory_order_relaxed); + continue; + } + if (!int_column_result.found_float) { + chunk_result.columns[column_i] = std::move(int_column_result.data); + column_info.found_int.store(true, std::memory_order_relaxed); + continue; + } + /* While parsing it as integers, floats were detected. So parse it as floats again. */ + column_info.found_float.store(true, std::memory_order_relaxed); + ParseFloatColumnResult float_column_result = parse_column_as_floats(records, column_i); + if (float_column_result.found_invalid) { + column_info.found_invalid.store(true, std::memory_order_relaxed); + continue; + } + chunk_result.columns[column_i] = std::move(float_column_result.data); } + return chunk_result; +} + +/** + * So far, the parsed data is still split into many chunks. This function flattens the chunks into + * continuous buffers that can be used as attributes. + */ +static Array>> flatten_valid_attribute_chunks( + const Span columns_info, + OffsetIndices chunk_offsets, + MutableSpan chunks) +{ + const int points_num = chunk_offsets.total_size(); + Array>> flattened_attributes(columns_info.size()); + + threading::parallel_for(columns_info.index_range(), 1, [&](const IndexRange columns_range) { + for (const int column_i : columns_range) { + const ColumnInfo &column_info = columns_info[column_i]; + if (column_info.has_invalid_name || column_info.found_invalid) { + /* Column can be ignored. */ + continue; + } + if (column_info.found_float) { + /* Should read column as floats. */ + GArray<> attribute(CPPType::get(), points_num); + float *attribute_buffer = static_cast(attribute.data()); + threading::parallel_for(chunks.index_range(), 1, [&](const IndexRange chunks_range) { + for (const int chunk_i : chunks_range) { + const IndexRange dst_range = chunk_offsets[chunk_i]; + ChunkResult &chunk = chunks[chunk_i]; + ColumnData &column_data = chunk.columns[column_i]; + if (const auto *float_vec = std::get_if>(&column_data)) { + BLI_assert(float_vec->size() == dst_range.size()); + uninitialized_copy_n( + float_vec->data(), dst_range.size(), attribute_buffer + dst_range.first()); + } + else if (const auto *int_vec = std::get_if>(&column_data)) { + /* This chunk was read entirely as integers, so it still has to be converted to + * floats. */ + BLI_assert(int_vec->size() == dst_range.size()); + uninitialized_convert_n(int_vec->data(), dst_range.size(), attribute_buffer); + } + else { + /* Expected data to be available, because the `found_invalid` flag was not + * set. */ + BLI_assert_unreachable(); + } + /* Free data for chunk. */ + column_data = std::monostate{}; + } + }); + flattened_attributes[column_i] = std::move(attribute); + continue; + } + if (column_info.found_int) { + /* Should read column as ints. */ + GArray<> attribute(CPPType::get(), points_num); + int *attribute_buffer = static_cast(attribute.data()); + threading::parallel_for(chunks.index_range(), 1, [&](const IndexRange chunks_range) { + for (const int chunk_i : chunks_range) { + const IndexRange dst_range = chunk_offsets[chunk_i]; + ChunkResult &chunk = chunks[chunk_i]; + ColumnData &column_data = chunk.columns[column_i]; + if (const auto *int_vec = std::get_if>(&column_data)) { + BLI_assert(int_vec->size() == dst_range.size()); + uninitialized_copy_n( + int_vec->data(), dst_range.size(), attribute_buffer + dst_range.first()); + } + else { + /* Expected data to be available, because the `found_invalid` and + * `found_float` flags were not set. */ + BLI_assert_unreachable(); + } + /* Free data for chunk. */ + column_data = std::monostate{}; + } + }); + flattened_attributes[column_i] = std::move(attribute); + continue; + } + } + }); + return flattened_attributes; } PointCloud *import_csv_as_point_cloud(const CSVImportParams &import_params) @@ -220,62 +270,82 @@ PointCloud *import_csv_as_point_cloud(const CSVImportParams &import_params) import_params.filepath); return nullptr; } - BLI_SCOPED_DEFER([&]() { MEM_freeN(buffer); }); - - StringRef buffer_str{static_cast(buffer), int64_t(buffer_len)}; - if (buffer_str.is_empty()) { + if (buffer_len == 0) { BKE_reportf( import_params.reports, RPT_ERROR, "CSV Import: empty file '%s'", import_params.filepath); return nullptr; } - const StringRef header = read_next_line(buffer_str); - const Vector names = parse_column_names(header); + Array columns_info; - if (buffer_str.is_empty()) { + const auto parse_header = [&](const csv_parse::CsvRecord &record) { + columns_info.reinitialize(record.size()); + for (const int i : record.index_range()) { + ColumnInfo &column_info = columns_info[i]; + const StringRef name = record.field_str(i); + column_info.name = name; + if (!bke::allow_procedural_attribute_access(name) || + bke::attribute_name_is_anonymous(name) || name.is_empty()) + { + column_info.has_invalid_name = true; + continue; + } + } + }; + const auto parse_data_chunk = [&](const csv_parse::CsvRecords &records) { + return parse_records_chunk(records, columns_info); + }; + + const Span buffer_span{static_cast(buffer), int64_t(buffer_len)}; + csv_parse::CsvParseOptions parse_options; + std::optional> parsed_chunks = csv_parse::parse_csv_in_chunks( + buffer_span, parse_options, parse_header, parse_data_chunk); + + if (!parsed_chunks.has_value()) { BKE_reportf(import_params.reports, RPT_ERROR, - "CSV Import: no rows in file '%s'", + "CSV import: failed to parse file '%s'", import_params.filepath); return nullptr; } - /* Shallow copy buffer to preserve pointers from first row for parsing */ - const StringRef data_buffer(buffer_str.begin(), buffer_str.end()); - - const StringRef first_row = read_next_line(buffer_str); - - Vector column_types; - if (!get_column_types(first_row, column_types)) { - std::string column_name = names[column_types.size()]; - BKE_reportf(import_params.reports, - RPT_ERROR, - "CSV Import: file '%s', Column %s is of unsupported data type", - import_params.filepath, - column_name.c_str()); - return nullptr; + /* Count the total number of records and compute the offset of each chunk which is used when + * flattening the parsed data. */ + Vector chunk_offsets_vec; + chunk_offsets_vec.append(0); + for (const ChunkResult &chunk : *parsed_chunks) { + chunk_offsets_vec.append(chunk_offsets_vec.last() + chunk.rows_num); } + const OffsetIndices chunk_offsets(chunk_offsets_vec); + const int points_num = chunk_offsets_vec.last(); - const int64_t rows_num = get_row_count(buffer_str); + PointCloud *pointcloud = BKE_pointcloud_new_nomain(points_num); - PointCloud *pointcloud = BKE_pointcloud_new_nomain(rows_num); - pointcloud->positions_for_write().fill(float3(0)); - - Array attribute_writers(names.size()); - Array attribute_data(names.size()); + Array>> flattened_attributes; + threading::memory_bandwidth_bound_task(points_num * 16, [&]() { + threading::parallel_invoke([&]() { pointcloud->positions_for_write().fill(float3(0)); }, + [&]() { + flattened_attributes = flatten_valid_attribute_chunks( + columns_info, chunk_offsets, *parsed_chunks); + }); + }); + /* Add all valid attributes to the pointcloud. */ bke::MutableAttributeAccessor attributes = pointcloud->attributes_for_write(); - for (const int i : names.index_range()) { - attribute_writers[i] = attributes.lookup_or_add_for_write_span( - names[i], bke::AttrDomain::Point, column_types[i]); - attribute_data[i] = attribute_writers[i].span; - } - - parse_csv_data(attribute_data, column_types, names, data_buffer, import_params); - - for (bke::GSpanAttributeWriter &attr : attribute_writers) { - attr.finish(); + for (const int column_i : columns_info.index_range()) { + const std::optional> &attribute = flattened_attributes[column_i]; + if (!attribute.has_value()) { + continue; + } + const auto *data = new ImplicitSharedValue>(std::move(*attribute)); + const eCustomDataType type = bke::cpp_type_to_custom_data_type(attribute->type()); + const ColumnInfo &column_info = columns_info[column_i]; + attributes.add(column_info.name, + bke::AttrDomain::Point, + type, + bke::AttributeInitShared{data->data.data(), *data}); + data->remove_user_and_delete_if_last(); } return pointcloud;