/* SPDX-FileCopyrightText: 2025 Blender Authors * * SPDX-License-Identifier: GPL-2.0-or-later */ /** \file * \ingroup bli */ #include "BLI_any.hh" #include "BLI_function_ref.hh" #include "BLI_linear_allocator.hh" #include "BLI_offset_indices.hh" #include "BLI_string_ref.hh" #include "BLI_vector.hh" namespace blender::csv_parse { /** * Contains the fields of a single record of a .csv file. Usually that corresponds to a single * line. */ class CsvRecord { private: Span> fields_; public: CsvRecord(Span> fields); /** Number of fields in the record. */ int64_t size() const; IndexRange index_range() const; /** Get the field at the given index. Empty data is returned if the index is too large. */ Span field(const int64_t index) const; StringRef field_str(const int64_t index) const; }; /** * Contains the fields of multiple records. */ class CsvRecords { private: OffsetIndices offsets_; Span> fields_; public: CsvRecords(OffsetIndices offsets, Span> fields); /** Number of records (rows). */ int64_t size() const; IndexRange index_range() const; /** Get the record at the given index. */ CsvRecord record(const int64_t index) const; }; struct CsvParseOptions { /** The character that separates fields within a row. */ char delimiter = ','; /** * The character that can be used to enclose fields which contain the delimiter or span multiple * lines. */ char quote = '"'; /** * Characters that can be used to escape the quote character. By default, "" or \" both represent * an escaped quote. */ Span quote_escape_chars = Span(StringRef("\"\\")); /** Approximate number of bytes per chunk that the input is split into. */ int64_t chunk_size_bytes = 64 * 1024; }; /** * Parses a `.csv` file. There are two important aspects to the way this interface is designed: * 1. It allows the file to be split into chunks that can be parsed in parallel. * 2. Splitting the file into individual records and fields is separated from parsing the actual * content into e.g. floats. This simplifies the implementation of both parts because the * logical parsing does not have to worry about e.g. the delimiter or quote characters. It also * simplifies unit testing. * * \param buffer: The buffer containing the `.csv` file. * \param options: Options that control how the file is parsed. * \param process_header: A function that is called at most once and contains the fields of the * first row/record. * \param process_records: A function that is called potentially many times in parallel and that * processes a chunk of parsed records. Typically this function parses raw byte fields into e.g. * ints or floats. The result of the parsing process has to be returned. Note that under specific * circumstances, this function may be called twice for the same records. That can happen when * the `.csv` file contains multi-line fields which were split incorrectly at first. * \return A vector containing the return values of the `process_records` function in the correct * order. #std::nullopt is returned if the file was malformed, e.g. * if it has a quoted field that is not closed. */ std::optional>> parse_csv_in_chunks( const Span buffer, const CsvParseOptions &options, FunctionRef process_header, FunctionRef(const CsvRecords &records)> process_records); /** * Same as above, but uses a templated chunk type instead of using #Any which can be more * convenient to use. */ template inline std::optional> parse_csv_in_chunks( const Span buffer, const CsvParseOptions &options, FunctionRef process_header, FunctionRef process_records) { std::optional>> result = parse_csv_in_chunks( buffer, options, process_header, [&](const CsvRecords &records) { return Any<>(process_records(records)); }); if (!result.has_value()) { return std::nullopt; } Vector result_chunks; result_chunks.reserve(result->size()); for (Any<> &value : *result) { result_chunks.append(std::move(value.get())); } return result_chunks; } /** * Fields in a CSV file may contain escaped quote characters (e.g. "" or \"). * This function replaces these with just the quote character. * The returned string may be reference the input string if it's the same. * Otherwise the returned string is allocated in the given allocator. */ StringRef unescape_field(const StringRef str, const CsvParseOptions &options, LinearAllocator<> &allocator); /* -------------------------------------------------------------------- */ /** \name #CsvRecord inline functions. * \{ */ inline CsvRecord::CsvRecord(Span> fields) : fields_(fields) {} inline int64_t CsvRecord::size() const { return fields_.size(); } inline IndexRange CsvRecord::index_range() const { return fields_.index_range(); } inline Span CsvRecord::field(const int64_t index) const { BLI_assert(index >= 0); if (index >= fields_.size()) { return {}; } return fields_[index]; } inline StringRef CsvRecord::field_str(const int64_t index) const { const Span value = this->field(index); return StringRef(value.data(), value.size()); } /** \} */ /* -------------------------------------------------------------------- */ /** \name #CsvRecords inline functions. * \{ */ inline CsvRecords::CsvRecords(const OffsetIndices offsets, const Span> fields) : offsets_(offsets), fields_(fields) { } inline int64_t CsvRecords::size() const { return offsets_.size(); } inline IndexRange CsvRecords::index_range() const { return offsets_.index_range(); } inline CsvRecord CsvRecords::record(const int64_t index) const { return CsvRecord(fields_.slice(offsets_[index])); } /** \} */ /* -------------------------------------------------------------------- */ /** \name Internal functions exposed for testing. * \{ */ namespace detail { /** * Find the index that ends the current field, i.e. the index of the next delimiter of newline. * The start index has to be the index of the first character in the field. It may also be the * end of the field already if it is empty. * * \param start: The index of the first character in the field. This may also be the end of the * field already if it is empty. * \param delimiter: The character that ends the field. * \return Index of the next delimiter, a newline character or the end of the buffer. */ int64_t find_end_of_simple_field(Span buffer, int64_t start, char delimiter); /** * Find the index of the quote that ends the current field. * * \param start: The index after the opening quote. * \param quote: The quote character that ends the field. * \param escape_chars: The characters that may be used to escape the quote character. * \return Index of the quote character that ends the field, or std::nullopt if the field is * malformed and does not have an end. */ std::optional find_end_of_quoted_field(Span buffer, int64_t start, char quote, Span escape_chars); /** * Finds all fields for the record starting at the given index. Typically, the record ends with a * newline, but quoted multi-line records are supported as well. * * \return Index of the start of the next record or the end of the buffer. #std::nullopt is * returned if the buffer has a malformed record at the end, * i.e. a quoted field that is not closed. */ std::optional parse_record_fields(const Span buffer, const int64_t start, const char delimiter, const char quote, const Span quote_escape_chars, Vector> &r_fields); } // namespace detail /** \} */ } // namespace blender::csv_parse