Files
test/source/blender/blenlib/BLI_csv_parse.hh
2025-03-29 15:18:50 +11:00

248 lines
8.2 KiB
C++

/* SPDX-FileCopyrightText: 2025 Blender Authors
*
* SPDX-License-Identifier: GPL-2.0-or-later */
/** \file
* \ingroup bli
*/
#include "BLI_any.hh"
#include "BLI_function_ref.hh"
#include "BLI_linear_allocator.hh"
#include "BLI_offset_indices.hh"
#include "BLI_string_ref.hh"
#include "BLI_vector.hh"
namespace blender::csv_parse {
/**
* Contains the fields of a single record of a .csv file. Usually that corresponds to a single
* line.
*/
class CsvRecord {
private:
Span<Span<char>> fields_;
public:
CsvRecord(Span<Span<char>> fields);
/** Number of fields in the record. */
int64_t size() const;
IndexRange index_range() const;
/** Get the field at the given index. Empty data is returned if the index is too large. */
Span<char> field(const int64_t index) const;
StringRef field_str(const int64_t index) const;
};
/**
* Contains the fields of multiple records.
*/
class CsvRecords {
private:
OffsetIndices<int64_t> offsets_;
Span<Span<char>> fields_;
public:
CsvRecords(OffsetIndices<int64_t> offsets, Span<Span<char>> fields);
/** Number of records (rows). */
int64_t size() const;
IndexRange index_range() const;
/** Get the record at the given index. */
CsvRecord record(const int64_t index) const;
};
struct CsvParseOptions {
/** The character that separates fields within a row. */
char delimiter = ',';
/**
* The character that can be used to enclose fields which contain the delimiter or span multiple
* lines.
*/
char quote = '"';
/**
* Characters that can be used to escape the quote character. By default, "" or \" both represent
* an escaped quote.
*/
Span<char> quote_escape_chars = Span<char>(StringRef("\"\\"));
/** Approximate number of bytes per chunk that the input is split into. */
int64_t chunk_size_bytes = 64 * 1024;
};
/**
* Parses a `.csv` file. There are two important aspects to the way this interface is designed:
* 1. It allows the file to be split into chunks that can be parsed in parallel.
* 2. Splitting the file into individual records and fields is separated from parsing the actual
* content into e.g. floats. This simplifies the implementation of both parts because the
* logical parsing does not have to worry about e.g. the delimiter or quote characters. It also
* simplifies unit testing.
*
* \param buffer: The buffer containing the `.csv` file.
* \param options: Options that control how the file is parsed.
* \param process_header: A function that is called at most once and contains the fields of the
* first row/record.
* \param process_records: A function that is called potentially many times in parallel and that
* processes a chunk of parsed records. Typically this function parses raw byte fields into e.g.
* ints or floats. The result of the parsing process has to be returned. Note that under specific
* circumstances, this function may be called twice for the same records. That can happen when
* the `.csv` file contains multi-line fields which were split incorrectly at first.
* \return A vector containing the return values of the `process_records` function in the correct
* order. #std::nullopt is returned if the file was malformed, e.g.
* if it has a quoted field that is not closed.
*/
std::optional<Vector<Any<>>> parse_csv_in_chunks(
const Span<char> buffer,
const CsvParseOptions &options,
FunctionRef<void(const CsvRecord &record)> process_header,
FunctionRef<Any<>(const CsvRecords &records)> process_records);
/**
* Same as above, but uses a templated chunk type instead of using #Any which can be more
* convenient to use.
*/
template<typename ChunkT>
inline std::optional<Vector<ChunkT>> parse_csv_in_chunks(
const Span<char> buffer,
const CsvParseOptions &options,
FunctionRef<void(const CsvRecord &record)> process_header,
FunctionRef<ChunkT(const CsvRecords &records)> process_records)
{
std::optional<Vector<Any<>>> result = parse_csv_in_chunks(
buffer, options, process_header, [&](const CsvRecords &records) {
return Any<>(process_records(records));
});
if (!result.has_value()) {
return std::nullopt;
}
Vector<ChunkT> result_chunks;
result_chunks.reserve(result->size());
for (Any<> &value : *result) {
result_chunks.append(std::move(value.get<ChunkT>()));
}
return result_chunks;
}
/**
* Fields in a CSV file may contain escaped quote characters (e.g. "" or \").
* This function replaces these with just the quote character.
* The returned string may be reference the input string if it's the same.
* Otherwise the returned string is allocated in the given allocator.
*/
StringRef unescape_field(const StringRef str,
const CsvParseOptions &options,
LinearAllocator<> &allocator);
/* -------------------------------------------------------------------- */
/** \name #CsvRecord inline functions.
* \{ */
inline CsvRecord::CsvRecord(Span<Span<char>> fields) : fields_(fields) {}
inline int64_t CsvRecord::size() const
{
return fields_.size();
}
inline IndexRange CsvRecord::index_range() const
{
return fields_.index_range();
}
inline Span<char> CsvRecord::field(const int64_t index) const
{
BLI_assert(index >= 0);
if (index >= fields_.size()) {
return {};
}
return fields_[index];
}
inline StringRef CsvRecord::field_str(const int64_t index) const
{
const Span<char> value = this->field(index);
return StringRef(value.data(), value.size());
}
/** \} */
/* -------------------------------------------------------------------- */
/** \name #CsvRecords inline functions.
* \{ */
inline CsvRecords::CsvRecords(const OffsetIndices<int64_t> offsets, const Span<Span<char>> fields)
: offsets_(offsets), fields_(fields)
{
}
inline int64_t CsvRecords::size() const
{
return offsets_.size();
}
inline IndexRange CsvRecords::index_range() const
{
return offsets_.index_range();
}
inline CsvRecord CsvRecords::record(const int64_t index) const
{
return CsvRecord(fields_.slice(offsets_[index]));
}
/** \} */
/* -------------------------------------------------------------------- */
/** \name Internal functions exposed for testing.
* \{ */
namespace detail {
/**
* Find the index that ends the current field, i.e. the index of the next delimiter of newline.
* The start index has to be the index of the first character in the field. It may also be the
* end of the field already if it is empty.
*
* \param start: The index of the first character in the field. This may also be the end of the
* field already if it is empty.
* \param delimiter: The character that ends the field.
* \return Index of the next delimiter, a newline character or the end of the buffer.
*/
int64_t find_end_of_simple_field(Span<char> buffer, int64_t start, char delimiter);
/**
* Find the index of the quote that ends the current field.
*
* \param start: The index after the opening quote.
* \param quote: The quote character that ends the field.
* \param escape_chars: The characters that may be used to escape the quote character.
* \return Index of the quote character that ends the field, or std::nullopt if the field is
* malformed and does not have an end.
*/
std::optional<int64_t> find_end_of_quoted_field(Span<char> buffer,
int64_t start,
char quote,
Span<char> escape_chars);
/**
* Finds all fields for the record starting at the given index. Typically, the record ends with a
* newline, but quoted multi-line records are supported as well.
*
* \return Index of the start of the next record or the end of the buffer. #std::nullopt is
* returned if the buffer has a malformed record at the end,
* i.e. a quoted field that is not closed.
*/
std::optional<int64_t> parse_record_fields(const Span<char> buffer,
const int64_t start,
const char delimiter,
const char quote,
const Span<char> quote_escape_chars,
Vector<Span<char>> &r_fields);
} // namespace detail
/** \} */
} // namespace blender::csv_parse