248 lines
8.2 KiB
C++
248 lines
8.2 KiB
C++
/* SPDX-FileCopyrightText: 2025 Blender Authors
|
|
*
|
|
* SPDX-License-Identifier: GPL-2.0-or-later */
|
|
|
|
/** \file
|
|
* \ingroup bli
|
|
*/
|
|
|
|
#include "BLI_any.hh"
|
|
#include "BLI_function_ref.hh"
|
|
#include "BLI_linear_allocator.hh"
|
|
#include "BLI_offset_indices.hh"
|
|
#include "BLI_string_ref.hh"
|
|
#include "BLI_vector.hh"
|
|
|
|
namespace blender::csv_parse {
|
|
|
|
/**
|
|
* Contains the fields of a single record of a .csv file. Usually that corresponds to a single
|
|
* line.
|
|
*/
|
|
class CsvRecord {
|
|
private:
|
|
Span<Span<char>> fields_;
|
|
|
|
public:
|
|
CsvRecord(Span<Span<char>> fields);
|
|
|
|
/** Number of fields in the record. */
|
|
int64_t size() const;
|
|
IndexRange index_range() const;
|
|
|
|
/** Get the field at the given index. Empty data is returned if the index is too large. */
|
|
Span<char> field(const int64_t index) const;
|
|
StringRef field_str(const int64_t index) const;
|
|
};
|
|
|
|
/**
|
|
* Contains the fields of multiple records.
|
|
*/
|
|
class CsvRecords {
|
|
private:
|
|
OffsetIndices<int64_t> offsets_;
|
|
Span<Span<char>> fields_;
|
|
|
|
public:
|
|
CsvRecords(OffsetIndices<int64_t> offsets, Span<Span<char>> fields);
|
|
|
|
/** Number of records (rows). */
|
|
int64_t size() const;
|
|
IndexRange index_range() const;
|
|
|
|
/** Get the record at the given index. */
|
|
CsvRecord record(const int64_t index) const;
|
|
};
|
|
|
|
struct CsvParseOptions {
|
|
/** The character that separates fields within a row. */
|
|
char delimiter = ',';
|
|
/**
|
|
* The character that can be used to enclose fields which contain the delimiter or span multiple
|
|
* lines.
|
|
*/
|
|
char quote = '"';
|
|
/**
|
|
* Characters that can be used to escape the quote character. By default, "" or \" both represent
|
|
* an escaped quote.
|
|
*/
|
|
Span<char> quote_escape_chars = Span<char>(StringRef("\"\\"));
|
|
/** Approximate number of bytes per chunk that the input is split into. */
|
|
int64_t chunk_size_bytes = 64 * 1024;
|
|
};
|
|
|
|
/**
|
|
* Parses a `.csv` file. There are two important aspects to the way this interface is designed:
|
|
* 1. It allows the file to be split into chunks that can be parsed in parallel.
|
|
* 2. Splitting the file into individual records and fields is separated from parsing the actual
|
|
* content into e.g. floats. This simplifies the implementation of both parts because the
|
|
* logical parsing does not have to worry about e.g. the delimiter or quote characters. It also
|
|
* simplifies unit testing.
|
|
*
|
|
* \param buffer: The buffer containing the `.csv` file.
|
|
* \param options: Options that control how the file is parsed.
|
|
* \param process_header: A function that is called at most once and contains the fields of the
|
|
* first row/record.
|
|
* \param process_records: A function that is called potentially many times in parallel and that
|
|
* processes a chunk of parsed records. Typically this function parses raw byte fields into e.g.
|
|
* ints or floats. The result of the parsing process has to be returned. Note that under specific
|
|
* circumstances, this function may be called twice for the same records. That can happen when
|
|
* the `.csv` file contains multi-line fields which were split incorrectly at first.
|
|
* \return A vector containing the return values of the `process_records` function in the correct
|
|
* order. #std::nullopt is returned if the file was malformed, e.g.
|
|
* if it has a quoted field that is not closed.
|
|
*/
|
|
std::optional<Vector<Any<>>> parse_csv_in_chunks(
|
|
const Span<char> buffer,
|
|
const CsvParseOptions &options,
|
|
FunctionRef<void(const CsvRecord &record)> process_header,
|
|
FunctionRef<Any<>(const CsvRecords &records)> process_records);
|
|
|
|
/**
|
|
* Same as above, but uses a templated chunk type instead of using #Any which can be more
|
|
* convenient to use.
|
|
*/
|
|
template<typename ChunkT>
|
|
inline std::optional<Vector<ChunkT>> parse_csv_in_chunks(
|
|
const Span<char> buffer,
|
|
const CsvParseOptions &options,
|
|
FunctionRef<void(const CsvRecord &record)> process_header,
|
|
FunctionRef<ChunkT(const CsvRecords &records)> process_records)
|
|
{
|
|
std::optional<Vector<Any<>>> result = parse_csv_in_chunks(
|
|
buffer, options, process_header, [&](const CsvRecords &records) {
|
|
return Any<>(process_records(records));
|
|
});
|
|
if (!result.has_value()) {
|
|
return std::nullopt;
|
|
}
|
|
Vector<ChunkT> result_chunks;
|
|
result_chunks.reserve(result->size());
|
|
for (Any<> &value : *result) {
|
|
result_chunks.append(std::move(value.get<ChunkT>()));
|
|
}
|
|
return result_chunks;
|
|
}
|
|
|
|
/**
|
|
* Fields in a CSV file may contain escaped quote characters (e.g. "" or \").
|
|
* This function replaces these with just the quote character.
|
|
* The returned string may be reference the input string if it's the same.
|
|
* Otherwise the returned string is allocated in the given allocator.
|
|
*/
|
|
StringRef unescape_field(const StringRef str,
|
|
const CsvParseOptions &options,
|
|
LinearAllocator<> &allocator);
|
|
|
|
/* -------------------------------------------------------------------- */
|
|
/** \name #CsvRecord inline functions.
|
|
* \{ */
|
|
|
|
inline CsvRecord::CsvRecord(Span<Span<char>> fields) : fields_(fields) {}
|
|
|
|
inline int64_t CsvRecord::size() const
|
|
{
|
|
return fields_.size();
|
|
}
|
|
|
|
inline IndexRange CsvRecord::index_range() const
|
|
{
|
|
return fields_.index_range();
|
|
}
|
|
|
|
inline Span<char> CsvRecord::field(const int64_t index) const
|
|
{
|
|
BLI_assert(index >= 0);
|
|
if (index >= fields_.size()) {
|
|
return {};
|
|
}
|
|
return fields_[index];
|
|
}
|
|
|
|
inline StringRef CsvRecord::field_str(const int64_t index) const
|
|
{
|
|
const Span<char> value = this->field(index);
|
|
return StringRef(value.data(), value.size());
|
|
}
|
|
|
|
/** \} */
|
|
|
|
/* -------------------------------------------------------------------- */
|
|
/** \name #CsvRecords inline functions.
|
|
* \{ */
|
|
|
|
inline CsvRecords::CsvRecords(const OffsetIndices<int64_t> offsets, const Span<Span<char>> fields)
|
|
: offsets_(offsets), fields_(fields)
|
|
{
|
|
}
|
|
|
|
inline int64_t CsvRecords::size() const
|
|
{
|
|
return offsets_.size();
|
|
}
|
|
|
|
inline IndexRange CsvRecords::index_range() const
|
|
{
|
|
return offsets_.index_range();
|
|
}
|
|
|
|
inline CsvRecord CsvRecords::record(const int64_t index) const
|
|
{
|
|
return CsvRecord(fields_.slice(offsets_[index]));
|
|
}
|
|
|
|
/** \} */
|
|
|
|
/* -------------------------------------------------------------------- */
|
|
/** \name Internal functions exposed for testing.
|
|
* \{ */
|
|
|
|
namespace detail {
|
|
|
|
/**
|
|
* Find the index that ends the current field, i.e. the index of the next delimiter of newline.
|
|
* The start index has to be the index of the first character in the field. It may also be the
|
|
* end of the field already if it is empty.
|
|
*
|
|
* \param start: The index of the first character in the field. This may also be the end of the
|
|
* field already if it is empty.
|
|
* \param delimiter: The character that ends the field.
|
|
* \return Index of the next delimiter, a newline character or the end of the buffer.
|
|
*/
|
|
int64_t find_end_of_simple_field(Span<char> buffer, int64_t start, char delimiter);
|
|
|
|
/**
|
|
* Find the index of the quote that ends the current field.
|
|
*
|
|
* \param start: The index after the opening quote.
|
|
* \param quote: The quote character that ends the field.
|
|
* \param escape_chars: The characters that may be used to escape the quote character.
|
|
* \return Index of the quote character that ends the field, or std::nullopt if the field is
|
|
* malformed and does not have an end.
|
|
*/
|
|
std::optional<int64_t> find_end_of_quoted_field(Span<char> buffer,
|
|
int64_t start,
|
|
char quote,
|
|
Span<char> escape_chars);
|
|
|
|
/**
|
|
* Finds all fields for the record starting at the given index. Typically, the record ends with a
|
|
* newline, but quoted multi-line records are supported as well.
|
|
*
|
|
* \return Index of the start of the next record or the end of the buffer. #std::nullopt is
|
|
* returned if the buffer has a malformed record at the end,
|
|
* i.e. a quoted field that is not closed.
|
|
*/
|
|
std::optional<int64_t> parse_record_fields(const Span<char> buffer,
|
|
const int64_t start,
|
|
const char delimiter,
|
|
const char quote,
|
|
const Span<char> quote_escape_chars,
|
|
Vector<Span<char>> &r_fields);
|
|
|
|
} // namespace detail
|
|
|
|
/** \} */
|
|
|
|
} // namespace blender::csv_parse
|