/* SPDX-FileCopyrightText: 2025 Blender Authors * * SPDX-License-Identifier: GPL-2.0-or-later */ /** \file * \ingroup bli */ #include "BLI_csv_parse.hh" #include "BLI_enumerable_thread_specific.hh" #include "BLI_task.hh" #include namespace blender::csv_parse { /** * Returns a guess for the start of the next record. Note that this could split up quoted fields. * This case needs to be detected at a higher level. */ static int64_t guess_next_record_start(const Span buffer, const int64_t start) { int64_t i = start; while (i < buffer.size()) { const char c = buffer[i]; if (c == '\n') { return i + 1; } i++; } return buffer.size(); } /** * Split the buffer into chunks of approximately the given size. The function attempts to align the * chunks so that records are not split. This works in the majority of cases, but can fail with * multi-line fields. This has to be detected at a higher level. */ static Vector> split_into_aligned_chunks(const Span buffer, int64_t approximate_chunk_size) { approximate_chunk_size = std::max(approximate_chunk_size, 1); Vector> chunks; int64_t start = 0; while (start < buffer.size()) { int64_t end = std::min(start + approximate_chunk_size, buffer.size()); end = guess_next_record_start(buffer, end); chunks.append(buffer.slice(IndexRange::from_begin_end(start, end))); start = end; } return chunks; } /** * Parses the given buffer into records and their fields. * * r_data_offsets and r_data_fields are passed into to be able to reuse their memory. */ static std::optional parse_records(const Span buffer, const CsvParseOptions &options, Vector &r_data_offsets, Vector> &r_data_fields) { using namespace detail; /* Clear the data that may still be in there, but do not free the memory. */ r_data_offsets.clear(); r_data_fields.clear(); r_data_offsets.append(0); int64_t start = 0; while (start < buffer.size()) { const std::optional next_record_start = parse_record_fields( buffer, start, options.delimiter, options.quote, options.quote_escape_chars, r_data_fields); if (!next_record_start.has_value()) { return std::nullopt; } /* Ignore empty lines. While those are not great practice, they can occur in practice. */ if (r_data_fields.size() > r_data_offsets.last()) { r_data_offsets.append(r_data_fields.size()); } start = *next_record_start; } return CsvRecords(OffsetIndices(r_data_offsets), r_data_fields); } std::optional>> parse_csv_in_chunks( const Span buffer, const CsvParseOptions &options, FunctionRef process_header, FunctionRef(const CsvRecords &records)> process_records) { using namespace detail; /* First parse the first row to get the column names. */ Vector> header_fields; const std::optional first_data_record_start = parse_record_fields( buffer, 0, options.delimiter, options.quote, options.quote_escape_chars, header_fields); if (!first_data_record_start.has_value()) { return std::nullopt; } /* Call this before starting to process the remaining data. This allows the caller to do some * preprocessing that is used during chunk parsing. */ process_header(CsvRecord(header_fields)); /* This buffer contains only the data records, without the header. */ const Span data_buffer = buffer.drop_front(*first_data_record_start); /* Split the buffer into chunks that can be processed in parallel. */ const Vector> data_buffer_chunks = split_into_aligned_chunks( data_buffer, options.chunk_size_bytes); /* It's not common, but it can happen that .csv files contain quoted multi-line values. In the * unlucky case that we split the buffer in the middle of such a multi-line field, there will be * malformed chunks. In this case we fallback to parsing the whole buffer with a single thread. * If this case becomes more common, we could try to avoid splitting into malformed chunks by * making the splitting logic a bit smarter. */ std::atomic found_malformed_chunk = false; Vector>> chunk_results(data_buffer_chunks.size()); struct TLS { Vector data_offsets; Vector> data_fields; }; threading::EnumerableThreadSpecific all_tls; threading::parallel_for(chunk_results.index_range(), 1, [&](const IndexRange range) { TLS &tls = all_tls.local(); for (const int64_t i : range) { if (found_malformed_chunk.load(std::memory_order_relaxed)) { /* All work is cancelled when there was a malformed chunk. */ return; } const Span chunk_buffer = data_buffer_chunks[i]; const std::optional records = parse_records( chunk_buffer, options, tls.data_offsets, tls.data_fields); if (!records.has_value()) { found_malformed_chunk.store(true, std::memory_order_relaxed); return; } chunk_results[i] = process_records(*records); } }); /* If there was a malformed chunk, process the data again in a single thread without splitting * the input into chunks. This should happen quite rarely but is important for overall * correctness. */ if (found_malformed_chunk) { chunk_results.clear(); TLS &tls = all_tls.local(); const std::optional records = parse_records( data_buffer, options, tls.data_offsets, tls.data_fields); if (!records.has_value()) { return std::nullopt; } chunk_results.append(process_records(*records)); } /* Prepare the return value. */ Vector> results; for (std::optional> &result : chunk_results) { BLI_assert(result.has_value()); results.append(std::move(result.value())); } return results; } StringRef unescape_field(const StringRef str, const CsvParseOptions &options, LinearAllocator<> &allocator) { const StringRef escape_chars{options.quote_escape_chars}; if (str.find_first_of(escape_chars) == StringRef::not_found) { return str; } /* The actual unescaped string may be shorter, but not longer. */ MutableSpan unescaped_str = allocator.allocate_array(str.size()); int64_t i = 0; int64_t escaped_size = 0; while (i < str.size()) { const char c = str[i]; if (options.quote_escape_chars.contains(c)) { if (i + 1 < str.size() && str[i + 1] == options.quote) { /* Ignore the current escape character. */ unescaped_str[escaped_size++] = options.quote; i += 2; continue; } } unescaped_str[escaped_size++] = c; i++; } return StringRef(unescaped_str.take_front(escaped_size)); } namespace detail { std::optional parse_record_fields(const Span buffer, const int64_t start, const char delimiter, const char quote, const Span quote_escape_chars, Vector> &r_fields) { using namespace detail; const auto handle_potentially_trailing_delimiter = [&](const int64_t i) { if (i <= buffer.size()) { if (i < buffer.size()) { if (ELEM(buffer[i], '\n', '\r')) { r_fields.append({}); } } else { r_fields.append({}); } } }; int64_t i = start; while (i < buffer.size()) { const char c = buffer[i]; if (c == '\n') { return i + 1; } if (c == '\r') { i++; continue; } if (c == delimiter) { r_fields.append({}); i++; handle_potentially_trailing_delimiter(i); continue; } if (c == quote) { i++; const std::optional end_of_field = find_end_of_quoted_field( buffer, i, quote, quote_escape_chars); if (!end_of_field.has_value()) { return std::nullopt; } r_fields.append(buffer.slice(IndexRange::from_begin_end(i, *end_of_field))); i = *end_of_field; while (i < buffer.size()) { const char inner_c = buffer[i]; if (inner_c == quote) { i++; continue; } if (inner_c == delimiter) { i++; handle_potentially_trailing_delimiter(i); break; } if (ELEM(inner_c, '\n', '\r')) { break; } i++; } continue; } const int64_t end_of_field = find_end_of_simple_field(buffer, i, delimiter); r_fields.append(buffer.slice(IndexRange::from_begin_end(i, end_of_field))); i = end_of_field; while (i < buffer.size()) { const char inner_c = buffer[i]; if (inner_c == delimiter) { i++; handle_potentially_trailing_delimiter(i); break; } if (ELEM(inner_c, '\n', '\r')) { break; } BLI_assert_unreachable(); } } return buffer.size(); } int64_t find_end_of_simple_field(const Span buffer, const int64_t start, const char delimiter) { int64_t i = start; while (i < buffer.size()) { const char c = buffer[i]; if (ELEM(c, delimiter, '\n', '\r')) { return i; } i++; } return buffer.size(); } std::optional find_end_of_quoted_field(const Span buffer, const int64_t start, const char quote, const Span escape_chars) { int64_t i = start; while (i < buffer.size()) { const char c = buffer[i]; if (escape_chars.contains(c)) { if (i + 1 < buffer.size() && buffer[i + 1] == quote) { i += 2; continue; } } if (c == quote) { return i; } i++; } return std::nullopt; } } // namespace detail } // namespace blender::csv_parse