Vince's CSV Parser
Loading...
Searching...
No Matches
basic_csv_parser.hpp
Go to the documentation of this file.
1
5#pragma once
6#include <algorithm>
7#include <array>
8#include <fstream>
9#include <memory>
10#include <unordered_map>
11#include <unordered_set>
12#include <vector>
13
14#if !defined(__EMSCRIPTEN__)
15#include "../external/mio.hpp"
16#endif
18#include "col_names.hpp"
19#include "common.hpp"
20#include "csv_format.hpp"
21#include "csv_row.hpp"
22#include "row_deque.hpp"
23
24namespace csv {
25 namespace internals {
26 constexpr const int UNINITIALIZED_FIELD = -1;
27
30 template<typename OutArray, typename T = typename OutArray::type>
31 CSV_CONST CONSTEXPR_17 OutArray arrayToDefault(T&& value)
32 {
33 OutArray a {};
34 for (auto& e : a)
35 e = value;
36 return a;
37 }
38
39 struct GuessScore {
40 size_t header;
41 size_t mode_row_length;
42 double score;
43 };
44
45 CSV_INLINE GuessScore calculate_score(csv::string_view head, const CSVFormat& format);
46
50 const std::vector<char>& delims = { ',', '|', '\t', ';', '^', '~' }
51 );
52
57 CSV_CONST CONSTEXPR_17 ParseFlagMap make_parse_flags(char delimiter) {
58 auto ret = arrayToDefault<ParseFlagMap>(ParseFlags::NOT_SPECIAL);
59 ret[delimiter + CHAR_OFFSET] = ParseFlags::DELIMITER;
60 ret['\r' + CHAR_OFFSET] = ParseFlags::CARRIAGE_RETURN;
61 ret['\n' + CHAR_OFFSET] = ParseFlags::NEWLINE;
62 return ret;
63 }
64
69 CSV_CONST CONSTEXPR_17 ParseFlagMap make_parse_flags(char delimiter, char quote_char) {
70 std::array<ParseFlags, 256> ret = make_parse_flags(delimiter);
71 ret[quote_char + CHAR_OFFSET] = ParseFlags::QUOTE;
72 return ret;
73 }
74
75 inline char infer_char_for_flag(
76 const ParseFlagMap& parse_flags,
77 ParseFlags target,
78 char fallback
79 ) noexcept {
80 for (size_t i = 0; i < parse_flags.size(); ++i) {
81 if (parse_flags[i] == target) {
82 return static_cast<char>(static_cast<int>(i) - CHAR_OFFSET);
83 }
84 }
85
86 return fallback;
87 }
88
89 inline char infer_delimiter(const ParseFlagMap& parse_flags) noexcept {
90 return infer_char_for_flag(parse_flags, ParseFlags::DELIMITER, ',');
91 }
92
93 // fallback is returned when no QUOTE flag exists in parse_flags (e.g. no_quote mode).
94 // Pass the delimiter so SIMD stops there instead of on a byte that is NOT_SPECIAL.
95 inline char infer_quote_char(const ParseFlagMap& parse_flags, char fallback = '"') noexcept {
96 return infer_char_for_flag(parse_flags, ParseFlags::QUOTE, fallback);
97 }
98
103 CSV_CONST CONSTEXPR_17 WhitespaceMap make_ws_flags(const char* ws_chars, size_t n_chars) {
104 auto ret = arrayToDefault<WhitespaceMap>(false);
105 for (size_t j = 0; j < n_chars; j++) {
106 ret[ws_chars[j] + CHAR_OFFSET] = true;
107 }
108 return ret;
109 }
110
111 inline WhitespaceMap make_ws_flags(const std::vector<char>& flags) {
112 return make_ws_flags(flags.data(), flags.size());
113 }
114
116 template<typename TStream,
117 csv::enable_if_t<std::is_base_of<std::istream, TStream>::value, int> = 0>
118 std::string get_csv_head_stream(TStream& source) {
119 const size_t limit = 500000;
120 std::string buf(limit, '\0');
121 source.read(&buf[0], (std::streamsize)limit);
122 buf.resize(static_cast<size_t>(source.gcount()));
123 return buf;
124 }
125
126 #if !defined(__EMSCRIPTEN__)
130 CSV_INLINE std::pair<std::string, size_t> get_csv_head_mmap(csv::string_view filename);
131 #endif
132
134 CSV_INLINE std::string get_csv_head(csv::string_view filename);
135
137 CSVFormat format;
138 size_t n_cols = 0;
139 };
140
141 class IBasicCSVParser;
142 }
143
162 const std::vector<char>& delims = { ',', '|', '\t', ';', '^', '~' }) {
163 auto head = internals::get_csv_head(filename);
164 return internals::guess_format(head, delims);
165 }
166
169
170 namespace internals {
178 public:
179 IBasicCSVParser() = default;
180 IBasicCSVParser(const CSVFormat&, const ColNamesPtr&);
182 const ParseFlagMap& parse_flags,
183 const WhitespaceMap& ws_flags
184 ) : parse_flags_(parse_flags), ws_flags_(ws_flags) {
185 const char d = internals::infer_delimiter(parse_flags);
186 simd_sentinels_ = SentinelVecs(d, internals::infer_quote_char(parse_flags, d));
187 has_ws_trimming_ = std::any_of(ws_flags.begin(), ws_flags.end(), [](bool b) { return b; });
188 }
189
190 virtual ~IBasicCSVParser() {}
191
193 bool eof() { return this->eof_; }
194
195 ResolvedFormat get_resolved_format() { return this->format; }
196
198 virtual void next(size_t bytes) = 0;
199
201 void end_feed();
202
203 CONSTEXPR_17 ParseFlags parse_flag(const char ch) const noexcept {
204 return parse_flags_.data()[ch + CHAR_OFFSET];
205 }
206
207 CONSTEXPR_17 ParseFlags compound_parse_flag(const char ch) const noexcept {
208 return quote_escape_flag(parse_flag(ch), this->quote_escape_);
209 }
210
212 CONSTEXPR bool utf8_bom() const { return this->utf8_bom_; }
213
214 void set_output(RowCollection& rows) { this->records_ = &rows; }
215
216 protected:
219 CSVRow current_row_;
220 RawCSVDataPtr data_ptr_ = nullptr;
221 ColNamesPtr col_names_ = nullptr;
222 RawCSVFieldList* fields_ = nullptr;
223 int field_start_ = UNINITIALIZED_FIELD;
224 size_t field_length_ = 0;
225
228
232
235 bool eof_ = false;
236
237 ResolvedFormat format;
238
240 size_t source_size_ = 0;
242
243 virtual std::string& get_csv_head() = 0;
244
246 CONSTEXPR bool no_chunk() const { return this->source_size_ < CSV_CHUNK_SIZE_DEFAULT; }
247
249 size_t parse();
250
252 void reset_data_ptr();
253
254 void resolve_format_from_head(const CSVFormat& format);
255 private:
259 WhitespaceMap ws_flags_;
260
264 bool has_ws_trimming_ = false;
265 bool quote_escape_ = false;
266 bool field_has_double_quote_ = false;
267
269 size_t data_pos_ = 0;
270
272 bool unicode_bom_scan_ = false;
273 bool utf8_bom_ = false;
274
276 RowCollection* records_ = nullptr;
277
278 CONSTEXPR_17 bool ws_flag(const char ch) const noexcept {
279 return ws_flags_.data()[ch + CHAR_OFFSET];
280 }
281
282 size_t& current_row_start() {
283 return this->current_row_.data_start;
284 }
285
286 void parse_field() noexcept;
287
289 void push_field();
290
292 void push_row();
293
295 void trim_utf8_bom();
296 };
297
315 template<typename TStream>
318
319 public:
320 StreamParser(TStream& source,
321 const CSVFormat& format,
322 const ColNamesPtr& col_names = nullptr
323 ) : IBasicCSVParser(format, col_names),
324 source_(source) {
325 this->resolve_format_from_head(format);
326 }
327
329 TStream& source,
330 internals::ParseFlagMap parse_flags,
331 internals::WhitespaceMap ws_flags) :
332 IBasicCSVParser(parse_flags, ws_flags),
333 source_(source)
334 {}
335
336 ~StreamParser() {}
337
338 std::string& get_csv_head() override {
339 leftover_ = get_csv_head_stream(this->source_);
340 return this->leftover_;
341 }
342
343 void next(size_t bytes = CSV_CHUNK_SIZE_DEFAULT) override {
344 if (this->eof()) return;
345
346 // Reset parser state
347 this->field_start_ = UNINITIALIZED_FIELD;
348 this->field_length_ = 0;
349 this->reset_data_ptr();
350 this->data_ptr_->_data = std::make_shared<std::string>();
351
352 auto& chunk = *static_cast<std::string*>(this->data_ptr_->_data.get());
353
354 // Prepend leftover bytes from the previous chunk's incomplete
355 // trailing row, then read the next block from the stream.
356 // Uses a raw buffer to avoid std::string::resize() zero-fill
357 // on the full 10MB chunk size (critical for tiny inputs).
358 chunk = std::move(leftover_);
359 std::unique_ptr<char[]> buf(new char[bytes]);
360 source_.read(buf.get(), (std::streamsize)bytes);
361
362 const size_t n = static_cast<size_t>(source_.gcount());
363
364 if (n > 0) chunk.append(buf.get(), n);
365
366 // Check for real I/O errors only (bad bit indicates unrecoverable error).
367 // failbit alone is not fatal - it's set on EOF or when requesting bytes
368 // beyond available data, which is normal behavior for stringstreams.
369 if (source_.bad()) {
370 throw std::runtime_error("StreamParser read failure");
371 }
372
373 // Create string_view
374 this->data_ptr_->data = chunk;
375
376 // Parse
377 this->current_row_ = CSVRow(this->data_ptr_);
378 size_t remainder = this->parse();
379
380 if (source_.eof() || chunk.empty()) {
381 this->eof_ = true;
382 this->end_feed();
383 }
384 else {
385 // Save the tail bytes that begin an incomplete row so they
386 // are prepended to the next chunk (see class-level comment).
387 leftover_ = chunk.substr(remainder);
388 }
389 }
390
391 private:
392 // Bytes from the previous chunk that form the start of an incomplete
393 // row, plus the initial head buffer on the first call.
394 std::string leftover_;
395
396 TStream& source_;
397 };
398
399#if !defined(__EMSCRIPTEN__)
416 public:
418 const CSVFormat& format,
419 const ColNamesPtr& col_names = nullptr
420 ) : IBasicCSVParser(format, col_names) {
421 this->_filename = filename.data();
422 auto head_and_size = get_csv_head_mmap(filename);
423 this->head_ = std::move(head_and_size.first);
424 this->source_size_ = head_and_size.second;
425 this->resolve_format_from_head(format);
426 };
427
428 ~MmapParser() {}
429
430 std::string& get_csv_head() override {
431 // head_ was already populated in the constructor.
432 return this->head_;
433 }
434
435 void next(size_t bytes) override;
436
437 private:
438 void finalize_loaded_chunk(size_t length, bool eof_on_no_chunk = false);
439
440 std::string _filename;
441 size_t mmap_pos = 0;
442 std::string head_;
443 };
444#endif
445 }
446}
CSV_CONST CONSTEXPR_17 ParseFlagMap make_parse_flags(char delimiter)
Create a vector v where each index i corresponds to the ASCII number for a character and,...
CSV_CONST CONSTEXPR_17 WhitespaceMap make_ws_flags(const char *ws_chars, size_t n_chars)
Create a vector v where each index i corresponds to the ASCII number for a character c and,...
CSVGuessResult guess_format(csv::string_view head, const std::vector< char > &delims={ ',', '|', '\t', ';', '^', '~' })
Guess the delimiter used by a delimiter-separated values file.
CSV_CONST CONSTEXPR_17 OutArray arrayToDefault(T &&value)
Helper constexpr function to initialize an array with all the elements set to value.
SIMD-accelerated skip for runs of non-special CSV bytes.
Stores information about how to parse a CSV file.
Data structure for representing CSV rows.
Definition csv_row.hpp:264
Abstract base class which provides CSV parsing logic.
CONSTEXPR bool no_chunk() const
Whether or not source needs to be read in chunks.
SentinelVecs simd_sentinels_
Precomputed SIMD broadcast vectors for find_next_non_special.
size_t source_size_
The size of the incoming CSV.
void reset_data_ptr()
Create a new RawCSVDataPtr for a new chunk of data.
bool eof()
Whether or not we have reached the end of source.
void end_feed()
Indicate the last block of data has been parsed.
size_t parse()
Parse the current chunk of data and return the completed-row prefix length.
virtual void next(size_t bytes)=0
Parse the next block of data.
CONSTEXPR bool utf8_bom() const
Whether or not this CSV has a UTF-8 byte order mark.
ParseFlagMap parse_flags_
An array where the (i + 128)th slot gives the ParseFlags for ASCII character i.
Parser for memory-mapped files.
A class for parsing CSV data from any std::istream, including non-seekable sources such as pipes and ...
void next(size_t bytes=CSV_CHUNK_SIZE_DEFAULT) override
Parse the next block of data.
A std::deque wrapper which allows multiple read and write threads to concurrently access it along wit...
A standalone header file containing shared code.
std::array< ParseFlags, 256 > ParseFlagMap
An array which maps ASCII chars to a parsing flag.
Definition common.hpp:289
std::array< bool, 256 > WhitespaceMap
An array which maps ASCII chars to a flag indicating if it is whitespace.
Definition common.hpp:292
ParseFlags
An enum used for describing the significance of each character with respect to CSV parsing.
Definition common.hpp:250
constexpr ParseFlags quote_escape_flag(ParseFlags flag, bool quote_escape) noexcept
Transform the ParseFlags given the context of whether or not the current field is quote escaped.
Definition common.hpp:261
#define CONSTEXPR
Expands to constexpr in decent compilers and inline otherwise.
Definition common.hpp:187
#define CSV_INLINE
Helper macro which should be #defined as "inline" in the single header version.
Definition common.hpp:26
constexpr size_t CSV_CHUNK_SIZE_DEFAULT
Default chunk size for lazy-loading large CSV files.
Definition common.hpp:228
Defines an object used to store CSV format settings.
Defines the data type used for storing information about a CSV row.
The all encompassing namespace.
CSVGuessResult guess_format(csv::string_view filename, const std::vector< char > &delims={ ',', '|', '\t', ';', '^', '~' })
Guess the delimiter, header row, and mode column count of a CSV file.
constexpr unsigned CHAR_OFFSET
Offset to convert char into array index.
Definition common.hpp:299
nonstd::string_view string_view
The string_view class used by this library.
Definition common.hpp:135
Shared contracts for row deque implementations.
Stores the inferred format of a CSV file.