Vince's CSV Parser
Loading...
Searching...
No Matches
basic_csv_parser.cpp
2
3#include <system_error>
4
5// Because g++ wants to be a pedantic little brat about fallthroughs
6#ifdef CXX_CSV_HAS_17
7#define FALLTHROUGH_TO_NEXT_CASE [[fallthrough]];
8#else
9#define FALLTHROUGH_TO_NEXT_CASE goto next_newline_case;
10#endif
11
12namespace csv {
13 namespace internals {
14 // Opens the file and delegates to the template overload to avoid duplicating the read/resize logic.
15 CSV_INLINE std::string get_csv_head_stream(csv::string_view filename) {
16 std::ifstream infile(std::string(filename), std::ios::binary);
17 if (!infile.is_open()) {
18 throw std::runtime_error("Cannot open file " + std::string(filename));
19 }
20 return get_csv_head_stream(infile);
21 }
22
23#if !defined(__EMSCRIPTEN__)
24 CSV_INLINE std::pair<std::string, size_t> get_csv_head_mmap(csv::string_view filename) {
25 const size_t bytes = 500000;
26 std::error_code error;
27 auto mmap = mio::make_mmap_source(std::string(filename), 0, mio::map_entire_file, error);
28 if (error) {
29 throw std::runtime_error("Cannot open file " + std::string(filename));
30 }
31 const size_t file_size = mmap.size();
32 const size_t length = std::min(file_size, bytes);
33 return { std::string(mmap.begin(), mmap.begin() + length), file_size };
34 }
35#endif
36
37 CSV_INLINE std::string get_csv_head(csv::string_view filename) {
38#if defined(__EMSCRIPTEN__)
39 return get_csv_head_stream(filename);
40#else
41 return get_csv_head_mmap(filename).first;
42#endif
43 }
44
45
46#ifdef _MSC_VER
47#pragma region IBasicCSVParser
48#endif
49 CSV_INLINE IBasicCSVParser::IBasicCSVParser(
50 const CSVFormat& source_format,
51 const ColNamesPtr& col_names
52 ) : col_names_(col_names) {
53 // Only initialize the fields that are stable before format resolution.
54 // parse_flags_ and simd_sentinels_ are always set by resolve_format_from_head,
55 // so there is no point computing them here with a placeholder delimiter.
56 ws_flags_ = internals::make_ws_flags(
57 source_format.trim_chars.data(), source_format.trim_chars.size()
58 );
59 has_ws_trimming_ = !source_format.trim_chars.empty();
60 }
61
62 CSV_INLINE void IBasicCSVParser::resolve_format_from_head(const CSVFormat& source_format) {
63 auto head = this->get_csv_head();
64
65 ResolvedFormat resolved;
66 resolved.format = source_format;
67
68 const bool infer_delimiter = source_format.guess_delim();
69 const bool infer_header = !source_format.header_explicitly_set_
70 && (infer_delimiter || !source_format.col_names_explicitly_set_);
71 const bool infer_n_cols = (source_format.get_header() < 0 && source_format.get_col_names().empty());
72
73 if (infer_delimiter || infer_header || infer_n_cols) {
74 auto guess_result = guess_format(head, source_format.get_possible_delims());
75 if (infer_delimiter) {
76 resolved.format.delimiter(guess_result.delim);
77 }
78
79 if (infer_header) {
80 // Inferred header should not clear user-provided column names.
81 resolved.format.header = guess_result.header_row;
82 }
83
84 resolved.n_cols = guess_result.n_cols;
85 }
86
87 if (resolved.format.no_quote) {
88 parse_flags_ = internals::make_parse_flags(resolved.format.get_delim());
89 }
90 else {
91 parse_flags_ = internals::make_parse_flags(resolved.format.get_delim(), resolved.format.quote_char);
92 }
93 const char resolved_eff_quote = resolved.format.no_quote
94 ? resolved.format.get_delim()
95 : resolved.format.quote_char;
96 simd_sentinels_ = SentinelVecs(resolved.format.get_delim(), resolved_eff_quote);
97
98 this->format = resolved;
99 }
100#ifdef _MSC_VER
101#pragma endregion
102#endif
103
104#ifdef _MSC_VER
105#pragma region IBasicCVParser: Core Parse Loop
106#endif
109
110 bool empty_last_field = this->data_ptr_
111 && this->data_ptr_->_data
112 && !this->data_ptr_->data.empty()
113 && (parse_flag(this->data_ptr_->data.back()) == ParseFlags::DELIMITER
114 || parse_flag(this->data_ptr_->data.back()) == ParseFlags::QUOTE);
115
116 // Push field
117 if (this->field_length_ > 0 || empty_last_field) {
118 this->push_field();
119 }
120
121 // Push row
122 if (this->current_row_.size() > 0)
123 this->push_row();
124 }
125
126 CSV_INLINE void IBasicCSVParser::parse_field() noexcept {
128 auto& in = this->data_ptr_->data;
129
130 if (field_start_ == UNINITIALIZED_FIELD)
131 field_start_ = (int)(data_pos_ - current_row_start());
132
133 // Optimization: Since NOT_SPECIAL characters tend to occur in contiguous
134 // sequences, use SIMD to skip long runs of them quickly.
135 // find_next_non_special processes complete SIMD lanes and returns pos
136 // unchanged for any tail shorter than one lane width.
137#if !defined(CSV_NO_SIMD)
138 data_pos_ = find_next_non_special(in, data_pos_, this->simd_sentinels_);
139#endif
140
141 // Scalar tail: handles remaining bytes after SIMD falls through, and
142 // handles any byte that SIMD stopped at conservatively (e.g. a delimiter
143 // inside a quoted field, which compound_parse_flag treats as NOT_SPECIAL).
144 while (data_pos_ < in.size() && compound_parse_flag(in[data_pos_]) == ParseFlags::NOT_SPECIAL)
145 data_pos_++;
146
147 field_length_ = data_pos_ - (field_start_ + current_row_start());
148
149 // Whitespace trimming is deferred to get_field_impl() so callers that never
150 // read field values (e.g. row counting) pay no trimming cost.
151 }
152
153 CSV_INLINE void IBasicCSVParser::push_field()
154 {
155 // Update
156 fields_->emplace_back(
157 field_start_ == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start_,
158 field_length_,
159 field_has_double_quote_
160 );
161
162 current_row_.row_length++;
163
164 // Reset field state
165 field_has_double_quote_ = false;
166 field_start_ = UNINITIALIZED_FIELD;
167 field_length_ = 0;
168 }
169
172 {
174
175 this->quote_escape_ = false;
176 this->data_pos_ = 0;
177 this->current_row_start() = 0;
178 this->trim_utf8_bom();
179
180 auto& in = this->data_ptr_->data;
181 while (this->data_pos_ < in.size()) {
182 switch (compound_parse_flag(in[this->data_pos_])) {
183 case ParseFlags::DELIMITER:
184 this->push_field();
185 this->data_pos_++;
186 break;
187
188 case ParseFlags::CARRIAGE_RETURN:
189 // Handles CRLF (we do not advance by 2 here, the NEWLINE case will handle it)
190 if (this->data_pos_ + 1 < in.size() && parse_flag(in[this->data_pos_ + 1]) == ParseFlags::NEWLINE) {
191 this->data_pos_++;
192 }
193
194 FALLTHROUGH_TO_NEXT_CASE
195
196 next_newline_case:
197 case ParseFlags::NEWLINE:
198 this->data_pos_++;
199
200 // End of record. Preserve intentional empty fields such as
201 // trailing delimiters and quoted empty strings, but leave a
202 // truly blank line as an empty row.
203 if (this->field_length_ > 0
204 || this->field_start_ != UNINITIALIZED_FIELD
205 || !this->current_row_.empty()) {
206 this->push_field();
207 }
208 this->push_row();
209
210 // Reset
211 this->current_row_ = CSVRow(data_ptr_, this->data_pos_, fields_->size());
212 break;
213
214 case ParseFlags::NOT_SPECIAL:
215 this->parse_field();
216 break;
217
218 case ParseFlags::QUOTE_ESCAPE_QUOTE:
219 if (data_pos_ + 1 == in.size()) return this->current_row_start();
220 else if (data_pos_ + 1 < in.size()) {
221 auto next_ch = parse_flag(in[data_pos_ + 1]);
222 if (next_ch >= ParseFlags::DELIMITER) {
223 quote_escape_ = false;
224 data_pos_++;
225 break;
226 }
227 else if (next_ch == ParseFlags::QUOTE) {
228 // Case: Escaped quote
229 data_pos_ += 2;
230 this->field_length_ += 2;
231 this->field_has_double_quote_ = true;
232 break;
233 }
234 }
235
236 // Case: Unescaped single quote => not strictly valid but we'll keep it
237 this->field_length_++;
238 data_pos_++;
239
240 break;
241
242 default: // Quote (currently not quote escaped)
243 if (this->field_length_ == 0) {
244 quote_escape_ = true;
245 data_pos_++;
246 if (field_start_ == UNINITIALIZED_FIELD && data_pos_ < in.size() && !ws_flag(in[data_pos_]))
247 field_start_ = (int)(data_pos_ - current_row_start());
248 break;
249 }
250
251 // Case: Unescaped quote
252 this->field_length_++;
253 data_pos_++;
254
255 break;
256 }
257 }
258
259 return this->current_row_start();
260 }
261
262 CSV_INLINE void IBasicCSVParser::push_row() {
263 size_t row_len = fields_->size() - current_row_.fields_start;
264 // Set row_length before pushing (immutable once created)
265 current_row_.row_length = row_len;
266 this->records_->push_back(std::move(current_row_));
267 }
268
270 this->data_ptr_ = std::make_shared<RawCSVData>();
271 this->data_ptr_->parse_flags = this->parse_flags_;
272 this->data_ptr_->ws_flags = this->ws_flags_;
273 this->data_ptr_->has_ws_trimming = this->has_ws_trimming_;
274 this->data_ptr_->col_names = this->col_names_;
275 this->fields_ = &(this->data_ptr_->fields);
276 }
277
278 CSV_INLINE void IBasicCSVParser::trim_utf8_bom() {
279 auto& data = this->data_ptr_->data;
280
281 if (!this->unicode_bom_scan_ && data.size() >= 3) {
282 if (data[0] == '\xEF' && data[1] == '\xBB' && data[2] == '\xBF') {
283 this->data_pos_ += 3; // Remove BOM from input string
284 this->utf8_bom_ = true;
285 }
286
287 this->unicode_bom_scan_ = true;
288 }
289 }
290#ifdef _MSC_VER
291#pragma endregion
292#endif
293
294#ifdef _MSC_VER
295#pragma region Specializations
296#endif
297#if !defined(__EMSCRIPTEN__)
298 CSV_INLINE void MmapParser::finalize_loaded_chunk(size_t length, bool eof_on_no_chunk) {
299 // Parse the currently loaded chunk and advance/re-align mmap_pos so
300 // the next read resumes at the start of the incomplete trailing row.
301 this->current_row_ = CSVRow(this->data_ptr_);
302 size_t remainder = this->parse();
303
304 if (this->mmap_pos == this->source_size_ || (eof_on_no_chunk && no_chunk())) {
305 this->eof_ = true;
306 this->end_feed();
307 }
308
309 this->mmap_pos -= (length - remainder);
310 }
311
312 CSV_INLINE void MmapParser::next(size_t bytes = CSV_CHUNK_SIZE_DEFAULT) {
313 // CRITICAL SECTION: Chunk Transition Logic
314 // This function reads 10MB chunks and must correctly handle fields that span
315 // chunk boundaries. The 'remainder' calculation below ensures partial fields
316 // are preserved for the next chunk.
317 //
318 // Bug #280: Field corruption occurred here when chunk transitions incorrectly
319 // split multi-byte characters or field boundaries.
320
321 // Reset parser state
322 this->field_start_ = UNINITIALIZED_FIELD;
323 this->field_length_ = 0;
324 this->reset_data_ptr();
325
326 // Reuse the pre-read head buffer (if any) as the first chunk.
327 // This avoids re-reading the same bytes that were already consumed
328 // for delimiter/header guessing.
329 if (!head_.empty()) {
330 this->data_ptr_->_data = std::make_shared<std::string>(std::move(head_));
331 auto* head_ptr = static_cast<std::string*>(this->data_ptr_->_data.get());
332 const size_t length = head_ptr->size();
333 this->mmap_pos += length;
334
335 this->data_ptr_->data = *head_ptr;
336 this->finalize_loaded_chunk(length);
337 return;
338 }
339
340 // Create memory map
341 const size_t offset = this->mmap_pos;
342 const size_t remaining = (offset < this->source_size_)
343 ? (this->source_size_ - offset)
344 : 0;
345 const size_t length = std::min(remaining, bytes);
346 if (length == 0) {
347 // No more data to read; mark EOF and end feed
348 // (Prevent exception on empty mmap as reported by #267)
349 this->eof_ = true;
350 this->end_feed();
351 return;
352 }
353
354 std::error_code error;
355 auto mmap = mio::make_mmap_source(this->_filename, offset, length, error);
356 if (error) {
357 std::string msg = "Memory mapping failed during CSV parsing: file='" + this->_filename
358 + "' offset=" + std::to_string(offset)
359 + " length=" + std::to_string(length);
360 throw std::system_error(error, msg);
361 }
362 this->data_ptr_->_data = std::make_shared<mio::basic_mmap_source<char>>(std::move(mmap));
363 this->mmap_pos += length;
364
365 auto mmap_ptr = (mio::basic_mmap_source<char>*)(this->data_ptr_->_data.get());
366
367 // Create string view
368 this->data_ptr_->data = csv::string_view(mmap_ptr->data(), mmap_ptr->length());
369 this->finalize_loaded_chunk(length, true);
370 }
371#endif
372#ifdef _MSC_VER
373#pragma endregion
374#endif
375 }
376}
Contains the main CSV parsing algorithm and various utility functions.
Stores information about how to parse a CSV file.
Data structure for representing CSV rows.
Definition csv_row.hpp:264
CONSTEXPR bool empty() const noexcept
Indicates whether row is empty or not.
Definition csv_row.hpp:278
CONSTEXPR size_t size() const noexcept
Return the number of fields in this row.
Definition csv_row.hpp:281
CONSTEXPR bool no_chunk() const
Whether or not source needs to be read in chunks.
SentinelVecs simd_sentinels_
Precomputed SIMD broadcast vectors for find_next_non_special.
size_t source_size_
The size of the incoming CSV.
void reset_data_ptr()
Create a new RawCSVDataPtr for a new chunk of data.
void end_feed()
Indicate the last block of data has been parsed.
size_t parse()
Parse the current chunk of data and return the completed-row prefix length.
ParseFlagMap parse_flags_
An array where the (i + 128)th slot gives the ParseFlags for ASCII character i.
void next(size_t bytes) override
Parse the next block of data.
ParseFlags
An enum used for describing the significance of each character with respect to CSV parsing.
Definition common.hpp:250
#define CSV_INLINE
Helper macro which should be #defined as "inline" in the single header version.
Definition common.hpp:26
The all encompassing namespace.
nonstd::string_view string_view
The string_view class used by this library.
Definition common.hpp:135