Vince's CSV Parser
Loading...
Searching...
No Matches
basic_csv_parser.cpp
2
3#include <system_error>
4
5namespace csv {
6 namespace internals {
7 CSV_INLINE size_t get_file_size(csv::string_view filename) {
8 std::ifstream infile(std::string(filename), std::ios::binary);
9 const auto start = infile.tellg();
10 infile.seekg(0, std::ios::end);
11 const auto end = infile.tellg();
12
13 return end - start;
14 }
15
16 CSV_INLINE std::string get_csv_head(csv::string_view filename) {
17 return get_csv_head(filename, get_file_size(filename));
18 }
19
20 CSV_INLINE std::string get_csv_head(csv::string_view filename, size_t file_size) {
21 const size_t bytes = 500000;
22
23 std::error_code error;
24 size_t length = std::min((size_t)file_size, bytes);
25 auto mmap = mio::make_mmap_source(std::string(filename), 0, length, error);
26
27 if (error) {
28 throw std::runtime_error("Cannot open file " + std::string(filename));
29 }
30
31 return std::string(mmap.begin(), mmap.end());
32 }
33
34#ifdef _MSC_VER
35#pragma region IBasicCVParser
36#endif
37 CSV_INLINE IBasicCSVParser::IBasicCSVParser(
38 const CSVFormat& format,
39 const ColNamesPtr& col_names
40 ) : _col_names(col_names) {
41 if (format.no_quote) {
42 _parse_flags = internals::make_parse_flags(format.get_delim());
43 }
44 else {
45 _parse_flags = internals::make_parse_flags(format.get_delim(), format.quote_char);
46 }
47
48 _ws_flags = internals::make_ws_flags(
49 format.trim_chars.data(), format.trim_chars.size()
50 );
51 }
52
53 CSV_INLINE void IBasicCSVParser::end_feed() {
55
56 bool empty_last_field = this->data_ptr
57 && this->data_ptr->_data
58 && !this->data_ptr->data.empty()
59 && (parse_flag(this->data_ptr->data.back()) == ParseFlags::DELIMITER
60 || parse_flag(this->data_ptr->data.back()) == ParseFlags::QUOTE);
61
62 // Push field
63 if (this->field_length > 0 || empty_last_field) {
64 this->push_field();
65 }
66
67 // Push row
68 if (this->current_row.size() > 0)
69 this->push_row();
70 }
71
72 CSV_INLINE void IBasicCSVParser::parse_field() noexcept {
74 auto& in = this->data_ptr->data;
75
76 // Trim off leading whitespace
77 while (data_pos < in.size() && ws_flag(in[data_pos]))
78 data_pos++;
79
80 if (field_start == UNINITIALIZED_FIELD)
81 field_start = (int)(data_pos - current_row_start());
82
83 // Optimization: Since NOT_SPECIAL characters tend to occur in contiguous
84 // sequences, use the loop below to avoid having to go through the outer
85 // switch statement as much as possible
86 while (data_pos < in.size() && compound_parse_flag(in[data_pos]) == ParseFlags::NOT_SPECIAL)
87 data_pos++;
88
89 field_length = data_pos - (field_start + current_row_start());
90
91 // Trim off trailing whitespace, this->field_length constraint matters
92 // when field is entirely whitespace
93 for (size_t j = data_pos - 1; ws_flag(in[j]) && this->field_length > 0; j--)
94 this->field_length--;
95 }
96
97 CSV_INLINE void IBasicCSVParser::push_field()
98 {
99 // Update
100 if (field_has_double_quote) {
101 fields->emplace_back(
102 field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start,
103 field_length,
104 true
105 );
106 field_has_double_quote = false;
107
108 }
109 else {
110 fields->emplace_back(
111 field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start,
112 field_length
113 );
114 }
115
116 current_row.row_length++;
117
118 // Reset field state
119 field_start = UNINITIALIZED_FIELD;
120 field_length = 0;
121 }
122
124 CSV_INLINE size_t IBasicCSVParser::parse()
125 {
127
128 this->quote_escape = false;
129 this->data_pos = 0;
130 this->current_row_start() = 0;
131 this->trim_utf8_bom();
132
133 auto& in = this->data_ptr->data;
134 while (this->data_pos < in.size()) {
135 switch (compound_parse_flag(in[this->data_pos])) {
136 case ParseFlags::DELIMITER:
137 this->push_field();
138 this->data_pos++;
139 break;
140
141 case ParseFlags::NEWLINE:
142 this->data_pos++;
143
144 // Catches CRLF (or LFLF, CRCRLF, or any other non-sensical combination of newlines)
145 while (this->data_pos < in.size() && parse_flag(in[this->data_pos]) == ParseFlags::NEWLINE)
146 this->data_pos++;
147
148 // End of record -> Write record
149 this->push_field();
150 this->push_row();
151
152 // Reset
153 this->current_row = CSVRow(data_ptr, this->data_pos, fields->size());
154 break;
155
156 case ParseFlags::NOT_SPECIAL:
157 this->parse_field();
158 break;
159
160 case ParseFlags::QUOTE_ESCAPE_QUOTE:
161 if (data_pos + 1 == in.size()) return this->current_row_start();
162 else if (data_pos + 1 < in.size()) {
163 auto next_ch = parse_flag(in[data_pos + 1]);
164 if (next_ch >= ParseFlags::DELIMITER) {
165 quote_escape = false;
166 data_pos++;
167 break;
168 }
169 else if (next_ch == ParseFlags::QUOTE) {
170 // Case: Escaped quote
171 data_pos += 2;
172 this->field_length += 2;
173 this->field_has_double_quote = true;
174 break;
175 }
176 }
177
178 // Case: Unescaped single quote => not strictly valid but we'll keep it
179 this->field_length++;
180 data_pos++;
181
182 break;
183
184 default: // Quote (currently not quote escaped)
185 if (this->field_length == 0) {
186 quote_escape = true;
187 data_pos++;
188 if (field_start == UNINITIALIZED_FIELD && data_pos < in.size() && !ws_flag(in[data_pos]))
189 field_start = (int)(data_pos - current_row_start());
190 break;
191 }
192
193 // Case: Unescaped quote
194 this->field_length++;
195 data_pos++;
196
197 break;
198 }
199 }
200
201 return this->current_row_start();
202 }
203
204 CSV_INLINE void IBasicCSVParser::push_row() {
205 size_t row_len = fields->size() - current_row.fields_start;
206 // Set row_length before pushing (immutable once created)
207 current_row.row_length = row_len;
208 this->_records->push_back(std::move(current_row));
209 }
210
211 CSV_INLINE void IBasicCSVParser::reset_data_ptr() {
212 this->data_ptr = std::make_shared<RawCSVData>();
213 this->data_ptr->parse_flags = this->_parse_flags;
214 this->data_ptr->col_names = this->_col_names;
215 this->fields = &(this->data_ptr->fields);
216 }
217
218 CSV_INLINE void IBasicCSVParser::trim_utf8_bom() {
219 auto& data = this->data_ptr->data;
220
221 if (!this->unicode_bom_scan && data.size() >= 3) {
222 if (data[0] == '\xEF' && data[1] == '\xBB' && data[2] == '\xBF') {
223 this->data_pos += 3; // Remove BOM from input string
224 this->_utf8_bom = true;
225 }
226
227 this->unicode_bom_scan = true;
228 }
229 }
230#ifdef _MSC_VER
231#pragma endregion
232#endif
233
234#ifdef _MSC_VER
235#pragma region Specializations
236#endif
237 CSV_INLINE void MmapParser::next(size_t bytes = ITERATION_CHUNK_SIZE) {
238 // CRITICAL SECTION: Chunk Transition Logic
239 // This function reads 10MB chunks and must correctly handle fields that span
240 // chunk boundaries. The 'remainder' calculation below ensures partial fields
241 // are preserved for the next chunk.
242 //
243 // Bug #280: Field corruption occurred here when chunk transitions incorrectly
244 // split multi-byte characters or field boundaries.
245
246 // Reset parser state
247 this->field_start = UNINITIALIZED_FIELD;
248 this->field_length = 0;
249 this->reset_data_ptr();
250
251 // Create memory map
252 const size_t offset = this->mmap_pos;
253 const size_t remaining = (offset < this->source_size)
254 ? (this->source_size - offset)
255 : 0;
256 const size_t length = std::min(remaining, bytes);
257 if (length == 0) {
258 // No more data to read; mark EOF and end feed
259 // (Prevent exception on empty mmap as reported by #267)
260 this->_eof = true;
261 this->end_feed();
262 return;
263 }
264
265 std::error_code error;
266 auto mmap = mio::make_mmap_source(this->_filename, offset, length, error);
267 if (error) {
268 std::string msg = "Memory mapping failed during CSV parsing: file='" + this->_filename
269 + "' offset=" + std::to_string(offset)
270 + " length=" + std::to_string(length);
271 throw std::system_error(error, msg);
272 }
273 this->data_ptr->_data = std::make_shared<mio::basic_mmap_source<char>>(std::move(mmap));
274 this->mmap_pos += length;
275
276 auto mmap_ptr = (mio::basic_mmap_source<char>*)(this->data_ptr->_data.get());
277
278 // Create string view
279 this->data_ptr->data = csv::string_view(mmap_ptr->data(), mmap_ptr->length());
280
281 // Parse
282 this->current_row = CSVRow(this->data_ptr);
283 size_t remainder = this->parse();
284
285 if (this->mmap_pos == this->source_size || no_chunk()) {
286 this->_eof = true;
287 this->end_feed();
288 }
289
290 this->mmap_pos -= (length - remainder);
291 }
292#ifdef _MSC_VER
293#pragma endregion
294#endif
295 }
296}
Contains the main CSV parsing algorithm and various utility functions.
Stores information about how to parse a CSV file.
Data structure for representing CSV rows.
Definition csv_row.hpp:280
#define CSV_INLINE
Helper macro which should be #defined as "inline" in the single header version.
Definition common.hpp:26
CSV_CONST CONSTEXPR_17 ParseFlagMap make_parse_flags(char delimiter)
Create a vector v where each index i corresponds to the ASCII number for a character and,...
ParseFlags
An enum used for describing the significance of each character with respect to CSV parsing.
Definition common.hpp:205
CSV_CONST CONSTEXPR_17 WhitespaceMap make_ws_flags(const char *ws_chars, size_t n_chars)
Create a vector v where each index i corresponds to the ASCII number for a character c and,...
The all encompassing namespace.
CSVReader parse(csv::string_view in, CSVFormat format)
Shorthand function for parsing an in-memory CSV string.
nonstd::string_view string_view
The string_view class used by this library.
Definition common.hpp:99