Vince's CSV Parser
Loading...
Searching...
No Matches
csv_reader.cpp
Go to the documentation of this file.
1
5#include "csv_reader.hpp"
6
7namespace csv {
8 namespace internals {
9 CSV_INLINE std::string format_row(const std::vector<std::string>& row, csv::string_view delim) {
11 std::stringstream ret;
12 for (size_t i = 0; i < row.size(); i++) {
13 ret << row[i];
14 if (i + 1 < row.size()) ret << delim;
15 else ret << '\n';
16 }
17 ret.flush();
18
19 return ret.str();
20 }
21
29 // Parse the CSV
30 auto trim_chars = format.get_trim_chars();
31 std::stringstream source(head.data());
32 RowCollection rows;
33
35 parser.set_output(rows);
36 parser.next();
37
38 return CSVRow(std::move(rows[format.get_header()]));
39 }
40
41 CSV_INLINE GuessScore calculate_score(csv::string_view head, const CSVFormat& format) {
42 // Frequency counter of row length
43 std::unordered_map<size_t, size_t> row_tally = { { 0, 0 } };
44
45 // Map row lengths to row num where they first occurred
46 std::unordered_map<size_t, size_t> row_when = { { 0, 0 } };
47
48 // Parse the CSV
49 std::stringstream source(head.data());
50 RowCollection rows;
51
53 parser.set_output(rows);
54 parser.next();
55
56 for (size_t i = 0; i < rows.size(); i++) {
57 auto& row = rows[i];
58
59 // Ignore zero-length rows
60 if (row.size() > 0) {
61 if (row_tally.find(row.size()) != row_tally.end()) {
62 row_tally[row.size()]++;
63 }
64 else {
65 row_tally[row.size()] = 1;
66 row_when[row.size()] = i;
67 }
68 }
69 }
70
71 double final_score = 0;
72 size_t header_row = 0;
73 size_t mode_row_length = 0;
74
75 // Final score is equal to the largest
76 // row size times rows of that size
77 for (auto& pair : row_tally) {
78 auto row_size = pair.first;
79 auto row_count = pair.second;
80 double score = (double)(row_size * row_count);
81 if (score > final_score) {
82 final_score = score;
84 header_row = row_when[row_size];
85 }
86 }
87
88 // Heuristic: If first row has >= columns than mode, use it as header
89 // This handles headers with optional columns, trailing delimiters, etc.
90 // while still supporting CSVs with comment lines before the header
91 size_t first_row_length = rows.size() > 0 ? rows[0].size() : 0;
93 header_row = 0;
94 }
95
96 return {
98 header_row
99 };
100 }
101
114 size_t max_score = 0,
115 header = 0;
116 char current_delim = delims[0];
117
118 for (char cand_delim : delims) {
119 auto result = calculate_score(head, format.delimiter(cand_delim));
120
121 if ((size_t)result.score > max_score) {
122 max_score = (size_t)result.score;
124 header = result.header;
125 }
126 }
127
128 return { current_delim, (int)header };
129 }
130 }
131
138 CSV_INLINE std::vector<std::string> get_col_names(csv::string_view filename, CSVFormat format) {
139 auto head = internals::get_csv_head(filename);
140
142 if (format.guess_delim()) {
143 auto guess_result = guess_format(filename, format.get_possible_delims());
144 format.delimiter(guess_result.delim).header_row(guess_result.header_row);
145 }
146
147 return internals::_get_col_names(head, format);
148 }
149
151 CSV_INLINE CSVGuessResult guess_format(csv::string_view filename, const std::vector<char>& delims) {
152 auto head = internals::get_csv_head(filename);
153 return internals::_guess_format(head, delims);
154 }
155
167 CSV_INLINE CSVReader::CSVReader(csv::string_view filename, CSVFormat format) : _format(format) {
168 auto head = internals::get_csv_head(filename);
169 using Parser = internals::MmapParser;
170 // Apply chunk size from format before any reading occurs
171 this->_chunk_size = format.get_chunk_size();
173 if (format.guess_delim()) {
174 auto guess_result = internals::_guess_format(head, format.possible_delimiters);
175 format.delimiter(guess_result.delim);
176 // Only override header if user hasn't explicitly called no_header()
177 // Note: column_names() also sets header=-1, but it populates col_names,
178 // so we can distinguish: no_header() means header=-1 && col_names.empty()
179 if (format.header != -1 || !format.col_names.empty()) {
180 format.header = guess_result.header_row;
181 }
182
183 this->_format = format;
184 }
185
186 if (!format.col_names.empty())
187 this->set_col_names(format.col_names);
188
189 this->parser = std::unique_ptr<Parser>(new Parser(filename, format, this->col_names)); // For C++11
190 this->initial_read();
191 }
192
195 CSVFormat new_format = this->_format;
196
197 // Since users are normally not allowed to set
198 // column names and header row simulatenously,
199 // we will set the backing variables directly here
200 new_format.col_names = this->col_names->get_col_names();
201 new_format.header = this->_format.header;
202
203 return new_format;
204 }
205
207 CSV_INLINE std::vector<std::string> CSVReader::get_col_names() const {
208 if (this->col_names) {
209 return this->col_names->get_col_names();
210 }
211
212 return std::vector<std::string>();
213 }
214
219 auto _col_names = this->get_col_names();
220 for (size_t i = 0; i < _col_names.size(); i++)
221 if (_col_names[i] == col_name) return (int)i;
222
223 return CSV_NOT_FOUND;
224 }
225
226 CSV_INLINE void CSVReader::trim_header() {
227 if (!this->header_trimmed) {
228 for (int i = 0; i <= this->_format.header && !this->records->empty(); i++) {
229 if (i == this->_format.header && this->col_names->empty()) {
230 this->set_col_names(this->records->pop_front());
231 }
232 else {
233 this->records->pop_front();
234 }
235 }
236
237 this->header_trimmed = true;
238 }
239 }
240
244 CSV_INLINE void CSVReader::set_col_names(const std::vector<std::string>& names)
245 {
246 this->col_names->set_col_names(names);
247 this->n_cols = names.size();
248 }
249
261 CSV_INLINE bool CSVReader::read_csv(size_t bytes) {
262 // WORKER THREAD FUNCTION: Runs asynchronously to read CSV chunks
263 //
264 // Threading model:
265 // 1. notify_all() - signals read_row() that worker is active
266 // 2. parser->next() - reads and parses bytes (10MB chunks)
267 // 3. kill_all() - signals read_row() that worker is done
268 //
269 // Exception handling: Exceptions thrown here MUST propagate to the calling
270 // thread via std::exception_ptr. Bug #282 fixed cases where exceptions were
271 // swallowed, causing std::terminate() instead of proper error handling.
272
273 // Tell read_row() to listen for CSV rows
274 this->records->notify_all();
275
276 try {
277 this->parser->set_output(*this->records);
278 this->parser->next(bytes);
279
280 if (!this->header_trimmed) {
281 this->trim_header();
282 }
283 }
284 catch (...) {
285 // Never allow exceptions to escape the worker thread, or std::terminate will be invoked.
286 // Store the exception and rethrow from the consumer thread (read_row / iterator).
287 this->set_read_csv_exception(std::current_exception());
288 }
289
290 // Tell read_row() to stop waiting
291 this->records->kill_all();
292
293 return true;
294 }
295
311 while (true) {
312 if (this->records->empty()) {
313 if (this->records->is_waitable()) {
314 // Reading thread is currently active => wait for it to populate records
315 this->records->wait();
316 continue;
317 }
318
319 // Reading thread is not active
320 if (this->read_csv_worker.joinable())
321 this->read_csv_worker.join();
322
323 // If the worker thread failed, rethrow the error here
324 this->rethrow_read_csv_exception_if_any();
325
326 if (this->parser->eof())
327 // End of file and no more records
328 return false;
329
330 // Detect infinite loop: a previous read was requested but records are still empty.
331 // This fires when a single row spans more than 2 × _chunk_size bytes:
332 // - chunk N fills without finding '\n' → _read_requested set to true
333 // - chunk N+1 also fills without '\n' → guard fires here
334 // Default _chunk_size is ITERATION_CHUNK_SIZE (10 MB), so the threshold is
335 // rows > 20 MB. Use CSVFormat::chunk_size() to raise the limit.
336 if (this->_read_requested && this->records->empty()) {
337 throw std::runtime_error(
338 "End of file not reached and no more records parsed. "
339 "This likely indicates a CSV row larger than the chunk size of " +
340 std::to_string(this->_chunk_size) + " bytes. "
341 "Use CSVFormat::chunk_size() to increase the chunk size."
342 );
343 }
344
345 // Start another reading thread
346 // Mark as waitable before starting the thread to avoid a race where
347 // read_row() observes is_waitable()==false immediately after thread creation.
348 this->records->notify_all();
349 this->read_csv_worker = std::thread(&CSVReader::read_csv, this, this->_chunk_size);
350 this->_read_requested = true;
351 continue;
352 }
353 else if (this->records->front().size() != this->n_cols &&
354 this->_format.variable_column_policy != VariableColumnPolicy::KEEP) {
355 auto errored_row = this->records->pop_front();
356
357 if (this->_format.variable_column_policy == VariableColumnPolicy::THROW) {
358 if (errored_row.size() < this->n_cols)
359 throw std::runtime_error("Line too short " + internals::format_row(errored_row));
360
361 throw std::runtime_error("Line too long " + internals::format_row(errored_row));
362 }
363 }
364 else {
365 row = this->records->pop_front();
366 this->_n_rows++;
367 this->_read_requested = false; // Reset flag on successful read
368 return true;
369 }
370 }
371
372 return false;
373 }
374}
Stores information about how to parse a CSV file.
CSVFormat & delimiter(char delim)
Sets the delimiter of the CSV file.
CSVFormat & header_row(int row)
Sets the header row.
CSVFormat get_format() const
Return the format of the original raw CSV.
int index_of(csv::string_view col_name) const
Return the index of the column name if found or csv::CSV_NOT_FOUND otherwise.
bool read_row(CSVRow &row)
Retrieve rows as CSVRow objects, returning true if more rows are available.
std::vector< std::string > get_col_names() const
Return the CSV's column names as a vector of strings.
CSVReader(csv::string_view filename, CSVFormat format=CSVFormat::guess_csv())
Construct CSVReader from filename using memory-mapped I/O.
Data structure for representing CSV rows.
Definition csv_row.hpp:280
Parser for memory-mapped files.
A class for parsing CSV data from a std::stringstream or an std::ifstream
void next(size_t bytes=ITERATION_CHUNK_SIZE) override
Parse the next block of data.
A std::deque wrapper which allows multiple read and write threads to concurrently access it along wit...
#define CSV_INLINE
Helper macro which should be #defined as "inline" in the single header version.
Definition common.hpp:26
Defines functionality needed for basic CSV parsing.
std::unique_ptr< RowCollection > records
Queue of parsed CSV rows.
size_t _n_rows
How many rows (minus header) have been read so far.
bool read_csv(size_t bytes=internals::ITERATION_CHUNK_SIZE)
Read a chunk of CSV data.
internals::ColNamesPtr col_names
Pointer to a object containing column information.
void set_col_names(const std::vector< std::string > &)
Sets this reader's column names and associated data.
std::unique_ptr< internals::IBasicCSVParser > parser
Helper class which actually does the parsing.
size_t n_cols
The number of columns in this CSV.
std::vector< std::string > _get_col_names(csv::string_view head, CSVFormat format)
Return a CSV's column names.
std::string format_row(const std::vector< std::string > &row, csv::string_view delim)
Definition csv_reader.cpp:9
CSVGuessResult _guess_format(csv::string_view head, const std::vector< char > &delims)
Guess the delimiter used by a delimiter-separated values file.
CSV_CONST CONSTEXPR_17 OutArray arrayToDefault(T &&value)
Helper constexpr function to initialize an array with all the elements set to value.
The all encompassing namespace.
std::vector< std::string > get_col_names(csv::string_view filename, CSVFormat format)
Return a CSV's column names.
internals::ThreadSafeDeque< CSVRow > RowCollection
Standard type for storing collection of rows.
constexpr int CSV_NOT_FOUND
Integer indicating a requested column wasn't found.
Definition common.hpp:246
CSVGuessResult guess_format(csv::string_view filename, const std::vector< char > &delims)
Guess the delimiter used by a delimiter-separated values file.
nonstd::string_view string_view
The string_view class used by this library.
Definition common.hpp:99
Stores the inferred format of a CSV file.