Vince's CSV Parser
Loading...
Searching...
No Matches
csv_row.hpp
Go to the documentation of this file.
1
5#pragma once
6#include <cmath>
7#include <iterator>
8#include <memory> // For CSVField
9#include <limits> // For CSVField
10#include <mutex>
11#include <unordered_set>
12#include <string>
13#include <sstream>
14#include <vector>
15
16#include "common.hpp"
17#include "data_type.hpp"
18#include "parse_hex.hpp"
19#include "raw_csv_data.hpp"
20
21namespace csv {
22 namespace internals {
23 class IBasicCSVParser;
24
25 static const std::string ERROR_NAN = "Not a number.";
26 static const std::string ERROR_OVERFLOW = "Overflow error.";
27 static const std::string ERROR_FLOAT_TO_INT =
28 "Attempted to convert a floating point value to an integral type.";
29 static const std::string ERROR_NEG_TO_UNSIGNED = "Negative numbers cannot be converted to unsigned types.";
30
31 std::string json_escape_string(csv::string_view s) noexcept;
32 }
33
39 class CSVField {
40 public:
42 constexpr explicit CSVField(csv::string_view _sv) noexcept : sv(_sv) {}
43
44 operator std::string() const {
45 return std::string("<CSVField> ") + std::string(this->sv);
46 }
47
76 template<typename T = std::string> T get() {
77 IF_CONSTEXPR(std::is_arithmetic<T>::value) {
78 // Note: this->type() also converts the CSV value to float
79 if (this->type() <= DataType::CSV_STRING) {
80 throw std::runtime_error(internals::ERROR_NAN);
81 }
82 }
83
84 IF_CONSTEXPR(std::is_integral<T>::value) {
85 // Note: this->is_float() also converts the CSV value to float
86 if (this->is_float()) {
87 throw std::runtime_error(internals::ERROR_FLOAT_TO_INT);
88 }
89
90 IF_CONSTEXPR(std::is_unsigned<T>::value) {
91 if (this->value < 0) {
92 throw std::runtime_error(internals::ERROR_NEG_TO_UNSIGNED);
93 }
94 }
95 }
96
97 // Allow fallthrough from previous if branch
98 IF_CONSTEXPR(!std::is_floating_point<T>::value) {
99 IF_CONSTEXPR(std::is_unsigned<T>::value) {
100 // Quick hack to perform correct unsigned integer boundary checks
101 if (this->value > internals::get_uint_max<sizeof(T)>()) {
102 throw std::runtime_error(internals::ERROR_OVERFLOW);
103 }
104 }
105 else if (internals::type_num<T>() < this->_type) {
106 throw std::runtime_error(internals::ERROR_OVERFLOW);
107 }
108 }
109
110 return static_cast<T>(this->value);
111 }
112
148 template<typename T = std::string>
149 bool try_get(T& out) noexcept {
150 IF_CONSTEXPR(std::is_arithmetic<T>::value) {
151 // Check if value is numeric
152 if (this->type() <= DataType::CSV_STRING) {
153 return false;
154 }
155 }
156
157 IF_CONSTEXPR(std::is_integral<T>::value) {
158 // Check for float-to-int conversion
159 if (this->is_float()) {
160 return false;
161 }
162
163 IF_CONSTEXPR(std::is_unsigned<T>::value) {
164 if (this->value < 0) {
165 return false;
166 }
167 }
168 }
169
170 // Check for overflow
171 IF_CONSTEXPR(!std::is_floating_point<T>::value) {
172 IF_CONSTEXPR(std::is_unsigned<T>::value) {
173 if (this->value > internals::get_uint_max<sizeof(T)>()) {
174 return false;
175 }
176 }
177 else if (internals::type_num<T>() < this->_type) {
178 return false;
179 }
180 }
181
182 out = static_cast<T>(this->value);
183 return true;
184 }
185
189 template<typename T = long long>
190 bool try_parse_hex(T& parsedValue) {
191 static_assert(std::is_integral<T>::value,
192 "try_parse_hex only works with integral types (int, long, long long, etc.)");
193 return internals::try_parse_hex(this->sv, parsedValue);
194 }
195
202 bool try_parse_decimal(long double& dVal, const char decimalSymbol = '.');
203
217 template<typename T>
218 CONSTEXPR_14 bool operator==(T other) const noexcept
219 {
220 static_assert(std::is_arithmetic<T>::value,
221 "T should be a numeric value.");
222
223 if (this->_type != DataType::UNKNOWN) {
224 if (this->_type == DataType::CSV_STRING) {
225 return false;
226 }
227
228 return internals::is_equal(value, static_cast<long double>(other), 0.000001L);
229 }
230
231 long double out = 0;
232 if (internals::data_type(this->sv, &out) == DataType::CSV_STRING) {
233 return false;
234 }
235
236 return internals::is_equal(out, static_cast<long double>(other), 0.000001L);
237 }
238
240 CONSTEXPR csv::string_view get_sv() const noexcept { return this->sv; }
241
243 CONSTEXPR_14 bool is_null() noexcept { return type() == DataType::CSV_NULL; }
244
246 CONSTEXPR_14 bool is_str() noexcept { return type() == DataType::CSV_STRING; }
247
249 CONSTEXPR_14 bool is_num() noexcept { return type() >= DataType::CSV_INT8; }
250
252 CONSTEXPR_14 bool is_int() noexcept {
253 return (type() >= DataType::CSV_INT8) && (type() <= DataType::CSV_INT64);
254 }
255
257 CONSTEXPR_14 bool is_float() noexcept { return type() == DataType::CSV_DOUBLE; }
258
260 CONSTEXPR_14 DataType type() noexcept {
261 this->get_value();
262 return _type;
263 }
264
265 private:
266 long double value = 0;
267 csv::string_view sv = "";
268 DataType _type = DataType::UNKNOWN;
269 CONSTEXPR_14 void get_value() noexcept {
270 /* Check to see if value has been cached previously, if not
271 * evaluate it
272 */
273 if ((int)_type < 0) {
274 this->_type = internals::data_type(this->sv, &this->value);
275 }
276 }
277 };
278
280 class CSVRow {
281 public:
283
284 CSVRow() = default;
285
287 CSVRow(internals::RawCSVDataPtr _data) : data(_data) {}
288 CSVRow(internals::RawCSVDataPtr _data, size_t _data_start, size_t _field_bounds)
289 : data(_data), data_start(_data_start), fields_start(_field_bounds) {}
290 CSVRow(internals::RawCSVDataPtr _data, size_t _data_start, size_t _field_bounds, size_t _row_length)
291 : data(_data), data_start(_data_start), fields_start(_field_bounds), row_length(_row_length) {}
292
294 CONSTEXPR bool empty() const noexcept { return this->size() == 0; }
295
297 CONSTEXPR size_t size() const noexcept { return row_length; }
298
301 CSVField operator[](size_t n) const;
302 CSVField operator[](const std::string&) const;
303 std::string to_json(const std::vector<std::string>& subset = {}) const;
304 std::string to_json_array(const std::vector<std::string>& subset = {}) const;
305
307 std::vector<std::string> get_col_names() const {
308 return this->data->col_names->get_col_names();
309 }
310
314 std::unordered_map<std::string, std::string> to_unordered_map() const;
315
321 std::unordered_map<std::string, std::string> to_unordered_map(
322 const std::vector<std::string>& subset
323 ) const;
324
329 operator std::vector<std::string>() const;
331
335 class iterator {
336 public:
337#ifndef DOXYGEN_SHOULD_SKIP_THIS
338 using value_type = CSVField;
339 using difference_type = int;
340 using pointer = std::shared_ptr<CSVField>;
341 using reference = CSVField & ;
342 using iterator_category = std::random_access_iterator_tag;
343#endif
344 iterator(const CSVRow*, int i);
345
346 reference operator*() const;
347 pointer operator->() const;
348
349 iterator operator++(int);
350 iterator& operator++();
351 iterator operator--(int);
352 iterator& operator--();
353 iterator operator+(difference_type n) const;
354 iterator operator-(difference_type n) const;
355
357 CONSTEXPR bool operator==(const iterator& other) const noexcept {
358 return this->i == other.i;
359 };
360
361 CONSTEXPR bool operator!=(const iterator& other) const noexcept { return !operator==(other); }
362
363#ifndef NDEBUG
364 friend CSVRow;
365#endif
366
367 private:
368 const CSVRow * daddy = nullptr; // Pointer to parent
369 internals::RawCSVDataPtr data = nullptr; // Keep data alive for lifetime of iterator
370 std::shared_ptr<CSVField> field = nullptr; // Current field pointed at
371 int i = 0; // Index of current field
372 };
373
375 using reverse_iterator = std::reverse_iterator<iterator>;
376
381 iterator begin() const;
382 iterator end() const noexcept;
383 reverse_iterator rbegin() const noexcept;
384 reverse_iterator rend() const;
386
387 private:
389 inline csv::string_view get_field_impl(size_t index, const internals::RawCSVDataPtr& _data) const {
391
392 if (index >= this->size())
393 throw std::runtime_error("Index out of bounds.");
394
395 const size_t field_index = this->fields_start + index;
396 auto field = _data->fields[field_index];
397 auto field_str = csv::string_view(_data->data).substr(this->data_start + field.start);
398
399 if (field.has_double_quote) {
400 auto& value = _data->double_quote_fields[field_index];
401 // Double-check locking: minimize lock contention by checking before acquiring lock
402 if (value.empty()) {
403 std::lock_guard<std::mutex> lock(_data->double_quote_init_lock);
404
405 // Check again after acquiring lock in case another thread initialized it
406 if (value.empty()) {
407 bool prev_ch_quote = false;
408 for (size_t i = 0; i < field.length; i++) {
409 if (_data->parse_flags[field_str[i] + CHAR_OFFSET] == ParseFlags::QUOTE) {
410 if (prev_ch_quote) {
411 prev_ch_quote = false;
412 continue;
413 }
414 else {
415 prev_ch_quote = true;
416 }
417 }
418
419 value += field_str[i];
420 }
421 }
422 }
423
424 return csv::string_view(value);
425 }
426
427 return field_str.substr(0, field.length);
428 }
429
431 csv::string_view get_field(size_t index) const;
432
436 csv::string_view get_field_safe(size_t index, internals::RawCSVDataPtr _data) const;
437
438 internals::RawCSVDataPtr data;
439
441 size_t data_start = 0;
442
444 size_t fields_start = 0;
445
447 size_t row_length = 0;
448 };
449
450#ifdef _MSC_VER
451#pragma region CSVField::get Specializations
452#endif
454 template<>
455 inline std::string CSVField::get<std::string>() {
456 return std::string(this->sv);
457 }
458
464 template<>
465 CONSTEXPR_14 csv::string_view CSVField::get<csv::string_view>() {
466 return this->sv;
467 }
468
470 template<>
471 CONSTEXPR_14 long double CSVField::get<long double>() {
472 if (!is_num())
473 throw std::runtime_error(internals::ERROR_NAN);
474
475 return this->value;
476 }
477
479 template<>
480 inline bool CSVField::try_get<std::string>(std::string& out) noexcept {
481 out = std::string(this->sv);
482 return true;
483 }
484
486 template<>
487 CONSTEXPR_14 bool CSVField::try_get<csv::string_view>(csv::string_view& out) noexcept {
488 out = this->sv;
489 return true;
490 }
491
493 template<>
494 CONSTEXPR_14 bool CSVField::try_get<long double>(long double& out) noexcept {
495 if (!is_num())
496 return false;
497
498 out = this->value;
499 return true;
500 }
501#ifdef _MSC_VER
502#pragma endregion CSVField::get Specializations
503#endif
504
506 template<>
507 CONSTEXPR bool CSVField::operator==(const char * other) const noexcept
508 {
509 return this->sv == other;
510 }
511
513 template<>
515 {
516 return this->sv == other;
517 }
518}
519
520inline std::ostream& operator << (std::ostream& os, csv::CSVField const& value) {
521 os << std::string(value);
522 return os;
523}
Data type representing individual CSV values.
Definition csv_row.hpp:39
CONSTEXPR_14 bool is_num() noexcept
Returns true if field is an integer or float.
Definition csv_row.hpp:249
bool try_parse_decimal(long double &dVal, const char decimalSymbol='.')
Attempts to parse a decimal (or integer) value using the given symbol, returning true if the value is...
Definition csv_row.cpp:91
CONSTEXPR_14 bool is_str() noexcept
Returns true if field is a non-numeric, non-empty string.
Definition csv_row.hpp:246
CONSTEXPR_14 bool is_int() noexcept
Returns true if field is an integer.
Definition csv_row.hpp:252
CONSTEXPR_14 bool is_null() noexcept
Returns true if field is an empty string or string of whitespace characters.
Definition csv_row.hpp:243
constexpr CSVField(csv::string_view _sv) noexcept
Constructs a CSVField from a string_view.
Definition csv_row.hpp:42
CONSTEXPR_14 DataType type() noexcept
Return the type of the underlying CSV data.
Definition csv_row.hpp:260
T get()
Returns the value casted to the requested type, performing type checking before.
Definition csv_row.hpp:76
CONSTEXPR_14 bool operator==(T other) const noexcept
Compares the contents of this field to a numeric value.
Definition csv_row.hpp:218
bool try_get(T &out) noexcept
Attempts to retrieve the value as the requested type without throwing exceptions.
Definition csv_row.hpp:149
CONSTEXPR_14 bool is_float() noexcept
Returns true if field is a floating point value.
Definition csv_row.hpp:257
CONSTEXPR csv::string_view get_sv() const noexcept
Return a string view over the field's contents.
Definition csv_row.hpp:240
bool try_parse_hex(T &parsedValue)
Parse a hexadecimal value, returning false if the value is not hex.
Definition csv_row.hpp:190
A random access iterator over the contents of a CSV row.
Definition csv_row.hpp:335
CONSTEXPR bool operator==(const iterator &other) const noexcept
Two iterators are equal if they point to the same field.
Definition csv_row.hpp:357
Data structure for representing CSV rows.
Definition csv_row.hpp:280
iterator end() const noexcept
Return an iterator pointing to just after the end of the CSVRow.
Definition csv_row.cpp:125
std::reverse_iterator< iterator > reverse_iterator
A reverse iterator over the contents of a CSVRow.
Definition csv_row.hpp:375
std::string to_json(const std::vector< std::string > &subset={}) const
Convert a CSV row to a JSON object, i.e.
CONSTEXPR bool empty() const noexcept
Indicates whether row is empty or not.
Definition csv_row.hpp:294
std::string to_json_array(const std::vector< std::string > &subset={}) const
Convert a CSV row to a JSON array, i.e.
std::unordered_map< std::string, std::string > to_unordered_map() const
Convert this CSVRow into an unordered map.
Definition csv_row.cpp:52
CONSTEXPR size_t size() const noexcept
Return the number of fields in this row.
Definition csv_row.hpp:297
std::vector< std::string > get_col_names() const
Retrieve this row's associated column names.
Definition csv_row.hpp:307
CSVField operator[](size_t n) const
Return a CSVField object corrsponding to the nth value in the row.
Definition csv_row.cpp:20
iterator begin() const
Return an iterator pointing to the first field.
Definition csv_row.cpp:116
CSVRow(internals::RawCSVDataPtr _data)
Construct a CSVRow from a RawCSVDataPtr.
Definition csv_row.hpp:287
Abstract base class which provides CSV parsing logic.
A standalone header file containing shared code.
#define IF_CONSTEXPR
Expands to if constexpr in C++17 and if otherwise.
Definition common.hpp:108
#define CONSTEXPR
Expands to constexpr in decent compilers and inline otherwise.
Definition common.hpp:149
Implements data type parsing functionality.
CONSTEXPR_14 DataType data_type(csv::string_view in, long double *const out, const char decimalSymbol)
Distinguishes numeric from other text values.
bool is_equal(T a, T b, T epsilon=0.001)
Definition common.hpp:193
ParseFlags
An enum used for describing the significance of each character with respect to CSV parsing.
Definition common.hpp:205
CSV_CONST CONSTEXPR_17 OutArray arrayToDefault(T &&value)
Helper constexpr function to initialize an array with all the elements set to value.
CONSTEXPR_14 long double get_uint_max()
Given a byte size, return the largest number than can be stored in an unsigned integer of that size.
The all encompassing namespace.
DataType
Enumerates the different CSV field types that are recognized by this library.
Definition data_type.hpp:20
@ CSV_INT64
64-bit integer (long long on MSVC/GCC)
@ CSV_DOUBLE
Floating point value.
@ CSV_NULL
Empty string.
@ CSV_INT8
8-bit integer
@ CSV_STRING
Non-numeric string.
constexpr unsigned CHAR_OFFSET
Offset to convert char into array index.
Definition common.hpp:249
nonstd::string_view string_view
The string_view class used by this library.
Definition common.hpp:99
Implements Functions related to hexadecimal parsing.
Internal data structures for CSV parsing.