Vince's CSV Parser
Loading...
Searching...
No Matches
basic_csv_parser.hpp
Go to the documentation of this file.
1
5#pragma once
6#include <algorithm>
7#include <array>
8#include <fstream>
9#include <memory>
10#include <unordered_map>
11#include <unordered_set>
12#include <thread>
13#include <vector>
14
15#include "../external/mio.hpp"
16#include "col_names.hpp"
17#include "common.hpp"
18#include "csv_format.hpp"
19#include "csv_row.hpp"
20#include "thread_safe_deque.hpp"
21
22namespace csv {
23 namespace internals {
24 constexpr const int UNINITIALIZED_FIELD = -1;
25
28 template<typename OutArray, typename T = typename OutArray::type>
29 CSV_CONST CONSTEXPR_17 OutArray arrayToDefault(T&& value)
30 {
31 OutArray a {};
32 for (auto& e : a)
33 e = value;
34 return a;
35 }
36
41 CSV_CONST CONSTEXPR_17 ParseFlagMap make_parse_flags(char delimiter) {
46 return ret;
47 }
48
53 CSV_CONST CONSTEXPR_17 ParseFlagMap make_parse_flags(char delimiter, char quote_char) {
54 std::array<ParseFlags, 256> ret = make_parse_flags(delimiter);
55 ret[quote_char + CHAR_OFFSET] = ParseFlags::QUOTE;
56 return ret;
57 }
58
63 CSV_CONST CONSTEXPR_17 WhitespaceMap make_ws_flags(const char* ws_chars, size_t n_chars) {
65 for (size_t j = 0; j < n_chars; j++) {
66 ret[ws_chars[j] + CHAR_OFFSET] = true;
67 }
68 return ret;
69 }
70
71 inline WhitespaceMap make_ws_flags(const std::vector<char>& flags) {
72 return make_ws_flags(flags.data(), flags.size());
73 }
74
75 CSV_INLINE size_t get_file_size(csv::string_view filename);
76
77 CSV_INLINE std::string get_csv_head(csv::string_view filename);
78 }
79
82
83 namespace internals {
91 public:
92 IBasicCSVParser() = default;
93 IBasicCSVParser(const CSVFormat&, const ColNamesPtr&);
94 IBasicCSVParser(const ParseFlagMap& parse_flags, const WhitespaceMap& ws_flags
95 ) : _parse_flags(parse_flags), _ws_flags(ws_flags) {}
96
97 virtual ~IBasicCSVParser() {}
98
100 bool eof() { return this->_eof; }
101
103 virtual void next(size_t bytes) = 0;
104
106 void end_feed();
107
108 CONSTEXPR_17 ParseFlags parse_flag(const char ch) const noexcept {
109 return _parse_flags.data()[ch + CHAR_OFFSET];
110 }
111
112 CONSTEXPR_17 ParseFlags compound_parse_flag(const char ch) const noexcept {
113 return quote_escape_flag(parse_flag(ch), this->quote_escape);
114 }
115
117 CONSTEXPR bool utf8_bom() const { return this->_utf8_bom; }
118
119 void set_output(RowCollection& rows) { this->_records = &rows; }
120
121 protected:
124 CSVRow current_row;
125 RawCSVDataPtr data_ptr = nullptr;
126 ColNamesPtr _col_names = nullptr;
127 CSVFieldList* fields = nullptr;
128 int field_start = UNINITIALIZED_FIELD;
129 size_t field_length = 0;
130
134
137 bool _eof = false;
138
140 size_t source_size = 0;
142
144 CONSTEXPR bool no_chunk() const { return this->source_size < ITERATION_CHUNK_SIZE; }
145
150 size_t parse();
151
153 void reset_data_ptr();
154 private:
158 WhitespaceMap _ws_flags;
159 bool quote_escape = false;
160 bool field_has_double_quote = false;
161
163 size_t data_pos = 0;
164
166 bool unicode_bom_scan = false;
167 bool _utf8_bom = false;
168
170 RowCollection* _records = nullptr;
171
172 CONSTEXPR_17 bool ws_flag(const char ch) const noexcept {
173 return _ws_flags.data()[ch + CHAR_OFFSET];
174 }
175
176 size_t& current_row_start() {
177 return this->current_row.data_start;
178 }
179
180 void parse_field() noexcept;
181
183 void push_field();
184
186 void push_row();
187
189 void trim_utf8_bom();
190 };
191
193 csv::enable_if_t<std::is_base_of<std::istream, TStream>::value, int> = 0>
194 std::string get_csv_head(TStream &source) {
195 auto tellg = source.tellg();
196 std::string head;
197 std::getline(source, head);
198 source.seekg(tellg);
199 return head;
200 }
201
203 CSV_INLINE std::string get_csv_head(csv::string_view filename, size_t file_size);
204
208 template<typename TStream>
211
212 public:
213 StreamParser(TStream& source,
214 const CSVFormat& format,
215 const ColNamesPtr& col_names = nullptr
216 ) : IBasicCSVParser(format, col_names), _source(source) {}
217
219 TStream& source,
220 internals::ParseFlagMap parse_flags,
221 internals::WhitespaceMap ws_flags) :
222 IBasicCSVParser(parse_flags, ws_flags),
223 _source(source)
224 {}
225
226 ~StreamParser() {}
227
228 void next(size_t bytes = ITERATION_CHUNK_SIZE) override {
229 if (this->eof()) return;
230
231 // Reset parser state
232 this->field_start = UNINITIALIZED_FIELD;
233 this->field_length = 0;
234 this->reset_data_ptr();
235 this->data_ptr->_data = std::make_shared<std::string>();
236
237 if (source_size == 0) {
238 const auto start = _source.tellg();
239 _source.seekg(0, std::ios::end);
240 const auto end = _source.tellg();
241 _source.seekg(0, std::ios::beg);
242
243 source_size = end - start;
244 }
245
246 // Read data into buffer
247 size_t length = std::min(source_size - stream_pos, bytes);
248 std::unique_ptr<char[]> buff(new char[length]);
249 _source.seekg(stream_pos, std::ios::beg);
250 _source.read(buff.get(), length);
251 stream_pos = _source.tellg();
252 ((std::string*)(this->data_ptr->_data.get()))->assign(buff.get(), length);
253
254 // Create string_view
255 this->data_ptr->data = *((std::string*)this->data_ptr->_data.get());
256
257 // Parse
258 this->current_row = CSVRow(this->data_ptr);
259 size_t remainder = this->parse();
260
261 if (stream_pos == source_size || no_chunk()) {
262 this->_eof = true;
263 this->end_feed();
264 }
265 else {
266 this->stream_pos -= (length - remainder);
267 }
268 }
269
270 private:
271 TStream& _source;
272 size_t stream_pos = 0;
273 };
274
285 public:
287 const CSVFormat& format,
288 const ColNamesPtr& col_names = nullptr
289 ) : IBasicCSVParser(format, col_names) {
290 this->_filename = filename.data();
291 this->source_size = get_file_size(filename);
292 };
293
294 ~MmapParser() {}
295
296 void next(size_t bytes) override;
297
298 private:
299 std::string _filename;
300 size_t mmap_pos = 0;
301 };
302 }
303}
Stores information about how to parse a CSV file.
Data structure for representing CSV rows.
Definition csv_row.hpp:280
Abstract base class which provides CSV parsing logic.
CONSTEXPR bool no_chunk() const
Whether or not source needs to be read in chunks.
ParseFlagMap _parse_flags
An array where the (i + 128)th slot gives the ParseFlags for ASCII character i.
void reset_data_ptr()
Create a new RawCSVDataPtr for a new chunk of data.
bool eof()
Whether or not we have reached the end of source.
void end_feed()
Indicate the last block of data has been parsed.
size_t parse()
Parse the current chunk of data *.
virtual void next(size_t bytes)=0
Parse the next block of data.
size_t source_size
The size of the incoming CSV.
CONSTEXPR bool utf8_bom() const
Whether or not this CSV has a UTF-8 byte order mark.
Parser for memory-mapped files.
A class for parsing CSV data from a std::stringstream or an std::ifstream
void next(size_t bytes=ITERATION_CHUNK_SIZE) override
Parse the next block of data.
A std::deque wrapper which allows multiple read and write threads to concurrently access it along wit...
A standalone header file containing shared code.
#define CONSTEXPR
Expands to constexpr in decent compilers and inline otherwise.
Definition common.hpp:149
#define CSV_INLINE
Helper macro which should be #defined as "inline" in the single header version.
Definition common.hpp:26
Defines an object used to store CSV format settings.
Defines the data type used for storing information about a CSV row.
std::array< ParseFlags, 256 > ParseFlagMap
An array which maps ASCII chars to a parsing flag.
Definition common.hpp:239
std::array< bool, 256 > WhitespaceMap
An array which maps ASCII chars to a flag indicating if it is whitespace.
Definition common.hpp:242
constexpr size_t ITERATION_CHUNK_SIZE
Chunk size for lazy-loading large CSV files.
Definition common.hpp:190
CSV_CONST CONSTEXPR_17 ParseFlagMap make_parse_flags(char delimiter)
Create a vector v where each index i corresponds to the ASCII number for a character and,...
ParseFlags
An enum used for describing the significance of each character with respect to CSV parsing.
Definition common.hpp:205
@ NOT_SPECIAL
Characters with no special meaning or escaped delimiters and newlines.
@ NEWLINE
Characters which signify a new row.
@ QUOTE
Characters which may signify a quote escape.
@ DELIMITER
Characters which signify a new field.
constexpr ParseFlags quote_escape_flag(ParseFlags flag, bool quote_escape) noexcept
Transform the ParseFlags given the context of whether or not the current field is quote escaped.
Definition common.hpp:215
CSV_CONST CONSTEXPR_17 WhitespaceMap make_ws_flags(const char *ws_chars, size_t n_chars)
Create a vector v where each index i corresponds to the ASCII number for a character c and,...
CSV_CONST CONSTEXPR_17 OutArray arrayToDefault(T &&value)
Helper constexpr function to initialize an array with all the elements set to value.
The all encompassing namespace.
CSVReader parse(csv::string_view in, CSVFormat format)
Shorthand function for parsing an in-memory CSV string.
constexpr unsigned CHAR_OFFSET
Offset to convert char into array index.
Definition common.hpp:249
nonstd::string_view string_view
The string_view class used by this library.
Definition common.hpp:99
Thread-safe deque for producer-consumer patterns.