Vince's CSV Parser
Loading...
Searching...
No Matches
csv_reader.hpp
Go to the documentation of this file.
1
5#pragma once
6
7#include <algorithm>
8#include <deque>
9#include <exception>
10#include <fstream>
11#include <functional>
12#include <iterator>
13#include <memory>
14#include <sstream>
15#include <string>
16#include <vector>
17
18#if !defined(CSV_ENABLE_THREADS) || CSV_ENABLE_THREADS
19#include <mutex>
20#include <thread>
21#endif
22
23#include "basic_csv_parser.hpp"
24#include "common.hpp"
25#include "data_type.hpp"
26#include "csv_format.hpp"
27
29namespace csv {
30#if CSV_ENABLE_THREADS
31 inline void join_worker(std::thread& worker) {
32 if (worker.joinable()) worker.join();
33 }
34
35 #define JOIN_WORKER(worker) join_worker(worker)
36#else
37 #define JOIN_WORKER(worker) ((void)0)
38#endif
39
61 class CSVReader {
62 public:
79 class iterator {
80 public:
81 #ifndef DOXYGEN_SHOULD_SKIP_THIS
82 using value_type = CSVRow;
83 using difference_type = std::ptrdiff_t;
84 using pointer = CSVRow * ;
85 using reference = CSVRow & ;
86 using iterator_category = std::input_iterator_tag;
87 #endif
88
89 iterator() = default;
90 iterator(CSVReader* reader) : daddy(reader) {}
92
94 CONSTEXPR_14 reference operator*() { return this->row; }
95 CONSTEXPR_14 reference operator*() const { return const_cast<reference>(this->row); }
96
98 CONSTEXPR_14 pointer operator->() { return &(this->row); }
99 CONSTEXPR_14 pointer operator->() const { return const_cast<pointer>(&(this->row)); }
100
101 iterator& operator++();
102 iterator operator++(int);
107 CONSTEXPR bool operator==(const iterator& other) const noexcept {
108 return (this->daddy == other.daddy) && (this->i == other.i);
109 }
110
111 CONSTEXPR bool operator!=(const iterator& other) const noexcept { return !operator==(other); }
112 private:
113 CSVReader * daddy = nullptr; // Pointer to parent
114 CSVRow row; // Current row
115 size_t i = 0; // Index of current row
116 };
117
122
134 CSVReader(csv::string_view filename, const CSVFormat& format = CSVFormat::guess_csv()) : _format(format) {
135#if defined(__EMSCRIPTEN__)
136 this->owned_stream = std::unique_ptr<std::istream>(
137 new std::ifstream(std::string(filename), std::ios::binary)
138 );
139
140 if (!(*this->owned_stream)) {
141 throw std::runtime_error("Cannot open file " + std::string(filename));
142 }
143
144 this->init_from_stream(*this->owned_stream, format);
145#else
146 // C4316: MmapParser may carry over-aligned SIMD members. Allocation
147 // alignment is handled by the allocator on supported platforms;
148 // suppress MSVC's false-positive warning at this site.
149 CSV_MSVC_PUSH_DISABLE(4316)
150 this->init_parser(std::unique_ptr<internals::IBasicCSVParser>(
151 new internals::MmapParser(filename, format, this->col_names)
152 ));
153 CSV_MSVC_POP
154#endif
155 }
156
169 template<typename TStream,
170 csv::enable_if_t<std::is_base_of<std::istream, TStream>::value, int> = 0>
171 CSVReader(TStream &source, CSVFormat format = CSVFormat::guess_csv()) : _format(format) {
172 this->init_from_stream(source, format);
173 }
174
180 CSVReader(std::unique_ptr<std::istream> source,
181 const CSVFormat& format = CSVFormat::guess_csv()) : _format(format), owned_stream(std::move(source)) {
182 if (!this->owned_stream) {
183 throw std::invalid_argument("CSVReader requires a non-null stream");
184 }
185
186 this->init_from_stream(*this->owned_stream, format);
187 }
189
190 CSVReader(const CSVReader&) = delete;
191 CSVReader& operator=(const CSVReader&) = delete;
192
201 CSVReader(CSVReader&& other) noexcept :
202 _format(std::move(other._format)),
203 col_names(std::move(other.col_names)),
204 parser(std::move(other.parser)),
205 records(std::move(other.records)),
206 owned_stream(std::move(other.owned_stream)),
207 n_cols(other.n_cols),
208 _n_rows(other._n_rows),
209 header_trimmed(other.header_trimmed),
210 _chunk_size(other._chunk_size),
211 _read_requested(other._read_requested),
212 read_csv_exception(other.take_read_csv_exception()) {
213 JOIN_WORKER(other.read_csv_worker);
214
215 other.n_cols = 0;
216 other._n_rows = 0;
217 other.header_trimmed = false;
218 other._read_requested = false;
219 other._chunk_size = internals::CSV_CHUNK_SIZE_DEFAULT;
220 }
221
226 CSVReader& operator=(CSVReader&& other) noexcept {
227 if (this == &other) {
228 return *this;
229 }
230
231 JOIN_WORKER(this->read_csv_worker);
232 JOIN_WORKER(other.read_csv_worker);
233
234 this->_format = std::move(other._format);
235 this->col_names = std::move(other.col_names);
236 this->parser = std::move(other.parser);
237 this->records = std::move(other.records);
238 this->owned_stream = std::move(other.owned_stream);
239 this->n_cols = other.n_cols;
240 this->_n_rows = other._n_rows;
241 this->header_trimmed = other.header_trimmed;
242 this->_chunk_size = other._chunk_size;
243 this->_read_requested = other._read_requested;
244 this->read_csv_exception = other.take_read_csv_exception();
245
246 other.n_cols = 0;
247 other._n_rows = 0;
248 other.header_trimmed = false;
249 other._read_requested = false;
250 other._chunk_size = internals::CSV_CHUNK_SIZE_DEFAULT;
251
252 return *this;
253 }
254
255 ~CSVReader() {
256 JOIN_WORKER(this->read_csv_worker);
257 }
258
261 bool read_row(CSVRow &row);
262 iterator begin();
263 CSV_CONST iterator end() const noexcept;
264
266 bool eof() const noexcept { return this->parser->eof(); }
268
271 CSVFormat get_format() const;
272 std::vector<std::string> get_col_names() const;
273 int index_of(csv::string_view col_name) const;
275
278
284 CONSTEXPR bool empty() const noexcept { return this->n_rows() == 0; }
285
287 CONSTEXPR size_t n_rows() const noexcept { return this->_n_rows; }
288
290 bool utf8_bom() const noexcept { return this->parser->utf8_bom(); }
292
293 protected:
302 void set_col_names(const std::vector<std::string>&);
303
306 CSVFormat _format;
308
311
312 internals::ColNamesPtr col_names = std::make_shared<internals::ColNames>();
313
315 std::unique_ptr<internals::IBasicCSVParser> parser = nullptr;
316
318 std::unique_ptr<RowCollection> records{new RowCollection(100)};
319
325 std::unique_ptr<std::istream> owned_stream = nullptr;
326
327 size_t n_cols = 0;
328 size_t _n_rows = 0;
332 bool read_csv(size_t bytes = internals::CSV_CHUNK_SIZE_DEFAULT);
334
337 private:
339 bool header_trimmed = false;
342 #if CSV_ENABLE_THREADS
343 std::thread read_csv_worker;
344 #endif
345 size_t _chunk_size = internals::CSV_CHUNK_SIZE_DEFAULT;
346 bool _read_requested = false;
348
350 std::exception_ptr read_csv_exception = nullptr;
351#if CSV_ENABLE_THREADS
352 std::mutex read_csv_exception_lock;
353#endif
354
355 void set_read_csv_exception(std::exception_ptr eptr) {
356#if CSV_ENABLE_THREADS
357 std::lock_guard<std::mutex> lock(this->read_csv_exception_lock);
358#endif
359 this->read_csv_exception = std::move(eptr);
360 }
361
362 std::exception_ptr take_read_csv_exception() {
363#if CSV_ENABLE_THREADS
364 std::lock_guard<std::mutex> lock(this->read_csv_exception_lock);
365#endif
366 auto eptr = this->read_csv_exception;
367 this->read_csv_exception = nullptr;
368 return eptr;
369 }
370
371 void rethrow_read_csv_exception_if_any() {
372 if (auto eptr = this->take_read_csv_exception()) {
373 std::rethrow_exception(eptr);
374 }
375 }
376
380 void init_parser(std::unique_ptr<internals::IBasicCSVParser> parser);
381
382 template<typename TStream,
383 csv::enable_if_t<std::is_base_of<std::istream, TStream>::value, int> = 0>
384 void init_from_stream(TStream& source, CSVFormat format) {
385 // C4316: StreamParser may have over-aligned SIMD members; heap allocation
386 // alignment is handled correctly at runtime via the allocator on supported
387 // platforms. Suppress the MSVC false-positive here.
388 CSV_MSVC_PUSH_DISABLE(4316)
389 this->init_parser(
390 std::unique_ptr<internals::IBasicCSVParser>(
391 new internals::StreamParser<TStream>(source, format, this->col_names)
392 )
393 );
394 CSV_MSVC_POP
395 }
396
398 void initial_read() {
399#if CSV_ENABLE_THREADS
400 this->read_csv_worker = std::thread(&CSVReader::read_csv, this, this->_chunk_size);
401 this->read_csv_worker.join();
402#else
403 this->read_csv(this->_chunk_size);
404#endif
405 this->rethrow_read_csv_exception_if_any();
406 }
407
408 void trim_header();
409 };
410}
Contains the main CSV parsing algorithm and various utility functions.
Stores information about how to parse a CSV file.
static CSVFormat guess_csv()
CSVFormat preset for delimiter inference with header/n_cols inference enabled.
An input iterator capable of handling large files.
CONSTEXPR bool operator==(const iterator &other) const noexcept
Returns true if iterators were constructed from the same CSVReader and point to the same row.
iterator & operator++()
Pre-increment iterator.
CONSTEXPR_14 reference operator*()
Access the CSVRow held by the iterator.
CONSTEXPR_14 pointer operator->()
Return a pointer to the CSVRow the iterator has stopped at.
Main class for parsing CSVs from files and in-memory sources.
CONSTEXPR bool empty() const noexcept
Whether or not the file or stream contains valid CSV rows, not including the header.
bool utf8_bom() const noexcept
Whether or not CSV was prefixed with a UTF-8 bom.
CSVFormat get_format() const
Return the format of the original raw CSV.
int index_of(csv::string_view col_name) const
Return the index of the column name if found or csv::CSV_NOT_FOUND otherwise.
CSVReader & operator=(const CSVReader &)=delete
Not copyable.
CSV_CONST iterator end() const noexcept
A placeholder for the imaginary past-the-end row in a CSV.
CONSTEXPR size_t n_rows() const noexcept
Retrieves the number of rows that have been read so far.
CSVReader(std::unique_ptr< std::istream > source, const CSVFormat &format=CSVFormat::guess_csv())
Construct CSVReader from an owned std::istream.
bool eof() const noexcept
Returns true if we have reached end of file.
CSVReader & operator=(CSVReader &&other) noexcept
Move assignment.
CSVReader(CSVReader &&other) noexcept
Move constructor.
bool read_row(CSVRow &row)
Retrieve rows as CSVRow objects, returning true if more rows are available.
std::vector< std::string > get_col_names() const
Return the CSV's column names as a vector of strings.
CSVReader(TStream &source, CSVFormat format=CSVFormat::guess_csv())
Construct CSVReader from std::istream.
CSVReader(const CSVReader &)=delete
Not copyable.
iterator begin()
Return an iterator to the first row in the reader.
CSVReader(csv::string_view filename, const CSVFormat &format=CSVFormat::guess_csv())
Construct CSVReader from filename.
Data structure for representing CSV rows.
Definition csv_row.hpp:264
Parser for memory-mapped files.
A standalone header file containing shared code.
#define CONSTEXPR
Expands to constexpr in decent compilers and inline otherwise.
Definition common.hpp:187
Defines an object used to store CSV format settings.
Implements data type parsing functionality.
std::unique_ptr< std::istream > owned_stream
Optional owned stream used by two paths: 1) Emscripten filename-constructor fallback to stream parsin...
std::unique_ptr< RowCollection > records
Queue of parsed CSV rows.
size_t _n_rows
How many rows (minus header) have been read so far.
bool read_csv(size_t bytes=internals::CSV_CHUNK_SIZE_DEFAULT)
Read a chunk of CSV data.
internals::ColNamesPtr col_names
Pointer to a object containing column information.
void set_col_names(const std::vector< std::string > &)
Sets this reader's column names and associated data.
std::unique_ptr< internals::IBasicCSVParser > parser
Helper class which actually does the parsing.
size_t n_cols
The number of columns in this CSV.
The all encompassing namespace.
internals::ThreadSafeDeque< CSVRow > RowCollection
Standard type for storing collection of rows.
nonstd::string_view string_view
The string_view class used by this library.
Definition common.hpp:135