Vince's CSV Parser
Loading...
Searching...
No Matches
csv_reader.hpp
Go to the documentation of this file.
1
5#pragma once
6
7#include <algorithm>
8#include <deque>
9#include <exception>
10#include <fstream>
11#include <functional>
12#include <iterator>
13#include <memory>
14#include <sstream>
15#include <string>
16#include <vector>
17
18#include "common.hpp"
19#include "csv_exceptions.hpp"
20#include "data_type.hpp"
21#include "csv_format.hpp"
22#include "parser/mmap.hpp"
23#include "parser/scheduler.hpp"
24#include "parser/stream.hpp"
25
27namespace csv {
49 class CSVReader {
50 public:
67 class iterator {
68 public:
69 #ifndef DOXYGEN_SHOULD_SKIP_THIS
70 using value_type = CSVRow;
71 using difference_type = std::ptrdiff_t;
72 using pointer = CSVRow * ;
73 using reference = CSVRow & ;
74 using iterator_category = std::input_iterator_tag;
75 #endif
76
77 iterator() = default;
78 iterator(CSVReader* reader) : daddy(reader) {}
80
82 CONSTEXPR_14 reference operator*() { return this->row; }
83 CONSTEXPR_14 reference operator*() const { return const_cast<reference>(this->row); }
84
86 CONSTEXPR_14 pointer operator->() { return &(this->row); }
87 CONSTEXPR_14 pointer operator->() const { return const_cast<pointer>(&(this->row)); }
88
89 iterator& operator++();
90 iterator operator++(int);
95 CONSTEXPR bool operator==(const iterator& other) const noexcept {
96 return (this->daddy == other.daddy) && (this->i == other.i);
97 }
98
99 CONSTEXPR bool operator!=(const iterator& other) const noexcept { return !operator==(other); }
100 private:
101 CSVReader * daddy = nullptr; // Pointer to parent
102 CSVRow row; // Current row
103 size_t i = 0; // Index of current row
104 };
105
110
123 : _format(format),
124 read_scheduler_(format.is_threading_enabled()) {
125#if defined(__EMSCRIPTEN__)
126 this->owned_stream = std::unique_ptr<std::istream>(
127 new std::ifstream(std::string(filename), std::ios::binary)
128 );
129
130 if (!(*this->owned_stream)) {
131 internals::throw_cannot_open_file(filename);
132 }
133
134 this->init_from_stream(*this->owned_stream, format);
135#else
136 this->init_parser(std::unique_ptr<internals::parser::CSVParserDriverBase>(
137 new internals::parser::MmapParser(filename, format, this->col_names)
138 ));
139#endif
140 }
141
154 template<typename TStream,
155 csv::enable_if_t<std::is_base_of<std::istream, TStream>::value, int> = 0>
156 CSVReader(TStream &source, CSVFormat format = CSVFormat::guess_csv())
157 : _format(format),
158 read_scheduler_(format.is_threading_enabled()) {
159 this->init_from_stream(source, format);
160 }
161
167 CSVReader(std::unique_ptr<std::istream> source,
168 const CSVFormat& format = CSVFormat::guess_csv())
169 : _format(format),
170 owned_stream(std::move(source)),
171 read_scheduler_(format.is_threading_enabled()) {
172 if (!this->owned_stream) {
173 throw std::invalid_argument(internals::ERROR_READER_NULL_STREAM);
174 }
175
176 this->init_from_stream(*this->owned_stream, format);
177 }
179
180 CSVReader(const CSVReader&) = delete;
181 CSVReader& operator=(const CSVReader&) = delete;
182
191 CSVReader(CSVReader&& other) noexcept :
192 read_scheduler_(other._format.is_threading_enabled()) {
193 other.read_scheduler_.join();
194 this->move_state_from(other);
195 }
196
201 CSVReader& operator=(CSVReader&& other) noexcept {
202 if (this == &other) {
203 return *this;
204 }
205
206 this->read_scheduler_.join();
207 other.read_scheduler_.join();
208 this->move_state_from(other);
209
210 return *this;
211 }
212
213 ~CSVReader() {
214 this->read_scheduler_.join();
215 }
216
219
237 bool read_row(CSVRow &row);
238
260 bool read_chunk(std::vector<CSVRow>& out, size_t max_rows);
261 iterator begin();
262 CSV_CONST iterator end() const noexcept;
263
265 bool eof() const noexcept { return this->parser->eof(); }
267
270
275 CSVFormat get_format() const;
276
278 const std::vector<std::string>& get_col_names() const{
279 static const std::vector<std::string> empty_col_names;
280 return (this->col_names) ? this->col_names->get_col_names() : empty_col_names;
281 }
282
284 internals::ConstColNamesPtr col_names_ptr() const noexcept {
285 return this->col_names;
286 }
287
289 int index_of(csv::string_view col_name) const {
290 return this->col_names->index_of(col_name);
291 }
293
296
302 CONSTEXPR bool empty() const noexcept { return this->n_rows() == 0; }
303
305 CONSTEXPR size_t n_rows() const noexcept { return this->_n_rows; }
306
308 bool utf8_bom() const noexcept { return this->parser->utf8_bom(); }
309
311 internals::SpeculativeParseDiagnostics speculative_diagnostics() const noexcept {
312 return this->parser ? this->parser->speculative_diagnostics()
313 : internals::SpeculativeParseDiagnostics();
314 }
315
317 size_t parse_worker_count() const noexcept {
318 return this->parser ? this->parser->parse_worker_count() : 1;
319 }
321
322 protected:
331 void set_col_names(const std::vector<std::string>&);
332
335 CSVFormat _format;
337
340
341 internals::ColNamesPtr col_names = std::make_shared<internals::ColNames>();
342
344 std::unique_ptr<internals::parser::CSVParserDriverBase> parser = nullptr;
345
347 std::unique_ptr<RowCollection> records{new RowCollection(100)};
348
354 std::unique_ptr<std::istream> owned_stream = nullptr;
355
356 size_t n_cols = 0;
357 size_t _n_rows = 0;
364 bool read_csv(size_t bytes = internals::CSV_CHUNK_SIZE_DEFAULT);
366
369 private:
371 bool header_trimmed = false;
372
375 size_t _chunk_size = internals::CSV_CHUNK_SIZE_DEFAULT;
376 bool _read_requested = false;
377 internals::CSVReadScheduler read_scheduler_;
379
380 void move_state_from(CSVReader& other) noexcept {
381 this->_format = std::move(other._format);
382 this->col_names = std::move(other.col_names);
383 this->parser = std::move(other.parser);
384 this->records = std::move(other.records);
385 this->owned_stream = std::move(other.owned_stream);
386 this->n_cols = other.n_cols;
387 this->_n_rows = other._n_rows;
388 this->header_trimmed = other.header_trimmed;
389 this->_chunk_size = other._chunk_size;
390 this->_read_requested = other._read_requested;
391 this->read_scheduler_.set_threading_enabled(this->_format.is_threading_enabled());
392 this->read_scheduler_.adopt_exception(other.read_scheduler_.take_exception());
393 other.reset_after_move();
394 }
395
396 void reset_after_move() noexcept {
397 this->n_cols = 0;
398 this->_n_rows = 0;
399 this->header_trimmed = false;
400 this->_read_requested = false;
401 this->_chunk_size = internals::CSV_CHUNK_SIZE_DEFAULT;
402 }
403
409
412 void init_parser(std::unique_ptr<internals::parser::CSVParserDriverBase> parser);
413
414 template<typename TStream,
415 csv::enable_if_t<std::is_base_of<std::istream, TStream>::value, int> = 0>
416 void init_from_stream(TStream& source, CSVFormat format) {
417 this->init_parser(
418 std::unique_ptr<internals::parser::CSVParserDriverBase>(
419 new internals::parser::StreamParser<TStream>(source, format, this->col_names)
420 )
421 );
422 }
423
425 void initial_read() {
426 this->read_scheduler_.run([this] { this->read_csv(this->_chunk_size); });
427 this->read_scheduler_.join();
428 this->read_scheduler_.rethrow_exception_if_any();
429 }
430
431 void trim_header();
433
439
440 bool accept_row(CSVRow&& candidate, CSVRow* single_row, std::vector<CSVRow>* batch_rows);
441
448 bool check_for_rows();
449
455 void drain_rows_into_chunk(std::vector<CSVRow>& out, size_t max_rows);
457 };
458
459 #ifdef CSV_HAS_CXX20
460 static_assert(
461 internals::csv_write_rows_input_range<CSVReader>,
462 "CSVReader must remain compatible with csv::DelimWriter::write_rows()."
463 );
464 #endif
465}
Stores information about how to parse a CSV file.
static CSVFormat guess_csv()
CSVFormat preset for delimiter inference with header/n_cols inference enabled.
An input iterator capable of handling large files.
CONSTEXPR bool operator==(const iterator &other) const noexcept
Returns true if iterators were constructed from the same CSVReader and point to the same row.
iterator & operator++()
Pre-increment iterator.
CONSTEXPR_14 reference operator*()
Access the CSVRow held by the iterator.
CONSTEXPR_14 pointer operator->()
Return a pointer to the CSVRow the iterator has stopped at.
Main class for parsing CSVs from files and in-memory sources.
CONSTEXPR bool empty() const noexcept
Whether or not the file or stream contains valid CSV rows, not including the header.
bool read_chunk(std::vector< CSVRow > &out, size_t max_rows)
Read up to max_rows rows into a caller-owned batch buffer.
bool utf8_bom() const noexcept
Whether or not CSV was prefixed with a UTF-8 bom.
internals::SpeculativeParseDiagnostics speculative_diagnostics() const noexcept
Return speculative-parsing counters for filename-backed readers.
CSVFormat get_format() const
Return the resolved parsing format for this CSV source.
internals::ConstColNamesPtr col_names_ptr() const noexcept
Internal accessor for preserving resolved column-name lookup policy across helper types.
int index_of(csv::string_view col_name) const
Return the index of col_name, or csv::CSV_NOT_FOUND if absent.
size_t parse_worker_count() const noexcept
Return the number of parser worker threads used by the active parser.
const std::vector< std::string > & get_col_names() const
Return the active column names in CSV order.
CSVReader & operator=(const CSVReader &)=delete
Not copyable.
CSV_CONST iterator end() const noexcept
A placeholder for the imaginary past-the-end row in a CSV.
CONSTEXPR size_t n_rows() const noexcept
Retrieves the number of rows that have been read so far.
CSVReader(std::unique_ptr< std::istream > source, const CSVFormat &format=CSVFormat::guess_csv())
Construct CSVReader from an owned std::istream.
bool eof() const noexcept
Returns true if we have reached end of file.
CSVReader & operator=(CSVReader &&other) noexcept
Move assignment.
CSVReader(CSVReader &&other) noexcept
Move constructor.
bool read_row(CSVRow &row)
Retrieve the next CSV row, returning true while more rows are available.
CSVReader(TStream &source, CSVFormat format=CSVFormat::guess_csv())
Construct CSVReader from std::istream.
CSVReader(const CSVReader &)=delete
Not copyable.
iterator begin()
Return an iterator to the first row in the reader.
CSVReader(csv::string_view filename, const CSVFormat &format=CSVFormat::guess_csv())
Construct CSVReader from filename.
Data structure for representing CSV rows.
Definition csv_row.hpp:544
A standalone header file containing shared code.
#define CONSTEXPR
Expands to constexpr in decent compilers and inline otherwise.
Definition common.hpp:251
Shared exception message templates and throw helpers.
Defines an object used to store CSV format settings.
CSV scalar type classification adapter.
std::unique_ptr< std::istream > owned_stream
Optional owned stream used by two paths: 1) Emscripten filename-constructor fallback to stream parsin...
std::unique_ptr< RowCollection > records
Queue of parsed CSV rows.
size_t _n_rows
How many rows (minus header) have been read so far.
bool read_csv(size_t bytes=internals::CSV_CHUNK_SIZE_DEFAULT)
Read a chunk of CSV data.
internals::ColNamesPtr col_names
Pointer to a object containing column information.
void set_col_names(const std::vector< std::string > &)
Sets this reader's column names and associated data.
std::unique_ptr< internals::parser::CSVParserDriverBase > parser
Helper class which actually does the parsing.
size_t n_cols
The number of columns in this CSV.
The all encompassing namespace.
std::string_view string_view
The string_view class used by this library.
Definition common.hpp:174