Vince's CSV Parser
Loading...
Searching...
No Matches
csv_reader.hpp
Go to the documentation of this file.
1
5#pragma once
6
7#include <algorithm>
8#include <deque>
9#include <exception>
10#include <fstream>
11#include <iterator>
12#include <memory>
13#include <mutex>
14#include <thread>
15#include <sstream>
16#include <string>
17#include <vector>
18
19#include "../external/mio.hpp"
20#include "basic_csv_parser.hpp"
21#include "common.hpp"
22#include "data_type.hpp"
23#include "csv_format.hpp"
24
26namespace csv {
28 namespace internals {
29 std::string format_row(const std::vector<std::string>& row, csv::string_view delim = ", ");
30
31 std::vector<std::string> _get_col_names( csv::string_view head, const CSVFormat format = CSVFormat::guess_csv());
32
33 struct GuessScore {
34 double score;
35 size_t header;
36 };
37
39
40 CSVGuessResult _guess_format(csv::string_view head, const std::vector<char>& delims = { ',', '|', '\t', ';', '^', '~' });
41 }
42
43 std::vector<std::string> get_col_names(
44 csv::string_view filename,
45 const CSVFormat format = CSVFormat::guess_csv());
46
67 CSVGuessResult guess_format(csv::string_view filename,
68 const std::vector<char>& delims = { ',', '|', '\t', ';', '^', '~' });
69
77 class CSVReader {
78 public:
117 class iterator {
118 public:
119 #ifndef DOXYGEN_SHOULD_SKIP_THIS
120 using value_type = CSVRow;
121 using difference_type = std::ptrdiff_t;
122 using pointer = CSVRow * ;
123 using reference = CSVRow & ;
124 using iterator_category = std::input_iterator_tag;
125 #endif
126
127 iterator() = default;
128 iterator(CSVReader* reader) : daddy(reader) {}
130
132 CONSTEXPR_14 reference operator*() { return this->row; }
133 CONSTEXPR_14 reference operator*() const { return const_cast<reference>(this->row); }
134
136 CONSTEXPR_14 pointer operator->() { return &(this->row); }
137 CONSTEXPR_14 pointer operator->() const { return const_cast<pointer>(&(this->row)); }
138
139 iterator& operator++();
140 iterator operator++(int);
145 CONSTEXPR bool operator==(const iterator& other) const noexcept {
146 return (this->daddy == other.daddy) && (this->i == other.i);
147 }
148
149 CONSTEXPR bool operator!=(const iterator& other) const noexcept { return !operator==(other); }
150 private:
151 CSVReader * daddy = nullptr; // Pointer to parent
152 CSVRow row; // Current row
153 size_t i = 0; // Index of current row
154 };
155
160
169 CSVReader(csv::string_view filename, CSVFormat format = CSVFormat::guess_csv());
170
181 template<typename TStream,
182 csv::enable_if_t<std::is_base_of<std::istream, TStream>::value, int> = 0>
183 CSVReader(TStream &source, CSVFormat format = CSVFormat::guess_csv()) : _format(format) {
184 auto head = internals::get_csv_head(source);
186
187 // Apply chunk size from format before any reading occurs
188 this->_chunk_size = format.get_chunk_size();
189
190 if (format.guess_delim()) {
191 auto guess_result = internals::_guess_format(head, format.possible_delimiters);
192 format.delimiter(guess_result.delim);
193 // Only override header if user hasn't explicitly called no_header()
194 // Note: column_names() also sets header=-1, but it populates col_names,
195 // so we can distinguish: no_header() means header=-1 && col_names.empty()
196 if (format.header != -1 || !format.col_names.empty()) {
197 format.header = guess_result.header_row;
198 }
199 this->_format = format;
200 }
201
202 if (!format.col_names.empty())
203 this->set_col_names(format.col_names);
204
205 this->parser = std::unique_ptr<Parser>(
206 new Parser(source, format, col_names)); // For C++11
207 this->initial_read();
208 }
210
211 CSVReader(const CSVReader&) = delete;
212 CSVReader(CSVReader&&) = delete;
213 CSVReader& operator=(const CSVReader&) = delete;
215 ~CSVReader() {
216 if (this->read_csv_worker.joinable()) {
217 this->read_csv_worker.join();
218 }
219 }
220
223 bool read_row(CSVRow &row);
224 iterator begin();
225 CSV_CONST iterator end() const noexcept;
226
228 bool eof() const noexcept { return this->parser->eof(); }
230
233 CSVFormat get_format() const;
234 std::vector<std::string> get_col_names() const;
235 int index_of(csv::string_view col_name) const;
237
240
246 CONSTEXPR bool empty() const noexcept { return this->n_rows() == 0; }
247
249 CONSTEXPR size_t n_rows() const noexcept { return this->_n_rows; }
250
252 bool utf8_bom() const noexcept { return this->parser->utf8_bom(); }
254
255 protected:
264 void set_col_names(const std::vector<std::string>&);
265
268 CSVFormat _format;
270
273
274 internals::ColNamesPtr col_names = std::make_shared<internals::ColNames>();
275
277 std::unique_ptr<internals::IBasicCSVParser> parser = nullptr;
278
280 std::unique_ptr<RowCollection> records{new RowCollection(100)};
281
282 size_t n_cols = 0;
283 size_t _n_rows = 0;
287 bool read_csv(size_t bytes = internals::ITERATION_CHUNK_SIZE);
289
292 private:
294 bool header_trimmed = false;
295
298 std::thread read_csv_worker;
299 size_t _chunk_size = internals::ITERATION_CHUNK_SIZE;
300 bool _read_requested = false;
302
304 std::exception_ptr read_csv_exception = nullptr;
305 std::mutex read_csv_exception_lock;
306
307 void set_read_csv_exception(std::exception_ptr eptr) {
308 std::lock_guard<std::mutex> lock(this->read_csv_exception_lock);
309 this->read_csv_exception = std::move(eptr);
310 }
311
312 std::exception_ptr take_read_csv_exception() {
313 std::lock_guard<std::mutex> lock(this->read_csv_exception_lock);
314 auto eptr = this->read_csv_exception;
315 this->read_csv_exception = nullptr;
316 return eptr;
317 }
318
319 void rethrow_read_csv_exception_if_any() {
320 if (auto eptr = this->take_read_csv_exception()) {
321 std::rethrow_exception(eptr);
322 }
323 }
324
326 void initial_read() {
327 this->read_csv_worker = std::thread(&CSVReader::read_csv, this, this->_chunk_size);
328 this->read_csv_worker.join();
329 this->rethrow_read_csv_exception_if_any();
330 }
331
332 void trim_header();
333 };
334}
Contains the main CSV parsing algorithm and various utility functions.
Stores information about how to parse a CSV file.
static CSVFormat guess_csv()
CSVFormat for guessing the delimiter.
An input iterator capable of handling large files.
CONSTEXPR bool operator==(const iterator &other) const noexcept
Returns true if iterators were constructed from the same CSVReader and point to the same row.
iterator & operator++()
Pre-increment iterator.
CONSTEXPR_14 reference operator*()
Access the CSVRow held by the iterator.
CONSTEXPR_14 pointer operator->()
Return a pointer to the CSVRow the iterator has stopped at.
Main class for parsing CSVs from files and in-memory sources.
CONSTEXPR bool empty() const noexcept
Whether or not the file or stream contains valid CSV rows, not including the header.
bool utf8_bom() const noexcept
Whether or not CSV was prefixed with a UTF-8 bom.
CSVFormat get_format() const
Return the format of the original raw CSV.
CSVReader & operator=(CSVReader &&)=delete
Not movable: contains std::mutex.
int index_of(csv::string_view col_name) const
Return the index of the column name if found or csv::CSV_NOT_FOUND otherwise.
CSVReader & operator=(const CSVReader &)=delete
Not copyable.
CSV_CONST iterator end() const noexcept
A placeholder for the imaginary past the end row in a CSV.
CONSTEXPR size_t n_rows() const noexcept
Retrieves the number of rows that have been read so far.
bool eof() const noexcept
Returns true if we have reached end of file.
bool read_row(CSVRow &row)
Retrieve rows as CSVRow objects, returning true if more rows are available.
std::vector< std::string > get_col_names() const
Return the CSV's column names as a vector of strings.
CSVReader(TStream &source, CSVFormat format=CSVFormat::guess_csv())
Construct CSVReader from std::istream.
CSVReader(CSVReader &&)=delete
Not movable: contains std::mutex.
CSVReader(const CSVReader &)=delete
Not copyable.
iterator begin()
Return an iterator to the first row in the reader.
Data structure for representing CSV rows.
Definition csv_row.hpp:280
A class for parsing CSV data from a std::stringstream or an std::ifstream
A standalone header file containing shared code.
#define CONSTEXPR
Expands to constexpr in decent compilers and inline otherwise.
Definition common.hpp:149
#define CSV_INLINE
Helper macro which should be #defined as "inline" in the single header version.
Definition common.hpp:26
Defines an object used to store CSV format settings.
Implements data type parsing functionality.
std::unique_ptr< RowCollection > records
Queue of parsed CSV rows.
size_t _n_rows
How many rows (minus header) have been read so far.
bool read_csv(size_t bytes=internals::ITERATION_CHUNK_SIZE)
Read a chunk of CSV data.
internals::ColNamesPtr col_names
Pointer to a object containing column information.
void set_col_names(const std::vector< std::string > &)
Sets this reader's column names and associated data.
std::unique_ptr< internals::IBasicCSVParser > parser
Helper class which actually does the parsing.
size_t n_cols
The number of columns in this CSV.
constexpr size_t ITERATION_CHUNK_SIZE
Chunk size for lazy-loading large CSV files.
Definition common.hpp:190
std::vector< std::string > _get_col_names(csv::string_view head, CSVFormat format)
Return a CSV's column names.
std::string format_row(const std::vector< std::string > &row, csv::string_view delim)
Definition csv_reader.cpp:9
CSVGuessResult _guess_format(csv::string_view head, const std::vector< char > &delims)
Guess the delimiter used by a delimiter-separated values file.
CSV_CONST CONSTEXPR_17 OutArray arrayToDefault(T &&value)
Helper constexpr function to initialize an array with all the elements set to value.
The all encompassing namespace.
std::vector< std::string > get_col_names(csv::string_view filename, CSVFormat format)
Return a CSV's column names.
internals::ThreadSafeDeque< CSVRow > RowCollection
Standard type for storing collection of rows.
CSVGuessResult guess_format(csv::string_view filename, const std::vector< char > &delims)
Guess the delimiter used by a delimiter-separated values file.
nonstd::string_view string_view
The string_view class used by this library.
Definition common.hpp:99
Stores the inferred format of a CSV file.