Vince's CSV Parser
Loading...
Searching...
No Matches
csv_format.hpp
Go to the documentation of this file.
1
5#pragma once
6#include <iterator>
7#include <stdexcept>
8#include <string>
9#include <vector>
10
11#include "common.hpp"
12#include "csv_exceptions.hpp"
13
14namespace csv {
15 namespace internals {
16 template<typename RowSink, typename ParsePolicy, typename FieldPolicy, typename RowPolicy>
18 namespace parser {
19 class CSVParserDriverBase;
20 }
21 }
22
23 class CSVReader;
24
27 THROW = -1,
28 IGNORE_ROW = 0,
29 KEEP = 1,
30 KEEP_NON_EMPTY = 2
31 };
32
34 enum class ColumnNamePolicy {
35 EXACT = 0,
37 };
38
41 char delim;
42 int header_row;
43 size_t n_cols;
44 };
45
49 class CSVFormat {
50 public:
52 CSVFormat() = default;
53
62 CSVFormat& delimiter(char delim);
63
70 CSVFormat& delimiter(const std::vector<char> & delim);
71
76 CSVFormat& trim(const std::vector<char> & ws);
77
82 CSVFormat& quote(char quote);
83
88 CSVFormat& column_names(const std::vector<std::string>& names);
89
95 CSVFormat& header_row(int row);
96
103 this->header_row(-1);
104 return *this;
105 }
106
108 CSVFormat& quote(bool use_quote) {
109 this->no_quote = !use_quote;
110 return *this;
111 }
112
114 CONSTEXPR_14 CSVFormat& variable_columns(VariableColumnPolicy policy = VariableColumnPolicy::IGNORE_ROW) {
115 this->variable_column_policy = policy;
116 return *this;
117 }
118
120 CONSTEXPR_14 CSVFormat& variable_columns(bool policy) {
121 this->variable_column_policy = (VariableColumnPolicy)policy;
122 return *this;
123 }
124
132 this->_column_name_policy = policy;
133 return *this;
134 }
135
145 CSVFormat& chunk_size(size_t size);
146
156 CONSTEXPR_14 CSVFormat& threading(bool enabled = true) {
157 this->_threading = enabled;
158 return *this;
159 }
160
165 CONSTEXPR_14 CSVFormat& speculative_parallel_threads(size_t n_threads) {
166 this->_speculative_parallel_threads = n_threads;
167 return *this;
168 }
169
171 CONSTEXPR_14 CSVFormat& speculative_parallel_min_bytes(size_t bytes) {
172 this->_speculative_parallel_min_bytes = bytes;
173 return *this;
174 }
175
181 CONSTEXPR_14 CSVFormat& eager_field_classification(bool enabled = true) {
182 this->_eager_field_classification = enabled;
183 return *this;
184 }
185
186#ifndef DOXYGEN_SHOULD_SKIP_THIS
187 char get_delim() const {
188 // This error should never be received by end users.
189 if (this->possible_delimiters.size() > 1) {
190 throw std::runtime_error(internals::ERROR_MULTIPLE_DELIMITERS);
191 }
192
193 return this->possible_delimiters.at(0);
194 }
195
196 CONSTEXPR bool is_quoting_enabled() const { return !this->no_quote; }
197 CONSTEXPR char get_quote_char() const { return this->quote_char; }
198 CONSTEXPR int get_header() const { return this->header; }
199 std::vector<char> get_possible_delims() const { return this->possible_delimiters; }
200 std::vector<char> get_trim_chars() const { return this->trim_chars; }
201 const std::vector<std::string>& get_col_names() const { return this->col_names; }
202 CONSTEXPR VariableColumnPolicy get_variable_column_policy() const { return this->variable_column_policy; }
203 CONSTEXPR ColumnNamePolicy get_column_name_policy() const { return this->_column_name_policy; }
204 CONSTEXPR size_t get_chunk_size() const { return this->_chunk_size; }
205 CONSTEXPR bool is_threading_enabled() const {
206#if CSV_ENABLE_THREADS
207 return this->_threading;
208#else
209 return false;
210#endif
211 }
212 CONSTEXPR size_t get_speculative_parallel_threads() const { return this->_speculative_parallel_threads; }
213 CONSTEXPR size_t get_speculative_parallel_min_bytes() const { return this->_speculative_parallel_min_bytes; }
214 CONSTEXPR bool is_eager_field_classification_enabled() const { return this->_eager_field_classification; }
215 CONSTEXPR bool should_use_speculative_parallel(size_t source_size, size_t n_threads) const {
216#if CSV_ENABLE_THREADS
217 return this->_threading
218 && n_threads > 1
219 && source_size >= this->_speculative_parallel_min_bytes;
220#else
221 (void)source_size;
222 (void)n_threads;
223 return false;
224#endif
225 }
226#endif
227
230 CSVFormat format;
231 format.delimiter({ ',', '|', '\t', ';', '^' })
232 .quote('"');
233 // Assign header directly rather than via header_row() so that
234 // header_explicitly_set_ remains false — the guesser must be free
235 // to detect the real header row at construction time.
236 format.header = 0;
237
238 return format;
239 }
240
241 bool guess_delim() const {
242 return this->possible_delimiters.size() > 1;
243 }
244
245 friend CSVReader;
246 template<typename RowSink, typename ParsePolicy, typename FieldPolicy, typename RowPolicy>
247 friend class internals::CSVParserCore;
248 friend internals::parser::CSVParserDriverBase;
249
250 private:
252 void assert_no_char_overlap();
253
255 std::vector<char> possible_delimiters = { ',' };
256
258 std::vector<char> trim_chars = {};
259
261 int header = 0;
262
264 bool header_explicitly_set_ = false;
265
267 bool no_quote = false;
268
270 char quote_char = '"';
271
273 std::vector<std::string> col_names = {};
274
276 bool col_names_explicitly_set_ = false;
277
279 VariableColumnPolicy variable_column_policy = VariableColumnPolicy::IGNORE_ROW;
280
282 ColumnNamePolicy _column_name_policy = ColumnNamePolicy::EXACT;
283
285 size_t _chunk_size = internals::CSV_CHUNK_SIZE_DEFAULT;
286
288 bool _threading = true;
289
291 size_t _speculative_parallel_threads = 0;
292
294 size_t _speculative_parallel_min_bytes = internals::CSV_SPECULATIVE_PARALLEL_MIN_BYTES;
295
297 bool _eager_field_classification = false;
298 };
299}
Stores information about how to parse a CSV file.
CSVFormat & column_names(const std::vector< std::string > &names)
Sets the column names.
CONSTEXPR_14 CSVFormat & variable_columns(VariableColumnPolicy policy=VariableColumnPolicy::IGNORE_ROW)
Tells the parser how to handle columns of a different length than the others.
CONSTEXPR_14 CSVFormat & column_names_policy(ColumnNamePolicy policy)
Sets the column name lookup policy.
CSVFormat()=default
Settings for parsing a RFC 4180 CSV file.
static CSVFormat guess_csv()
CSVFormat preset for delimiter inference with header/n_cols inference enabled.
CSVFormat & chunk_size(size_t size)
Sets the chunk size used when reading the CSV.
CONSTEXPR_14 CSVFormat & eager_field_classification(bool enabled=true)
Enable parser-time scalar classification for typed consumers.
CSVFormat & trim(const std::vector< char > &ws)
Sets the whitespace characters to be trimmed.
CSVFormat & delimiter(char delim)
Sets the delimiter of the CSV file.
CONSTEXPR_14 CSVFormat & speculative_parallel_threads(size_t n_threads)
Set the worker count used by speculative parallel parsing.
CSVFormat & quote(bool use_quote)
Turn quoting on or off.
CONSTEXPR_14 CSVFormat & threading(bool enabled=true)
Enable or disable parser threading at runtime.
CONSTEXPR_14 CSVFormat & speculative_parallel_min_bytes(size_t bytes)
Set the minimum source size required for speculative parallel parsing.
CONSTEXPR_14 CSVFormat & variable_columns(bool policy)
Tells the parser how to handle columns of a different length than the others.
CSVFormat & no_header()
Tells the parser that this CSV has no header row.
CSVFormat & header_row(int row)
Sets the header row.
CSVFormat & quote(char quote)
Sets the quote character.
Main class for parsing CSVs from files and in-memory sources.
A standalone header file containing shared code.
#define CONSTEXPR
Expands to constexpr in decent compilers and inline otherwise.
Definition common.hpp:251
#define CSV_INLINE
Helper macro which should be #defined as "inline" in the single header version.
Definition common.hpp:31
Shared exception message templates and throw helpers.
The all encompassing namespace.
ColumnNamePolicy
Determines how column name lookups are performed.
@ CASE_INSENSITIVE
Case-insensitive match.
@ EXACT
Case-sensitive match (default)
VariableColumnPolicy
Determines how to handle rows that are shorter or longer than the majority.
std::vector< std::string > get_col_names(csv::string_view filename, const CSVFormat &format=CSVFormat::guess_csv())
Get the column names of a CSV file using just the first 500KB.
Stores the inferred format of a CSV file.