10#include <unordered_map>
11#include <unordered_set>
14#if !defined(__EMSCRIPTEN__)
15#include "../external/mio.hpp"
18#include "col_names.hpp"
26 constexpr const int UNINITIALIZED_FIELD = -1;
30 template<
typename OutArray,
typename T =
typename OutArray::type>
41 size_t mode_row_length;
50 const std::vector<char>& delims = {
',',
'|',
'\t',
';',
'^',
'~' }
58 auto ret = arrayToDefault<ParseFlagMap>(ParseFlags::NOT_SPECIAL);
59 ret[delimiter +
CHAR_OFFSET] = ParseFlags::DELIMITER;
60 ret[
'\r' +
CHAR_OFFSET] = ParseFlags::CARRIAGE_RETURN;
75 inline char infer_char_for_flag(
76 const ParseFlagMap& parse_flags,
80 for (
size_t i = 0; i < parse_flags.size(); ++i) {
81 if (parse_flags[i] == target) {
82 return static_cast<char>(
static_cast<int>(i) -
CHAR_OFFSET);
89 inline char infer_delimiter(
const ParseFlagMap& parse_flags)
noexcept {
90 return infer_char_for_flag(parse_flags, ParseFlags::DELIMITER,
',');
95 inline char infer_quote_char(
const ParseFlagMap& parse_flags,
char fallback =
'"') noexcept {
96 return infer_char_for_flag(parse_flags, ParseFlags::QUOTE, fallback);
104 auto ret = arrayToDefault<WhitespaceMap>(
false);
105 for (
size_t j = 0; j < n_chars; j++) {
111 inline WhitespaceMap make_ws_flags(
const std::vector<char>& flags) {
112 return make_ws_flags(flags.data(), flags.size());
116 template<
typename TStream,
117 csv::enable_if_t<std::is_base_of<std::istream, TStream>::value,
int> = 0>
118 std::string get_csv_head_stream(TStream& source) {
119 const size_t limit = 500000;
120 std::string buf(limit,
'\0');
121 source.read(&buf[0], (std::streamsize)limit);
122 buf.resize(
static_cast<size_t>(source.gcount()));
126 #if !defined(__EMSCRIPTEN__)
162 const std::vector<char>& delims = {
',',
'|',
'\t',
';',
'^',
'~' }) {
163 auto head = internals::get_csv_head(filename);
164 return internals::guess_format(head, delims);
170 namespace internals {
185 const char d = internals::infer_delimiter(parse_flags);
187 has_ws_trimming_ = std::any_of(ws_flags.begin(), ws_flags.end(), [](
bool b) { return b; });
193 bool eof() {
return this->eof_; }
198 virtual void next(
size_t bytes) = 0;
203 CONSTEXPR_17
ParseFlags parse_flag(
const char ch)
const noexcept {
207 CONSTEXPR_17
ParseFlags compound_parse_flag(
const char ch)
const noexcept {
214 void set_output(
RowCollection& rows) { this->records_ = &rows; }
220 RawCSVDataPtr data_ptr_ =
nullptr;
221 ColNamesPtr col_names_ =
nullptr;
222 RawCSVFieldList* fields_ =
nullptr;
223 int field_start_ = UNINITIALIZED_FIELD;
224 size_t field_length_ = 0;
243 virtual std::string& get_csv_head() = 0;
254 void resolve_format_from_head(
const CSVFormat& format);
264 bool has_ws_trimming_ =
false;
265 bool quote_escape_ =
false;
266 bool field_has_double_quote_ =
false;
269 size_t data_pos_ = 0;
272 bool unicode_bom_scan_ =
false;
273 bool utf8_bom_ =
false;
278 CONSTEXPR_17
bool ws_flag(
const char ch)
const noexcept {
282 size_t& current_row_start() {
283 return this->current_row_.data_start;
286 void parse_field() noexcept;
295 void trim_utf8_bom();
315 template<typename TStream>
322 const ColNamesPtr& col_names =
nullptr
325 this->resolve_format_from_head(format);
338 std::string& get_csv_head()
override {
339 leftover_ = get_csv_head_stream(this->source_);
340 return this->leftover_;
343 void next(
size_t bytes = CSV_CHUNK_SIZE_DEFAULT)
override {
344 if (this->
eof())
return;
347 this->field_start_ = UNINITIALIZED_FIELD;
348 this->field_length_ = 0;
350 this->data_ptr_->_data = std::make_shared<std::string>();
352 auto& chunk = *
static_cast<std::string*
>(this->data_ptr_->_data.get());
358 chunk = std::move(leftover_);
359 std::unique_ptr<char[]> buf(
new char[bytes]);
360 source_.read(buf.get(), (std::streamsize)bytes);
362 const size_t n =
static_cast<size_t>(source_.gcount());
364 if (n > 0) chunk.append(buf.get(), n);
370 throw std::runtime_error(
"StreamParser read failure");
374 this->data_ptr_->data = chunk;
377 this->current_row_ =
CSVRow(this->data_ptr_);
378 size_t remainder = this->
parse();
380 if (source_.eof() || chunk.empty()) {
387 leftover_ = chunk.substr(remainder);
394 std::string leftover_;
399#if !defined(__EMSCRIPTEN__)
419 const ColNamesPtr& col_names =
nullptr
421 this->_filename = filename.data();
422 auto head_and_size = get_csv_head_mmap(filename);
423 this->head_ = std::move(head_and_size.first);
424 this->source_size_ = head_and_size.second;
425 this->resolve_format_from_head(format);
430 std::string& get_csv_head()
override {
435 void next(
size_t bytes)
override;
438 void finalize_loaded_chunk(
size_t length,
bool eof_on_no_chunk =
false);
440 std::string _filename;
CSV_CONST CONSTEXPR_17 ParseFlagMap make_parse_flags(char delimiter)
Create a vector v where each index i corresponds to the ASCII number for a character and,...
CSV_CONST CONSTEXPR_17 WhitespaceMap make_ws_flags(const char *ws_chars, size_t n_chars)
Create a vector v where each index i corresponds to the ASCII number for a character c and,...
CSVGuessResult guess_format(csv::string_view head, const std::vector< char > &delims={ ',', '|', '\t', ';', '^', '~' })
Guess the delimiter used by a delimiter-separated values file.
CSV_CONST CONSTEXPR_17 OutArray arrayToDefault(T &&value)
Helper constexpr function to initialize an array with all the elements set to value.
SIMD-accelerated skip for runs of non-special CSV bytes.
Data structure for representing CSV rows.
Abstract base class which provides CSV parsing logic.
CONSTEXPR bool no_chunk() const
Whether or not source needs to be read in chunks.
SentinelVecs simd_sentinels_
Precomputed SIMD broadcast vectors for find_next_non_special.
size_t source_size_
The size of the incoming CSV.
void reset_data_ptr()
Create a new RawCSVDataPtr for a new chunk of data.
bool eof()
Whether or not we have reached the end of source.
void end_feed()
Indicate the last block of data has been parsed.
size_t parse()
Parse the current chunk of data and return the completed-row prefix length.
virtual void next(size_t bytes)=0
Parse the next block of data.
CONSTEXPR bool utf8_bom() const
Whether or not this CSV has a UTF-8 byte order mark.
ParseFlagMap parse_flags_
An array where the (i + 128)th slot gives the ParseFlags for ASCII character i.
Parser for memory-mapped files.
A class for parsing CSV data from any std::istream, including non-seekable sources such as pipes and ...
void next(size_t bytes=CSV_CHUNK_SIZE_DEFAULT) override
Parse the next block of data.
A std::deque wrapper which allows multiple read and write threads to concurrently access it along wit...
A standalone header file containing shared code.
std::array< ParseFlags, 256 > ParseFlagMap
An array which maps ASCII chars to a parsing flag.
std::array< bool, 256 > WhitespaceMap
An array which maps ASCII chars to a flag indicating if it is whitespace.
ParseFlags
An enum used for describing the significance of each character with respect to CSV parsing.
constexpr ParseFlags quote_escape_flag(ParseFlags flag, bool quote_escape) noexcept
Transform the ParseFlags given the context of whether or not the current field is quote escaped.
#define CONSTEXPR
Expands to constexpr in decent compilers and inline otherwise.
#define CSV_INLINE
Helper macro which should be #defined as "inline" in the single header version.
constexpr size_t CSV_CHUNK_SIZE_DEFAULT
Default chunk size for lazy-loading large CSV files.
Defines the data type used for storing information about a CSV row.
The all encompassing namespace.
CSVGuessResult guess_format(csv::string_view filename, const std::vector< char > &delims={ ',', '|', '\t', ';', '^', '~' })
Guess the delimiter, header row, and mode column count of a CSV file.
constexpr unsigned CHAR_OFFSET
Offset to convert char into array index.
nonstd::string_view string_view
The string_view class used by this library.
Shared contracts for row deque implementations.
Stores the inferred format of a CSV file.