16 std::ifstream infile(std::string(filename), std::ios::binary);
17 if (!infile.is_open()) {
18 throw std::runtime_error(
"Cannot open file " + std::string(filename));
20 return get_csv_head_stream(infile);
23#if !defined(__EMSCRIPTEN__)
25 const size_t bytes = 500000;
26 std::error_code error;
27 auto mmap = mio::make_mmap_source(std::string(filename), 0, mio::map_entire_file, error);
29 throw std::runtime_error(
"Cannot open file " + std::string(filename));
31 const size_t file_size = mmap.size();
32 const size_t length = std::min(file_size, bytes);
33 return { std::string(mmap.begin(), mmap.begin() + length), file_size };
38#if defined(__EMSCRIPTEN__)
39 return get_csv_head_stream(filename);
41 return get_csv_head_mmap(filename).first;
47#pragma region IBasicCSVParser
51 const ColNamesPtr& col_names
52 ) : col_names_(col_names) {
56 ws_flags_ = internals::make_ws_flags(
57 source_format.trim_chars.data(), source_format.trim_chars.size()
59 has_ws_trimming_ = !source_format.trim_chars.empty();
63 auto head = this->get_csv_head();
65 ResolvedFormat resolved;
66 resolved.format = source_format;
68 const bool infer_delimiter = source_format.guess_delim();
69 const bool infer_header = !source_format.header_explicitly_set_
70 && (infer_delimiter || !source_format.col_names_explicitly_set_);
71 const bool infer_n_cols = (source_format.get_header() < 0 && source_format.get_col_names().empty());
73 if (infer_delimiter || infer_header || infer_n_cols) {
74 auto guess_result = guess_format(head, source_format.get_possible_delims());
75 if (infer_delimiter) {
76 resolved.format.delimiter(guess_result.delim);
81 resolved.format.header = guess_result.header_row;
84 resolved.n_cols = guess_result.n_cols;
87 if (resolved.format.no_quote) {
88 parse_flags_ = internals::make_parse_flags(resolved.format.get_delim());
91 parse_flags_ = internals::make_parse_flags(resolved.format.get_delim(), resolved.format.quote_char);
93 const char resolved_eff_quote = resolved.format.no_quote
94 ? resolved.format.get_delim()
95 : resolved.format.quote_char;
96 simd_sentinels_ = SentinelVecs(resolved.format.get_delim(), resolved_eff_quote);
98 this->format = resolved;
105#pragma region IBasicCVParser: Core Parse Loop
110 bool empty_last_field = this->data_ptr_
111 && this->data_ptr_->_data
112 && !this->data_ptr_->data.empty()
113 && (parse_flag(this->data_ptr_->data.back()) == ParseFlags::DELIMITER
114 || parse_flag(this->data_ptr_->data.back()) == ParseFlags::QUOTE);
117 if (this->field_length_ > 0 || empty_last_field) {
122 if (this->current_row_.
size() > 0)
126 CSV_INLINE void IBasicCSVParser::parse_field() noexcept {
128 auto& in = this->data_ptr_->data;
130 if (field_start_ == UNINITIALIZED_FIELD)
131 field_start_ = (int)(data_pos_ - current_row_start());
137#if !defined(CSV_NO_SIMD)
138 data_pos_ = find_next_non_special(in, data_pos_, this->
simd_sentinels_);
144 while (data_pos_ < in.size() && compound_parse_flag(in[data_pos_]) == ParseFlags::NOT_SPECIAL)
147 field_length_ = data_pos_ - (field_start_ + current_row_start());
156 fields_->emplace_back(
157 field_start_ == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start_,
159 field_has_double_quote_
162 current_row_.row_length++;
165 field_has_double_quote_ =
false;
166 field_start_ = UNINITIALIZED_FIELD;
175 this->quote_escape_ =
false;
177 this->current_row_start() = 0;
178 this->trim_utf8_bom();
180 auto& in = this->data_ptr_->data;
181 while (this->data_pos_ < in.size()) {
182 switch (compound_parse_flag(in[this->data_pos_])) {
183 case ParseFlags::DELIMITER:
188 case ParseFlags::CARRIAGE_RETURN:
190 if (this->data_pos_ + 1 < in.size() && parse_flag(in[this->data_pos_ + 1]) == ParseFlags::NEWLINE) {
194 FALLTHROUGH_TO_NEXT_CASE
197 case ParseFlags::NEWLINE:
203 if (this->field_length_ > 0
204 || this->field_start_ != UNINITIALIZED_FIELD
205 || !this->current_row_.
empty()) {
211 this->current_row_ =
CSVRow(data_ptr_, this->data_pos_, fields_->size());
214 case ParseFlags::NOT_SPECIAL:
218 case ParseFlags::QUOTE_ESCAPE_QUOTE:
219 if (data_pos_ + 1 == in.size())
return this->current_row_start();
220 else if (data_pos_ + 1 < in.size()) {
221 auto next_ch = parse_flag(in[data_pos_ + 1]);
222 if (next_ch >= ParseFlags::DELIMITER) {
223 quote_escape_ =
false;
227 else if (next_ch == ParseFlags::QUOTE) {
230 this->field_length_ += 2;
231 this->field_has_double_quote_ =
true;
237 this->field_length_++;
243 if (this->field_length_ == 0) {
244 quote_escape_ =
true;
246 if (field_start_ == UNINITIALIZED_FIELD && data_pos_ < in.size() && !ws_flag(in[data_pos_]))
247 field_start_ = (int)(data_pos_ - current_row_start());
252 this->field_length_++;
259 return this->current_row_start();
263 size_t row_len = fields_->size() - current_row_.fields_start;
265 current_row_.row_length = row_len;
266 this->records_->push_back(std::move(current_row_));
270 this->data_ptr_ = std::make_shared<RawCSVData>();
272 this->data_ptr_->ws_flags = this->ws_flags_;
273 this->data_ptr_->has_ws_trimming = this->has_ws_trimming_;
274 this->data_ptr_->col_names = this->col_names_;
275 this->fields_ = &(this->data_ptr_->fields);
278 CSV_INLINE void IBasicCSVParser::trim_utf8_bom() {
279 auto& data = this->data_ptr_->data;
281 if (!this->unicode_bom_scan_ && data.size() >= 3) {
282 if (data[0] ==
'\xEF' && data[1] ==
'\xBB' && data[2] ==
'\xBF') {
283 this->data_pos_ += 3;
284 this->utf8_bom_ =
true;
287 this->unicode_bom_scan_ =
true;
295#pragma region Specializations
297#if !defined(__EMSCRIPTEN__)
298 CSV_INLINE void MmapParser::finalize_loaded_chunk(
size_t length,
bool eof_on_no_chunk) {
301 this->current_row_ = CSVRow(this->data_ptr_);
302 size_t remainder = this->
parse();
309 this->mmap_pos -= (length - remainder);
322 this->field_start_ = UNINITIALIZED_FIELD;
323 this->field_length_ = 0;
329 if (!head_.empty()) {
330 this->data_ptr_->_data = std::make_shared<std::string>(std::move(head_));
331 auto* head_ptr =
static_cast<std::string*
>(this->data_ptr_->_data.get());
332 const size_t length = head_ptr->size();
333 this->mmap_pos += length;
335 this->data_ptr_->data = *head_ptr;
336 this->finalize_loaded_chunk(length);
341 const size_t offset = this->mmap_pos;
345 const size_t length = std::min(remaining, bytes);
354 std::error_code error;
355 auto mmap = mio::make_mmap_source(this->_filename, offset, length, error);
357 std::string msg =
"Memory mapping failed during CSV parsing: file='" + this->_filename
358 +
"' offset=" + std::to_string(offset)
359 +
" length=" + std::to_string(length);
360 throw std::system_error(error, msg);
362 this->data_ptr_->_data = std::make_shared<mio::basic_mmap_source<char>>(std::move(mmap));
363 this->mmap_pos += length;
365 auto mmap_ptr = (mio::basic_mmap_source<char>*)(this->data_ptr_->_data.get());
368 this->data_ptr_->data =
csv::string_view(mmap_ptr->data(), mmap_ptr->length());
369 this->finalize_loaded_chunk(length,
true);