8 std::ifstream infile(std::string(filename), std::ios::binary);
9 const auto start = infile.tellg();
10 infile.seekg(0, std::ios::end);
11 const auto end = infile.tellg();
17 return get_csv_head(filename, get_file_size(filename));
21 const size_t bytes = 500000;
23 std::error_code error;
24 size_t length = std::min((
size_t)file_size, bytes);
25 auto mmap = mio::make_mmap_source(std::string(filename), 0, length, error);
28 throw std::runtime_error(
"Cannot open file " + std::string(filename));
31 return std::string(mmap.begin(), mmap.end());
35#pragma region IBasicCVParser
39 const ColNamesPtr& col_names
40 ) : _col_names(col_names) {
41 if (format.no_quote) {
49 format.trim_chars.data(), format.trim_chars.size()
56 bool empty_last_field = this->data_ptr
57 && this->data_ptr->_data
58 && !this->data_ptr->data.empty()
59 && (parse_flag(this->data_ptr->data.back()) == ParseFlags::DELIMITER
60 || parse_flag(this->data_ptr->data.back()) == ParseFlags::QUOTE);
63 if (this->field_length > 0 || empty_last_field) {
68 if (this->current_row.size() > 0)
72 CSV_INLINE void IBasicCSVParser::parse_field() noexcept {
74 auto& in = this->data_ptr->data;
77 while (data_pos < in.size() && ws_flag(in[data_pos]))
80 if (field_start == UNINITIALIZED_FIELD)
81 field_start = (int)(data_pos - current_row_start());
86 while (data_pos < in.size() && compound_parse_flag(in[data_pos]) == ParseFlags::NOT_SPECIAL)
89 field_length = data_pos - (field_start + current_row_start());
93 for (
size_t j = data_pos - 1; ws_flag(in[j]) && this->field_length > 0; j--)
100 if (field_has_double_quote) {
101 fields->emplace_back(
102 field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start,
106 field_has_double_quote =
false;
110 fields->emplace_back(
111 field_start == UNINITIALIZED_FIELD ? 0 : (unsigned int)field_start,
116 current_row.row_length++;
119 field_start = UNINITIALIZED_FIELD;
128 this->quote_escape =
false;
130 this->current_row_start() = 0;
131 this->trim_utf8_bom();
133 auto& in = this->data_ptr->data;
134 while (this->data_pos < in.size()) {
135 switch (compound_parse_flag(in[this->data_pos])) {
136 case ParseFlags::DELIMITER:
141 case ParseFlags::NEWLINE:
145 while (this->data_pos < in.size() && parse_flag(in[this->data_pos]) == ParseFlags::NEWLINE)
153 this->current_row =
CSVRow(data_ptr, this->data_pos, fields->size());
156 case ParseFlags::NOT_SPECIAL:
160 case ParseFlags::QUOTE_ESCAPE_QUOTE:
161 if (data_pos + 1 == in.size())
return this->current_row_start();
162 else if (data_pos + 1 < in.size()) {
163 auto next_ch = parse_flag(in[data_pos + 1]);
164 if (next_ch >= ParseFlags::DELIMITER) {
165 quote_escape =
false;
169 else if (next_ch == ParseFlags::QUOTE) {
172 this->field_length += 2;
173 this->field_has_double_quote =
true;
179 this->field_length++;
185 if (this->field_length == 0) {
188 if (field_start == UNINITIALIZED_FIELD && data_pos < in.size() && !ws_flag(in[data_pos]))
189 field_start = (int)(data_pos - current_row_start());
194 this->field_length++;
201 return this->current_row_start();
205 size_t row_len = fields->size() - current_row.fields_start;
207 current_row.row_length = row_len;
208 this->_records->push_back(std::move(current_row));
212 this->data_ptr = std::make_shared<RawCSVData>();
213 this->data_ptr->parse_flags = this->_parse_flags;
214 this->data_ptr->col_names = this->_col_names;
215 this->fields = &(this->data_ptr->fields);
218 CSV_INLINE void IBasicCSVParser::trim_utf8_bom() {
219 auto& data = this->data_ptr->data;
221 if (!this->unicode_bom_scan && data.size() >= 3) {
222 if (data[0] ==
'\xEF' && data[1] ==
'\xBB' && data[2] ==
'\xBF') {
224 this->_utf8_bom =
true;
227 this->unicode_bom_scan =
true;
235#pragma region Specializations
237 CSV_INLINE void MmapParser::next(
size_t bytes = ITERATION_CHUNK_SIZE) {
247 this->field_start = UNINITIALIZED_FIELD;
248 this->field_length = 0;
249 this->reset_data_ptr();
252 const size_t offset = this->mmap_pos;
253 const size_t remaining = (offset < this->source_size)
254 ? (this->source_size - offset)
256 const size_t length = std::min(remaining, bytes);
265 std::error_code error;
266 auto mmap = mio::make_mmap_source(this->_filename, offset, length, error);
268 std::string msg =
"Memory mapping failed during CSV parsing: file='" + this->_filename
269 +
"' offset=" + std::to_string(offset)
270 +
" length=" + std::to_string(length);
271 throw std::system_error(error, msg);
273 this->data_ptr->_data = std::make_shared<mio::basic_mmap_source<char>>(std::move(mmap));
274 this->mmap_pos += length;
276 auto mmap_ptr = (mio::basic_mmap_source<char>*)(this->data_ptr->_data.get());
279 this->data_ptr->data =
csv::string_view(mmap_ptr->data(), mmap_ptr->length());
282 this->current_row =
CSVRow(this->data_ptr);
283 size_t remainder = this->
parse();
285 if (this->mmap_pos == this->source_size || no_chunk()) {
290 this->mmap_pos -= (length - remainder);