Vince's CSV Parser
Loading...
Searching...
No Matches
data_frame.hpp
1#pragma once
2
3#include <algorithm>
4#include <functional>
5#include <iterator>
6#include <memory>
7#include <stdexcept>
8#include <string>
9#include <type_traits>
10#include <unordered_map>
11#include <utility>
12#include <vector>
13
14#include "csv_reader.hpp"
15
16namespace csv {
17 namespace internals {
18 template<typename T>
20 private:
21 template<typename U>
22 static auto test(int) -> decltype(
23 std::hash<U>{}(std::declval<const U&>()),
24 std::true_type{}
25 );
26
27 template<typename>
28 static std::false_type test(...);
29
30 public:
31 static constexpr bool value = decltype(test<T>(0))::value;
32 };
33
34 template<typename T>
36 private:
37 template<typename U>
38 static auto test(int) -> decltype(
39 std::declval<const U&>() == std::declval<const U&>(),
40 std::true_type{}
41 );
42
43 template<typename>
44 static std::false_type test(...);
45
46 public:
47 static constexpr bool value = decltype(test<T>(0))::value;
48 };
49 }
50
53 public:
54 DataFrameOptions() = default;
55
57 enum class DuplicateKeyPolicy {
58 THROW, // Throw an error if a duplicate key is encountered
59 OVERWRITE, // Overwrite the existing value with the new value
60 KEEP_FIRST // Ignore the new value and keep the existing value
61 };
62
63 DataFrameOptions& set_duplicate_key_policy(DuplicateKeyPolicy value) {
64 this->duplicate_key_policy = value;
65 return *this;
66 }
67
68 DuplicateKeyPolicy get_duplicate_key_policy() const {
69 return this->duplicate_key_policy;
70 }
71
72 DataFrameOptions& set_key_column(const std::string& value) {
73 this->key_column = value;
74 return *this;
75 }
76
77 const std::string& get_key_column() const {
78 return this->key_column;
79 }
80
81 DataFrameOptions& set_throw_on_missing_key(bool value) {
82 this->throw_on_missing_key = value;
83 return *this;
84 }
85
86 bool get_throw_on_missing_key() const {
87 return this->throw_on_missing_key;
88 }
89
90 private:
91 std::string key_column;
92
93 DuplicateKeyPolicy duplicate_key_policy = DuplicateKeyPolicy::OVERWRITE;
94
96 bool throw_on_missing_key = true;
97 };
98
103 template<typename KeyType>
105 public:
107 DataFrameRow() : row(nullptr), row_edits(nullptr), key_ptr(nullptr) {}
108
111 const CSVRow* _row,
112 const std::unordered_map<size_t, std::string>* _edits,
113 const KeyType* _key
114 ) : row(_row), row_edits(_edits), key_ptr(_key) {}
115
117 CSVField operator[](const std::string& col) const {
118 // Find column index by searching column names
119 auto col_names = row->get_col_names();
120 auto it = std::find(col_names.begin(), col_names.end(), col);
121 if (it == col_names.end()) {
122 // Column not found, let row handle the error
123 return (*row)[col];
124 }
125
126 size_t col_index = std::distance(col_names.begin(), it);
127
128 if (row_edits) {
129 auto edit_it = row_edits->find(col_index);
130 if (edit_it != row_edits->end()) {
131 return CSVField(csv::string_view(edit_it->second));
132 }
133 }
134 return (*row)[col];
135 }
136
138 CSVField operator[](size_t n) const {
139 return (*row)[n];
140 }
141
143 size_t size() const { return row->size(); }
144
146 bool empty() const { return row->empty(); }
147
149 std::vector<std::string> get_col_names() const { return row->get_col_names(); }
150
152 const CSVRow& get_underlying_row() const { return *row; }
153
155 const KeyType& get_key() const { return *key_ptr; }
156
158 operator std::vector<std::string>() const {
159 std::vector<std::string> result;
160 result.reserve(row->size());
161
162 for (size_t i = 0; i < row->size(); i++) {
163 // Check if this column has an edit
164 if (row_edits) {
165 auto it = row_edits->find(i);
166 if (it != row_edits->end()) {
167 result.push_back(it->second);
168 continue;
169 }
170 }
171 // Use original value
172 result.push_back((*row)[i].get<std::string>());
173 }
174 return result;
175 }
176
178 std::string to_json(const std::vector<std::string>& subset = {}) const {
179 return row->to_json(subset);
180 }
181
183 std::string to_json_array(const std::vector<std::string>& subset = {}) const {
184 return row->to_json_array(subset);
185 }
186
187 #ifdef CSV_HAS_CXX20
191 auto to_sv_range() const {
192 return std::views::iota(size_t{0}, this->size())
193 | std::views::transform([this](size_t i) {
194 // Check if this column has an edit
195 if (row_edits) {
196 auto it = row_edits->find(i);
197 if (it != row_edits->end()) {
198 return csv::string_view(it->second);
199 }
200 }
201 // Use original value
202 return (*row)[i].template get<csv::string_view>();
203 });
204 }
205 #endif
206
207 private:
208 const CSVRow* row;
209 const std::unordered_map<size_t, std::string>* row_edits;
210 const KeyType* key_ptr;
211 };
212
213 template<typename KeyType = std::string>
214 class DataFrame {
215 public:
217 using row_entry = std::pair<KeyType, CSVRow>;
218
220 class iterator {
221 public:
223 using difference_type = std::ptrdiff_t;
224 using pointer = const DataFrameRow<KeyType>*;
225 using reference = const DataFrameRow<KeyType>&;
226 using iterator_category = std::random_access_iterator_tag;
227
228 iterator() = default;
229 iterator(
230 typename std::vector<row_entry>::iterator it,
231 const std::unordered_map<KeyType, std::unordered_map<size_t, std::string>>* edits
232 ) : iter(it), edits_map(edits) {}
233
234 reference operator*() const {
235 const std::unordered_map<size_t, std::string>* row_edits = nullptr;
236 if (edits_map) {
237 auto it = edits_map->find(iter->first);
238 if (it != edits_map->end()) {
239 row_edits = &it->second;
240 }
241 }
242 cached_row = DataFrameRow<KeyType>(&iter->second, row_edits, &iter->first);
243 return cached_row;
244 }
245
246 pointer operator->() const {
247 // Ensure cached_row is populated
248 operator*();
249 return &cached_row;
250 }
251
252 iterator& operator++() { ++iter; return *this; }
253 iterator operator++(int) { auto tmp = *this; ++iter; return tmp; }
254 iterator& operator--() { --iter; return *this; }
255 iterator operator--(int) { auto tmp = *this; --iter; return tmp; }
256
257 iterator operator+(difference_type n) const { return iterator(iter + n, edits_map); }
258 iterator operator-(difference_type n) const { return iterator(iter - n, edits_map); }
259 difference_type operator-(const iterator& other) const { return iter - other.iter; }
260
261 bool operator==(const iterator& other) const { return iter == other.iter; }
262 bool operator!=(const iterator& other) const { return iter != other.iter; }
263
264 private:
265 typename std::vector<row_entry>::iterator iter;
266 const std::unordered_map<KeyType, std::unordered_map<size_t, std::string>>* edits_map = nullptr;
267 mutable DataFrameRow<KeyType> cached_row;
268 };
269
272 public:
274 using difference_type = std::ptrdiff_t;
275 using pointer = const DataFrameRow<KeyType>*;
276 using reference = const DataFrameRow<KeyType>&;
277 using iterator_category = std::random_access_iterator_tag;
278
279 const_iterator() = default;
281 typename std::vector<row_entry>::const_iterator it,
282 const std::unordered_map<KeyType, std::unordered_map<size_t, std::string>>* edits
283 ) : iter(it), edits_map(edits) {}
284
285 reference operator*() const {
286 const std::unordered_map<size_t, std::string>* row_edits = nullptr;
287 if (edits_map) {
288 auto it = edits_map->find(iter->first);
289 if (it != edits_map->end()) {
290 row_edits = &it->second;
291 }
292 }
293 cached_row = DataFrameRow<KeyType>(&iter->second, row_edits, &iter->first);
294 return cached_row;
295 }
296
297 pointer operator->() const {
298 // Ensure cached_row is populated
299 operator*();
300 return &cached_row;
301 }
302
303 const_iterator& operator++() { ++iter; return *this; }
304 const_iterator operator++(int) { auto tmp = *this; ++iter; return tmp; }
305 const_iterator& operator--() { --iter; return *this; }
306 const_iterator operator--(int) { auto tmp = *this; --iter; return tmp; }
307
308 const_iterator operator+(difference_type n) const { return const_iterator(iter + n, edits_map); }
309 const_iterator operator-(difference_type n) const { return const_iterator(iter - n, edits_map); }
310 difference_type operator-(const const_iterator& other) const { return iter - other.iter; }
311
312 bool operator==(const const_iterator& other) const { return iter == other.iter; }
313 bool operator!=(const const_iterator& other) const { return iter != other.iter; }
314
315 private:
316 typename std::vector<row_entry>::const_iterator iter;
317 const std::unordered_map<KeyType, std::unordered_map<size_t, std::string>>* edits_map = nullptr;
318 mutable DataFrameRow<KeyType> cached_row;
319 };
320
321 static_assert(
323 "DataFrame<KeyType> requires KeyType to be hashable (std::hash<KeyType> specialization required)."
324 );
325
326 static_assert(
328 "DataFrame<KeyType> requires KeyType to be equality comparable (operator== required)."
329 );
330
331 static_assert(
332 std::is_default_constructible<KeyType>::value,
333 "DataFrame<KeyType> requires KeyType to be default-constructible."
334 );
335
337
339 DataFrame() = default;
340
345 explicit DataFrame(CSVReader& reader) {
346 this->init_unkeyed_from_reader(reader);
347 }
348
353 explicit DataFrame(CSVReader& reader, const DataFrameOptions& options) {
354 this->init_from_reader(reader, options);
355 }
356
362 csv::string_view filename,
363 const DataFrameOptions& options,
365 ) {
366 CSVReader reader(filename, format);
367 this->init_from_reader(reader, options);
368 }
369
375 CSVReader& reader,
376 const std::string& _key_column,
377 DuplicateKeyPolicy policy = DuplicateKeyPolicy::OVERWRITE,
378 bool throw_on_missing_key = true
379 ) : DataFrame(
380 reader,
382 .set_key_column(_key_column)
383 .set_duplicate_key_policy(policy)
384 .set_throw_on_missing_key(throw_on_missing_key)
385 ) {}
386
391 template<
392 typename KeyFunc,
393 typename ResultType = invoke_result_t<KeyFunc, const CSVRow&>,
394 csv::enable_if_t<std::is_convertible<ResultType, KeyType>::value, int> = 0
395 >
397 CSVReader& reader,
398 KeyFunc key_func,
399 DuplicateKeyPolicy policy = DuplicateKeyPolicy::OVERWRITE
400 ) : col_names_(reader.get_col_names()) {
401 this->is_keyed = true;
402 this->build_from_key_function(reader, key_func, policy);
403 }
404
406 template<
407 typename KeyFunc,
408 typename ResultType = invoke_result_t<KeyFunc, const CSVRow&>,
409 csv::enable_if_t<std::is_convertible<ResultType, KeyType>::value, int> = 0
410 >
412 CSVReader& reader,
413 KeyFunc key_func,
414 const DataFrameOptions& options
415 ) : DataFrame(reader, key_func, options.get_duplicate_key_policy()) {}
416
418 size_t size() const noexcept {
419 return rows.size();
420 }
421
423 bool empty() const noexcept {
424 return rows.empty();
425 }
426
428 size_t n_rows() const noexcept { return rows.size(); }
429
431 size_t n_cols() const noexcept { return col_names_.size(); }
432
434 bool has_column(const std::string& name) const {
435 return std::find(col_names_.begin(), col_names_.end(), name) != col_names_.end();
436 }
437
439 int index_of(const std::string& name) const {
440 auto it = std::find(col_names_.begin(), col_names_.end(), name);
441 if (it == col_names_.end())
442 return CSV_NOT_FOUND;
443 return static_cast<int>(std::distance(col_names_.begin(), it));
444 }
445
447 const std::vector<std::string>& columns() const noexcept {
448 return col_names_;
449 }
450
452 const std::string& key_name() const noexcept {
453 return key_column;
454 }
455
465 template<typename K = KeyType,
466 csv::enable_if_t<!std::is_integral<K>::value, int> = 0>
468 static_assert(std::is_same<K, KeyType>::value,
469 "Do not explicitly instantiate this template. Use iloc() for positional access.");
470 return this->iloc(i);
471 }
472
475 template<typename K = KeyType,
476 csv::enable_if_t<!std::is_integral<K>::value, int> = 0>
478 static_assert(std::is_same<K, KeyType>::value,
479 "Do not explicitly instantiate this template. Use iloc() for positional access.");
480 return this->iloc(i);
481 }
482
489 const std::unordered_map<size_t, std::string>* row_edits = nullptr;
490 if (is_keyed) {
491 auto it = edits.find(rows.at(i).first);
492 if (it != edits.end()) row_edits = &it->second;
493 }
494 return DataFrameRow<KeyType>(&rows.at(i).second, row_edits, &rows.at(i).first);
495 }
496
498 DataFrameRow<KeyType> at(size_t i) const {
499 const std::unordered_map<size_t, std::string>* row_edits = nullptr;
500 if (is_keyed) {
501 auto it = edits.find(rows.at(i).first);
502 if (it != edits.end()) row_edits = &it->second;
503 }
504 return DataFrameRow<KeyType>(&rows.at(i).second, row_edits, &rows.at(i).first);
505 }
506
513 DataFrameRow<KeyType> operator[](const KeyType& key) {
514 this->require_keyed_frame();
515 auto position = this->position_of(key);
516 const std::unordered_map<size_t, std::string>* row_edits = nullptr;
517 auto it = edits.find(key);
518 if (it != edits.end()) row_edits = &it->second;
519 return DataFrameRow<KeyType>(&rows[position].second, row_edits, &rows[position].first);
520 }
521
523 DataFrameRow<KeyType> operator[](const KeyType& key) const {
524 this->require_keyed_frame();
525 auto position = this->position_of(key);
526 const std::unordered_map<size_t, std::string>* row_edits = nullptr;
527 auto it = edits.find(key);
528 if (it != edits.end()) row_edits = &it->second;
529 return DataFrameRow<KeyType>(&rows[position].second, row_edits, &rows[position].first);
530 }
531
538 const std::unordered_map<size_t, std::string>* row_edits = nullptr;
539 if (is_keyed) {
540 auto it = edits.find(rows.at(i).first);
541 if (it != edits.end()) row_edits = &it->second;
542 }
543 return DataFrameRow<KeyType>(&rows.at(i).second, row_edits, &rows.at(i).first);
544 }
545
547 DataFrameRow<KeyType> iloc(size_t i) const {
548 const std::unordered_map<size_t, std::string>* row_edits = nullptr;
549 if (is_keyed) {
550 auto it = edits.find(rows.at(i).first);
551 if (it != edits.end()) row_edits = &it->second;
552 }
553 return DataFrameRow<KeyType>(&rows.at(i).second, row_edits, &rows.at(i).first);
554 }
555
557 bool try_get(size_t i, DataFrameRow<KeyType>& out) {
558 if (i >= rows.size()) {
559 return false;
560 }
561 const std::unordered_map<size_t, std::string>* row_edits = nullptr;
562 if (is_keyed) {
563 auto it = edits.find(rows[i].first);
564 if (it != edits.end()) row_edits = &it->second;
565 }
566 out = DataFrameRow<KeyType>(&rows[i].second, row_edits, &rows[i].first);
567 return true;
568 }
569
571 bool try_get(size_t i, DataFrameRow<KeyType>& out) const {
572 if (i >= rows.size()) {
573 return false;
574 }
575 const std::unordered_map<size_t, std::string>* row_edits = nullptr;
576 if (is_keyed) {
577 auto it = edits.find(rows[i].first);
578 if (it != edits.end()) row_edits = &it->second;
579 }
580 out = DataFrameRow<KeyType>(&rows[i].second, row_edits, &rows[i].first);
581 return true;
582 }
583
590 const KeyType& key_at(size_t i) const {
591 this->require_keyed_frame();
592 return rows.at(i).first;
593 }
594
600 bool contains(const KeyType& key) const {
601 this->require_keyed_frame();
602 this->ensure_key_index();
603 return key_index->find(key) != key_index->end();
604 }
605
612 DataFrameRow<KeyType> at(const KeyType& key) {
613 this->require_keyed_frame();
614 auto position = this->position_of(key);
615 const std::unordered_map<size_t, std::string>* row_edits = nullptr;
616 auto it = edits.find(key);
617 if (it != edits.end()) row_edits = &it->second;
618 return DataFrameRow<KeyType>(&rows.at(position).second, row_edits, &rows.at(position).first);
619 }
620
622 DataFrameRow<KeyType> at(const KeyType& key) const {
623 this->require_keyed_frame();
624 auto position = this->position_of(key);
625 const std::unordered_map<size_t, std::string>* row_edits = nullptr;
626 auto it = edits.find(key);
627 if (it != edits.end()) row_edits = &it->second;
628 return DataFrameRow<KeyType>(&rows.at(position).second, row_edits, &rows.at(position).first);
629 }
630
636 bool try_get(const KeyType& key, DataFrameRow<KeyType>& out) {
637 this->require_keyed_frame();
638 this->ensure_key_index();
639 auto it = key_index->find(key);
640 if (it == key_index->end()) {
641 return false;
642 }
643 const std::unordered_map<size_t, std::string>* row_edits = nullptr;
644 auto edit_it = edits.find(key);
645 if (edit_it != edits.end()) row_edits = &edit_it->second;
646 out = DataFrameRow<KeyType>(&rows[it->second].second, row_edits, &rows[it->second].first);
647 return true;
648 }
649
651 bool try_get(const KeyType& key, DataFrameRow<KeyType>& out) const {
652 this->require_keyed_frame();
653 this->ensure_key_index();
654 auto it = key_index->find(key);
655 if (it == key_index->end()) {
656 return false;
657 }
658 const std::unordered_map<size_t, std::string>* row_edits = nullptr;
659 auto edit_it = edits.find(key);
660 if (edit_it != edits.end()) row_edits = &edit_it->second;
661 out = DataFrameRow<KeyType>(&rows[it->second].second, row_edits, &rows[it->second].first);
662 return true;
663 }
664
671 std::string get(const KeyType& key, const std::string& column) const {
672 this->require_keyed_frame();
673
674 auto col_names = (*this)[key].get_col_names();
675 auto col_it = std::find(col_names.begin(), col_names.end(), column);
676 if (col_it == col_names.end()) {
677 throw std::out_of_range("Column '" + column + "' not found");
678 }
679 size_t col_idx = std::distance(col_names.begin(), col_it);
680
681 auto row_edits = this->edits.find(key);
682 if (row_edits != this->edits.end()) {
683 auto value = row_edits->second.find(col_idx);
684 if (value != row_edits->second.end()) {
685 return value->second;
686 }
687 }
688
689 return (*this)[key][column].template get<std::string>();
690 }
691
698 void set(const KeyType& key, const std::string& column, const std::string& value) {
699 this->require_keyed_frame();
700 size_t row_idx = this->position_of(key);
701
702 // Find column index
703 auto col_names = rows[row_idx].second.get_col_names();
704 auto it = std::find(col_names.begin(), col_names.end(), column);
705 if (it == col_names.end()) {
706 throw std::out_of_range("Column '" + column + "' not found");
707 }
708 size_t col_idx = std::distance(col_names.begin(), it);
709
710 edits[key][col_idx] = value;
711 }
712
718 bool erase_row(const KeyType& key) {
719 this->require_keyed_frame();
720 this->ensure_key_index();
721
722 auto it = key_index->find(key);
723 if (it == key_index->end()) {
724 return false;
725 }
726
727 rows.erase(rows.begin() + it->second);
728 edits.erase(key);
729 this->invalidate_key_index();
730 return true;
731 }
732
734 bool erase_row_at(size_t i) {
735 if (i >= rows.size()) return false;
736 if (is_keyed) edits.erase(rows[i].first);
737
738 rows.erase(rows.begin() + i);
739 this->invalidate_key_index();
740 return true;
741 }
742
749 void set_at(size_t i, const std::string& column, const std::string& value) {
750 if (!is_keyed) {
751 throw std::runtime_error("This DataFrame was created without a key column.");
752 }
753 if (i >= rows.size()) {
754 throw std::out_of_range("Row index out of bounds.");
755 }
756
757 // Find column index
758 auto col_names = rows[i].second.get_col_names();
759 auto it = std::find(col_names.begin(), col_names.end(), column);
760 if (it == col_names.end()) {
761 throw std::out_of_range("Column '" + column + "' not found");
762 }
763 size_t col_idx = std::distance(col_names.begin(), it);
764
765 edits[rows[i].first][col_idx] = value;
766 }
767
775 template<typename T = std::string>
776 std::vector<T> column(const std::string& name) const {
777 auto col_it = std::find(col_names_.begin(), col_names_.end(), name);
778 if (col_it == col_names_.end()) {
779 throw std::runtime_error("Column not found: " + name);
780 }
781 size_t col_idx = std::distance(col_names_.begin(), col_it);
782
783 std::vector<T> values;
784 values.reserve(rows.size());
785
786 for (const auto& entry : rows) {
787 auto row_edits = this->edits.find(entry.first);
788 if (row_edits != this->edits.end()) {
789 auto value = row_edits->second.find(col_idx);
790 if (value != row_edits->second.end()) {
791 // Reuse CSVField parsing/validation on edited string values.
792 CSVField edited_field(csv::string_view(value->second));
793 values.push_back(edited_field.template get<T>());
794 continue;
795 }
796 }
797
798 values.push_back(entry.second[name].template get<T>());
799 }
800
801 return values;
802 }
803
809 template<
810 typename GroupFunc,
811 typename GroupKey = invoke_result_t<GroupFunc, const CSVRow&>,
812 csv::enable_if_t<
815 int
816 > = 0
817 >
818 std::unordered_map<GroupKey, std::vector<size_t>> group_by(GroupFunc group_func) const {
819 std::unordered_map<GroupKey, std::vector<size_t>> grouped;
820
821 for (size_t i = 0; i < rows.size(); i++) {
822 GroupKey group_key = group_func(rows[i].second);
823 grouped[group_key].push_back(i);
824 }
825
826 return grouped;
827 }
828
834 std::unordered_map<std::string, std::vector<size_t>> group_by(
835 const std::string& name,
836 bool use_edits = true
837 ) const {
838 auto col_it = std::find(col_names_.begin(), col_names_.end(), name);
839 if (col_it == col_names_.end()) {
840 throw std::runtime_error("Column not found: " + name);
841 }
842 size_t col_idx = std::distance(col_names_.begin(), col_it);
843
844 std::unordered_map<std::string, std::vector<size_t>> grouped;
845
846 for (size_t i = 0; i < rows.size(); i++) {
847 std::string group_key;
848 bool has_group_key = false;
849
850 if (use_edits) {
851 auto row_edits = this->edits.find(rows[i].first);
852 if (row_edits != this->edits.end()) {
853 auto edited_value = row_edits->second.find(col_idx);
854 if (edited_value != row_edits->second.end()) {
855 group_key = edited_value->second;
856 has_group_key = true;
857 }
858 }
859 }
860
861 if (!has_group_key) {
862 group_key = rows[i].second[name].template get<std::string>();
863 }
864
865 grouped[group_key].push_back(i);
866 }
867
868 return grouped;
869 }
870
872 iterator begin() { return iterator(rows.begin(), is_keyed ? &edits : nullptr); }
873
875 iterator end() { return iterator(rows.end(), is_keyed ? &edits : nullptr); }
876
878 const_iterator begin() const { return const_iterator(rows.begin(), is_keyed ? &edits : nullptr); }
879
881 const_iterator end() const { return const_iterator(rows.end(), is_keyed ? &edits : nullptr); }
882
884 const_iterator cbegin() const { return const_iterator(rows.begin(), is_keyed ? &edits : nullptr); }
885
887 const_iterator cend() const { return const_iterator(rows.end(), is_keyed ? &edits : nullptr); }
888
889 private:
891 std::string key_column;
892
894 bool is_keyed = false;
895
897 std::vector<std::string> col_names_;
898
900 std::vector<row_entry> rows;
901
903 mutable std::unique_ptr<std::unordered_map<KeyType, size_t>> key_index;
904
909 std::unordered_map<KeyType, std::unordered_map<size_t, std::string>> edits;
910
912 void init_unkeyed_from_reader(CSVReader& reader) {
913 this->col_names_ = reader.get_col_names();
914 for (auto& row : reader) {
915 rows.push_back(row_entry{KeyType(), row});
916 }
917 }
918
920 void init_from_reader(CSVReader& reader, const DataFrameOptions& options) {
921 this->is_keyed = true;
922 this->key_column = options.get_key_column();
923 this->col_names_ = reader.get_col_names();
924
925 if (key_column.empty()) {
926 throw std::runtime_error("Key column cannot be empty.");
927 }
928
929 if (std::find(col_names_.begin(), col_names_.end(), key_column) == col_names_.end()) {
930 throw std::runtime_error("Key column not found: " + key_column);
931 }
932
933 const bool throw_on_missing_key = options.get_throw_on_missing_key();
934
935 this->build_from_key_function(
936 reader,
937 [this, throw_on_missing_key](const CSVRow& row) -> KeyType {
938 try {
939 return row[this->key_column].template get<KeyType>();
940 }
941 catch (const std::exception& e) {
942 if (throw_on_missing_key) {
943 throw std::runtime_error("Error retrieving key column value: " + std::string(e.what()));
944 }
945
946 return KeyType();
947 }
948 },
949 options.get_duplicate_key_policy()
950 );
951 }
952
954 template<typename KeyFunc>
955 void build_from_key_function(
956 CSVReader& reader,
957 KeyFunc key_func,
958 DuplicateKeyPolicy policy
959 ) {
960 std::unordered_map<KeyType, size_t> key_to_pos;
961
962 for (auto& row : reader) {
963 KeyType key = key_func(row);
964
965 auto existing = key_to_pos.find(key);
966 if (existing != key_to_pos.end()) {
967 if (policy == DuplicateKeyPolicy::THROW) {
968 throw std::runtime_error("Duplicate key encountered.");
969 }
970
971 if (policy == DuplicateKeyPolicy::OVERWRITE) {
972 rows[existing->second].second = row;
973 }
974
975 continue;
976 }
977
978 rows.push_back(row_entry{key, row});
979 key_to_pos[key] = rows.size() - 1;
980 }
981 }
982
984 void require_keyed_frame() const {
985 if (!is_keyed) {
986 throw std::runtime_error("This DataFrame was created without a key column.");
987 }
988 }
989
991 void invalidate_key_index() {
992 key_index.reset();
993 }
994
996 void ensure_key_index() const {
997 if (key_index) {
998 return;
999 }
1000
1001 key_index = std::unique_ptr<std::unordered_map<KeyType, size_t>>(
1002 new std::unordered_map<KeyType, size_t>()
1003 );
1004
1005 for (size_t i = 0; i < rows.size(); i++) {
1006 (*key_index)[rows[i].first] = i;
1007 }
1008 }
1009
1011 size_t position_of(const KeyType& key) const {
1012 this->ensure_key_index();
1013 auto it = key_index->find(key);
1014 if (it == key_index->end()) {
1015 throw std::out_of_range("Key not found.");
1016 }
1017
1018 return it->second;
1019 }
1020 };
1021}
Data type representing individual CSV values.
Definition csv_row.hpp:67
Stores information about how to parse a CSV file.
static CSVFormat guess_csv()
CSVFormat preset for delimiter inference with header/n_cols inference enabled.
Main class for parsing CSVs from files and in-memory sources.
std::vector< std::string > get_col_names() const
Return the CSV's column names as a vector of strings.
Data structure for representing CSV rows.
Definition csv_row.hpp:264
std::string to_json(const std::vector< std::string > &subset={}) const
Convert a CSV row to a JSON object, i.e.
CONSTEXPR bool empty() const noexcept
Indicates whether row is empty or not.
Definition csv_row.hpp:278
std::string to_json_array(const std::vector< std::string > &subset={}) const
Convert a CSV row to a JSON array, i.e.
CONSTEXPR size_t size() const noexcept
Return the number of fields in this row.
Definition csv_row.hpp:281
std::vector< std::string > get_col_names() const
Retrieve this row's associated column names.
Definition csv_row.hpp:291
Allows configuration of DataFrame behavior.
DuplicateKeyPolicy
Policy for handling duplicate keys when creating a keyed DataFrame.
Proxy class that wraps a CSVRow and intercepts field access to check for edits.
std::string to_json(const std::vector< std::string > &subset={}) const
Convert to JSON.
std::vector< std::string > get_col_names() const
Get column names.
CSVField operator[](size_t n) const
Access a field by position (positional access never checks edits).
auto to_sv_range() const
Convert this DataFrameRow into a std::ranges::input_range of string_views, respecting the sparse over...
CSVField operator[](const std::string &col) const
Access a field by column name, checking edits first.
DataFrameRow()
Default constructor (creates an unbound proxy).
std::string to_json_array(const std::vector< std::string > &subset={}) const
Convert to JSON array.
size_t size() const
Get the number of fields in the row.
bool empty() const
Check if the row is empty.
const CSVRow & get_underlying_row() const
Get the underlying CSVRow for compatibility.
const KeyType & get_key() const
Get the key for this row (only valid for keyed DataFrames).
DataFrameRow(const CSVRow *_row, const std::unordered_map< size_t, std::string > *_edits, const KeyType *_key)
Construct a DataFrameRow wrapper.
Row-wise const iterator over DataFrameRow entries.
Row-wise iterator over DataFrameRow entries.
std::unordered_map< std::string, std::vector< size_t > > group_by(const std::string &name, bool use_edits=true) const
Group row positions by the value of a column.
bool empty() const noexcept
Check if the DataFrame is empty (has no rows).
DataFrame(CSVReader &reader, const std::string &_key_column, DuplicateKeyPolicy policy=DuplicateKeyPolicy::OVERWRITE, bool throw_on_missing_key=true)
Construct a keyed DataFrame using a column name as the key.
DataFrameRow< KeyType > operator[](size_t i)
Access a row by position (unchecked).
bool erase_row_at(size_t i)
Remove a row by its position.
DataFrame(CSVReader &reader, const DataFrameOptions &options)
Construct a keyed DataFrame from a CSV reader with options.
bool has_column(const std::string &name) const
Check if a column exists in the DataFrame.
iterator end()
Get iterator past the last row.
DataFrameRow< KeyType > operator[](const KeyType &key)
Access a row by its key.
std::string get(const KeyType &key, const std::string &column) const
Get a cell value as a string, accounting for edits.
const_iterator cend() const
Get const iterator past the last row (explicit).
std::vector< T > column(const std::string &name) const
Extract all values from a column with type conversion.
size_t n_cols() const noexcept
Get the number of columns in the DataFrame.
const KeyType & key_at(size_t i) const
Get the key for a row at a given position.
std::unordered_map< GroupKey, std::vector< size_t > > group_by(GroupFunc group_func) const
Group row positions using an arbitrary grouping function.
iterator begin()
Get iterator to the first row.
DataFrame(CSVReader &reader, KeyFunc key_func, const DataFrameOptions &options)
Construct a keyed DataFrame using a custom key function with options.
bool erase_row(const KeyType &key)
Remove a row by its key.
DataFrameRow< KeyType > iloc(size_t i)
Access a row by position (iloc-style, pandas naming).
DataFrame(csv::string_view filename, const DataFrameOptions &options, CSVFormat format=CSVFormat::guess_csv())
Construct a keyed DataFrame directly from a CSV file.
void set_at(size_t i, const std::string &column, const std::string &value)
Set a cell value by position (stored in edit overlay).
size_t n_rows() const noexcept
Get the number of rows in the DataFrame.
const std::vector< std::string > & columns() const noexcept
Get the column names in order.
const_iterator cbegin() const
Get const iterator to the first row (explicit).
DataFrameRow< KeyType > at(const KeyType &key) const
Access a row by its key with bounds checking (const version).
DataFrameRow< KeyType > operator[](size_t i) const
Access a row by position (unchecked, const version).
bool try_get(size_t i, DataFrameRow< KeyType > &out)
Attempt to access a row by position without throwing.
DataFrameRow< KeyType > at(const KeyType &key)
Access a row by its key with bounds checking.
const_iterator begin() const
Get const iterator to the first row.
DataFrameRow< KeyType > at(size_t i) const
Access a row by position with bounds checking (const version).
DataFrameRow< KeyType > operator[](const KeyType &key) const
Access a row by its key (const version).
bool try_get(const KeyType &key, DataFrameRow< KeyType > &out)
Attempt to access a row by key without throwing.
DataFrameRow< KeyType > iloc(size_t i) const
Access a row by position (const version).
std::pair< KeyType, CSVRow > row_entry
Type alias for internal row storage: pair of key and CSVRow.
bool contains(const KeyType &key) const
Check if a key exists in the DataFrame.
const_iterator end() const
Get const iterator past the last row.
int index_of(const std::string &name) const
Get the index of a column by name.
DataFrameRow< KeyType > at(size_t i)
Access a row by position with bounds checking.
DataFrame(CSVReader &reader, KeyFunc key_func, DuplicateKeyPolicy policy=DuplicateKeyPolicy::OVERWRITE)
Construct a keyed DataFrame using a custom key function.
bool try_get(const KeyType &key, DataFrameRow< KeyType > &out) const
Attempt to access a row by key without throwing (const version).
void set(const KeyType &key, const std::string &column, const std::string &value)
Set a cell value (stored in edit overlay).
const std::string & key_name() const noexcept
Get the name of the key column (empty string if unkeyed).
size_t size() const noexcept
Get the number of rows in the DataFrame.
bool try_get(size_t i, DataFrameRow< KeyType > &out) const
Attempt to access a row by position without throwing (const version).
DataFrame()=default
Construct an empty DataFrame.
DataFrame(CSVReader &reader)
Construct an unkeyed DataFrame from a CSV reader.
Defines functionality needed for basic CSV parsing.
The all encompassing namespace.
CONSTEXPR_14 csv::string_view CSVField::get< csv::string_view >()
Retrieve a view over this field's string.
Definition csv_row.hpp:465
std::string CSVField::get< std::string >()
Retrieve this field's original string.
Definition csv_row.hpp:455
std::vector< std::string > get_col_names(csv::string_view filename, const CSVFormat &format=CSVFormat::guess_csv())
Get the column names of a CSV file using just the first 500KB.
constexpr int CSV_NOT_FOUND
Integer indicating a requested column wasn't found.
Definition common.hpp:296
nonstd::string_view string_view
The string_view class used by this library.
Definition common.hpp:135