Vince's CSV Parser
Loading...
Searching...
No Matches
data_frame.hpp
1#pragma once
2
3#include <algorithm>
4#include <functional>
5#include <iterator>
6#include <memory>
7#include <stdexcept>
8#include <string>
9#include <type_traits>
10#include <unordered_map>
11#include <utility>
12#include <vector>
13
14#include "csv_reader.hpp"
15
16namespace csv {
17 namespace internals {
18 template<typename T>
20 private:
21 template<typename U>
22 static auto test(int) -> decltype(
23 std::hash<U>{}(std::declval<const U&>()),
24 std::true_type{}
25 );
26
27 template<typename>
28 static std::false_type test(...);
29
30 public:
31 static constexpr bool value = decltype(test<T>(0))::value;
32 };
33
34 template<typename T>
36 private:
37 template<typename U>
38 static auto test(int) -> decltype(
39 std::declval<const U&>() == std::declval<const U&>(),
40 std::true_type{}
41 );
42
43 template<typename>
44 static std::false_type test(...);
45
46 public:
47 static constexpr bool value = decltype(test<T>(0))::value;
48 };
49 }
50
53 public:
54 DataFrameOptions() = default;
55
57 enum class DuplicateKeyPolicy {
58 THROW, // Throw an error if a duplicate key is encountered
59 OVERWRITE, // Overwrite the existing value with the new value
60 KEEP_FIRST // Ignore the new value and keep the existing value
61 };
62
63 DataFrameOptions& set_duplicate_key_policy(DuplicateKeyPolicy value) {
64 this->duplicate_key_policy = value;
65 return *this;
66 }
67
68 DuplicateKeyPolicy get_duplicate_key_policy() const {
69 return this->duplicate_key_policy;
70 }
71
72 DataFrameOptions& set_key_column(const std::string& value) {
73 this->key_column = value;
74 return *this;
75 }
76
77 const std::string& get_key_column() const {
78 return this->key_column;
79 }
80
81 DataFrameOptions& set_throw_on_missing_key(bool value) {
82 this->throw_on_missing_key = value;
83 return *this;
84 }
85
86 bool get_throw_on_missing_key() const {
87 return this->throw_on_missing_key;
88 }
89
90 private:
91 std::string key_column;
92
93 DuplicateKeyPolicy duplicate_key_policy = DuplicateKeyPolicy::OVERWRITE;
94
96 bool throw_on_missing_key = true;
97 };
98
103 template<typename KeyType>
105 public:
107 DataFrameRow() : row(nullptr), row_edits(nullptr), key_ptr(nullptr) {}
108
111 const CSVRow* _row,
112 const std::unordered_map<std::string, std::string>* _edits,
113 const KeyType* _key
114 ) : row(_row), row_edits(_edits), key_ptr(_key) {}
115
122 CSVField operator[](const std::string& col) const {
123 if (row_edits) {
124 auto it = row_edits->find(col);
125 if (it != row_edits->end()) {
126 return CSVField(csv::string_view(it->second));
127 }
128 }
129 return (*row)[col];
130 }
131
133 CSVField operator[](size_t n) const {
134 return (*row)[n];
135 }
136
138 size_t size() const { return row->size(); }
139
141 bool empty() const { return row->empty(); }
142
144 std::vector<std::string> get_col_names() const { return row->get_col_names(); }
145
147 const CSVRow& get_underlying_row() const { return *row; }
148
150 const KeyType& get_key() const { return *key_ptr; }
151
153 operator std::vector<std::string>() const {
154 std::vector<std::string> result;
155 result.reserve(row->size());
156
157 auto col_names = row->get_col_names();
158 for (size_t i = 0; i < row->size(); i++) {
159 // Check if this column has an edit
160 if (row_edits && i < col_names.size()) {
161 auto it = row_edits->find(col_names[i]);
162 if (it != row_edits->end()) {
163 result.push_back(it->second);
164 continue;
165 }
166 }
167 // Use original value
168 result.push_back((*row)[i].get<std::string>());
169 }
170 return result;
171 }
172
174 std::string to_json(const std::vector<std::string>& subset = {}) const {
175 return row->to_json(subset);
176 }
177
179 std::string to_json_array(const std::vector<std::string>& subset = {}) const {
180 return row->to_json_array(subset);
181 }
182
183 private:
184 const CSVRow* row;
185 const std::unordered_map<std::string, std::string>* row_edits;
186 const KeyType* key_ptr;
187 };
188
189 template<typename KeyType = std::string>
190 class DataFrame {
191 public:
193 using row_entry = std::pair<KeyType, CSVRow>;
194
196 class iterator {
197 public:
199 using difference_type = std::ptrdiff_t;
200 using pointer = const DataFrameRow<KeyType>*;
201 using reference = const DataFrameRow<KeyType>&;
202 using iterator_category = std::random_access_iterator_tag;
203
204 iterator() = default;
205 iterator(
206 typename std::vector<row_entry>::iterator it,
207 const std::unordered_map<KeyType, std::unordered_map<std::string, std::string>>* edits
208 ) : iter(it), edits_map(edits) {}
209
210 reference operator*() const {
211 const std::unordered_map<std::string, std::string>* row_edits = nullptr;
212 if (edits_map) {
213 auto it = edits_map->find(iter->first);
214 if (it != edits_map->end()) {
215 row_edits = &it->second;
216 }
217 }
218 cached_row = DataFrameRow<KeyType>(&iter->second, row_edits, &iter->first);
219 return cached_row;
220 }
221
222 pointer operator->() const {
223 // Ensure cached_row is populated
224 operator*();
225 return &cached_row;
226 }
227
228 iterator& operator++() { ++iter; return *this; }
229 iterator operator++(int) { auto tmp = *this; ++iter; return tmp; }
230 iterator& operator--() { --iter; return *this; }
231 iterator operator--(int) { auto tmp = *this; --iter; return tmp; }
232
233 iterator operator+(difference_type n) const { return iterator(iter + n, edits_map); }
234 iterator operator-(difference_type n) const { return iterator(iter - n, edits_map); }
235 difference_type operator-(const iterator& other) const { return iter - other.iter; }
236
237 bool operator==(const iterator& other) const { return iter == other.iter; }
238 bool operator!=(const iterator& other) const { return iter != other.iter; }
239
240 private:
241 typename std::vector<row_entry>::iterator iter;
242 const std::unordered_map<KeyType, std::unordered_map<std::string, std::string>>* edits_map = nullptr;
243 mutable DataFrameRow<KeyType> cached_row;
244 };
245
248 public:
250 using difference_type = std::ptrdiff_t;
251 using pointer = const DataFrameRow<KeyType>*;
252 using reference = const DataFrameRow<KeyType>&;
253 using iterator_category = std::random_access_iterator_tag;
254
255 const_iterator() = default;
257 typename std::vector<row_entry>::const_iterator it,
258 const std::unordered_map<KeyType, std::unordered_map<std::string, std::string>>* edits
259 ) : iter(it), edits_map(edits) {}
260
261 reference operator*() const {
262 const std::unordered_map<std::string, std::string>* row_edits = nullptr;
263 if (edits_map) {
264 auto it = edits_map->find(iter->first);
265 if (it != edits_map->end()) {
266 row_edits = &it->second;
267 }
268 }
269 cached_row = DataFrameRow<KeyType>(&iter->second, row_edits, &iter->first);
270 return cached_row;
271 }
272
273 pointer operator->() const {
274 // Ensure cached_row is populated
275 operator*();
276 return &cached_row;
277 }
278
279 const_iterator& operator++() { ++iter; return *this; }
280 const_iterator operator++(int) { auto tmp = *this; ++iter; return tmp; }
281 const_iterator& operator--() { --iter; return *this; }
282 const_iterator operator--(int) { auto tmp = *this; --iter; return tmp; }
283
284 const_iterator operator+(difference_type n) const { return const_iterator(iter + n, edits_map); }
285 const_iterator operator-(difference_type n) const { return const_iterator(iter - n, edits_map); }
286 difference_type operator-(const const_iterator& other) const { return iter - other.iter; }
287
288 bool operator==(const const_iterator& other) const { return iter == other.iter; }
289 bool operator!=(const const_iterator& other) const { return iter != other.iter; }
290
291 private:
292 typename std::vector<row_entry>::const_iterator iter;
293 const std::unordered_map<KeyType, std::unordered_map<std::string, std::string>>* edits_map = nullptr;
294 mutable DataFrameRow<KeyType> cached_row;
295 };
296
297 static_assert(
299 "DataFrame<KeyType> requires KeyType to be hashable (std::hash<KeyType> specialization required)."
300 );
301
302 static_assert(
304 "DataFrame<KeyType> requires KeyType to be equality comparable (operator== required)."
305 );
306
307 static_assert(
308 std::is_default_constructible<KeyType>::value,
309 "DataFrame<KeyType> requires KeyType to be default-constructible."
310 );
311
313
315 DataFrame() = default;
316
321 explicit DataFrame(CSVReader& reader) {
322 this->init_unkeyed_from_reader(reader);
323 }
324
332 explicit DataFrame(CSVReader& reader, const DataFrameOptions& options) {
333 this->init_from_reader(reader, options);
334 }
335
345 csv::string_view filename,
346 const DataFrameOptions& options,
348 ) {
349 CSVReader reader(filename, format);
350 this->init_from_reader(reader, options);
351 }
352
363 CSVReader& reader,
364 const std::string& _key_column,
365 DuplicateKeyPolicy policy = DuplicateKeyPolicy::OVERWRITE,
366 bool throw_on_missing_key = true
367 ) : DataFrame(
368 reader,
370 .set_key_column(_key_column)
371 .set_duplicate_key_policy(policy)
372 .set_throw_on_missing_key(throw_on_missing_key)
373 ) {}
374
383 template<
384 typename KeyFunc,
385 typename ResultType = invoke_result_t<KeyFunc, const CSVRow&>,
386 csv::enable_if_t<std::is_convertible<ResultType, KeyType>::value, int> = 0
387 >
389 CSVReader& reader,
390 KeyFunc key_func,
391 DuplicateKeyPolicy policy = DuplicateKeyPolicy::OVERWRITE
392 ) : col_names(reader.get_col_names()) {
393 this->is_keyed = true;
394 this->build_from_key_function(reader, key_func, policy);
395 }
396
404 template<
405 typename KeyFunc,
406 typename ResultType = invoke_result_t<KeyFunc, const CSVRow&>,
407 csv::enable_if_t<std::is_convertible<ResultType, KeyType>::value, int> = 0
408 >
410 CSVReader& reader,
411 KeyFunc key_func,
412 const DataFrameOptions& options
413 ) : DataFrame(reader, key_func, options.get_duplicate_key_policy()) {}
414
416 size_t size() const noexcept {
417 return rows.size();
418 }
419
421 bool empty() const noexcept {
422 return rows.empty();
423 }
424
426 size_t n_rows() const noexcept { return rows.size(); }
427
429 size_t n_cols() const noexcept { return col_names.size(); }
430
437 bool has_column(const std::string& name) const {
438 return std::find(col_names.begin(), col_names.end(), name) != col_names.end();
439 }
440
447 int index_of(const std::string& name) const {
448 auto it = std::find(col_names.begin(), col_names.end(), name);
449 if (it == col_names.end())
450 return CSV_NOT_FOUND;
451 return static_cast<int>(std::distance(col_names.begin(), it));
452 }
453
455 const std::vector<std::string>& columns() const noexcept {
456 return col_names;
457 }
458
460 const std::string& key_name() const noexcept {
461 return key_column;
462 }
463
475 template<typename K = KeyType,
476 csv::enable_if_t<!std::is_integral<K>::value, int> = 0>
478 static_assert(std::is_same<K, KeyType>::value,
479 "Do not explicitly instantiate this template. Use iloc() for positional access.");
480 return this->iloc(i);
481 }
482
485 template<typename K = KeyType,
486 csv::enable_if_t<!std::is_integral<K>::value, int> = 0>
488 static_assert(std::is_same<K, KeyType>::value,
489 "Do not explicitly instantiate this template. Use iloc() for positional access.");
490 return this->iloc(i);
491 }
492
501 const std::unordered_map<std::string, std::string>* row_edits = nullptr;
502 if (is_keyed) {
503 auto it = edits.find(rows.at(i).first);
504 if (it != edits.end()) row_edits = &it->second;
505 }
506 return DataFrameRow<KeyType>(&rows.at(i).second, row_edits, &rows.at(i).first);
507 }
508
510 DataFrameRow<KeyType> at(size_t i) const {
511 const std::unordered_map<std::string, std::string>* row_edits = nullptr;
512 if (is_keyed) {
513 auto it = edits.find(rows.at(i).first);
514 if (it != edits.end()) row_edits = &it->second;
515 }
516 return DataFrameRow<KeyType>(&rows.at(i).second, row_edits, &rows.at(i).first);
517 }
518
527 DataFrameRow<KeyType> operator[](const KeyType& key) {
528 this->require_keyed_frame();
529 auto position = this->position_of(key);
530 const std::unordered_map<std::string, std::string>* row_edits = nullptr;
531 auto it = edits.find(key);
532 if (it != edits.end()) row_edits = &it->second;
533 return DataFrameRow<KeyType>(&rows[position].second, row_edits, &rows[position].first);
534 }
535
537 DataFrameRow<KeyType> operator[](const KeyType& key) const {
538 this->require_keyed_frame();
539 auto position = this->position_of(key);
540 const std::unordered_map<std::string, std::string>* row_edits = nullptr;
541 auto it = edits.find(key);
542 if (it != edits.end()) row_edits = &it->second;
543 return DataFrameRow<KeyType>(&rows[position].second, row_edits, &rows[position].first);
544 }
545
554 const std::unordered_map<std::string, std::string>* row_edits = nullptr;
555 if (is_keyed) {
556 auto it = edits.find(rows.at(i).first);
557 if (it != edits.end()) row_edits = &it->second;
558 }
559 return DataFrameRow<KeyType>(&rows.at(i).second, row_edits, &rows.at(i).first);
560 }
561
563 DataFrameRow<KeyType> iloc(size_t i) const {
564 const std::unordered_map<std::string, std::string>* row_edits = nullptr;
565 if (is_keyed) {
566 auto it = edits.find(rows.at(i).first);
567 if (it != edits.end()) row_edits = &it->second;
568 }
569 return DataFrameRow<KeyType>(&rows.at(i).second, row_edits, &rows.at(i).first);
570 }
571
579 bool try_get(size_t i, DataFrameRow<KeyType>& out) {
580 if (i >= rows.size()) {
581 return false;
582 }
583 const std::unordered_map<std::string, std::string>* row_edits = nullptr;
584 if (is_keyed) {
585 auto it = edits.find(rows[i].first);
586 if (it != edits.end()) row_edits = &it->second;
587 }
588 out = DataFrameRow<KeyType>(&rows[i].second, row_edits, &rows[i].first);
589 return true;
590 }
591
593 bool try_get(size_t i, DataFrameRow<KeyType>& out) const {
594 if (i >= rows.size()) {
595 return false;
596 }
597 const std::unordered_map<std::string, std::string>* row_edits = nullptr;
598 if (is_keyed) {
599 auto it = edits.find(rows[i].first);
600 if (it != edits.end()) row_edits = &it->second;
601 }
602 out = DataFrameRow<KeyType>(&rows[i].second, row_edits, &rows[i].first);
603 return true;
604 }
605
614 const KeyType& key_at(size_t i) const {
615 this->require_keyed_frame();
616 return rows.at(i).first;
617 }
618
626 bool contains(const KeyType& key) const {
627 this->require_keyed_frame();
628 this->ensure_key_index();
629 return key_index->find(key) != key_index->end();
630 }
631
640 DataFrameRow<KeyType> at(const KeyType& key) {
641 this->require_keyed_frame();
642 auto position = this->position_of(key);
643 const std::unordered_map<std::string, std::string>* row_edits = nullptr;
644 auto it = edits.find(key);
645 if (it != edits.end()) row_edits = &it->second;
646 return DataFrameRow<KeyType>(&rows.at(position).second, row_edits, &rows.at(position).first);
647 }
648
650 DataFrameRow<KeyType> at(const KeyType& key) const {
651 this->require_keyed_frame();
652 auto position = this->position_of(key);
653 const std::unordered_map<std::string, std::string>* row_edits = nullptr;
654 auto it = edits.find(key);
655 if (it != edits.end()) row_edits = &it->second;
656 return DataFrameRow<KeyType>(&rows.at(position).second, row_edits, &rows.at(position).first);
657 }
658
667 bool try_get(const KeyType& key, DataFrameRow<KeyType>& out) {
668 this->require_keyed_frame();
669 this->ensure_key_index();
670 auto it = key_index->find(key);
671 if (it == key_index->end()) {
672 return false;
673 }
674 const std::unordered_map<std::string, std::string>* row_edits = nullptr;
675 auto edit_it = edits.find(key);
676 if (edit_it != edits.end()) row_edits = &edit_it->second;
677 out = DataFrameRow<KeyType>(&rows[it->second].second, row_edits, &rows[it->second].first);
678 return true;
679 }
680
682 bool try_get(const KeyType& key, DataFrameRow<KeyType>& out) const {
683 this->require_keyed_frame();
684 this->ensure_key_index();
685 auto it = key_index->find(key);
686 if (it == key_index->end()) {
687 return false;
688 }
689 const std::unordered_map<std::string, std::string>* row_edits = nullptr;
690 auto edit_it = edits.find(key);
691 if (edit_it != edits.end()) row_edits = &edit_it->second;
692 out = DataFrameRow<KeyType>(&rows[it->second].second, row_edits, &rows[it->second].first);
693 return true;
694 }
695
705 std::string get(const KeyType& key, const std::string& column) const {
706 this->require_keyed_frame();
707
708 auto row_edits = this->edits.find(key);
709 if (row_edits != this->edits.end()) {
710 auto value = row_edits->second.find(column);
711 if (value != row_edits->second.end()) {
712 return value->second;
713 }
714 }
715
716 return (*this)[key][column].template get<std::string>();
717 }
718
728 void set(const KeyType& key, const std::string& column, const std::string& value) {
729 this->require_keyed_frame();
730 (void)this->position_of(key);
731 edits[key][column] = value;
732 }
733
741 bool erase_row(const KeyType& key) {
742 this->require_keyed_frame();
743 this->ensure_key_index();
744
745 auto it = key_index->find(key);
746 if (it == key_index->end()) {
747 return false;
748 }
749
750 rows.erase(rows.begin() + it->second);
751 edits.erase(key);
752 this->invalidate_key_index();
753 return true;
754 }
755
762 bool erase_row_at(size_t i) {
763 if (i >= rows.size()) return false;
764 if (is_keyed) edits.erase(rows[i].first);
765
766 rows.erase(rows.begin() + i);
767 this->invalidate_key_index();
768 return true;
769 }
770
780 void set_at(size_t i, const std::string& column, const std::string& value) {
781 if (!is_keyed) {
782 throw std::runtime_error("This DataFrame was created without a key column.");
783 }
784 if (i >= rows.size()) {
785 throw std::out_of_range("Row index out of bounds.");
786 }
787 edits[rows[i].first][column] = value;
788 }
789
799 template<typename T = std::string>
800 std::vector<T> column(const std::string& name) const {
801 if (std::find(col_names.begin(), col_names.end(), name) == col_names.end()) {
802 throw std::runtime_error("Column not found: " + name);
803 }
804
805 std::vector<T> values;
806 values.reserve(rows.size());
807
808 for (const auto& entry : rows) {
809 auto row_edits = this->edits.find(entry.first);
810 if (row_edits != this->edits.end()) {
811 auto value = row_edits->second.find(name);
812 if (value != row_edits->second.end()) {
813 // Reuse CSVField parsing/validation on edited string values.
814 CSVField edited_field(csv::string_view(value->second));
815 values.push_back(edited_field.template get<T>());
816 continue;
817 }
818 }
819
820 values.push_back(entry.second[name].template get<T>());
821 }
822
823 return values;
824 }
825
833 template<
834 typename GroupFunc,
835 typename GroupKey = invoke_result_t<GroupFunc, const CSVRow&>,
836 csv::enable_if_t<
839 int
840 > = 0
841 >
842 std::unordered_map<GroupKey, std::vector<size_t>> group_by(GroupFunc group_func) const {
843 std::unordered_map<GroupKey, std::vector<size_t>> grouped;
844
845 for (size_t i = 0; i < rows.size(); i++) {
846 GroupKey group_key = group_func(rows[i].second);
847 grouped[group_key].push_back(i);
848 }
849
850 return grouped;
851 }
852
861 std::unordered_map<std::string, std::vector<size_t>> group_by(
862 const std::string& name,
863 bool use_edits = true
864 ) const {
865 if (std::find(col_names.begin(), col_names.end(), name) == col_names.end()) {
866 throw std::runtime_error("Column not found: " + name);
867 }
868
869 std::unordered_map<std::string, std::vector<size_t>> grouped;
870
871 for (size_t i = 0; i < rows.size(); i++) {
872 std::string group_key;
873 bool has_group_key = false;
874
875 if (use_edits) {
876 auto row_edits = this->edits.find(rows[i].first);
877 if (row_edits != this->edits.end()) {
878 auto edited_value = row_edits->second.find(name);
879 if (edited_value != row_edits->second.end()) {
880 group_key = edited_value->second;
881 has_group_key = true;
882 }
883 }
884 }
885
886 if (!has_group_key) {
887 group_key = rows[i].second[name].template get<std::string>();
888 }
889
890 grouped[group_key].push_back(i);
891 }
892
893 return grouped;
894 }
895
897 iterator begin() { return iterator(rows.begin(), is_keyed ? &edits : nullptr); }
898
900 iterator end() { return iterator(rows.end(), is_keyed ? &edits : nullptr); }
901
903 const_iterator begin() const { return const_iterator(rows.begin(), is_keyed ? &edits : nullptr); }
904
906 const_iterator end() const { return const_iterator(rows.end(), is_keyed ? &edits : nullptr); }
907
909 const_iterator cbegin() const { return const_iterator(rows.begin(), is_keyed ? &edits : nullptr); }
910
912 const_iterator cend() const { return const_iterator(rows.end(), is_keyed ? &edits : nullptr); }
913
914 private:
916 std::string key_column;
917
919 bool is_keyed = false;
920
922 std::vector<std::string> col_names;
923
925 std::vector<row_entry> rows;
926
928 mutable std::unique_ptr<std::unordered_map<KeyType, size_t>> key_index;
929
934 std::unordered_map<KeyType, std::unordered_map<std::string, std::string>> edits;
935
937 void init_unkeyed_from_reader(CSVReader& reader) {
938 this->col_names = reader.get_col_names();
939 for (auto& row : reader) {
940 rows.push_back(row_entry{KeyType(), row});
941 }
942 }
943
945 void init_from_reader(CSVReader& reader, const DataFrameOptions& options) {
946 this->is_keyed = true;
947 this->key_column = options.get_key_column();
948 this->col_names = reader.get_col_names();
949
950 if (key_column.empty()) {
951 throw std::runtime_error("Key column cannot be empty.");
952 }
953
954 if (std::find(col_names.begin(), col_names.end(), key_column) == col_names.end()) {
955 throw std::runtime_error("Key column not found: " + key_column);
956 }
957
958 const bool throw_on_missing_key = options.get_throw_on_missing_key();
959
960 this->build_from_key_function(
961 reader,
962 [this, throw_on_missing_key](const CSVRow& row) -> KeyType {
963 try {
964 return row[this->key_column].template get<KeyType>();
965 }
966 catch (const std::exception& e) {
967 if (throw_on_missing_key) {
968 throw std::runtime_error("Error retrieving key column value: " + std::string(e.what()));
969 }
970
971 return KeyType();
972 }
973 },
974 options.get_duplicate_key_policy()
975 );
976 }
977
979 template<typename KeyFunc>
980 void build_from_key_function(
981 CSVReader& reader,
982 KeyFunc key_func,
983 DuplicateKeyPolicy policy
984 ) {
985 std::unordered_map<KeyType, size_t> key_to_pos;
986
987 for (auto& row : reader) {
988 KeyType key = key_func(row);
989
990 auto existing = key_to_pos.find(key);
991 if (existing != key_to_pos.end()) {
992 if (policy == DuplicateKeyPolicy::THROW) {
993 throw std::runtime_error("Duplicate key encountered.");
994 }
995
996 if (policy == DuplicateKeyPolicy::OVERWRITE) {
997 rows[existing->second].second = row;
998 }
999
1000 continue;
1001 }
1002
1003 rows.push_back(row_entry{key, row});
1004 key_to_pos[key] = rows.size() - 1;
1005 }
1006 }
1007
1009 void require_keyed_frame() const {
1010 if (!is_keyed) {
1011 throw std::runtime_error("This DataFrame was created without a key column.");
1012 }
1013 }
1014
1016 void invalidate_key_index() {
1017 key_index.reset();
1018 }
1019
1021 void ensure_key_index() const {
1022 if (key_index) {
1023 return;
1024 }
1025
1026 key_index = std::unique_ptr<std::unordered_map<KeyType, size_t>>(
1027 new std::unordered_map<KeyType, size_t>()
1028 );
1029
1030 for (size_t i = 0; i < rows.size(); i++) {
1031 (*key_index)[rows[i].first] = i;
1032 }
1033 }
1034
1036 size_t position_of(const KeyType& key) const {
1037 this->ensure_key_index();
1038 auto it = key_index->find(key);
1039 if (it == key_index->end()) {
1040 throw std::out_of_range("Key not found.");
1041 }
1042
1043 return it->second;
1044 }
1045 };
1046}
Data type representing individual CSV values.
Definition csv_row.hpp:39
Stores information about how to parse a CSV file.
static CSVFormat guess_csv()
CSVFormat for guessing the delimiter.
Main class for parsing CSVs from files and in-memory sources.
std::vector< std::string > get_col_names() const
Return the CSV's column names as a vector of strings.
Data structure for representing CSV rows.
Definition csv_row.hpp:280
std::string to_json(const std::vector< std::string > &subset={}) const
Convert a CSV row to a JSON object, i.e.
CONSTEXPR bool empty() const noexcept
Indicates whether row is empty or not.
Definition csv_row.hpp:294
std::string to_json_array(const std::vector< std::string > &subset={}) const
Convert a CSV row to a JSON array, i.e.
CONSTEXPR size_t size() const noexcept
Return the number of fields in this row.
Definition csv_row.hpp:297
std::vector< std::string > get_col_names() const
Retrieve this row's associated column names.
Definition csv_row.hpp:307
Allows configuration of DataFrame behavior.
DuplicateKeyPolicy
Policy for handling duplicate keys when creating a keyed DataFrame.
Proxy class that wraps a CSVRow and intercepts field access to check for edits.
std::string to_json(const std::vector< std::string > &subset={}) const
Convert to JSON.
std::vector< std::string > get_col_names() const
Get column names.
CSVField operator[](size_t n) const
Access a field by position (positional access never checks edits).
DataFrameRow(const CSVRow *_row, const std::unordered_map< std::string, std::string > *_edits, const KeyType *_key)
Construct a DataFrameRow wrapper.
CSVField operator[](const std::string &col) const
Access a field by column name, checking edits first.
DataFrameRow()
Default constructor (creates an unbound proxy).
std::string to_json_array(const std::vector< std::string > &subset={}) const
Convert to JSON array.
size_t size() const
Get the number of fields in the row.
bool empty() const
Check if the row is empty.
const CSVRow & get_underlying_row() const
Get the underlying CSVRow for compatibility.
const KeyType & get_key() const
Get the key for this row (only valid for keyed DataFrames).
Row-wise const iterator over DataFrameRow entries.
Row-wise iterator over DataFrameRow entries.
std::unordered_map< std::string, std::vector< size_t > > group_by(const std::string &name, bool use_edits=true) const
Group row positions by the value of a column.
bool empty() const noexcept
Check if the DataFrame is empty (has no rows).
DataFrame(CSVReader &reader, const std::string &_key_column, DuplicateKeyPolicy policy=DuplicateKeyPolicy::OVERWRITE, bool throw_on_missing_key=true)
Construct a keyed DataFrame using a column name as the key.
DataFrameRow< KeyType > operator[](size_t i)
Access a row by position (unchecked).
bool erase_row_at(size_t i)
Remove a row by its position.
DataFrame(CSVReader &reader, const DataFrameOptions &options)
Construct a keyed DataFrame from a CSV reader with options.
bool has_column(const std::string &name) const
Check if a column exists in the DataFrame.
iterator end()
Get iterator past the last row.
DataFrameRow< KeyType > operator[](const KeyType &key)
Access a row by its key.
std::string get(const KeyType &key, const std::string &column) const
Get a cell value as a string, accounting for edits.
const_iterator cend() const
Get const iterator past the last row (explicit).
std::vector< T > column(const std::string &name) const
Extract all values from a column with type conversion.
size_t n_cols() const noexcept
Get the number of columns in the DataFrame.
const KeyType & key_at(size_t i) const
Get the key for a row at a given position.
std::unordered_map< GroupKey, std::vector< size_t > > group_by(GroupFunc group_func) const
Group row positions using an arbitrary grouping function.
iterator begin()
Get iterator to the first row.
DataFrame(CSVReader &reader, KeyFunc key_func, const DataFrameOptions &options)
Construct a keyed DataFrame using a custom key function with options.
bool erase_row(const KeyType &key)
Remove a row by its key.
DataFrameRow< KeyType > iloc(size_t i)
Access a row by position (iloc-style, pandas naming).
DataFrame(csv::string_view filename, const DataFrameOptions &options, CSVFormat format=CSVFormat::guess_csv())
Construct a keyed DataFrame directly from a CSV file.
void set_at(size_t i, const std::string &column, const std::string &value)
Set a cell value by position (stored in edit overlay).
size_t n_rows() const noexcept
Get the number of rows in the DataFrame.
const std::vector< std::string > & columns() const noexcept
Get the column names in order.
const_iterator cbegin() const
Get const iterator to the first row (explicit).
DataFrameRow< KeyType > at(const KeyType &key) const
Access a row by its key with bounds checking (const version).
DataFrameRow< KeyType > operator[](size_t i) const
Access a row by position (unchecked, const version).
bool try_get(size_t i, DataFrameRow< KeyType > &out)
Attempt to access a row by position without throwing.
DataFrameRow< KeyType > at(const KeyType &key)
Access a row by its key with bounds checking.
const_iterator begin() const
Get const iterator to the first row.
DataFrameRow< KeyType > at(size_t i) const
Access a row by position with bounds checking (const version).
DataFrameRow< KeyType > operator[](const KeyType &key) const
Access a row by its key (const version).
bool try_get(const KeyType &key, DataFrameRow< KeyType > &out)
Attempt to access a row by key without throwing.
DataFrameRow< KeyType > iloc(size_t i) const
Access a row by position (const version).
std::pair< KeyType, CSVRow > row_entry
Type alias for internal row storage: pair of key and CSVRow.
bool contains(const KeyType &key) const
Check if a key exists in the DataFrame.
const_iterator end() const
Get const iterator past the last row.
int index_of(const std::string &name) const
Get the index of a column by name.
DataFrameRow< KeyType > at(size_t i)
Access a row by position with bounds checking.
DataFrame(CSVReader &reader, KeyFunc key_func, DuplicateKeyPolicy policy=DuplicateKeyPolicy::OVERWRITE)
Construct a keyed DataFrame using a custom key function.
bool try_get(const KeyType &key, DataFrameRow< KeyType > &out) const
Attempt to access a row by key without throwing (const version).
void set(const KeyType &key, const std::string &column, const std::string &value)
Set a cell value (stored in edit overlay).
const std::string & key_name() const noexcept
Get the name of the key column (empty string if unkeyed).
size_t size() const noexcept
Get the number of rows in the DataFrame.
bool try_get(size_t i, DataFrameRow< KeyType > &out) const
Attempt to access a row by position without throwing (const version).
DataFrame()=default
Construct an empty DataFrame.
DataFrame(CSVReader &reader)
Construct an unkeyed DataFrame from a CSV reader.
Defines functionality needed for basic CSV parsing.
CSV_CONST CONSTEXPR_17 OutArray arrayToDefault(T &&value)
Helper constexpr function to initialize an array with all the elements set to value.
The all encompassing namespace.
std::vector< std::string > get_col_names(csv::string_view filename, CSVFormat format)
Return a CSV's column names.
std::string CSVField::get< std::string >()
Retrieve this field's original string.
Definition csv_row.hpp:455
constexpr int CSV_NOT_FOUND
Integer indicating a requested column wasn't found.
Definition common.hpp:246
nonstd::string_view string_view
The string_view class used by this library.
Definition common.hpp:99