Vince's CSV Parser
Loading...
Searching...
No Matches
data_frame.hpp
1#pragma once
2
3#include <algorithm>
4#include <atomic>
5#include <condition_variable>
6#include <exception>
7#include <functional>
8#include <iterator>
9#include <memory>
10#include <mutex>
11#include <stdexcept>
12#include <string>
13#include <type_traits>
14#include <unordered_map>
15#include <utility>
16#include <vector>
17
18#include "csv_reader.hpp"
19#include "csv_exceptions.hpp"
20#include "json_converter.hpp"
21
22namespace csv {
23 template<typename KeyType>
24 class DataFrame;
25 template<typename KeyType>
26 class DataFrameColumn;
27 template<typename KeyType>
28 class DataFrameRow;
29
30 struct RowOverlay {
31 RowOverlay() = default;
32 RowOverlay(const RowOverlay&) = delete;
33 RowOverlay& operator=(const RowOverlay&) = delete;
34
35 RowOverlay(RowOverlay&& other) noexcept : values(std::move(other.values)) {
36 busy.clear(std::memory_order_release);
37 }
38
39 RowOverlay& operator=(RowOverlay&& other) noexcept {
40 if (this != &other) {
41 values = std::move(other.values);
42 busy.clear(std::memory_order_release);
43 }
44 return *this;
45 }
46
47 bool try_get_copy(size_t col_index, std::string& out) const {
48 row_overlay_lock_guard lock(this);
49 auto it = values.find(col_index);
50 if (it == values.end()) {
51 return false;
52 }
53
54 out = it->second;
55 return true;
56 }
57
66 bool try_get_view(size_t col_index, csv::string_view& out) const {
67 row_overlay_lock_guard lock(this);
68 auto it = values.find(col_index);
69 if (it == values.end()) {
70 return false;
71 }
72
73 out = csv::string_view(it->second);
74 return true;
75 }
76
77 void set(size_t col_index, std::string value) {
78 row_overlay_lock_guard lock(this);
79 values[col_index] = std::move(value);
80 }
81
82 bool empty() const {
83 row_overlay_lock_guard lock(this);
84 return values.empty();
85 }
86
87 private:
88 struct row_overlay_lock_guard {
89 explicit row_overlay_lock_guard(const RowOverlay* overlay)
90 : busy(const_cast<std::atomic_flag&>(overlay->busy)) {
91 while (busy.test_and_set(std::memory_order_acquire)) {}
92 }
93
94 ~row_overlay_lock_guard() {
95 busy.clear(std::memory_order_release);
96 }
97
98 std::atomic_flag& busy;
99 };
100
101 mutable std::atomic_flag busy = ATOMIC_FLAG_INIT;
102 std::unordered_map<size_t, std::string> values;
103 };
104
106 RowOverlaySlot() noexcept : ptr(nullptr) {}
107 RowOverlaySlot(const RowOverlaySlot&) = delete;
108 RowOverlaySlot& operator=(const RowOverlaySlot&) = delete;
109
110 RowOverlaySlot(RowOverlaySlot&& other) noexcept
111 : ptr(nullptr),
112 owned(std::move(other.owned)) {
113 RowOverlay* overlay = owned.get();
114 ptr.store(overlay, std::memory_order_release);
115 other.ptr.store(nullptr, std::memory_order_release);
116 }
117
118 RowOverlaySlot& operator=(RowOverlaySlot&& other) noexcept {
119 if (this != &other) {
120 owned = std::move(other.owned);
121 RowOverlay* overlay = owned.get();
122 ptr.store(overlay, std::memory_order_release);
123 other.ptr.store(nullptr, std::memory_order_release);
124 }
125
126 return *this;
127 }
128
129 RowOverlay* get() noexcept {
130 return ptr.load(std::memory_order_acquire);
131 }
132
133 const RowOverlay* get() const noexcept {
134 return ptr.load(std::memory_order_acquire);
135 }
136
137 RowOverlay* ensure() {
138 RowOverlay* overlay = this->get();
139 if (!overlay) {
140 owned.reset(new RowOverlay());
141 overlay = owned.get();
142 ptr.store(overlay, std::memory_order_release);
143 }
144
145 return overlay;
146 }
147
148 private:
149 std::atomic<RowOverlay*> ptr;
150 std::unique_ptr<RowOverlay> owned;
151 };
152
153 namespace internals {
154 template<typename Owner, typename Proxy, typename Accessor>
155 class indexed_proxy_iterator {
156 public:
157 using value_type = Proxy;
158 using difference_type = std::ptrdiff_t;
159 using pointer = const Proxy*;
160 using reference = const Proxy&;
161 using iterator_category = std::random_access_iterator_tag;
162
163 indexed_proxy_iterator() = default;
164 indexed_proxy_iterator(Owner* owner, size_t index, Accessor accessor = Accessor())
165 : owner_(owner), index_(index), accessor_(accessor) {}
166
167 reference operator*() const {
168 cached_proxy_ = accessor_(owner_, index_);
169 return cached_proxy_;
170 }
171
172 pointer operator->() const {
173 operator*();
174 return &cached_proxy_;
175 }
176
177 indexed_proxy_iterator& operator++() { ++index_; return *this; }
178 indexed_proxy_iterator operator++(int) { auto tmp = *this; ++index_; return tmp; }
179 indexed_proxy_iterator& operator--() { --index_; return *this; }
180 indexed_proxy_iterator operator--(int) { auto tmp = *this; --index_; return tmp; }
181
182 indexed_proxy_iterator operator+(difference_type n) const {
183 return indexed_proxy_iterator(owner_, static_cast<size_t>(index_ + n), accessor_);
184 }
185
186 indexed_proxy_iterator operator-(difference_type n) const {
187 return indexed_proxy_iterator(owner_, static_cast<size_t>(index_ - n), accessor_);
188 }
189
190 difference_type operator-(const indexed_proxy_iterator& other) const {
191 return static_cast<difference_type>(index_) - static_cast<difference_type>(other.index_);
192 }
193
194 bool operator==(const indexed_proxy_iterator& other) const {
195 return owner_ == other.owner_ && index_ == other.index_;
196 }
197
198 bool operator!=(const indexed_proxy_iterator& other) const {
199 return !(*this == other);
200 }
201
202 private:
203 Owner* owner_ = nullptr;
204 size_t index_ = 0;
205 Accessor accessor_;
206 mutable Proxy cached_proxy_;
207 };
208 }
209
212 public:
213 DataFrameOptions() = default;
214
217 THROW, // Throw an error if a duplicate key is encountered
218 OVERWRITE, // Overwrite the existing value with the new value
219 KEEP_FIRST // Ignore the new value and keep the existing value
220 };
221
222 DataFrameOptions& set_duplicate_key_policy(DuplicateKeyPolicy value) {
223 this->duplicate_key_policy = value;
224 return *this;
225 }
226
227 DuplicateKeyPolicy get_duplicate_key_policy() const {
228 return this->duplicate_key_policy;
229 }
230
231 DataFrameOptions& set_key_column(const std::string& value) {
232 this->key_column = value;
233 return *this;
234 }
235
236 const std::string& get_key_column() const {
237 return this->key_column;
238 }
239
240 DataFrameOptions& set_throw_on_missing_key(bool value) {
241 this->throw_on_missing_key = value;
242 return *this;
243 }
244
245 bool get_throw_on_missing_key() const {
246 return this->throw_on_missing_key;
247 }
248
249 private:
250 std::string key_column;
251
252 DuplicateKeyPolicy duplicate_key_policy = DuplicateKeyPolicy::OVERWRITE;
253
255 bool throw_on_missing_key = true;
256 };
257
260 public:
261 explicit DataFrameExecutor(size_t worker_count = default_worker_count()) {
262 this->start_workers(worker_count);
263 }
264
265 DataFrameExecutor(const DataFrameExecutor&) = delete;
266 DataFrameExecutor& operator=(const DataFrameExecutor&) = delete;
267
269 this->stop_workers();
270 }
271
272 size_t worker_count() const noexcept {
273#if CSV_ENABLE_THREADS
274 return workers_.size();
275#else
276 return 0;
277#endif
278 }
279
280 template<typename Fn>
281 void parallel_for(size_t task_count, Fn&& fn) {
282 if (task_count == 0) {
283 return;
284 }
285
286#if CSV_ENABLE_THREADS
287 if (workers_.empty() || task_count <= workers_.size()) {
288 this->run_serial(task_count, std::forward<Fn>(fn));
289 return;
290 }
291
292 std::exception_ptr captured_exception;
293 {
294 std::unique_lock<std::mutex> lock(mutex_);
295 current_task_ = std::forward<Fn>(fn);
296 task_exception_ = nullptr;
297 next_task_.store(0);
298 task_count_ = task_count;
299 active_workers_ = workers_.size();
300 ++generation_;
301 }
302
303 task_ready_.notify_all();
304
305 std::unique_lock<std::mutex> lock(mutex_);
306 task_done_.wait(lock, [this]() {
307 return completed_generation_ == generation_;
308 });
309
310 captured_exception = task_exception_;
311 current_task_ = std::function<void(size_t)>();
312 task_exception_ = nullptr;
313
314 if (captured_exception) {
315 std::rethrow_exception(captured_exception);
316 }
317#else
318 this->run_serial(task_count, std::forward<Fn>(fn));
319#endif
320 }
321
322 private:
323 template<typename Fn>
324 void run_serial(size_t task_count, Fn&& fn) {
325 for (size_t i = 0; i < task_count; ++i) {
326 fn(i);
327 }
328 }
329
330 static size_t default_worker_count() {
331#if CSV_ENABLE_THREADS
332 const unsigned int hw = std::thread::hardware_concurrency();
333 return hw > 0 ? static_cast<size_t>(hw) : 1;
334#else
335 return 0;
336#endif
337 }
338
339#if CSV_ENABLE_THREADS
340 void start_workers(size_t worker_count) {
341 workers_.reserve(worker_count);
342 for (size_t i = 0; i < worker_count; ++i) {
343 workers_.push_back(std::thread(&DataFrameExecutor::worker_loop, this));
344 }
345 }
346
347 void stop_workers() {
348 {
349 std::lock_guard<std::mutex> lock(mutex_);
350 stop_ = true;
351 }
352
353 task_ready_.notify_all();
354 for (auto& worker : workers_) {
355 if (worker.joinable())
356 worker.join();
357 }
358 }
359
360 void worker_loop() {
361 size_t seen_generation = 0;
362 std::unique_lock<std::mutex> lock(mutex_);
363
364 while (true) {
365 task_ready_.wait(lock, [this, seen_generation]() {
366 return stop_ || generation_ != seen_generation;
367 });
368
369 if (stop_) return;
370
371 const size_t local_generation = generation_;
372 seen_generation = local_generation;
373 lock.unlock();
374
375 while (true) {
376 const size_t task_index = next_task_.fetch_add(1);
377 if (task_index >= task_count_)
378 break;
379
380 try {
381 current_task_(task_index);
382 }
383 catch (...) {
384 lock.lock();
385 if (!task_exception_) {
386 task_exception_ = std::current_exception();
387 next_task_.store(task_count_);
388 }
389 lock.unlock();
390 break;
391 }
392 }
393
394 lock.lock();
395 if (--active_workers_ == 0) {
396 completed_generation_ = local_generation;
397 task_done_.notify_one();
398 }
399 }
400 }
401
402 std::vector<std::thread> workers_;
403 std::mutex mutex_;
404 std::condition_variable task_ready_;
405 std::condition_variable task_done_;
406 std::function<void(size_t)> current_task_;
407 std::exception_ptr task_exception_ = nullptr;
408 std::atomic<size_t> next_task_{0};
409 size_t task_count_ = 0;
410 size_t active_workers_ = 0;
411 size_t generation_ = 0;
412 size_t completed_generation_ = 0;
413 bool stop_ = false;
414#else
415 void start_workers(size_t) {}
416 void stop_workers() {}
417#endif
418 };
419
420 class DataFrameCell : public CSVField {
421 public:
422 using CSVField::get;
423 using CSVField::get_sv;
424 using CSVField::is_float;
425 using CSVField::is_int;
426 using CSVField::is_null;
427 using CSVField::is_num;
428 using CSVField::is_str;
429 using CSVField::try_get;
430 using CSVField::type;
431
432 DataFrameCell() : CSVField(csv::string_view()), row(nullptr), row_overlay(nullptr), col_index(0), can_mutate(false) {}
433
434 DataFrameCell(const DataFrameCell& other)
436 row(other.row),
437 row_overlay(other.row_overlay),
438 col_index(other.col_index),
439 can_mutate(other.can_mutate),
440 owned_value_(other.owned_value_) {
441 this->refresh_value();
442 }
443
444 DataFrameCell(DataFrameCell&& other) noexcept
446 row(other.row),
447 row_overlay(other.row_overlay),
448 col_index(other.col_index),
449 can_mutate(other.can_mutate),
450 owned_value_(std::move(other.owned_value_)) {
451 this->refresh_value();
452 }
453
455 const CSVRow* _row,
456 RowOverlay* _row_overlay,
457 size_t _col_index
459 row(_row),
460 row_overlay(_row_overlay),
461 col_index(_col_index),
462 can_mutate(true) {
463 this->refresh_value();
464 }
465
467 const CSVRow* _row,
468 const RowOverlay* _row_overlay,
469 size_t _col_index
471 row(_row),
472 row_overlay(_row_overlay),
473 col_index(_col_index),
474 can_mutate(false) {
475 this->refresh_value();
476 }
477
478 DataFrameCell& operator=(const DataFrameCell& other) {
479 if (this != &other) {
480 row = other.row;
481 row_overlay = other.row_overlay;
482 col_index = other.col_index;
483 can_mutate = other.can_mutate;
484 owned_value_ = other.owned_value_;
485 this->refresh_value();
486 }
487
488 return *this;
489 }
490
491 DataFrameCell& operator=(DataFrameCell&& other) noexcept {
492 if (this != &other) {
493 row = other.row;
494 row_overlay = other.row_overlay;
495 col_index = other.col_index;
496 can_mutate = other.can_mutate;
497 owned_value_ = std::move(other.owned_value_);
498 this->refresh_value();
499 }
500
501 return *this;
502 }
503
504 DataFrameCell& operator=(csv::string_view value) {
505 return this->assign(std::string(value));
506 }
507
509 template<typename T = std::string>
510 T get() const {
511 return const_cast<DataFrameCell*>(this)->CSVField::template get<T>();
512 }
513
514 bool is_null() const noexcept {
515 return const_cast<DataFrameCell*>(this)->CSVField::is_null();
516 }
517
518 bool is_str() const noexcept {
519 return const_cast<DataFrameCell*>(this)->CSVField::is_str();
520 }
521
522 bool is_num() const noexcept {
523 return const_cast<DataFrameCell*>(this)->CSVField::is_num();
524 }
525
526 bool is_int() const noexcept {
527 return const_cast<DataFrameCell*>(this)->CSVField::is_int();
528 }
529
530 bool is_float() const noexcept {
531 return const_cast<DataFrameCell*>(this)->CSVField::is_float();
532 }
533
534 DataType type() const noexcept {
535 return const_cast<DataFrameCell*>(this)->CSVField::type();
536 }
537
538 template<typename T>
539 bool try_get(T& out) const noexcept {
540 return const_cast<DataFrameCell*>(this)->CSVField::template try_get<T>(out);
541 }
542
543 private:
544 void refresh_value() {
545 if (!row) {
546 CSVField::operator=(CSVField(csv::string_view()));
547 return;
548 }
549
550 if (row_overlay && row_overlay->try_get_copy(col_index, owned_value_)) {
551 CSVField::operator=(CSVField(csv::string_view(owned_value_)));
552 return;
553 }
554
555 owned_value_.clear();
556 CSVField::operator=(CSVField((*row)[col_index].template get<csv::string_view>()));
557 }
558
559 DataFrameCell& assign(std::string stored) {
560 if (!can_mutate || !row_overlay) {
561 throw std::runtime_error(internals::ERROR_CANNOT_EDIT_CONST_DF_CELL);
562 }
563
564 owned_value_ = stored;
565 const_cast<RowOverlay*>(row_overlay)->set(col_index, std::move(stored));
566 CSVField::operator=(CSVField(csv::string_view(owned_value_)));
567 return *this;
568 }
569
570 const CSVRow* row;
571 const RowOverlay* row_overlay;
572 size_t col_index;
573 bool can_mutate;
574 std::string owned_value_;
575 };
576
581 template<typename KeyType>
583 public:
585 DataFrameRow() : row(nullptr), frame(nullptr), row_index(0), row_overlay(nullptr), key_ptr(nullptr), can_mutate(false) {}
586
589 const CSVRow* _row,
590 DataFrame<KeyType>* _frame,
591 size_t _row_index,
592 RowOverlay* _edits,
593 const KeyType* _key
594 ) : row(_row), frame(_frame), row_index(_row_index), row_overlay(_edits), key_ptr(_key), can_mutate(true) {}
595
598 const CSVRow* _row,
599 const DataFrame<KeyType>* _frame,
600 size_t _row_index,
601 const RowOverlay* _edits,
602 const KeyType* _key
603 ) : row(_row), frame(_frame), row_index(_row_index), row_overlay(_edits), key_ptr(_key), can_mutate(false) {}
604
606 DataFrameCell operator[](const std::string& col) {
607 return this->make_cell(this->find_column(col));
608 }
609
612 return this->make_cell(n);
613 }
614
616 DataFrameCell operator[](const std::string& col) const {
617 return this->make_cell(this->find_column(col));
618 }
619
621 DataFrameCell operator[](size_t n) const {
622 return this->make_cell(n);
623 }
624
626 size_t size() const { return row->size(); }
627
629 bool empty() const { return row->empty(); }
630
632 const std::vector<std::string>& get_col_names() const { return row->get_col_names(); }
633
635 const CSVRow& get_underlying_row() const { return *row; }
636
638 const KeyType& key() const { return *key_ptr; }
639
644 bool erase() {
645 if (!can_mutate || !frame) {
646 throw std::runtime_error(internals::ERROR_CANNOT_ERASE_CONST_DF_ROW);
647 }
648
649 return const_cast<DataFrame<KeyType>*>(frame)->erase_at_index(row_index);
650 }
651
653 operator std::vector<std::string>() const {
654 std::vector<std::string> result;
655 result.reserve(row->size());
656
657 for (size_t i = 0; i < row->size(); i++) {
658 result.push_back(this->make_cell(i).template get<std::string>());
659 }
660 return result;
661 }
662
664 std::string to_json(const std::vector<std::string>& subset = {}) const {
665 const field_string_accessor field_at(this);
666 if (frame) {
667 return this->get_frame_json_converter().row_to_json(this->size(), field_at, subset);
668 }
669
670 return this->make_detached_json_converter().row_to_json(this->size(), field_at, subset);
671 }
672
674 std::string to_json_array(const std::vector<std::string>& subset = {}) const {
675 const field_string_accessor field_at(this);
676 if (frame) {
677 return this->get_frame_json_converter().row_to_json_array(this->size(), field_at, subset);
678 }
679
680 return this->make_detached_json_converter().row_to_json_array(this->size(), field_at, subset);
681 }
682
683 #ifdef CSV_HAS_CXX20
689 auto to_sv_range() const {
690 return std::views::iota(size_t{0}, this->size())
691 | std::views::transform([this](size_t i) { return this->make_cell(i).template get<std::string>(); });
692 }
693 #endif
694
695 private:
696 struct field_string_accessor {
697 explicit field_string_accessor(const DataFrameRow* owner) : owner(owner) {}
698
699 std::string operator()(size_t i) const {
700 return owner->make_cell(i).template get<std::string>();
701 }
702
703 const DataFrameRow* owner;
704 };
705
706 const internals::JsonConverter& get_frame_json_converter() const {
707 return frame->json_converter_.get_or_create([this]() {
708 return std::make_shared<internals::JsonConverter>(frame->columns());
709 });
710 }
711
712 internals::JsonConverter make_detached_json_converter() const {
713 return internals::JsonConverter(this->get_col_names());
714 }
715
716 DataFrameCell make_cell(size_t col_index) {
717 return can_mutate
718 ? DataFrameCell(row, const_cast<RowOverlay*>(row_overlay), col_index)
719 : DataFrameCell(row, row_overlay, col_index);
720 }
721
722 DataFrameCell make_cell(size_t col_index) const {
723 return DataFrameCell(row, row_overlay, col_index);
724 }
725
726 size_t find_column(const std::string& col) const {
727 if (frame) {
728 return frame->find_column(col);
729 }
730
731 const internals::ConstColNamesPtr col_names = row->col_names_ptr();
732 const int position = col_names->index_of(col);
733 if (position == CSV_NOT_FOUND) {
734 internals::throw_column_not_found_out_of_range(col);
735 }
736
737 return static_cast<size_t>(position);
738 }
739
740 const CSVRow* row;
741 const DataFrame<KeyType>* frame;
742 size_t row_index;
743 const RowOverlay* row_overlay;
744 const KeyType* key_ptr;
745 bool can_mutate;
746 };
747
749 template<typename KeyType>
751 public:
753 DataFrameCell operator()(const DataFrameColumn<KeyType>* owner, size_t row_index) const {
754 return owner->operator[](row_index);
755 }
756 };
757
758 using iterator = internals::indexed_proxy_iterator<const DataFrameColumn<KeyType>, DataFrameCell, cell_accessor>;
759 using const_iterator = iterator;
760
761 DataFrameColumn() : frame_(nullptr), col_index_(0) {}
762
763 DataFrameColumn(const DataFrame<KeyType>* frame, size_t col_index)
764 : frame_(frame), col_index_(col_index) {}
765
767 const std::string& name() const {
768 return (*frame_->col_names_)[col_index_];
769 }
770
772 size_t index() const noexcept {
773 return col_index_;
774 }
775
777 size_t size() const noexcept {
778 return frame_->size();
779 }
780
782 bool empty() const noexcept {
783 return this->size() == 0;
784 }
785
787 DataFrameCell operator[](size_t row_index) const {
788 const auto& row = frame_->rows.at(row_index);
789 const auto* row_edits = frame_->find_row_edits(row_index);
790 return DataFrameCell(&row, row_edits, col_index_);
791 }
792
799 csv::string_view get_sv(size_t row_index) const {
800 const auto& row = frame_->rows.at(row_index);
801 const auto* row_edits = frame_->find_row_edits(row_index);
802 csv::string_view edited_value;
803 if (row_edits && row_edits->try_get_view(col_index_, edited_value)) {
804 return edited_value;
805 }
806
807 return row[col_index_].template get<csv::string_view>();
808 }
809
811 template<typename T = std::string>
812 std::vector<T> to_vector() const {
813 std::vector<T> values;
814 values.reserve(this->size());
815
816 for (size_t row_index = 0; row_index < this->size(); ++row_index) {
817 values.push_back((*this)[row_index].template get<T>());
818 }
819
820 return values;
821 }
822
824 operator std::vector<std::string>() const {
825 return this->to_vector<std::string>();
826 }
827
828 #ifdef CSV_HAS_CXX20
833 auto to_sv_range() const {
834 return std::views::iota(size_t{0}, this->size())
835 | std::views::transform([this](size_t row_index) {
836 return (*this)[row_index].template get<std::string>();
837 });
838 }
839 #endif
840
842 iterator begin() const { return iterator(this, 0); }
843 iterator end() const { return iterator(this, this->size()); }
844 const_iterator cbegin() const { return const_iterator(this, 0); }
845 const_iterator cend() const { return const_iterator(this, this->size()); }
846
847 private:
848 const DataFrame<KeyType>* frame_;
849 size_t col_index_;
850 };
851
852 template<typename KeyType = std::string>
853 class DataFrame {
854 public:
855 friend class DataFrameRow<KeyType>;
856 friend class DataFrameColumn<KeyType>;
857 using row_type = DataFrameRow<KeyType>;
858 using column_type = DataFrameColumn<KeyType>;
859
861 DataFrameRow<KeyType> operator()(DataFrame<KeyType>* owner, size_t row_index) const {
862 return owner->make_row_proxy(row_index);
863 }
864 };
865
867 DataFrameRow<KeyType> operator()(const DataFrame<KeyType>* owner, size_t row_index) const {
868 return owner->make_const_row_proxy(row_index);
869 }
870 };
871
873 using iterator = internals::indexed_proxy_iterator<DataFrame<KeyType>, DataFrameRow<KeyType>, mutable_row_accessor>;
874
876 using const_iterator = internals::indexed_proxy_iterator<const DataFrame<KeyType>, DataFrameRow<KeyType>, const_row_accessor>;
877
878 static_assert(
880 "DataFrame<KeyType> requires KeyType to be hashable (std::hash<KeyType> specialization required)."
881 );
882
883 static_assert(
885 "DataFrame<KeyType> requires KeyType to be equality comparable (operator== required)."
886 );
887
888 static_assert(
889 std::is_default_constructible<KeyType>::value,
890 "DataFrame<KeyType> requires KeyType to be default-constructible."
891 );
892
894
896 DataFrame() = default;
897
902 explicit DataFrame(CSVReader& reader) {
903 this->init_unkeyed_from_reader(reader);
904 }
905
907 explicit DataFrame(std::vector<CSVRow> rows) {
908 this->init_unkeyed_from_rows(rows);
909 }
910
915 explicit DataFrame(CSVReader& reader, const DataFrameOptions& options) {
916 this->init_from_reader(reader, options);
917 }
918
924 csv::string_view filename,
925 const DataFrameOptions& options,
927 ) {
928 CSVReader reader(filename, format);
929 this->init_from_reader(reader, options);
930 }
931
937 CSVReader& reader,
938 const std::string& _key_column,
939 DuplicateKeyPolicy policy = DuplicateKeyPolicy::OVERWRITE,
940 bool throw_on_missing_key = true
941 ) : DataFrame(
942 reader,
944 .set_key_column(_key_column)
945 .set_duplicate_key_policy(policy)
946 .set_throw_on_missing_key(throw_on_missing_key)
947 ) {}
948
953 template<
954 typename KeyFunc,
955 csv::enable_if_t<csv::is_invocable_returning<KeyFunc, KeyType, const CSVRow&>::value, int> = 0
956 >
958 CSVReader& reader,
959 KeyFunc key_func,
960 DuplicateKeyPolicy policy = DuplicateKeyPolicy::OVERWRITE
961 ) : col_names_(reader.col_names_ptr()) {
962 this->is_keyed = true;
963 this->build_from_key_function(reader, key_func, policy);
964 }
965
967 template<
968 typename KeyFunc,
969 csv::enable_if_t<csv::is_invocable_returning<KeyFunc, KeyType, const CSVRow&>::value, int> = 0
970 >
972 CSVReader& reader,
973 KeyFunc key_func,
974 const DataFrameOptions& options
975 ) : DataFrame(reader, key_func, options.get_duplicate_key_policy()) {}
976
978 size_t size() const noexcept {
979 return rows.size();
980 }
981
983 bool empty() const noexcept {
984 return rows.empty();
985 }
986
988 size_t n_rows() const noexcept { return rows.size(); }
989
991 size_t n_cols() const noexcept { return col_names_->size(); }
992
994 bool has_column(const std::string& name) const {
995 return this->index_of(name) != CSV_NOT_FOUND;
996 }
997
999 int index_of(const std::string& name) const {
1000 return this->col_names_->index_of(name);
1001 }
1002
1004 const std::vector<std::string>& columns() const noexcept { return this->col_names_->get_col_names(); }
1005
1011 DataFrame selected_rows(const std::vector<std::uint8_t>& include_rows) const {
1012 if (include_rows.size() != this->rows.size()) {
1013 throw std::invalid_argument("selected row mask size must match DataFrame row count");
1014 }
1015
1016 std::vector<CSVRow> selected;
1017 selected.reserve(this->rows.size());
1018 for (size_t row_index = 0; row_index < this->rows.size(); ++row_index) {
1019 if (include_rows[row_index]) {
1020 selected.push_back(this->rows[row_index]);
1021 }
1022 }
1023
1024 return DataFrame(std::move(selected));
1025 }
1026
1028 DataFrameColumn<KeyType> column_view(size_t col_index) const {
1029 if (col_index >= this->n_cols()) {
1030 internals::throw_column_index_out_of_range();
1031 }
1032
1033 return DataFrameColumn<KeyType>(this, col_index);
1034 }
1035
1037 DataFrameColumn<KeyType> column_view(const std::string& name) const {
1038 return this->column_view(this->find_column(name));
1039 }
1040
1050 template<typename K = KeyType,
1051 csv::enable_if_t<!std::is_integral<K>::value, int> = 0>
1052 DataFrameRow<KeyType> operator[](size_t i) {
1053 static_assert(std::is_same<K, KeyType>::value,
1054 "Do not explicitly instantiate this template. Use at(size_t) for positional access.");
1055 return this->at(i);
1056 }
1057
1060 template<typename K = KeyType,
1061 csv::enable_if_t<!std::is_integral<K>::value, int> = 0>
1062 DataFrameRow<KeyType> operator[](size_t i) const {
1063 static_assert(std::is_same<K, KeyType>::value,
1064 "Do not explicitly instantiate this template. Use at(size_t) for positional access.");
1065 return this->at(i);
1066 }
1067
1073 DataFrameRow<KeyType> at(size_t i) {
1074 const auto& row = rows.at(i);
1075 auto* row_edits = this->ensure_row_edits(i);
1076 return DataFrameRow<KeyType>(&row, this, i, row_edits, this->key_ptr_at(i));
1077 }
1078
1080 DataFrameRow<KeyType> at(size_t i) const {
1081 const auto& row = rows.at(i);
1082 const RowOverlay* row_edits = this->find_row_edits(i);
1083 return DataFrameRow<KeyType>(&row, this, i, row_edits, this->key_ptr_at(i));
1084 }
1085
1092 DataFrameRow<KeyType> operator[](const KeyType& key) {
1093 this->require_keyed_frame();
1094 auto position = this->position_of(key);
1095 return DataFrameRow<KeyType>(&rows.at(position), this, position, this->ensure_row_edits(position), this->key_ptr_at(position));
1096 }
1097
1099 DataFrameRow<KeyType> operator[](const KeyType& key) const {
1100 this->require_keyed_frame();
1101 auto position = this->position_of(key);
1102 const RowOverlay* row_edits = this->find_row_edits(position);
1103 return DataFrameRow<KeyType>(&rows.at(position), this, position, row_edits, this->key_ptr_at(position));
1104 }
1105
1111 bool contains(const KeyType& key) const {
1112 this->require_keyed_frame();
1113 this->ensure_key_index();
1114 return key_index->find(key) != key_index->end();
1115 }
1116
1124 template<typename T = std::string>
1125 std::vector<T> column(const std::string& name) const {
1126 const size_t col_idx = this->find_column(name);
1127 std::vector<T> values;
1128
1129 values.reserve(rows.size());
1130 for (size_t row_index = 0; row_index < rows.size(); ++row_index) {
1131 values.push_back(this->at(row_index)[col_idx].template get<T>());
1132 }
1133
1134 return values;
1135 }
1136
1150 template<typename State, typename Fn>
1152 DataFrameExecutor& executor,
1153 std::vector<State>& states,
1154 Fn&& fn
1155 ) const {
1156 if (states.size() != this->n_cols()) {
1157 throw std::invalid_argument(internals::ERROR_COLUMN_APPLY_STATE_COUNT);
1158 }
1159
1160 executor.parallel_for(this->n_cols(), [this, &states, &fn](size_t column_index) {
1161 fn(this->column_view(column_index), states[column_index]);
1162 });
1163 }
1164
1173 template<typename State, typename Fn>
1175 DataFrameExecutor& executor,
1176 const std::vector<size_t>& column_indices,
1177 std::vector<State>& states,
1178 Fn&& fn
1179 ) const {
1180 if (states.size() != column_indices.size()) {
1181 throw std::invalid_argument(internals::ERROR_COLUMN_APPLY_SUBSET_STATE_COUNT);
1182 }
1183
1184 this->validate_selected_columns(column_indices);
1185
1186 executor.parallel_for(column_indices.size(), [this, &column_indices, &states, &fn](size_t selected_index) {
1187 const size_t column_index = column_indices[selected_index];
1188 fn(this->column_view(column_index), states[selected_index]);
1189 });
1190 }
1191
1198 template<typename Fn>
1200 DataFrameExecutor& executor,
1201 Fn&& fn
1202 ) const {
1203 executor.parallel_for(this->n_cols(), [this, &fn](size_t column_index) {
1204 fn(this->column_view(column_index));
1205 });
1206 }
1207
1215 template<typename Fn>
1217 DataFrameExecutor& executor,
1218 const std::vector<size_t>& column_indices,
1219 Fn&& fn
1220 ) const {
1221 this->validate_selected_columns(column_indices);
1222
1223 executor.parallel_for(column_indices.size(), [this, &column_indices, &fn](size_t selected_index) {
1224 fn(this->column_view(column_indices[selected_index]));
1225 });
1226 }
1227
1233 template<
1234 typename GroupFunc,
1235 typename GroupKey = invoke_result_t<GroupFunc, DataFrameRow<KeyType>>,
1236 csv::enable_if_t<
1239 int
1240 > = 0
1241 >
1242 std::unordered_map<GroupKey, std::vector<size_t>> group_by(GroupFunc group_func) const {
1243 std::unordered_map<GroupKey, std::vector<size_t>> grouped;
1244
1245 for (size_t i = 0; i < rows.size(); i++) {
1246 GroupKey group_key = group_func(this->at(i));
1247 grouped[group_key].push_back(i);
1248 }
1249
1250 return grouped;
1251 }
1252
1258 std::unordered_map<std::string, std::vector<size_t>> group_by(const std::string& name) const {
1259 const size_t col_idx = this->find_column(name);
1260 std::unordered_map<std::string, std::vector<size_t>> grouped;
1261
1262 for (size_t i = 0; i < rows.size(); i++) {
1263 grouped[this->at(i)[col_idx].template get<std::string>()].push_back(i);
1264 }
1265
1266 return grouped;
1267 }
1268
1270 iterator begin() { return iterator(this, 0); }
1271
1273 iterator end() { return iterator(this, this->size()); }
1274
1276 const_iterator begin() const { return const_iterator(this, 0); }
1277
1279 const_iterator end() const { return const_iterator(this, this->size()); }
1280
1282 const_iterator cbegin() const { return const_iterator(this, 0); }
1283
1285 const_iterator cend() const { return const_iterator(this, this->size()); }
1286
1287 private:
1289 bool is_keyed = false;
1290
1292 internals::ConstColNamesPtr col_names_ = std::make_shared<internals::ColNames>();
1293
1295 std::vector<CSVRow> rows;
1296
1298 std::vector<KeyType> keys_;
1299
1301 mutable std::unique_ptr<std::unordered_map<KeyType, size_t>> key_index;
1302 mutable internals::lazy_shared_ptr<internals::JsonConverter> json_converter_;
1303
1309 std::vector<RowOverlaySlot> edits;
1310 std::shared_ptr<std::mutex> edits_creation_lock_{ new std::mutex() };
1311
1313 void init_unkeyed_from_reader(CSVReader& reader) {
1314 this->assert_fresh_storage(false);
1315 this->is_keyed = false;
1316 this->col_names_ = reader.col_names_ptr();
1317
1318 std::vector<CSVRow> batch;
1319 while (reader.read_chunk(batch, dataframe_read_chunk_rows())) {
1320 this->append_unkeyed_batch(batch);
1321 }
1322 }
1323
1325 void init_unkeyed_from_rows(std::vector<CSVRow>& source_rows) {
1326 this->assert_fresh_storage(false);
1327 this->is_keyed = false;
1328 this->col_names_ = source_rows.empty()
1329 ? internals::ConstColNamesPtr(std::make_shared<internals::ColNames>())
1330 : source_rows.front().col_names_ptr();
1331 this->rows = std::move(source_rows);
1332 this->edits.resize(this->rows.size());
1333 }
1334
1336 void init_from_reader(CSVReader& reader, const DataFrameOptions& options) {
1337 this->assert_fresh_storage(false);
1338 this->is_keyed = true;
1339 this->col_names_ = reader.col_names_ptr();
1340 const std::string key_column = options.get_key_column();
1341
1342 if (key_column.empty())
1343 throw std::runtime_error(internals::ERROR_KEY_COLUMN_EMPTY);
1344
1345 if (this->col_names_->index_of(key_column) == CSV_NOT_FOUND)
1346 throw std::runtime_error(std::string(internals::ERROR_KEY_COLUMN_NOT_FOUND) + key_column);
1347
1348 const bool throw_on_missing_key = options.get_throw_on_missing_key();
1349
1350 this->build_from_key_function(
1351 reader,
1352 [key_column, throw_on_missing_key](const CSVRow& row) -> KeyType {
1353 try {
1354 return row[key_column].template get<KeyType>();
1355 }
1356 catch (const std::exception& e) {
1357 if (throw_on_missing_key) {
1358 throw std::runtime_error(internals::ERROR_KEY_COLUMN_VALUE + std::string(e.what()));
1359 }
1360
1361 return KeyType();
1362 }
1363 },
1364 options.get_duplicate_key_policy()
1365 );
1366 }
1367
1369 template<typename KeyFunc>
1370 void build_from_key_function(
1371 CSVReader& reader,
1372 KeyFunc key_func,
1373 DuplicateKeyPolicy policy
1374 ) {
1375 std::unordered_map<KeyType, size_t> key_to_pos;
1376 this->assert_fresh_storage(true);
1377
1378 std::vector<CSVRow> batch;
1379 while (reader.read_chunk(batch, dataframe_read_chunk_rows())) {
1380 this->append_keyed_batch(batch, key_func, policy, key_to_pos);
1381 }
1382 }
1383
1384 static size_t dataframe_read_chunk_rows() noexcept {
1385 return 50000;
1386 }
1387
1388 template<typename Container>
1389 static void reserve_for_append(Container& container, size_t additional) {
1390 if (additional == 0) {
1391 return;
1392 }
1393
1394 const size_t required = container.size() + additional;
1395 if (required <= container.capacity()) {
1396 return;
1397 }
1398
1399 const size_t current = container.capacity();
1400 // Do not reserve exactly one batch ahead. DataFrame construction
1401 // appends fixed-size read_chunk() batches, so exact reserves would
1402 // force a reallocation on every batch for large inputs.
1403 size_t next = current == 0 ? additional : current * 2;
1404 if (next < required || next < current) {
1405 next = required;
1406 }
1407
1408 container.reserve(next);
1409 }
1410
1411 void append_unkeyed_batch(std::vector<CSVRow>& batch) {
1412 reserve_for_append(rows, batch.size());
1413 reserve_for_append(edits, batch.size());
1414
1415 for (auto& row : batch) {
1416 rows.push_back(std::move(row));
1417 edits.emplace_back();
1418 }
1419 }
1420
1421 template<typename KeyFunc>
1422 void append_keyed_batch(
1423 std::vector<CSVRow>& batch,
1424 KeyFunc& key_func,
1425 DuplicateKeyPolicy policy,
1426 std::unordered_map<KeyType, size_t>& key_to_pos
1427 ) {
1428 reserve_for_append(rows, batch.size());
1429 reserve_for_append(keys_, batch.size());
1430 reserve_for_append(edits, batch.size());
1431
1432 for (auto& row : batch) {
1433 KeyType key = key_func(row);
1434
1435 auto existing = key_to_pos.find(key);
1436 if (existing != key_to_pos.end()) {
1437 if (policy == DuplicateKeyPolicy::THROW)
1438 throw std::runtime_error(internals::ERROR_DUPLICATE_KEY);
1439
1440 if (policy == DuplicateKeyPolicy::OVERWRITE)
1441 rows[existing->second] = std::move(row);
1442
1443 continue;
1444 }
1445
1446 rows.push_back(std::move(row));
1447 keys_.push_back(key);
1448 edits.emplace_back();
1449 key_to_pos[key] = rows.size() - 1;
1450 }
1451 }
1452
1454 size_t find_column(const std::string& name) const {
1455 return index_of(name) != CSV_NOT_FOUND ? static_cast<size_t>(index_of(name)) :
1456 throw std::out_of_range(std::string(internals::ERROR_COLUMN_NOT_FOUND) + name);
1457 }
1458
1460 const RowOverlay* find_row_edits(size_t row_index) const {
1461 return edits.at(row_index).get();
1462 }
1463
1465 RowOverlay* ensure_row_edits(size_t row_index) {
1466 RowOverlaySlot& slot = edits.at(row_index);
1467 RowOverlay* overlay = slot.get();
1468 if (overlay) {
1469 return overlay;
1470 }
1471
1472 std::lock_guard<std::mutex> lock(*edits_creation_lock_);
1473 return slot.ensure();
1474 }
1475
1476 DataFrameRow<KeyType> make_row_proxy(size_t row_index) {
1477 const auto& row = rows.at(row_index);
1478 return DataFrameRow<KeyType>(&row, this, row_index, this->ensure_row_edits(row_index), this->key_ptr_at(row_index));
1479 }
1480
1481 DataFrameRow<KeyType> make_const_row_proxy(size_t row_index) const {
1482 const auto& row = rows.at(row_index);
1483 return DataFrameRow<KeyType>(&row, this, row_index, this->find_row_edits(row_index), this->key_ptr_at(row_index));
1484 }
1485
1486 void erase_row_edits(size_t row_index) {
1487 if (row_index < edits.size()) {
1488 edits.erase(edits.begin() + row_index);
1489 }
1490 }
1491
1492 bool erase_at_index(size_t row_index) {
1493 if (row_index >= rows.size()) {
1494 return false;
1495 }
1496
1497 this->erase_row_edits(row_index);
1498 rows.erase(rows.begin() + row_index);
1499 if (this->is_keyed) {
1500 keys_.erase(keys_.begin() + row_index);
1501 }
1502 this->invalidate_key_index();
1503 return true;
1504 }
1505
1506 void validate_selected_columns(const std::vector<size_t>& column_indices) const {
1507 for (size_t column_index : column_indices) {
1508 if (column_index >= this->n_cols()) {
1509 throw std::out_of_range(internals::ERROR_COLUMN_APPLY_INVALID_INDEX);
1510 }
1511 }
1512 }
1513
1515 void require_keyed_frame() const {
1516 if (!is_keyed)
1517 throw std::runtime_error(internals::ERROR_UNKEYED_DATA_FRAME);
1518 }
1519
1521 void invalidate_key_index() {
1522 key_index.reset();
1523 }
1524
1526 void assert_fresh_storage(bool expected_is_keyed) const {
1527 CSV_DEBUG_ASSERT(this->rows.empty());
1528 CSV_DEBUG_ASSERT(this->keys_.empty());
1529 CSV_DEBUG_ASSERT(this->edits.empty());
1530 CSV_DEBUG_ASSERT(this->key_index.get() == nullptr);
1531 CSV_DEBUG_ASSERT(this->json_converter_.get() == nullptr);
1532 CSV_DEBUG_ASSERT(this->is_keyed == expected_is_keyed);
1533 }
1534
1536 void ensure_key_index() const {
1537 if (key_index) return;
1538
1539 key_index = std::unique_ptr<std::unordered_map<KeyType, size_t>>(
1540 new std::unordered_map<KeyType, size_t>()
1541 );
1542
1543 for (size_t i = 0; i < rows.size(); i++) {
1544 (*key_index)[keys_[i]] = i;
1545 }
1546 }
1547
1549 size_t position_of(const KeyType& key) const {
1550 this->ensure_key_index();
1551 auto it = key_index->find(key);
1552 return it == key_index->end() ? throw std::out_of_range(internals::ERROR_KEY_NOT_FOUND)
1553 : it->second;
1554 }
1555
1556 const KeyType* key_ptr_at(size_t row_index) const {
1557 return this->is_keyed ? &keys_.at(row_index) : nullptr;
1558 }
1559 };
1560
1561 #ifdef CSV_HAS_CXX20
1562 static_assert(
1563 internals::csv_write_rows_input_range<DataFrame<>>,
1564 "DataFrame must remain compatible with csv::DelimWriter::write_rows()."
1565 );
1566 #endif
1567}
Data type representing individual CSV values.
Definition csv_row.hpp:114
DataType type() noexcept
Return the type of the underlying CSV data.
Definition csv_row.hpp:372
bool is_str() noexcept
Returns true if field is a non-numeric, non-empty string.
Definition csv_row.hpp:350
constexpr CSVField(csv::string_view _sv) noexcept
Constructs a CSVField from a string_view.
Definition csv_row.hpp:117
T get()
Returns the value casted to the requested type, performing type checking before.
Definition csv_row.hpp:180
bool try_get(T &out) noexcept
Non-throwing equivalent of get().
Definition csv_row.hpp:225
bool is_null() noexcept
Returns true if field is an empty string or string of whitespace characters.
Definition csv_row.hpp:347
CONSTEXPR csv::string_view get_sv() const noexcept
Return a string view over the field's contents.
Definition csv_row.hpp:344
bool is_float() noexcept
Returns true if field is a floating point value.
Definition csv_row.hpp:363
bool is_num() noexcept
Returns true if field is an integer or float.
Definition csv_row.hpp:353
bool is_int() noexcept
Returns true if field is an integer.
Definition csv_row.hpp:358
Stores information about how to parse a CSV file.
static CSVFormat guess_csv()
CSVFormat preset for delimiter inference with header/n_cols inference enabled.
Main class for parsing CSVs from files and in-memory sources.
Data structure for representing CSV rows.
Definition csv_row.hpp:544
CONSTEXPR bool empty() const noexcept
Indicates whether row is empty or not.
Definition csv_row.hpp:562
const std::vector< std::string > & get_col_names() const
Retrieve this row's associated column names.
Definition csv_row.hpp:583
CONSTEXPR size_t size() const noexcept
Return the number of fields in this row.
Definition csv_row.hpp:565
internals::ConstColNamesPtr col_names_ptr() const noexcept
Internal accessor for preserving resolved column-name lookup policy across helper types.
Definition csv_row.hpp:588
T get() const
Const-friendly read access for proxy use in column iteration and callbacks.
Lightweight non-owning view over one DataFrame column.
csv::string_view get_sv(size_t row_index) const
Access a visible cell value as a string_view without materializing a DataFrameCell.
DataFrameCell operator[](size_t row_index) const
Access a visible cell value by row index.
std::vector< T > to_vector() const
Materialize this column as a vector of converted values.
size_t index() const noexcept
Zero-based column position.
size_t size() const noexcept
Number of rows in the parent batch.
const std::string & name() const
Column name.
auto to_sv_range() const
Convert this DataFrameColumn into a std::ranges::input_range of strings.
bool empty() const noexcept
Whether the parent batch contains no rows.
iterator begin() const
Iterate over visible cells in this column.
Persistent execution backend for batch-oriented DataFrame column work.
Allows configuration of DataFrame behavior.
DuplicateKeyPolicy
Policy for handling duplicate keys when creating a keyed DataFrame.
Proxy class that wraps a CSVRow and intercepts field access to check for edits.
DataFrameCell operator[](const std::string &col)
Access a field by column name, preserving edit support.
std::string to_json(const std::vector< std::string > &subset={}) const
Convert to JSON.
DataFrameRow(const CSVRow *_row, const DataFrame< KeyType > *_frame, size_t _row_index, const RowOverlay *_edits, const KeyType *_key)
Construct a read-only DataFrameRow wrapper.
auto to_sv_range() const
Convert this DataFrameRow into a std::ranges::input_range of strings, respecting the sparse overlay (...
DataFrameRow(const CSVRow *_row, DataFrame< KeyType > *_frame, size_t _row_index, RowOverlay *_edits, const KeyType *_key)
Construct a mutable DataFrameRow wrapper.
DataFrameCell operator[](size_t n)
Access a field by position, preserving edit support.
DataFrameRow()
Default constructor (creates an unbound proxy).
const std::vector< std::string > & get_col_names() const
Get column names.
DataFrameCell operator[](size_t n) const
Access a field by position, checking edits first.
DataFrameCell operator[](const std::string &col) const
Access a field by column name, checking edits first.
std::string to_json_array(const std::vector< std::string > &subset={}) const
Convert to JSON array.
size_t size() const
Get the number of fields in the row.
const KeyType & key() const
Get the key for this row (only valid for keyed DataFrames).
bool empty() const
Check if the row is empty.
const CSVRow & get_underlying_row() const
Get the underlying CSVRow for compatibility.
bool erase()
Delete this row from the parent DataFrame.
bool empty() const noexcept
Check if the DataFrame is empty (has no rows).
DataFrame(CSVReader &reader, const std::string &_key_column, DuplicateKeyPolicy policy=DuplicateKeyPolicy::OVERWRITE, bool throw_on_missing_key=true)
Construct a keyed DataFrame using a column name as the key.
internals::indexed_proxy_iterator< const DataFrame< KeyType >, DataFrameRow< KeyType >, const_row_accessor > const_iterator
Row-wise const iterator over DataFrameRow entries.
DataFrameRow< KeyType > operator[](size_t i)
Access a row by position (unchecked).
DataFrame(CSVReader &reader, const DataFrameOptions &options)
Construct a keyed DataFrame from a CSV reader with options.
bool has_column(const std::string &name) const
Check if a column exists in the DataFrame.
iterator end()
Get iterator past the last row.
DataFrameRow< KeyType > operator[](const KeyType &key)
Access a row by its key.
const_iterator cend() const
Get const iterator past the last row (explicit).
std::vector< T > column(const std::string &name) const
Extract all values from a column with type conversion.
size_t n_cols() const noexcept
Get the number of columns in the DataFrame.
internals::indexed_proxy_iterator< DataFrame< KeyType >, DataFrameRow< KeyType >, mutable_row_accessor > iterator
Row-wise iterator over DataFrameRow entries.
std::unordered_map< GroupKey, std::vector< size_t > > group_by(GroupFunc group_func) const
Group row positions using an arbitrary grouping function.
iterator begin()
Get iterator to the first row.
DataFrame(CSVReader &reader, KeyFunc key_func, DuplicateKeyPolicy policy=DuplicateKeyPolicy::OVERWRITE)
Construct a keyed DataFrame using a custom key function.
DataFrame(csv::string_view filename, const DataFrameOptions &options, CSVFormat format=CSVFormat::guess_csv())
Construct a keyed DataFrame directly from a CSV file.
void column_parallel_apply(DataFrameExecutor &executor, const std::vector< size_t > &column_indices, std::vector< State > &states, Fn &&fn) const
Apply a batch-oriented function to a selected subset of columns, potentially in parallel.
DataFrame(CSVReader &reader, KeyFunc key_func, const DataFrameOptions &options)
Construct a keyed DataFrame using a custom key function with options.
DataFrame selected_rows(const std::vector< std::uint8_t > &include_rows) const
Build an unkeyed DataFrame containing rows whose corresponding mask entry is true.
size_t n_rows() const noexcept
Get the number of rows in the DataFrame.
const std::vector< std::string > & columns() const noexcept
Get the column names in order.
const_iterator cbegin() const
Get const iterator to the first row (explicit).
void column_parallel_apply(DataFrameExecutor &executor, std::vector< State > &states, Fn &&fn) const
Apply a batch-oriented function to each column, potentially in parallel.
DataFrameRow< KeyType > operator[](size_t i) const
Access a row by position (unchecked, const version).
void column_parallel_apply(DataFrameExecutor &executor, const std::vector< size_t > &column_indices, Fn &&fn) const
Apply a read-only batch function to a selected subset of columns, potentially in parallel.
const_iterator begin() const
Get const iterator to the first row.
DataFrameRow< KeyType > at(size_t i) const
Access a row by position with bounds checking (const version).
DataFrameRow< KeyType > operator[](const KeyType &key) const
Access a row by its key (const version).
DataFrameColumn< KeyType > column_view(const std::string &name) const
Access a column view by name.
bool contains(const KeyType &key) const
Check if a key exists in the DataFrame.
const_iterator end() const
Get const iterator past the last row.
int index_of(const std::string &name) const
Get the index of a column by name.
DataFrameRow< KeyType > at(size_t i)
Access a row by position with bounds checking.
void column_parallel_apply(DataFrameExecutor &executor, Fn &&fn) const
Apply a read-only batch function to each column, potentially in parallel.
DataFrame(std::vector< CSVRow > rows)
Construct an unkeyed DataFrame from an existing batch of rows.
size_t size() const noexcept
Get the number of rows in the DataFrame.
std::unordered_map< std::string, std::vector< size_t > > group_by(const std::string &name) const
Group row positions by the value of a column.
DataFrameColumn< KeyType > column_view(size_t col_index) const
Access a column view by position.
DataFrame()=default
Construct an empty DataFrame.
DataFrame(CSVReader &reader)
Construct an unkeyed DataFrame from a CSV reader.
Shared exception message templates and throw helpers.
Defines functionality needed for basic CSV parsing.
Internal JSON serialization helpers for row-like CSV data.
The all encompassing namespace.
DataType
Enumerates the different CSV field types recognized by this library.
Definition data_type.hpp:14
CONSTEXPR_14 csv::string_view CSVField::get< csv::string_view >()
Retrieve a view over this field's string.
Definition csv_row.hpp:767
std::string CSVField::get< std::string >()
Retrieve this field's original string.
Definition csv_row.hpp:757
constexpr int CSV_NOT_FOUND
Integer indicating a requested column wasn't found.
Definition common.hpp:479
std::string_view string_view
The string_view class used by this library.
Definition common.hpp:174
bool try_get_view(size_t col_index, csv::string_view &out) const
Return a view into an edited cell without copying.