10#include <unordered_map>
22 static auto test(
int) ->
decltype(
23 std::hash<U>{}(std::declval<const U&>()),
28 static std::false_type test(...);
31 static constexpr bool value =
decltype(test<T>(0))::value;
38 static auto test(
int) ->
decltype(
39 std::declval<const U&>() == std::declval<const U&>(),
44 static std::false_type test(...);
47 static constexpr bool value =
decltype(test<T>(0))::value;
64 this->duplicate_key_policy = value;
69 return this->duplicate_key_policy;
72 DataFrameOptions& set_key_column(
const std::string& value) {
73 this->key_column = value;
77 const std::string& get_key_column()
const {
78 return this->key_column;
81 DataFrameOptions& set_throw_on_missing_key(
bool value) {
82 this->throw_on_missing_key = value;
86 bool get_throw_on_missing_key()
const {
87 return this->throw_on_missing_key;
91 std::string key_column;
96 bool throw_on_missing_key =
true;
103 template<
typename KeyType>
112 const std::unordered_map<size_t, std::string>* _edits,
114 ) : row(_row), row_edits(_edits), key_ptr(_key) {}
120 auto it = std::find(col_names.begin(), col_names.end(), col);
121 if (it == col_names.end()) {
126 size_t col_index = std::distance(col_names.begin(), it);
129 auto edit_it = row_edits->find(col_index);
130 if (edit_it != row_edits->end()) {
155 const KeyType&
get_key()
const {
return *key_ptr; }
158 operator std::vector<std::string>()
const {
159 std::vector<std::string> result;
160 result.reserve(row->
size());
162 for (
size_t i = 0; i < row->
size(); i++) {
165 auto it = row_edits->find(i);
166 if (it != row_edits->end()) {
167 result.push_back(it->second);
178 std::string
to_json(
const std::vector<std::string>& subset = {})
const {
183 std::string
to_json_array(
const std::vector<std::string>& subset = {})
const {
192 return std::views::iota(
size_t{0}, this->
size())
193 | std::views::transform([
this](
size_t i) {
196 auto it = row_edits->find(i);
197 if (it != row_edits->end()) {
209 const std::unordered_map<size_t, std::string>* row_edits;
210 const KeyType* key_ptr;
213 template<
typename KeyType = std::
string>
223 using difference_type = std::ptrdiff_t;
226 using iterator_category = std::random_access_iterator_tag;
230 typename std::vector<row_entry>::iterator it,
231 const std::unordered_map<KeyType, std::unordered_map<size_t, std::string>>* edits
232 ) : iter(it), edits_map(edits) {}
235 const std::unordered_map<size_t, std::string>* row_edits =
nullptr;
237 auto it = edits_map->find(iter->first);
238 if (it != edits_map->end()) {
239 row_edits = &it->second;
252 iterator& operator++() { ++iter;
return *
this; }
253 iterator operator++(
int) {
auto tmp = *
this; ++iter;
return tmp; }
254 iterator& operator--() { --iter;
return *
this; }
255 iterator operator--(
int) {
auto tmp = *
this; --iter;
return tmp; }
257 iterator operator+(difference_type n)
const {
return iterator(iter + n, edits_map); }
258 iterator operator-(difference_type n)
const {
return iterator(iter - n, edits_map); }
259 difference_type operator-(
const iterator& other)
const {
return iter - other.iter; }
261 bool operator==(
const iterator& other)
const {
return iter == other.iter; }
262 bool operator!=(
const iterator& other)
const {
return iter != other.iter; }
265 typename std::vector<row_entry>::iterator iter;
266 const std::unordered_map<KeyType, std::unordered_map<size_t, std::string>>* edits_map =
nullptr;
274 using difference_type = std::ptrdiff_t;
277 using iterator_category = std::random_access_iterator_tag;
281 typename std::vector<row_entry>::const_iterator it,
282 const std::unordered_map<KeyType, std::unordered_map<size_t, std::string>>* edits
283 ) : iter(it), edits_map(edits) {}
286 const std::unordered_map<size_t, std::string>* row_edits =
nullptr;
288 auto it = edits_map->find(iter->first);
289 if (it != edits_map->end()) {
290 row_edits = &it->second;
304 const_iterator operator++(
int) {
auto tmp = *
this; ++iter;
return tmp; }
306 const_iterator operator--(
int) {
auto tmp = *
this; --iter;
return tmp; }
310 difference_type operator-(
const const_iterator& other)
const {
return iter - other.iter; }
312 bool operator==(
const const_iterator& other)
const {
return iter == other.iter; }
313 bool operator!=(
const const_iterator& other)
const {
return iter != other.iter; }
316 typename std::vector<row_entry>::const_iterator iter;
317 const std::unordered_map<KeyType, std::unordered_map<size_t, std::string>>* edits_map =
nullptr;
323 "DataFrame<KeyType> requires KeyType to be hashable (std::hash<KeyType> specialization required)."
328 "DataFrame<KeyType> requires KeyType to be equality comparable (operator== required)."
332 std::is_default_constructible<KeyType>::value,
333 "DataFrame<KeyType> requires KeyType to be default-constructible."
346 this->init_unkeyed_from_reader(reader);
354 this->init_from_reader(reader, options);
367 this->init_from_reader(reader, options);
376 const std::string& _key_column,
378 bool throw_on_missing_key =
true
382 .set_key_column(_key_column)
383 .set_duplicate_key_policy(policy)
384 .set_throw_on_missing_key(throw_on_missing_key)
393 typename ResultType = invoke_result_t<KeyFunc, const CSVRow&>,
394 csv::enable_if_t<std::is_convertible<ResultType, KeyType>::value,
int> = 0
401 this->is_keyed =
true;
402 this->build_from_key_function(reader, key_func, policy);
408 typename ResultType = invoke_result_t<KeyFunc, const CSVRow&>,
409 csv::enable_if_t<std::is_convertible<ResultType, KeyType>::value,
int> = 0
415 ) :
DataFrame(reader, key_func, options.get_duplicate_key_policy()) {}
428 size_t n_rows() const noexcept {
return rows.size(); }
431 size_t n_cols() const noexcept {
return col_names_.size(); }
435 return std::find(col_names_.begin(), col_names_.end(), name) != col_names_.end();
440 auto it = std::find(col_names_.begin(), col_names_.end(), name);
441 if (it == col_names_.end())
443 return static_cast<int>(std::distance(col_names_.begin(), it));
447 const std::vector<std::string>&
columns() const noexcept {
465 template<
typename K = KeyType,
466 csv::enable_if_t<!std::is_integral<K>::value,
int> = 0>
468 static_assert(std::is_same<K, KeyType>::value,
469 "Do not explicitly instantiate this template. Use iloc() for positional access.");
470 return this->
iloc(i);
475 template<
typename K = KeyType,
476 csv::enable_if_t<!std::is_integral<K>::value,
int> = 0>
478 static_assert(std::is_same<K, KeyType>::value,
479 "Do not explicitly instantiate this template. Use iloc() for positional access.");
480 return this->
iloc(i);
489 const std::unordered_map<size_t, std::string>* row_edits =
nullptr;
491 auto it = edits.find(rows.at(i).first);
492 if (it != edits.end()) row_edits = &it->second;
499 const std::unordered_map<size_t, std::string>* row_edits =
nullptr;
501 auto it = edits.find(rows.at(i).first);
502 if (it != edits.end()) row_edits = &it->second;
514 this->require_keyed_frame();
515 auto position = this->position_of(key);
516 const std::unordered_map<size_t, std::string>* row_edits =
nullptr;
517 auto it = edits.find(key);
518 if (it != edits.end()) row_edits = &it->second;
524 this->require_keyed_frame();
525 auto position = this->position_of(key);
526 const std::unordered_map<size_t, std::string>* row_edits =
nullptr;
527 auto it = edits.find(key);
528 if (it != edits.end()) row_edits = &it->second;
538 const std::unordered_map<size_t, std::string>* row_edits =
nullptr;
540 auto it = edits.find(rows.at(i).first);
541 if (it != edits.end()) row_edits = &it->second;
548 const std::unordered_map<size_t, std::string>* row_edits =
nullptr;
550 auto it = edits.find(rows.at(i).first);
551 if (it != edits.end()) row_edits = &it->second;
558 if (i >= rows.size()) {
561 const std::unordered_map<size_t, std::string>* row_edits =
nullptr;
563 auto it = edits.find(rows[i].first);
564 if (it != edits.end()) row_edits = &it->second;
572 if (i >= rows.size()) {
575 const std::unordered_map<size_t, std::string>* row_edits =
nullptr;
577 auto it = edits.find(rows[i].first);
578 if (it != edits.end()) row_edits = &it->second;
591 this->require_keyed_frame();
592 return rows.at(i).first;
601 this->require_keyed_frame();
602 this->ensure_key_index();
603 return key_index->find(key) != key_index->end();
613 this->require_keyed_frame();
614 auto position = this->position_of(key);
615 const std::unordered_map<size_t, std::string>* row_edits =
nullptr;
616 auto it = edits.find(key);
617 if (it != edits.end()) row_edits = &it->second;
623 this->require_keyed_frame();
624 auto position = this->position_of(key);
625 const std::unordered_map<size_t, std::string>* row_edits =
nullptr;
626 auto it = edits.find(key);
627 if (it != edits.end()) row_edits = &it->second;
637 this->require_keyed_frame();
638 this->ensure_key_index();
639 auto it = key_index->find(key);
640 if (it == key_index->end()) {
643 const std::unordered_map<size_t, std::string>* row_edits =
nullptr;
644 auto edit_it = edits.find(key);
645 if (edit_it != edits.end()) row_edits = &edit_it->second;
652 this->require_keyed_frame();
653 this->ensure_key_index();
654 auto it = key_index->find(key);
655 if (it == key_index->end()) {
658 const std::unordered_map<size_t, std::string>* row_edits =
nullptr;
659 auto edit_it = edits.find(key);
660 if (edit_it != edits.end()) row_edits = &edit_it->second;
671 std::string
get(
const KeyType& key,
const std::string&
column)
const {
672 this->require_keyed_frame();
674 auto col_names = (*this)[key].get_col_names();
675 auto col_it = std::find(col_names.begin(), col_names.end(),
column);
676 if (col_it == col_names.end()) {
677 throw std::out_of_range(
"Column '" +
column +
"' not found");
679 size_t col_idx = std::distance(col_names.begin(), col_it);
681 auto row_edits = this->edits.find(key);
682 if (row_edits != this->edits.end()) {
683 auto value = row_edits->second.find(col_idx);
684 if (value != row_edits->second.end()) {
685 return value->second;
698 void set(
const KeyType& key,
const std::string&
column,
const std::string& value) {
699 this->require_keyed_frame();
700 size_t row_idx = this->position_of(key);
703 auto col_names = rows[row_idx].second.get_col_names();
704 auto it = std::find(col_names.begin(), col_names.end(),
column);
705 if (it == col_names.end()) {
706 throw std::out_of_range(
"Column '" +
column +
"' not found");
708 size_t col_idx = std::distance(col_names.begin(), it);
710 edits[key][col_idx] = value;
719 this->require_keyed_frame();
720 this->ensure_key_index();
722 auto it = key_index->find(key);
723 if (it == key_index->end()) {
727 rows.erase(rows.begin() + it->second);
729 this->invalidate_key_index();
735 if (i >= rows.size())
return false;
736 if (is_keyed) edits.erase(rows[i].first);
738 rows.erase(rows.begin() + i);
739 this->invalidate_key_index();
749 void set_at(
size_t i,
const std::string&
column,
const std::string& value) {
751 throw std::runtime_error(
"This DataFrame was created without a key column.");
753 if (i >= rows.size()) {
754 throw std::out_of_range(
"Row index out of bounds.");
758 auto col_names = rows[i].second.get_col_names();
759 auto it = std::find(col_names.begin(), col_names.end(),
column);
760 if (it == col_names.end()) {
761 throw std::out_of_range(
"Column '" +
column +
"' not found");
763 size_t col_idx = std::distance(col_names.begin(), it);
765 edits[rows[i].first][col_idx] = value;
775 template<
typename T = std::
string>
776 std::vector<T>
column(
const std::string& name)
const {
777 auto col_it = std::find(col_names_.begin(), col_names_.end(), name);
778 if (col_it == col_names_.end()) {
779 throw std::runtime_error(
"Column not found: " + name);
781 size_t col_idx = std::distance(col_names_.begin(), col_it);
783 std::vector<T> values;
784 values.reserve(rows.size());
786 for (
const auto& entry : rows) {
787 auto row_edits = this->edits.find(entry.first);
788 if (row_edits != this->edits.end()) {
789 auto value = row_edits->second.find(col_idx);
790 if (value != row_edits->second.end()) {
793 values.push_back(edited_field.template get<T>());
798 values.push_back(entry.second[name].template get<T>());
811 typename GroupKey = invoke_result_t<GroupFunc, const CSVRow&>,
818 std::unordered_map<GroupKey, std::vector<size_t>>
group_by(GroupFunc group_func)
const {
819 std::unordered_map<GroupKey, std::vector<size_t>> grouped;
821 for (
size_t i = 0; i < rows.size(); i++) {
822 GroupKey group_key = group_func(rows[i].second);
823 grouped[group_key].push_back(i);
834 std::unordered_map<std::string, std::vector<size_t>>
group_by(
835 const std::string& name,
836 bool use_edits =
true
838 auto col_it = std::find(col_names_.begin(), col_names_.end(), name);
839 if (col_it == col_names_.end()) {
840 throw std::runtime_error(
"Column not found: " + name);
842 size_t col_idx = std::distance(col_names_.begin(), col_it);
844 std::unordered_map<std::string, std::vector<size_t>> grouped;
846 for (
size_t i = 0; i < rows.size(); i++) {
847 std::string group_key;
848 bool has_group_key =
false;
851 auto row_edits = this->edits.find(rows[i].first);
852 if (row_edits != this->edits.end()) {
853 auto edited_value = row_edits->second.find(col_idx);
854 if (edited_value != row_edits->second.end()) {
855 group_key = edited_value->second;
856 has_group_key =
true;
861 if (!has_group_key) {
865 grouped[group_key].push_back(i);
891 std::string key_column;
894 bool is_keyed =
false;
897 std::vector<std::string> col_names_;
900 std::vector<row_entry> rows;
903 mutable std::unique_ptr<std::unordered_map<KeyType, size_t>> key_index;
909 std::unordered_map<KeyType, std::unordered_map<size_t, std::string>> edits;
912 void init_unkeyed_from_reader(
CSVReader& reader) {
914 for (
auto& row : reader) {
915 rows.push_back(
row_entry{KeyType(), row});
920 void init_from_reader(CSVReader& reader,
const DataFrameOptions& options) {
921 this->is_keyed =
true;
922 this->key_column = options.get_key_column();
923 this->col_names_ = reader.get_col_names();
925 if (key_column.empty()) {
926 throw std::runtime_error(
"Key column cannot be empty.");
929 if (std::find(col_names_.begin(), col_names_.end(), key_column) == col_names_.end()) {
930 throw std::runtime_error(
"Key column not found: " + key_column);
933 const bool throw_on_missing_key = options.get_throw_on_missing_key();
935 this->build_from_key_function(
937 [
this, throw_on_missing_key](
const CSVRow& row) -> KeyType {
939 return row[this->key_column].template get<KeyType>();
941 catch (
const std::exception& e) {
942 if (throw_on_missing_key) {
943 throw std::runtime_error(
"Error retrieving key column value: " + std::string(e.what()));
949 options.get_duplicate_key_policy()
954 template<
typename KeyFunc>
955 void build_from_key_function(
958 DuplicateKeyPolicy policy
960 std::unordered_map<KeyType, size_t> key_to_pos;
962 for (
auto& row : reader) {
963 KeyType key = key_func(row);
965 auto existing = key_to_pos.find(key);
966 if (existing != key_to_pos.end()) {
967 if (policy == DuplicateKeyPolicy::THROW) {
968 throw std::runtime_error(
"Duplicate key encountered.");
971 if (policy == DuplicateKeyPolicy::OVERWRITE) {
972 rows[existing->second].second = row;
979 key_to_pos[key] = rows.size() - 1;
984 void require_keyed_frame()
const {
986 throw std::runtime_error(
"This DataFrame was created without a key column.");
991 void invalidate_key_index() {
996 void ensure_key_index()
const {
1001 key_index = std::unique_ptr<std::unordered_map<KeyType, size_t>>(
1002 new std::unordered_map<KeyType, size_t>()
1005 for (
size_t i = 0; i < rows.size(); i++) {
1006 (*key_index)[rows[i].first] = i;
1011 size_t position_of(
const KeyType& key)
const {
1012 this->ensure_key_index();
1013 auto it = key_index->find(key);
1014 if (it == key_index->end()) {
1015 throw std::out_of_range(
"Key not found.");
Data type representing individual CSV values.
Main class for parsing CSVs from files and in-memory sources.
std::vector< std::string > get_col_names() const
Return the CSV's column names as a vector of strings.
Data structure for representing CSV rows.
std::string to_json(const std::vector< std::string > &subset={}) const
Convert a CSV row to a JSON object, i.e.
CONSTEXPR bool empty() const noexcept
Indicates whether row is empty or not.
std::string to_json_array(const std::vector< std::string > &subset={}) const
Convert a CSV row to a JSON array, i.e.
CONSTEXPR size_t size() const noexcept
Return the number of fields in this row.
std::vector< std::string > get_col_names() const
Retrieve this row's associated column names.
Allows configuration of DataFrame behavior.
DuplicateKeyPolicy
Policy for handling duplicate keys when creating a keyed DataFrame.
Proxy class that wraps a CSVRow and intercepts field access to check for edits.
std::string to_json(const std::vector< std::string > &subset={}) const
Convert to JSON.
std::vector< std::string > get_col_names() const
Get column names.
CSVField operator[](size_t n) const
Access a field by position (positional access never checks edits).
auto to_sv_range() const
Convert this DataFrameRow into a std::ranges::input_range of string_views, respecting the sparse over...
CSVField operator[](const std::string &col) const
Access a field by column name, checking edits first.
DataFrameRow()
Default constructor (creates an unbound proxy).
std::string to_json_array(const std::vector< std::string > &subset={}) const
Convert to JSON array.
size_t size() const
Get the number of fields in the row.
bool empty() const
Check if the row is empty.
const CSVRow & get_underlying_row() const
Get the underlying CSVRow for compatibility.
const KeyType & get_key() const
Get the key for this row (only valid for keyed DataFrames).
DataFrameRow(const CSVRow *_row, const std::unordered_map< size_t, std::string > *_edits, const KeyType *_key)
Construct a DataFrameRow wrapper.
Row-wise const iterator over DataFrameRow entries.
Row-wise iterator over DataFrameRow entries.
std::unordered_map< std::string, std::vector< size_t > > group_by(const std::string &name, bool use_edits=true) const
Group row positions by the value of a column.
bool empty() const noexcept
Check if the DataFrame is empty (has no rows).
DataFrame(CSVReader &reader, const std::string &_key_column, DuplicateKeyPolicy policy=DuplicateKeyPolicy::OVERWRITE, bool throw_on_missing_key=true)
Construct a keyed DataFrame using a column name as the key.
DataFrameRow< KeyType > operator[](size_t i)
Access a row by position (unchecked).
bool erase_row_at(size_t i)
Remove a row by its position.
DataFrame(CSVReader &reader, const DataFrameOptions &options)
Construct a keyed DataFrame from a CSV reader with options.
bool has_column(const std::string &name) const
Check if a column exists in the DataFrame.
iterator end()
Get iterator past the last row.
DataFrameRow< KeyType > operator[](const KeyType &key)
Access a row by its key.
std::string get(const KeyType &key, const std::string &column) const
Get a cell value as a string, accounting for edits.
const_iterator cend() const
Get const iterator past the last row (explicit).
std::vector< T > column(const std::string &name) const
Extract all values from a column with type conversion.
size_t n_cols() const noexcept
Get the number of columns in the DataFrame.
const KeyType & key_at(size_t i) const
Get the key for a row at a given position.
std::unordered_map< GroupKey, std::vector< size_t > > group_by(GroupFunc group_func) const
Group row positions using an arbitrary grouping function.
iterator begin()
Get iterator to the first row.
DataFrame(CSVReader &reader, KeyFunc key_func, const DataFrameOptions &options)
Construct a keyed DataFrame using a custom key function with options.
bool erase_row(const KeyType &key)
Remove a row by its key.
DataFrameRow< KeyType > iloc(size_t i)
Access a row by position (iloc-style, pandas naming).
DataFrame(csv::string_view filename, const DataFrameOptions &options, CSVFormat format=CSVFormat::guess_csv())
Construct a keyed DataFrame directly from a CSV file.
void set_at(size_t i, const std::string &column, const std::string &value)
Set a cell value by position (stored in edit overlay).
size_t n_rows() const noexcept
Get the number of rows in the DataFrame.
const std::vector< std::string > & columns() const noexcept
Get the column names in order.
const_iterator cbegin() const
Get const iterator to the first row (explicit).
DataFrameRow< KeyType > at(const KeyType &key) const
Access a row by its key with bounds checking (const version).
DataFrameRow< KeyType > operator[](size_t i) const
Access a row by position (unchecked, const version).
bool try_get(size_t i, DataFrameRow< KeyType > &out)
Attempt to access a row by position without throwing.
DataFrameRow< KeyType > at(const KeyType &key)
Access a row by its key with bounds checking.
const_iterator begin() const
Get const iterator to the first row.
DataFrameRow< KeyType > at(size_t i) const
Access a row by position with bounds checking (const version).
DataFrameRow< KeyType > operator[](const KeyType &key) const
Access a row by its key (const version).
bool try_get(const KeyType &key, DataFrameRow< KeyType > &out)
Attempt to access a row by key without throwing.
DataFrameRow< KeyType > iloc(size_t i) const
Access a row by position (const version).
std::pair< KeyType, CSVRow > row_entry
Type alias for internal row storage: pair of key and CSVRow.
bool contains(const KeyType &key) const
Check if a key exists in the DataFrame.
const_iterator end() const
Get const iterator past the last row.
int index_of(const std::string &name) const
Get the index of a column by name.
DataFrameRow< KeyType > at(size_t i)
Access a row by position with bounds checking.
DataFrame(CSVReader &reader, KeyFunc key_func, DuplicateKeyPolicy policy=DuplicateKeyPolicy::OVERWRITE)
Construct a keyed DataFrame using a custom key function.
bool try_get(const KeyType &key, DataFrameRow< KeyType > &out) const
Attempt to access a row by key without throwing (const version).
void set(const KeyType &key, const std::string &column, const std::string &value)
Set a cell value (stored in edit overlay).
const std::string & key_name() const noexcept
Get the name of the key column (empty string if unkeyed).
size_t size() const noexcept
Get the number of rows in the DataFrame.
bool try_get(size_t i, DataFrameRow< KeyType > &out) const
Attempt to access a row by position without throwing (const version).
DataFrame()=default
Construct an empty DataFrame.
DataFrame(CSVReader &reader)
Construct an unkeyed DataFrame from a CSV reader.
Defines functionality needed for basic CSV parsing.
The all encompassing namespace.
CONSTEXPR_14 csv::string_view CSVField::get< csv::string_view >()
Retrieve a view over this field's string.
std::string CSVField::get< std::string >()
Retrieve this field's original string.
std::vector< std::string > get_col_names(csv::string_view filename, const CSVFormat &format=CSVFormat::guess_csv())
Get the column names of a CSV file using just the first 500KB.
constexpr int CSV_NOT_FOUND
Integer indicating a requested column wasn't found.
nonstd::string_view string_view
The string_view class used by this library.