10#include <unordered_map>
22 static auto test(
int) ->
decltype(
23 std::hash<U>{}(std::declval<const U&>()),
28 static std::false_type test(...);
31 static constexpr bool value =
decltype(
test<T>(0))::value;
38 static auto test(
int) ->
decltype(
39 std::declval<const U&>() == std::declval<const U&>(),
44 static std::false_type test(...);
47 static constexpr bool value =
decltype(
test<T>(0))::value;
64 this->duplicate_key_policy = value;
69 return this->duplicate_key_policy;
72 DataFrameOptions& set_key_column(
const std::string& value) {
73 this->key_column = value;
77 const std::string& get_key_column()
const {
78 return this->key_column;
81 DataFrameOptions& set_throw_on_missing_key(
bool value) {
82 this->throw_on_missing_key = value;
86 bool get_throw_on_missing_key()
const {
87 return this->throw_on_missing_key;
91 std::string key_column;
96 bool throw_on_missing_key =
true;
103 template<
typename KeyType>
112 const std::unordered_map<std::string, std::string>* _edits,
114 ) : row(_row), row_edits(_edits), key_ptr(_key) {}
124 auto it = row_edits->find(col);
125 if (it != row_edits->end()) {
150 const KeyType&
get_key()
const {
return *key_ptr; }
153 operator std::vector<std::string>()
const {
154 std::vector<std::string> result;
155 result.reserve(row->
size());
158 for (
size_t i = 0; i < row->
size(); i++) {
160 if (row_edits && i < col_names.size()) {
161 auto it = row_edits->find(col_names[i]);
162 if (it != row_edits->end()) {
163 result.push_back(it->second);
174 std::string
to_json(
const std::vector<std::string>& subset = {})
const {
179 std::string
to_json_array(
const std::vector<std::string>& subset = {})
const {
185 const std::unordered_map<std::string, std::string>* row_edits;
186 const KeyType* key_ptr;
189 template<
typename KeyType = std::
string>
199 using difference_type = std::ptrdiff_t;
202 using iterator_category = std::random_access_iterator_tag;
206 typename std::vector<row_entry>::iterator it,
207 const std::unordered_map<KeyType, std::unordered_map<std::string, std::string>>* edits
208 ) : iter(it), edits_map(edits) {}
211 const std::unordered_map<std::string, std::string>* row_edits =
nullptr;
213 auto it = edits_map->find(iter->first);
214 if (it != edits_map->end()) {
215 row_edits = &it->second;
228 iterator& operator++() { ++iter;
return *
this; }
229 iterator operator++(
int) {
auto tmp = *
this; ++iter;
return tmp; }
230 iterator& operator--() { --iter;
return *
this; }
231 iterator operator--(
int) {
auto tmp = *
this; --iter;
return tmp; }
233 iterator operator+(difference_type n)
const {
return iterator(iter + n, edits_map); }
234 iterator operator-(difference_type n)
const {
return iterator(iter - n, edits_map); }
235 difference_type operator-(
const iterator& other)
const {
return iter - other.iter; }
237 bool operator==(
const iterator& other)
const {
return iter == other.iter; }
238 bool operator!=(
const iterator& other)
const {
return iter != other.iter; }
241 typename std::vector<row_entry>::iterator iter;
242 const std::unordered_map<KeyType, std::unordered_map<std::string, std::string>>* edits_map =
nullptr;
250 using difference_type = std::ptrdiff_t;
253 using iterator_category = std::random_access_iterator_tag;
257 typename std::vector<row_entry>::const_iterator it,
258 const std::unordered_map<KeyType, std::unordered_map<std::string, std::string>>* edits
259 ) : iter(it), edits_map(edits) {}
262 const std::unordered_map<std::string, std::string>* row_edits =
nullptr;
264 auto it = edits_map->find(iter->first);
265 if (it != edits_map->end()) {
266 row_edits = &it->second;
280 const_iterator operator++(
int) {
auto tmp = *
this; ++iter;
return tmp; }
282 const_iterator operator--(
int) {
auto tmp = *
this; --iter;
return tmp; }
286 difference_type operator-(
const const_iterator& other)
const {
return iter - other.iter; }
288 bool operator==(
const const_iterator& other)
const {
return iter == other.iter; }
289 bool operator!=(
const const_iterator& other)
const {
return iter != other.iter; }
292 typename std::vector<row_entry>::const_iterator iter;
293 const std::unordered_map<KeyType, std::unordered_map<std::string, std::string>>* edits_map =
nullptr;
299 "DataFrame<KeyType> requires KeyType to be hashable (std::hash<KeyType> specialization required)."
304 "DataFrame<KeyType> requires KeyType to be equality comparable (operator== required)."
308 std::is_default_constructible<KeyType>::value,
309 "DataFrame<KeyType> requires KeyType to be default-constructible."
322 this->init_unkeyed_from_reader(reader);
333 this->init_from_reader(reader, options);
350 this->init_from_reader(reader, options);
364 const std::string& _key_column,
366 bool throw_on_missing_key =
true
370 .set_key_column(_key_column)
371 .set_duplicate_key_policy(policy)
372 .set_throw_on_missing_key(throw_on_missing_key)
385 typename ResultType = invoke_result_t<KeyFunc, const CSVRow&>,
386 csv::enable_if_t<std::is_convertible<ResultType, KeyType>::value,
int> = 0
393 this->is_keyed =
true;
394 this->build_from_key_function(reader, key_func, policy);
406 typename ResultType = invoke_result_t<KeyFunc, const CSVRow&>,
407 csv::enable_if_t<std::is_convertible<ResultType, KeyType>::value,
int> = 0
413 ) :
DataFrame(reader, key_func, options.get_duplicate_key_policy()) {}
426 size_t n_rows() const noexcept {
return rows.size(); }
429 size_t n_cols() const noexcept {
return col_names.size(); }
438 return std::find(col_names.begin(), col_names.end(), name) != col_names.end();
448 auto it = std::find(col_names.begin(), col_names.end(), name);
449 if (it == col_names.end())
451 return static_cast<int>(std::distance(col_names.begin(), it));
455 const std::vector<std::string>&
columns() const noexcept {
475 template<
typename K = KeyType,
476 csv::enable_if_t<!std::is_integral<K>::value,
int> = 0>
478 static_assert(std::is_same<K, KeyType>::value,
479 "Do not explicitly instantiate this template. Use iloc() for positional access.");
480 return this->
iloc(i);
485 template<
typename K = KeyType,
486 csv::enable_if_t<!std::is_integral<K>::value,
int> = 0>
488 static_assert(std::is_same<K, KeyType>::value,
489 "Do not explicitly instantiate this template. Use iloc() for positional access.");
490 return this->
iloc(i);
501 const std::unordered_map<std::string, std::string>* row_edits =
nullptr;
503 auto it = edits.find(rows.at(i).first);
504 if (it != edits.end()) row_edits = &it->second;
511 const std::unordered_map<std::string, std::string>* row_edits =
nullptr;
513 auto it = edits.find(rows.at(i).first);
514 if (it != edits.end()) row_edits = &it->second;
528 this->require_keyed_frame();
529 auto position = this->position_of(key);
530 const std::unordered_map<std::string, std::string>* row_edits =
nullptr;
531 auto it = edits.find(key);
532 if (it != edits.end()) row_edits = &it->second;
538 this->require_keyed_frame();
539 auto position = this->position_of(key);
540 const std::unordered_map<std::string, std::string>* row_edits =
nullptr;
541 auto it = edits.find(key);
542 if (it != edits.end()) row_edits = &it->second;
554 const std::unordered_map<std::string, std::string>* row_edits =
nullptr;
556 auto it = edits.find(rows.at(i).first);
557 if (it != edits.end()) row_edits = &it->second;
564 const std::unordered_map<std::string, std::string>* row_edits =
nullptr;
566 auto it = edits.find(rows.at(i).first);
567 if (it != edits.end()) row_edits = &it->second;
580 if (i >= rows.size()) {
583 const std::unordered_map<std::string, std::string>* row_edits =
nullptr;
585 auto it = edits.find(rows[i].first);
586 if (it != edits.end()) row_edits = &it->second;
594 if (i >= rows.size()) {
597 const std::unordered_map<std::string, std::string>* row_edits =
nullptr;
599 auto it = edits.find(rows[i].first);
600 if (it != edits.end()) row_edits = &it->second;
615 this->require_keyed_frame();
616 return rows.at(i).first;
627 this->require_keyed_frame();
628 this->ensure_key_index();
629 return key_index->find(key) != key_index->end();
641 this->require_keyed_frame();
642 auto position = this->position_of(key);
643 const std::unordered_map<std::string, std::string>* row_edits =
nullptr;
644 auto it = edits.find(key);
645 if (it != edits.end()) row_edits = &it->second;
651 this->require_keyed_frame();
652 auto position = this->position_of(key);
653 const std::unordered_map<std::string, std::string>* row_edits =
nullptr;
654 auto it = edits.find(key);
655 if (it != edits.end()) row_edits = &it->second;
668 this->require_keyed_frame();
669 this->ensure_key_index();
670 auto it = key_index->find(key);
671 if (it == key_index->end()) {
674 const std::unordered_map<std::string, std::string>* row_edits =
nullptr;
675 auto edit_it = edits.find(key);
676 if (edit_it != edits.end()) row_edits = &edit_it->second;
683 this->require_keyed_frame();
684 this->ensure_key_index();
685 auto it = key_index->find(key);
686 if (it == key_index->end()) {
689 const std::unordered_map<std::string, std::string>* row_edits =
nullptr;
690 auto edit_it = edits.find(key);
691 if (edit_it != edits.end()) row_edits = &edit_it->second;
705 std::string
get(
const KeyType& key,
const std::string&
column)
const {
706 this->require_keyed_frame();
708 auto row_edits = this->edits.find(key);
709 if (row_edits != this->edits.end()) {
710 auto value = row_edits->second.find(
column);
711 if (value != row_edits->second.end()) {
712 return value->second;
728 void set(
const KeyType& key,
const std::string&
column,
const std::string& value) {
729 this->require_keyed_frame();
730 (void)this->position_of(key);
731 edits[key][
column] = value;
742 this->require_keyed_frame();
743 this->ensure_key_index();
745 auto it = key_index->find(key);
746 if (it == key_index->end()) {
750 rows.erase(rows.begin() + it->second);
752 this->invalidate_key_index();
763 if (i >= rows.size())
return false;
764 if (is_keyed) edits.erase(rows[i].first);
766 rows.erase(rows.begin() + i);
767 this->invalidate_key_index();
780 void set_at(
size_t i,
const std::string&
column,
const std::string& value) {
782 throw std::runtime_error(
"This DataFrame was created without a key column.");
784 if (i >= rows.size()) {
785 throw std::out_of_range(
"Row index out of bounds.");
787 edits[rows[i].first][
column] = value;
799 template<
typename T = std::
string>
800 std::vector<T>
column(
const std::string& name)
const {
801 if (std::find(col_names.begin(), col_names.end(), name) == col_names.end()) {
802 throw std::runtime_error(
"Column not found: " + name);
805 std::vector<T> values;
806 values.reserve(rows.size());
808 for (
const auto& entry : rows) {
809 auto row_edits = this->edits.find(entry.first);
810 if (row_edits != this->edits.end()) {
811 auto value = row_edits->second.find(name);
812 if (value != row_edits->second.end()) {
815 values.push_back(edited_field.template get<T>());
820 values.push_back(entry.second[name].template get<T>());
835 typename GroupKey = invoke_result_t<GroupFunc, const CSVRow&>,
842 std::unordered_map<GroupKey, std::vector<size_t>>
group_by(GroupFunc group_func)
const {
843 std::unordered_map<GroupKey, std::vector<size_t>> grouped;
845 for (
size_t i = 0; i < rows.size(); i++) {
846 GroupKey group_key = group_func(rows[i].second);
847 grouped[group_key].push_back(i);
861 std::unordered_map<std::string, std::vector<size_t>>
group_by(
862 const std::string& name,
863 bool use_edits =
true
865 if (std::find(col_names.begin(), col_names.end(), name) == col_names.end()) {
866 throw std::runtime_error(
"Column not found: " + name);
869 std::unordered_map<std::string, std::vector<size_t>> grouped;
871 for (
size_t i = 0; i < rows.size(); i++) {
872 std::string group_key;
873 bool has_group_key =
false;
876 auto row_edits = this->edits.find(rows[i].first);
877 if (row_edits != this->edits.end()) {
878 auto edited_value = row_edits->second.find(name);
879 if (edited_value != row_edits->second.end()) {
880 group_key = edited_value->second;
881 has_group_key =
true;
886 if (!has_group_key) {
890 grouped[group_key].push_back(i);
916 std::string key_column;
919 bool is_keyed =
false;
922 std::vector<std::string> col_names;
925 std::vector<row_entry> rows;
928 mutable std::unique_ptr<std::unordered_map<KeyType, size_t>> key_index;
934 std::unordered_map<KeyType, std::unordered_map<std::string, std::string>> edits;
937 void init_unkeyed_from_reader(
CSVReader& reader) {
939 for (
auto& row : reader) {
940 rows.push_back(
row_entry{KeyType(), row});
945 void init_from_reader(CSVReader& reader,
const DataFrameOptions& options) {
946 this->is_keyed =
true;
947 this->key_column = options.get_key_column();
948 this->col_names = reader.get_col_names();
950 if (key_column.empty()) {
951 throw std::runtime_error(
"Key column cannot be empty.");
954 if (std::find(col_names.begin(), col_names.end(), key_column) == col_names.end()) {
955 throw std::runtime_error(
"Key column not found: " + key_column);
958 const bool throw_on_missing_key = options.get_throw_on_missing_key();
960 this->build_from_key_function(
962 [
this, throw_on_missing_key](
const CSVRow& row) -> KeyType {
964 return row[this->key_column].template get<KeyType>();
966 catch (
const std::exception& e) {
967 if (throw_on_missing_key) {
968 throw std::runtime_error(
"Error retrieving key column value: " + std::string(e.what()));
974 options.get_duplicate_key_policy()
979 template<
typename KeyFunc>
980 void build_from_key_function(
983 DuplicateKeyPolicy policy
985 std::unordered_map<KeyType, size_t> key_to_pos;
987 for (
auto& row : reader) {
988 KeyType key = key_func(row);
990 auto existing = key_to_pos.find(key);
991 if (existing != key_to_pos.end()) {
992 if (policy == DuplicateKeyPolicy::THROW) {
993 throw std::runtime_error(
"Duplicate key encountered.");
996 if (policy == DuplicateKeyPolicy::OVERWRITE) {
997 rows[existing->second].second = row;
1004 key_to_pos[key] = rows.size() - 1;
1009 void require_keyed_frame()
const {
1011 throw std::runtime_error(
"This DataFrame was created without a key column.");
1016 void invalidate_key_index() {
1021 void ensure_key_index()
const {
1026 key_index = std::unique_ptr<std::unordered_map<KeyType, size_t>>(
1027 new std::unordered_map<KeyType, size_t>()
1030 for (
size_t i = 0; i < rows.size(); i++) {
1031 (*key_index)[rows[i].first] = i;
1036 size_t position_of(
const KeyType& key)
const {
1037 this->ensure_key_index();
1038 auto it = key_index->find(key);
1039 if (it == key_index->end()) {
1040 throw std::out_of_range(
"Key not found.");
Data type representing individual CSV values.
Main class for parsing CSVs from files and in-memory sources.
std::vector< std::string > get_col_names() const
Return the CSV's column names as a vector of strings.
Data structure for representing CSV rows.
std::string to_json(const std::vector< std::string > &subset={}) const
Convert a CSV row to a JSON object, i.e.
CONSTEXPR bool empty() const noexcept
Indicates whether row is empty or not.
std::string to_json_array(const std::vector< std::string > &subset={}) const
Convert a CSV row to a JSON array, i.e.
CONSTEXPR size_t size() const noexcept
Return the number of fields in this row.
std::vector< std::string > get_col_names() const
Retrieve this row's associated column names.
Allows configuration of DataFrame behavior.
DuplicateKeyPolicy
Policy for handling duplicate keys when creating a keyed DataFrame.
Proxy class that wraps a CSVRow and intercepts field access to check for edits.
std::string to_json(const std::vector< std::string > &subset={}) const
Convert to JSON.
std::vector< std::string > get_col_names() const
Get column names.
CSVField operator[](size_t n) const
Access a field by position (positional access never checks edits).
DataFrameRow(const CSVRow *_row, const std::unordered_map< std::string, std::string > *_edits, const KeyType *_key)
Construct a DataFrameRow wrapper.
CSVField operator[](const std::string &col) const
Access a field by column name, checking edits first.
DataFrameRow()
Default constructor (creates an unbound proxy).
std::string to_json_array(const std::vector< std::string > &subset={}) const
Convert to JSON array.
size_t size() const
Get the number of fields in the row.
bool empty() const
Check if the row is empty.
const CSVRow & get_underlying_row() const
Get the underlying CSVRow for compatibility.
const KeyType & get_key() const
Get the key for this row (only valid for keyed DataFrames).
Row-wise const iterator over DataFrameRow entries.
Row-wise iterator over DataFrameRow entries.
std::unordered_map< std::string, std::vector< size_t > > group_by(const std::string &name, bool use_edits=true) const
Group row positions by the value of a column.
bool empty() const noexcept
Check if the DataFrame is empty (has no rows).
DataFrame(CSVReader &reader, const std::string &_key_column, DuplicateKeyPolicy policy=DuplicateKeyPolicy::OVERWRITE, bool throw_on_missing_key=true)
Construct a keyed DataFrame using a column name as the key.
DataFrameRow< KeyType > operator[](size_t i)
Access a row by position (unchecked).
bool erase_row_at(size_t i)
Remove a row by its position.
DataFrame(CSVReader &reader, const DataFrameOptions &options)
Construct a keyed DataFrame from a CSV reader with options.
bool has_column(const std::string &name) const
Check if a column exists in the DataFrame.
iterator end()
Get iterator past the last row.
DataFrameRow< KeyType > operator[](const KeyType &key)
Access a row by its key.
std::string get(const KeyType &key, const std::string &column) const
Get a cell value as a string, accounting for edits.
const_iterator cend() const
Get const iterator past the last row (explicit).
std::vector< T > column(const std::string &name) const
Extract all values from a column with type conversion.
size_t n_cols() const noexcept
Get the number of columns in the DataFrame.
const KeyType & key_at(size_t i) const
Get the key for a row at a given position.
std::unordered_map< GroupKey, std::vector< size_t > > group_by(GroupFunc group_func) const
Group row positions using an arbitrary grouping function.
iterator begin()
Get iterator to the first row.
DataFrame(CSVReader &reader, KeyFunc key_func, const DataFrameOptions &options)
Construct a keyed DataFrame using a custom key function with options.
bool erase_row(const KeyType &key)
Remove a row by its key.
DataFrameRow< KeyType > iloc(size_t i)
Access a row by position (iloc-style, pandas naming).
DataFrame(csv::string_view filename, const DataFrameOptions &options, CSVFormat format=CSVFormat::guess_csv())
Construct a keyed DataFrame directly from a CSV file.
void set_at(size_t i, const std::string &column, const std::string &value)
Set a cell value by position (stored in edit overlay).
size_t n_rows() const noexcept
Get the number of rows in the DataFrame.
const std::vector< std::string > & columns() const noexcept
Get the column names in order.
const_iterator cbegin() const
Get const iterator to the first row (explicit).
DataFrameRow< KeyType > at(const KeyType &key) const
Access a row by its key with bounds checking (const version).
DataFrameRow< KeyType > operator[](size_t i) const
Access a row by position (unchecked, const version).
bool try_get(size_t i, DataFrameRow< KeyType > &out)
Attempt to access a row by position without throwing.
DataFrameRow< KeyType > at(const KeyType &key)
Access a row by its key with bounds checking.
const_iterator begin() const
Get const iterator to the first row.
DataFrameRow< KeyType > at(size_t i) const
Access a row by position with bounds checking (const version).
DataFrameRow< KeyType > operator[](const KeyType &key) const
Access a row by its key (const version).
bool try_get(const KeyType &key, DataFrameRow< KeyType > &out)
Attempt to access a row by key without throwing.
DataFrameRow< KeyType > iloc(size_t i) const
Access a row by position (const version).
std::pair< KeyType, CSVRow > row_entry
Type alias for internal row storage: pair of key and CSVRow.
bool contains(const KeyType &key) const
Check if a key exists in the DataFrame.
const_iterator end() const
Get const iterator past the last row.
int index_of(const std::string &name) const
Get the index of a column by name.
DataFrameRow< KeyType > at(size_t i)
Access a row by position with bounds checking.
DataFrame(CSVReader &reader, KeyFunc key_func, DuplicateKeyPolicy policy=DuplicateKeyPolicy::OVERWRITE)
Construct a keyed DataFrame using a custom key function.
bool try_get(const KeyType &key, DataFrameRow< KeyType > &out) const
Attempt to access a row by key without throwing (const version).
void set(const KeyType &key, const std::string &column, const std::string &value)
Set a cell value (stored in edit overlay).
const std::string & key_name() const noexcept
Get the name of the key column (empty string if unkeyed).
size_t size() const noexcept
Get the number of rows in the DataFrame.
bool try_get(size_t i, DataFrameRow< KeyType > &out) const
Attempt to access a row by position without throwing (const version).
DataFrame()=default
Construct an empty DataFrame.
DataFrame(CSVReader &reader)
Construct an unkeyed DataFrame from a CSV reader.
Defines functionality needed for basic CSV parsing.
CSV_CONST CONSTEXPR_17 OutArray arrayToDefault(T &&value)
Helper constexpr function to initialize an array with all the elements set to value.
The all encompassing namespace.
std::vector< std::string > get_col_names(csv::string_view filename, CSVFormat format)
Return a CSV's column names.
std::string CSVField::get< std::string >()
Retrieve this field's original string.
constexpr int CSV_NOT_FOUND
Integer indicating a requested column wasn't found.
nonstd::string_view string_view
The string_view class used by this library.