17 reader(filename, format) {
23 reader(stream, format) {
29 std::vector<long double> ret;
30 for (
size_t i = 0; i < this->get_col_names().size(); i++) {
31 ret.push_back(this->rolling_means[i]);
38 std::vector<long double> ret;
39 for (
size_t i = 0; i < this->get_col_names().size(); i++) {
40 ret.push_back(this->rolling_vars[i]/(this->n[i] - 1));
47 std::vector<long double> ret;
48 for (
size_t i = 0; i < this->get_col_names().size(); i++) {
49 ret.push_back(this->mins[i]);
56 std::vector<long double> ret;
57 for (
size_t i = 0; i < this->get_col_names().size(); i++) {
58 ret.push_back(this->maxes[i]);
65 std::vector<FreqCount> ret;
66 for (
size_t i = 0; i < this->get_col_names().size(); i++) {
67 ret.push_back(this->counts[i]);
74 std::vector<TypeCount> ret;
75 for (
size_t i = 0; i < this->get_col_names().size(); i++) {
76 ret.push_back(this->dtypes[i]);
85 for (
size_t i = 0; i < this->get_col_names().size(); i++) {
88 rolling_means.push_back(0);
89 rolling_vars.push_back(0);
98 std::vector<std::thread> pool;
99 for (
size_t i = 0; i < this->get_col_names().size(); i++)
100 pool.push_back(std::thread(&CSVStat::calc_worker,
this, i));
103 for (
auto& th : pool)
106 for (
size_t i = 0; i < this->get_col_names().size(); i++) {
107 this->calc_worker(i);
111 this->records.clear();
115 constexpr size_t CALC_CHUNK_SIZE = 5000;
117 for (
auto& row : reader) {
118 this->records.push_back(std::move(row));
121 if (this->records.size() == CALC_CHUNK_SIZE) {
126 if (!this->records.empty()) {
131 CSV_INLINE void CSVStat::calc_worker(
const size_t &i) {
134 auto current_record = this->records.begin();
136 for (
size_t processed = 0; current_record != this->records.end(); processed++) {
137 if (current_record->size() == this->get_col_names().size()) {
138 auto current_field = (*current_record)[i];
141 if (processed < 1000 || this->counts[i].size() <= 500)
142 this->count(current_field, i);
144 this->dtype(current_field, i);
147 if (current_field.is_num()) {
148 long double x_n = current_field.get<
long double>();
151 this->variance(x_n, i);
152 this->min_max(x_n, i);
155 else if (this->reader.
get_format().get_variable_column_policy() == VariableColumnPolicy::THROW) {
156 throw std::runtime_error(
"Line has different length than the others " + std::string(current_record->raw_str()));
163 CSV_INLINE void CSVStat::dtype(CSVField& data,
const size_t &i) {
165 auto type = data.type();
166 if (this->dtypes[i].find(type) !=
167 this->dtypes[i].end()) {
169 this->dtypes[i][type]++;
172 this->dtypes[i].insert(std::make_pair(type, 1));
176 CSV_INLINE void CSVStat::count(CSVField& data,
const size_t &i) {
178 auto item = data.get<std::string>();
180 if (this->counts[i].find(item) !=
181 this->counts[i].end()) {
183 this->counts[i][item]++;
186 this->counts[i].insert(std::make_pair(item, 1));
190 CSV_INLINE void CSVStat::min_max(
const long double &x_n,
const size_t &i) {
192 if (std::isnan(this->mins[i]))
194 if (std::isnan(this->maxes[i]))
195 this->maxes[i] = x_n;
197 if (x_n < this->mins[i])
199 else if (x_n > this->maxes[i])
200 this->maxes[i] = x_n;
203 CSV_INLINE void CSVStat::variance(
const long double &x_n,
const size_t &i) {
205 long double& current_rolling_mean = this->rolling_means[i];
206 long double& current_rolling_var = this->rolling_vars[i];
207 long double& current_n = this->n[i];
213 if (current_n == 1) {
214 current_rolling_mean = x_n;
216 delta = x_n - current_rolling_mean;
217 current_rolling_mean += delta/current_n;
218 delta2 = x_n - current_rolling_mean;
219 current_rolling_var += delta*delta2;
230 std::unordered_map<std::string, DataType> csv_dtypes;
232 auto col_names = stat.get_col_names();
235 for (
size_t i = 0; i < stat.get_col_names().size(); i++) {
237 auto& col_name = col_names[i];
CSVFormat get_format() const
Return the format of the original raw CSV.
Class for calculating statistics from CSV files and in-memory sources.
std::vector< long double > get_mean() const
Return current means.
std::vector< long double > get_variance() const
Return current variances.
CSVStat(csv::string_view filename, CSVFormat format=CSVFormat::guess_csv())
Calculate statistics for an arbitrarily large file.
std::vector< long double > get_mins() const
Return current mins.
std::vector< TypeCount > get_dtypes() const
Get data type counts for each column.
std::vector< long double > get_maxes() const
Return current maxes.
std::vector< FreqCount > get_counts() const
Get counts for each column.
#define CSV_INLINE
Helper macro which should be #defined as "inline" in the single header version.
Calculates statistics from CSV files.
The all encompassing namespace.
@ CSV_INT64
64-bit integer (long long on MSVC/GCC)
@ CSV_DOUBLE
Floating point value.
@ CSV_INT16
16-bit integer (short on MSVC/GCC)
@ CSV_INT32
32-bit integer (int on MSVC/GCC)
@ CSV_STRING
Non-numeric string.
std::unordered_map< std::string, DataType > csv_data_types(const std::string &filename)
Useful for uploading CSV files to SQL databases.
nonstd::string_view string_view
The string_view class used by this library.