Vince's CSV Parser
Loading...
Searching...
No Matches
csv_stat.cpp
Go to the documentation of this file.
1
5#include <string>
6#include "csv_stat.hpp"
7
8namespace csv {
14 reader(filename, format) {
15 this->calc();
16 }
17
19 CSV_INLINE CSVStat::CSVStat(std::stringstream& stream, CSVFormat format) :
20 reader(stream, format) {
21 this->calc();
22 }
23
25 CSV_INLINE std::vector<long double> CSVStat::get_mean() const {
26 std::vector<long double> ret;
27 for (size_t i = 0; i < this->get_col_names().size(); i++) {
28 ret.push_back(this->rolling_means[i]);
29 }
30 return ret;
31 }
32
34 CSV_INLINE std::vector<long double> CSVStat::get_variance() const {
35 std::vector<long double> ret;
36 for (size_t i = 0; i < this->get_col_names().size(); i++) {
37 ret.push_back(this->rolling_vars[i]/(this->n[i] - 1));
38 }
39 return ret;
40 }
41
43 CSV_INLINE std::vector<long double> CSVStat::get_mins() const {
44 std::vector<long double> ret;
45 for (size_t i = 0; i < this->get_col_names().size(); i++) {
46 ret.push_back(this->mins[i]);
47 }
48 return ret;
49 }
50
52 CSV_INLINE std::vector<long double> CSVStat::get_maxes() const {
53 std::vector<long double> ret;
54 for (size_t i = 0; i < this->get_col_names().size(); i++) {
55 ret.push_back(this->maxes[i]);
56 }
57 return ret;
58 }
59
61 CSV_INLINE std::vector<CSVStat::FreqCount> CSVStat::get_counts() const {
62 std::vector<FreqCount> ret;
63 for (size_t i = 0; i < this->get_col_names().size(); i++) {
64 ret.push_back(this->counts[i]);
65 }
66 return ret;
67 }
68
70 CSV_INLINE std::vector<CSVStat::TypeCount> CSVStat::get_dtypes() const {
71 std::vector<TypeCount> ret;
72 for (size_t i = 0; i < this->get_col_names().size(); i++) {
73 ret.push_back(this->dtypes[i]);
74 }
75 return ret;
76 }
77
78 CSV_INLINE void CSVStat::calc_chunk() {
80 if (dtypes.empty()) {
82 for (size_t i = 0; i < this->get_col_names().size(); i++) {
83 dtypes.push_back({});
84 counts.push_back({});
85 rolling_means.push_back(0);
86 rolling_vars.push_back(0);
87 mins.push_back(NAN);
88 maxes.push_back(NAN);
89 n.push_back(0);
90 }
91 }
92
93 // Start threads
94 std::vector<std::thread> pool;
95 for (size_t i = 0; i < this->get_col_names().size(); i++)
96 pool.push_back(std::thread(&CSVStat::calc_worker, this, i));
97
98 // Block until done
99 for (auto& th : pool)
100 th.join();
101
102 this->records.clear();
103 }
104
105 CSV_INLINE void CSVStat::calc() {
106 constexpr size_t CALC_CHUNK_SIZE = 5000;
107
108 for (auto& row : reader) {
109 this->records.push_back(std::move(row));
110
112 if (this->records.size() == CALC_CHUNK_SIZE) {
113 calc_chunk();
114 }
115 }
116
117 if (!this->records.empty()) {
118 calc_chunk();
119 }
120 }
121
122 CSV_INLINE void CSVStat::calc_worker(const size_t &i) {
128 auto current_record = this->records.begin();
129
130 for (size_t processed = 0; current_record != this->records.end(); processed++) {
131 if (current_record->size() == this->get_col_names().size()) {
132 auto current_field = (*current_record)[i];
133
134 // Optimization: Don't count() if there's too many distinct values in the first 1000 rows
135 if (processed < 1000 || this->counts[i].size() <= 500)
136 this->count(current_field, i);
137
138 this->dtype(current_field, i);
139
140 // Numeric Stuff
141 if (current_field.is_num()) {
142 long double x_n = current_field.get<long double>();
143
144 // This actually calculates mean AND variance
145 this->variance(x_n, i);
146 this->min_max(x_n, i);
147 }
148 }
149 else if (this->reader.get_format().get_variable_column_policy() == VariableColumnPolicy::THROW) {
150 throw std::runtime_error("Line has different length than the others " + internals::format_row(*current_record));
151 }
152
153 ++current_record;
154 }
155 }
156
157 CSV_INLINE void CSVStat::dtype(CSVField& data, const size_t &i) {
163 auto type = data.type();
164 if (this->dtypes[i].find(type) !=
165 this->dtypes[i].end()) {
166 // Increment count
167 this->dtypes[i][type]++;
168 } else {
169 // Initialize count
170 this->dtypes[i].insert(std::make_pair(type, 1));
171 }
172 }
173
174 CSV_INLINE void CSVStat::count(CSVField& data, const size_t &i) {
180 auto item = data.get<std::string>();
181
182 if (this->counts[i].find(item) !=
183 this->counts[i].end()) {
184 // Increment count
185 this->counts[i][item]++;
186 } else {
187 // Initialize count
188 this->counts[i].insert(std::make_pair(item, 1));
189 }
190 }
191
192 CSV_INLINE void CSVStat::min_max(const long double &x_n, const size_t &i) {
197 if (std::isnan(this->mins[i]))
198 this->mins[i] = x_n;
199 if (std::isnan(this->maxes[i]))
200 this->maxes[i] = x_n;
201
202 if (x_n < this->mins[i])
203 this->mins[i] = x_n;
204 else if (x_n > this->maxes[i])
205 this->maxes[i] = x_n;
206 }
207
208 CSV_INLINE void CSVStat::variance(const long double &x_n, const size_t &i) {
214 long double& current_rolling_mean = this->rolling_means[i];
215 long double& current_rolling_var = this->rolling_vars[i];
216 long double& current_n = this->n[i];
217 long double delta;
218 long double delta2;
219
220 current_n++;
221
222 if (current_n == 1) {
223 current_rolling_mean = x_n;
224 } else {
225 delta = x_n - current_rolling_mean;
226 current_rolling_mean += delta/current_n;
227 delta2 = x_n - current_rolling_mean;
228 current_rolling_var += delta*delta2;
229 }
230 }
231
240 CSV_INLINE std::unordered_map<std::string, DataType> csv_data_types(const std::string& filename) {
241 CSVStat stat(filename);
242 std::unordered_map<std::string, DataType> csv_dtypes;
243
244 auto col_names = stat.get_col_names();
245 auto temp = stat.get_dtypes();
246
247 for (size_t i = 0; i < stat.get_col_names().size(); i++) {
248 auto& col = temp[i];
249 auto& col_name = col_names[i];
250
251 if (col[DataType::CSV_STRING])
252 csv_dtypes[col_name] = DataType::CSV_STRING;
253 else if (col[DataType::CSV_INT64])
254 csv_dtypes[col_name] = DataType::CSV_INT64;
255 else if (col[DataType::CSV_INT32])
256 csv_dtypes[col_name] = DataType::CSV_INT32;
257 else if (col[DataType::CSV_INT16])
258 csv_dtypes[col_name] = DataType::CSV_INT16;
259 else if (col[DataType::CSV_INT8])
260 csv_dtypes[col_name] = DataType::CSV_INT8;
261 else
262 csv_dtypes[col_name] = DataType::CSV_DOUBLE;
263 }
264
265 return csv_dtypes;
266 }
267}
Stores information about how to parse a CSV file.
CSVFormat get_format() const
Return the format of the original raw CSV.
Class for calculating statistics from CSV files and in-memory sources.
Definition csv_stat.hpp:18
std::vector< long double > get_mean() const
Return current means.
Definition csv_stat.cpp:25
std::vector< long double > get_variance() const
Return current variances.
Definition csv_stat.cpp:34
CSVStat(csv::string_view filename, CSVFormat format=CSVFormat::guess_csv())
Calculate statistics for an arbitrarily large file.
Definition csv_stat.cpp:13
std::vector< long double > get_mins() const
Return current mins.
Definition csv_stat.cpp:43
std::vector< TypeCount > get_dtypes() const
Get data type counts for each column.
Definition csv_stat.cpp:70
std::vector< long double > get_maxes() const
Return current maxes.
Definition csv_stat.cpp:52
std::vector< FreqCount > get_counts() const
Get counts for each column.
Definition csv_stat.cpp:61
#define CSV_INLINE
Helper macro which should be #defined as "inline" in the single header version.
Definition common.hpp:26
Calculates statistics from CSV files.
std::string format_row(const std::vector< std::string > &row, csv::string_view delim)
Definition csv_reader.cpp:9
The all encompassing namespace.
@ CSV_INT64
64-bit integer (long long on MSVC/GCC)
@ CSV_DOUBLE
Floating point value.
@ CSV_INT16
16-bit integer (short on MSVC/GCC)
@ CSV_INT32
32-bit integer (int on MSVC/GCC)
@ CSV_INT8
8-bit integer
@ CSV_STRING
Non-numeric string.
std::unordered_map< std::string, DataType > csv_data_types(const std::string &filename)
Useful for uploading CSV files to SQL databases.
Definition csv_stat.cpp:240
nonstd::string_view string_view
The string_view class used by this library.
Definition common.hpp:99