Vince's CSV Parser
Loading...
Searching...
No Matches
csv_stat.cpp
Go to the documentation of this file.
1
5#include <string>
6#if CSV_ENABLE_THREADS
7#include <thread>
8#endif
9#include "csv_stat.hpp"
10
11namespace csv {
17 reader(filename, format) {
18 this->calc();
19 }
20
22 CSV_INLINE CSVStat::CSVStat(std::stringstream& stream, CSVFormat format) :
23 reader(stream, format) {
24 this->calc();
25 }
26
28 CSV_INLINE std::vector<long double> CSVStat::get_mean() const {
29 std::vector<long double> ret;
30 for (size_t i = 0; i < this->get_col_names().size(); i++) {
31 ret.push_back(this->rolling_means[i]);
32 }
33 return ret;
34 }
35
37 CSV_INLINE std::vector<long double> CSVStat::get_variance() const {
38 std::vector<long double> ret;
39 for (size_t i = 0; i < this->get_col_names().size(); i++) {
40 ret.push_back(this->rolling_vars[i]/(this->n[i] - 1));
41 }
42 return ret;
43 }
44
46 CSV_INLINE std::vector<long double> CSVStat::get_mins() const {
47 std::vector<long double> ret;
48 for (size_t i = 0; i < this->get_col_names().size(); i++) {
49 ret.push_back(this->mins[i]);
50 }
51 return ret;
52 }
53
55 CSV_INLINE std::vector<long double> CSVStat::get_maxes() const {
56 std::vector<long double> ret;
57 for (size_t i = 0; i < this->get_col_names().size(); i++) {
58 ret.push_back(this->maxes[i]);
59 }
60 return ret;
61 }
62
64 CSV_INLINE std::vector<CSVStat::FreqCount> CSVStat::get_counts() const {
65 std::vector<FreqCount> ret;
66 for (size_t i = 0; i < this->get_col_names().size(); i++) {
67 ret.push_back(this->counts[i]);
68 }
69 return ret;
70 }
71
73 CSV_INLINE std::vector<CSVStat::TypeCount> CSVStat::get_dtypes() const {
74 std::vector<TypeCount> ret;
75 for (size_t i = 0; i < this->get_col_names().size(); i++) {
76 ret.push_back(this->dtypes[i]);
77 }
78 return ret;
79 }
80
81 CSV_INLINE void CSVStat::calc_chunk() {
83 if (dtypes.empty()) {
85 for (size_t i = 0; i < this->get_col_names().size(); i++) {
86 dtypes.push_back({});
87 counts.push_back({});
88 rolling_means.push_back(0);
89 rolling_vars.push_back(0);
90 mins.push_back(NAN);
91 maxes.push_back(NAN);
92 n.push_back(0);
93 }
94 }
95
96#if CSV_ENABLE_THREADS
97 // Start threads
98 std::vector<std::thread> pool;
99 for (size_t i = 0; i < this->get_col_names().size(); i++)
100 pool.push_back(std::thread(&CSVStat::calc_worker, this, i));
101
102 // Block until done
103 for (auto& th : pool)
104 th.join();
105#else
106 for (size_t i = 0; i < this->get_col_names().size(); i++) {
107 this->calc_worker(i);
108 }
109#endif
110
111 this->records.clear();
112 }
113
114 CSV_INLINE void CSVStat::calc() {
115 constexpr size_t CALC_CHUNK_SIZE = 5000;
116
117 for (auto& row : reader) {
118 this->records.push_back(std::move(row));
119
121 if (this->records.size() == CALC_CHUNK_SIZE) {
122 calc_chunk();
123 }
124 }
125
126 if (!this->records.empty()) {
127 calc_chunk();
128 }
129 }
130
131 CSV_INLINE void CSVStat::calc_worker(const size_t &i) {
132 // Worker routine for CSVStat::calc() that processes one column.
133
134 auto current_record = this->records.begin();
135
136 for (size_t processed = 0; current_record != this->records.end(); processed++) {
137 if (current_record->size() == this->get_col_names().size()) {
138 auto current_field = (*current_record)[i];
139
140 // Optimization: Don't count() if there's too many distinct values in the first 1000 rows
141 if (processed < 1000 || this->counts[i].size() <= 500)
142 this->count(current_field, i);
143
144 this->dtype(current_field, i);
145
146 // Numeric Stuff
147 if (current_field.is_num()) {
148 long double x_n = current_field.get<long double>();
149
150 // This actually calculates mean AND variance
151 this->variance(x_n, i);
152 this->min_max(x_n, i);
153 }
154 }
155 else if (this->reader.get_format().get_variable_column_policy() == VariableColumnPolicy::THROW) {
156 throw std::runtime_error("Line has different length than the others " + std::string(current_record->raw_str()));
157 }
158
159 ++current_record;
160 }
161 }
162
163 CSV_INLINE void CSVStat::dtype(CSVField& data, const size_t &i) {
164 // Update the type counter for one field.
165 auto type = data.type();
166 if (this->dtypes[i].find(type) !=
167 this->dtypes[i].end()) {
168 // Increment count
169 this->dtypes[i][type]++;
170 } else {
171 // Initialize count
172 this->dtypes[i].insert(std::make_pair(type, 1));
173 }
174 }
175
176 CSV_INLINE void CSVStat::count(CSVField& data, const size_t &i) {
177 // Update the frequency counter for one field.
178 auto item = data.get<std::string>();
179
180 if (this->counts[i].find(item) !=
181 this->counts[i].end()) {
182 // Increment count
183 this->counts[i][item]++;
184 } else {
185 // Initialize count
186 this->counts[i].insert(std::make_pair(item, 1));
187 }
188 }
189
190 CSV_INLINE void CSVStat::min_max(const long double &x_n, const size_t &i) {
191 // Update the current minimum and maximum for one column.
192 if (std::isnan(this->mins[i]))
193 this->mins[i] = x_n;
194 if (std::isnan(this->maxes[i]))
195 this->maxes[i] = x_n;
196
197 if (x_n < this->mins[i])
198 this->mins[i] = x_n;
199 else if (x_n > this->maxes[i])
200 this->maxes[i] = x_n;
201 }
202
203 CSV_INLINE void CSVStat::variance(const long double &x_n, const size_t &i) {
204 // Update the rolling mean and variance for one column using Welford's algorithm.
205 long double& current_rolling_mean = this->rolling_means[i];
206 long double& current_rolling_var = this->rolling_vars[i];
207 long double& current_n = this->n[i];
208 long double delta;
209 long double delta2;
210
211 current_n++;
212
213 if (current_n == 1) {
214 current_rolling_mean = x_n;
215 } else {
216 delta = x_n - current_rolling_mean;
217 current_rolling_mean += delta/current_n;
218 delta2 = x_n - current_rolling_mean;
219 current_rolling_var += delta*delta2;
220 }
221 }
222
228 CSV_INLINE std::unordered_map<std::string, DataType> csv_data_types(const std::string& filename) {
229 CSVStat stat(filename);
230 std::unordered_map<std::string, DataType> csv_dtypes;
231
232 auto col_names = stat.get_col_names();
233 auto temp = stat.get_dtypes();
234
235 for (size_t i = 0; i < stat.get_col_names().size(); i++) {
236 auto& col = temp[i];
237 auto& col_name = col_names[i];
238
239 if (col[DataType::CSV_STRING])
240 csv_dtypes[col_name] = DataType::CSV_STRING;
241 else if (col[DataType::CSV_INT64])
242 csv_dtypes[col_name] = DataType::CSV_INT64;
243 else if (col[DataType::CSV_INT32])
244 csv_dtypes[col_name] = DataType::CSV_INT32;
245 else if (col[DataType::CSV_INT16])
246 csv_dtypes[col_name] = DataType::CSV_INT16;
247 else if (col[DataType::CSV_INT8])
248 csv_dtypes[col_name] = DataType::CSV_INT8;
249 else
250 csv_dtypes[col_name] = DataType::CSV_DOUBLE;
251 }
252
253 return csv_dtypes;
254 }
255}
Stores information about how to parse a CSV file.
CSVFormat get_format() const
Return the format of the original raw CSV.
Class for calculating statistics from CSV files and in-memory sources.
Definition csv_stat.hpp:18
std::vector< long double > get_mean() const
Return current means.
Definition csv_stat.cpp:28
std::vector< long double > get_variance() const
Return current variances.
Definition csv_stat.cpp:37
CSVStat(csv::string_view filename, CSVFormat format=CSVFormat::guess_csv())
Calculate statistics for an arbitrarily large file.
Definition csv_stat.cpp:16
std::vector< long double > get_mins() const
Return current mins.
Definition csv_stat.cpp:46
std::vector< TypeCount > get_dtypes() const
Get data type counts for each column.
Definition csv_stat.cpp:73
std::vector< long double > get_maxes() const
Return current maxes.
Definition csv_stat.cpp:55
std::vector< FreqCount > get_counts() const
Get counts for each column.
Definition csv_stat.cpp:64
#define CSV_INLINE
Helper macro which should be #defined as "inline" in the single header version.
Definition common.hpp:26
Calculates statistics from CSV files.
The all encompassing namespace.
@ CSV_INT64
64-bit integer (long long on MSVC/GCC)
@ CSV_DOUBLE
Floating point value.
@ CSV_INT16
16-bit integer (short on MSVC/GCC)
@ CSV_INT32
32-bit integer (int on MSVC/GCC)
@ CSV_INT8
8-bit integer
@ CSV_STRING
Non-numeric string.
std::unordered_map< std::string, DataType > csv_data_types(const std::string &filename)
Useful for uploading CSV files to SQL databases.
Definition csv_stat.cpp:228
nonstd::string_view string_view
The string_view class used by this library.
Definition common.hpp:135