Vince's CSV Parser
Loading...
Searching...
No Matches
csv_utility.cpp
1#include "csv_utility.hpp"
2#include "data_frame.hpp"
3
4namespace csv {
5 CSV_INLINE std::unordered_map<std::string, DataType> csv_data_types(CSVReader& reader) {
6 std::unordered_map<std::string, DataType> csv_dtypes;
7 const auto col_names = reader.get_col_names();
8 std::vector<std::unordered_map<DataType, size_t>> type_counts(col_names.size());
9 constexpr size_t TYPE_CHUNK_SIZE = 5000;
10
11 chunk_parallel_apply(reader, type_counts,
12 [](DataFrame<>::column_type column, std::unordered_map<DataType, size_t>& counts) {
13 for (size_t row_index = 0; row_index < column.size(); ++row_index) {
14 counts[internals::data_type(column.get_sv(row_index))]++;
15 }
16 },
17 TYPE_CHUNK_SIZE
18 );
19
20 for (size_t i = 0; i < col_names.size(); i++) {
21 auto& col = type_counts[i];
22 auto& col_name = col_names[i];
23
24 if (col[DataType::CSV_STRING])
25 csv_dtypes[col_name] = DataType::CSV_STRING;
26 else if (col[DataType::CSV_INT64])
27 csv_dtypes[col_name] = DataType::CSV_INT64;
28 else if (col[DataType::CSV_INT32])
29 csv_dtypes[col_name] = DataType::CSV_INT32;
30 else if (col[DataType::CSV_INT16])
31 csv_dtypes[col_name] = DataType::CSV_INT16;
32 else if (col[DataType::CSV_INT8])
33 csv_dtypes[col_name] = DataType::CSV_INT8;
34 else if (col[DataType::CSV_BOOL])
35 csv_dtypes[col_name] = DataType::CSV_BOOL;
36 else if (col[DataType::CSV_TIMESTAMP])
37 csv_dtypes[col_name] = DataType::CSV_TIMESTAMP;
38 else if (col[DataType::CSV_NULL])
39 csv_dtypes[col_name] = DataType::CSV_NULL;
40 else
41 csv_dtypes[col_name] = DataType::CSV_DOUBLE;
42 }
43
44 return csv_dtypes;
45 }
46}
Main class for parsing CSVs from files and in-memory sources.
Lightweight non-owning view over one DataFrame column.
csv::string_view get_sv(size_t row_index) const
Access a visible cell value as a string_view without materializing a DataFrameCell.
size_t size() const noexcept
Number of rows in the parent batch.
#define CSV_INLINE
Helper macro which should be #defined as "inline" in the single header version.
Definition common.hpp:31
The all encompassing namespace.
@ CSV_TIMESTAMP
Timestamp value.
@ CSV_INT64
64-bit integer
@ CSV_DOUBLE
Floating point value.
@ CSV_BOOL
Boolean value.
@ CSV_NULL
Empty string.
@ CSV_INT16
16-bit integer
@ CSV_INT32
32-bit integer
@ CSV_INT8
8-bit integer
@ CSV_STRING
Non-scalar string.
void chunk_parallel_apply(CSVReader &reader, DataFrameExecutor &executor, std::vector< State > &states, Fn &&fn, size_t chunk_size=50000)
Apply a per-column batch function over a CSVReader using a reusable executor.
std::unordered_map< std::string, DataType > csv_data_types(CSVReader &reader)
Infer SQL-friendly column data types from an existing CSVReader.