Vince's CSV Parser
Loading...
Searching...
No Matches
csv_utility.hpp
1#pragma once
2#include "common.hpp"
3#include "csv_exceptions.hpp"
4#include "csv_format.hpp"
5#include "csv_reader.hpp"
6#include "data_frame.hpp"
7#include "data_type.hpp"
8#include "string_view_stream.hpp"
9
10#include <memory>
11#include <sstream>
12#include <string>
13#include <type_traits>
14#include <unordered_map>
15#include <utility>
16
17namespace csv {
19 struct CSVFileInfo {
20 std::string filename;
21 std::vector<std::string> col_names;
22 char delim;
23 size_t n_rows;
24 size_t n_cols;
26 internals::SpeculativeParseDiagnostics speculative_diagnostics;
27 };
28
33
43 std::unique_ptr<std::istream> ss(new std::stringstream(std::string(in)));
44 return CSVReader(std::move(ss), format);
45 }
46
57 std::unique_ptr<std::istream> stream(new internals::StringViewStream(in));
58 return CSVReader(std::move(stream), format);
59 }
60
63 CSVFormat format;
64 format.header_row(-1);
65 return parse(in, format);
66 }
67
76 inline CSVReader operator ""_csv(const char* in, size_t n) {
77 return parse_unsafe(csv::string_view(in, n));
78 }
79
85 inline CSVReader operator ""_csv_no_header(const char* in, size_t n) {
86 CSVFormat format;
87 format.header_row(-1);
88 return parse_unsafe(csv::string_view(in, n), format);
89 }
91
94
99 std::unordered_map<std::string, DataType> csv_data_types(CSVReader& reader);
100
117 template<
118 typename... ReaderArgs,
119 csv::enable_if_t<std::is_constructible<CSVReader, ReaderArgs...>::value, int> = 0
120 >
121 inline std::unordered_map<std::string, DataType> csv_data_types(ReaderArgs&&... reader_args) {
122 CSVReader reader(std::forward<ReaderArgs>(reader_args)...);
123 return csv_data_types(reader);
124 }
125
138 template<typename State, typename Fn>
140 CSVReader& reader,
141 DataFrameExecutor& executor,
142 std::vector<State>& states,
143 Fn&& fn,
144 size_t chunk_size = 50000
145 ) {
146 if (chunk_size == 0) {
147 throw std::invalid_argument(internals::ERROR_CHUNK_PARALLEL_APPLY_ZERO);
148 }
149
150 std::vector<CSVRow> rows;
151 while (reader.read_chunk(rows, chunk_size)) {
152 DataFrame<> batch(std::move(rows));
153 batch.column_parallel_apply(executor, states, std::forward<Fn>(fn));
154 }
155 }
156
162 template<typename State, typename Fn>
164 CSVReader& reader,
165 std::vector<State>& states,
166 Fn&& fn,
167 size_t chunk_size = 50000
168 ) {
169 DataFrameExecutor executor;
170 chunk_parallel_apply(reader, executor, states, std::forward<Fn>(fn), chunk_size);
171 }
172
176 inline CSVFileInfo get_file_info(const std::string& filename) {
177 CSVFormat reader_format = CSVFormat::guess_csv();
178
179 CSVReader reader(filename, reader_format);
180 CSVFormat format = reader.get_format();
181
182 std::vector<CSVRow> rows;
183 while (reader.read_chunk(rows, 50000)) {}
184
185 return {
186 filename,
187 reader.get_col_names(),
188 format.get_delim(),
189 reader.n_rows(),
190 reader.get_col_names().size(),
191 reader.parse_worker_count(),
192 reader.speculative_diagnostics()
193 };
194 }
195
197 inline std::vector<std::string> get_col_names(
198 csv::string_view filename,
199 const CSVFormat& format = CSVFormat::guess_csv()) {
200 auto head = internals::parser::get_csv_head(filename);
201 return parse_unsafe(head, format).get_col_names();
202 }
203
205 inline long long get_col_pos(csv::string_view filename, csv::string_view col_name,
206 const CSVFormat& format = CSVFormat::guess_csv()) {
207 auto col_names = get_col_names(filename, format);
208 return col_names.empty() ? CSV_NOT_FOUND :
209 std::distance(col_names.begin(), std::find(col_names.begin(), col_names.end(), col_name));
210 }
212}
Stores information about how to parse a CSV file.
static CSVFormat guess_csv()
CSVFormat preset for delimiter inference with header/n_cols inference enabled.
CSVFormat & header_row(int row)
Sets the header row.
Main class for parsing CSVs from files and in-memory sources.
const std::vector< std::string > & get_col_names() const
Return the active column names in CSV order.
Persistent execution backend for batch-oriented DataFrame column work.
Lightweight istream over csv::string_view with zero copy.
A standalone header file containing shared code.
Shared exception message templates and throw helpers.
Defines an object used to store CSV format settings.
Defines functionality needed for basic CSV parsing.
CSV scalar type classification adapter.
The all encompassing namespace.
long long get_col_pos(csv::string_view filename, csv::string_view col_name, const CSVFormat &format=CSVFormat::guess_csv())
Find the position of a column in a CSV file or CSV_NOT_FOUND otherwise.
CSVReader parse_unsafe(csv::string_view in, CSVFormat format=CSVFormat::guess_csv())
Parse CSV from an in-memory view with zero copy.
CSVFileInfo get_file_info(const std::string &filename)
Get basic information about a CSV file.
void chunk_parallel_apply(CSVReader &reader, DataFrameExecutor &executor, std::vector< State > &states, Fn &&fn, size_t chunk_size=50000)
Apply a per-column batch function over a CSVReader using a reusable executor.
CSVReader parse(csv::string_view in, const CSVFormat &format=CSVFormat::guess_csv())
Parse CSV from a string view, copying the input into an owned buffer.
std::vector< std::string > get_col_names(csv::string_view filename, const CSVFormat &format=CSVFormat::guess_csv())
Get the column names of a CSV file using just the first 500KB.
CSVReader parse_no_header(csv::string_view in)
Parses a CSV string with no headers.
constexpr int CSV_NOT_FOUND
Integer indicating a requested column wasn't found.
Definition common.hpp:479
std::unordered_map< std::string, DataType > csv_data_types(CSVReader &reader)
Infer SQL-friendly column data types from an existing CSVReader.
std::string_view string_view
The string_view class used by this library.
Definition common.hpp:174
Returned by get_file_info()
size_t n_cols
Number of columns in a CSV.
std::vector< std::string > col_names
CSV column names.
size_t parse_worker_count
Number of parser worker threads used.
char delim
Delimiting character.
std::string filename
Filename.
size_t n_rows
Number of rows in a file.