Vince's CSV Parser
Loading...
Searching...
No Matches
csv Namespace Reference

The all encompassing namespace. More...

Classes

class  CSVField
 Data type representing individual CSV values. More...
 
struct  CSVFileInfo
 Returned by get_file_info() More...
 
class  CSVFormat
 Stores information about how to parse a CSV file. More...
 
struct  CSVGuessResult
 Stores the inferred format of a CSV file. More...
 
class  CSVReader
 Main class for parsing CSVs from files and in-memory sources. More...
 
class  CSVRow
 Data structure for representing CSV rows. More...
 
class  DataFrame
 
class  DataFrameCell
 
class  DataFrameColumn
 Lightweight non-owning view over one DataFrame column. More...
 
class  DataFrameExecutor
 Persistent execution backend for batch-oriented DataFrame column work. More...
 
class  DataFrameOptions
 Allows configuration of DataFrame behavior. More...
 
class  DataFrameRow
 Proxy class that wraps a CSVRow and intercepts field access to check for edits. More...
 
class  DelimWriter
 Class for writing delimiter separated values files. More...
 
struct  is_invocable_returning
 
struct  is_invocable_returning_impl
 
struct  is_invocable_returning_impl< F, ReturnType, void_t< invoke_result_t< F, Args... > >, Args... >
 
struct  RowOverlay
 
struct  RowOverlaySlot
 

Typedefs

using string_view = std::string_view
 The string_view class used by this library.
 
template<bool B, class T = void>
using enable_if_t = typename std::enable_if< B, T >::type
 
template<typename F , typename... Args>
using invoke_result_t = typename std::invoke_result< F, Args... >::type
 
template<typename... Ts>
using void_t = void
 

Enumerations

enum class  VariableColumnPolicy { THROW = -1 , IGNORE_ROW = 0 , KEEP = 1 , KEEP_NON_EMPTY = 2 }
 Determines how to handle rows that are shorter or longer than the majority. More...
 
enum class  ColumnNamePolicy { EXACT = 0 , CASE_INSENSITIVE = 1 }
 Determines how column name lookups are performed. More...
 
enum class  CSVConversionError {
  None = 0 , NotANumber , Overflow , FloatToInt ,
  NegativeToUnsigned
}
 Non-throwing CSVField conversion result. More...
 
enum class  DataType {
  UNKNOWN = classify_scalar::scalar_invalid , CSV_NULL = classify_scalar::scalar_null , CSV_STRING = classify_scalar::scalar_string , CSV_BOOL = classify_scalar::scalar_bool ,
  CSV_INT8 = classify_scalar::scalar_int8 , CSV_INT16 = classify_scalar::scalar_int16 , CSV_INT32 = classify_scalar::scalar_int32 , CSV_INT64 = classify_scalar::scalar_int64 ,
  CSV_BIGINT = classify_scalar::scalar_bigint , CSV_DOUBLE = classify_scalar::scalar_float , CSV_TIMESTAMP = classify_scalar::scalar_timestamp , scalar_custom_begin = classify_scalar::scalar_custom_begin - 1
}
 Enumerates the different CSV field types recognized by this library. More...
 

Functions

 CSVRow::operator std::vector< std::string > () const
 
 CSV_NON_NULL (2) CSVRow
 
const char * csv_conversion_error_message (CSVConversionError error) noexcept
 Return a stable human-readable description for a CSVConversionError.
 
template<>
std::string CSVField::get< std::string > ()
 Retrieve this field's original string.
 
template<>
CONSTEXPR_14 csv::string_view CSVField::get< csv::string_view > ()
 Retrieve a view over this field's string.
 
template<>
bool CSVField::try_get< std::string > (std::string &out) noexcept
 Non-throwing retrieval of field as std::string.
 
template<>
CONSTEXPR_14 bool CSVField::try_get< csv::string_view > (csv::string_view &out) noexcept
 Non-throwing retrieval of field as csv::string_view.
 
Utility Functions
std::unordered_map< std::string, DataTypecsv_data_types (CSVReader &reader)
 Infer SQL-friendly column data types from an existing CSVReader.
 
template<typename... ReaderArgs, csv::enable_if_t< std::is_constructible< CSVReader, ReaderArgs... >::value, int > = 0>
std::unordered_map< std::string, DataTypecsv_data_types (ReaderArgs &&... reader_args)
 Infer SQL-friendly column data types from any CSVReader constructor input.
 
template<typename State , typename Fn >
void chunk_parallel_apply (CSVReader &reader, DataFrameExecutor &executor, std::vector< State > &states, Fn &&fn, size_t chunk_size=50000)
 Apply a per-column batch function over a CSVReader using a reusable executor.
 
template<typename State , typename Fn >
void chunk_parallel_apply (CSVReader &reader, std::vector< State > &states, Fn &&fn, size_t chunk_size=50000)
 Apply a per-column batch function over a CSVReader with a temporary executor.
 
CSVFileInfo get_file_info (const std::string &filename)
 Get basic information about a CSV file.
 
std::vector< std::string > get_col_names (csv::string_view filename, const CSVFormat &format=CSVFormat::guess_csv())
 Get the column names of a CSV file using just the first 500KB.
 
long long get_col_pos (csv::string_view filename, csv::string_view col_name, const CSVFormat &format=CSVFormat::guess_csv())
 Find the position of a column in a CSV file or CSV_NOT_FOUND otherwise.
 
Shorthand Parsing Functions

Convenience functions for parsing small strings

CSVReader parse (csv::string_view in, const CSVFormat &format=CSVFormat::guess_csv())
 Parse CSV from a string view, copying the input into an owned buffer.
 
CSVReader parse_unsafe (csv::string_view in, CSVFormat format=CSVFormat::guess_csv())
 Parse CSV from an in-memory view with zero copy.
 
CSVReader parse_no_header (csv::string_view in)
 Parses a CSV string with no headers.
 
CSVReader operator""_csv (const char *in, size_t n)
 Parse a RFC 4180 CSV string.
 
CSVReader operator""_csv_no_header (const char *in, size_t n)
 A shorthand for csv::parse_no_header().
 

Variables

constexpr int CSV_NOT_FOUND = -1
 Integer indicating a requested column wasn't found.
 
constexpr unsigned CHAR_OFFSET = std::numeric_limits<char>::is_signed ? 128 : 0
 Offset to convert char into array index.
 

CSV Writing

template<class OutputStream >
using CSVWriter = DelimWriter< OutputStream, ',', '"'>
 An alias for csv::DelimWriter for writing standard CSV files.
 
template<class OutputStream >
using TSVWriter = DelimWriter< OutputStream, '\t', '"'>
 Class for writing tab-separated values files.
 
template<class OutputStream >
CSVWriter< OutputStream > make_csv_writer (OutputStream &out, bool quote_minimal=true)
 Return a csv::CSVWriter over the output stream.
 
template<class OutputStream >
TSVWriter< OutputStream > make_tsv_writer (OutputStream &out, bool quote_minimal=true)
 Return a csv::TSVWriter over the output stream.
 

Detailed Description

The all encompassing namespace.

Typedef Documentation

◆ CSVWriter

template<class OutputStream >
using csv::CSVWriter = typedef DelimWriter<OutputStream, ',', '"'>

An alias for csv::DelimWriter for writing standard CSV files.

See also
csv::DelimWriter::operator<<()
Note
Use csv::make_csv_writer() to instantiate this class over an actual output stream.

Definition at line 664 of file csv_writer.hpp.

◆ enable_if_t

template<bool B, class T = void>
using csv::enable_if_t = typedef typename std::enable_if<B, T>::type

Definition at line 202 of file common.hpp.

◆ invoke_result_t

template<typename F , typename... Args>
using csv::invoke_result_t = typedef typename std::invoke_result<F, Args...>::type

Definition at line 215 of file common.hpp.

◆ string_view

The string_view class used by this library.

Definition at line 174 of file common.hpp.

◆ TSVWriter

template<class OutputStream >
using csv::TSVWriter = typedef DelimWriter<OutputStream, '\t', '"'>

Class for writing tab-separated values files.

See also
csv::DelimWriter::write_row()
csv::DelimWriter::operator<<()
Note
Use csv::make_tsv_writer() to instantiate this class over an actual output stream.

Definition at line 675 of file csv_writer.hpp.

◆ void_t

template<typename... Ts>
using csv::void_t = typedef void

Definition at line 222 of file common.hpp.

Enumeration Type Documentation

◆ ColumnNamePolicy

enum class csv::ColumnNamePolicy
strong

Determines how column name lookups are performed.

Enumerator
EXACT 

Case-sensitive match (default)

CASE_INSENSITIVE 

Case-insensitive match.

Definition at line 34 of file csv_format.hpp.

◆ CSVConversionError

enum class csv::CSVConversionError
strong

Non-throwing CSVField conversion result.

Returned by CSVField::as() inside std::expected, and used internally by CSVField::get() and CSVField::try_get() to keep throwing and non-throwing conversions on the same rules.

See also
csv_conversion_error_message()
Enumerator
None 

Conversion succeeded.

NotANumber 

The field is not compatible with the requested target type.

Overflow 

The parsed value does not fit in the requested target type.

FloatToInt 

A floating point field was requested as an integral type.

NegativeToUnsigned 

A negative value was requested as an unsigned type.

Definition at line 72 of file csv_row.hpp.

◆ DataType

enum class csv::DataType
strong

Enumerates the different CSV field types recognized by this library.

Enumerator
CSV_NULL 

Empty string.

CSV_STRING 

Non-scalar string.

CSV_BOOL 

Boolean value.

CSV_INT8 

8-bit integer

CSV_INT16 

16-bit integer

CSV_INT32 

32-bit integer

CSV_INT64 

64-bit integer

CSV_BIGINT 

Integer too large to fit in 64 bits.

CSV_DOUBLE 

Floating point value.

CSV_TIMESTAMP 

Timestamp value.

Definition at line 14 of file data_type.hpp.

◆ VariableColumnPolicy

enum class csv::VariableColumnPolicy
strong

Determines how to handle rows that are shorter or longer than the majority.

Definition at line 26 of file csv_format.hpp.

Function Documentation

◆ chunk_parallel_apply() [1/2]

template<typename State , typename Fn >
void csv::chunk_parallel_apply ( CSVReader reader,
DataFrameExecutor executor,
std::vector< State > &  states,
Fn &&  fn,
size_t  chunk_size = 50000 
)
inline

Apply a per-column batch function over a CSVReader using a reusable executor.

Reads the source in chunks, promotes each chunk into a temporary DataFrame, and applies fn(column, states[column.index()]).

Callbacks may treat each batch DataFrame as read-mostly, and sparse overlay cell edits are synchronized at row granularity. If you need more involved batch orchestration, use CSVReader::read_chunk() and construct a batch-scoped DataFrame yourself.

Exceptions
std::invalid_argumentif chunk_size == 0

Definition at line 139 of file csv_utility.hpp.

◆ chunk_parallel_apply() [2/2]

template<typename State , typename Fn >
void csv::chunk_parallel_apply ( CSVReader reader,
std::vector< State > &  states,
Fn &&  fn,
size_t  chunk_size = 50000 
)
inline

Apply a per-column batch function over a CSVReader with a temporary executor.

This is the convenience overload for the common case where callers do not need to reuse worker threads across multiple reader pipelines.

Definition at line 163 of file csv_utility.hpp.

◆ csv_conversion_error_message()

const char * csv::csv_conversion_error_message ( CSVConversionError  error)
inlinenoexcept

Return a stable human-readable description for a CSVConversionError.

Definition at line 102 of file csv_row.hpp.

◆ csv_data_types() [1/2]

std::unordered_map< std::string, DataType > csv::csv_data_types ( CSVReader reader)

Infer SQL-friendly column data types from an existing CSVReader.

This consumes rows from reader using the chunked ETL path and returns one inferred DataType per column name.

Definition at line 5 of file csv_utility.cpp.

◆ csv_data_types() [2/2]

template<typename... ReaderArgs, csv::enable_if_t< std::is_constructible< CSVReader, ReaderArgs... >::value, int > = 0>
std::unordered_map< std::string, DataType > csv::csv_data_types ( ReaderArgs &&...  reader_args)
inline

Infer SQL-friendly column data types from any CSVReader constructor input.

This convenience overload forwards its arguments directly to CSVReader, so it supports filenames, std::istream sources, owned streams, and custom CSVFormat combinations without additional wrapper code.

Example
std::istringstream input("name,age\nAlice,30\nBob,41\n");
CSVFormat format;
format.delimiter(',').header_row(0);
auto dtypes = csv::csv_data_types(input, format);
Stores information about how to parse a CSV file.
CSVFormat & delimiter(char delim)
Sets the delimiter of the CSV file.
CSVFormat & header_row(int row)
Sets the header row.
std::unordered_map< std::string, DataType > csv_data_types(CSVReader &reader)
Infer SQL-friendly column data types from an existing CSVReader.

Definition at line 121 of file csv_utility.hpp.

◆ CSV_NON_NULL()

csv::CSV_NON_NULL ( )

Definition at line 207 of file csv_row.cpp.

◆ CSVField::get< csv::string_view >()

template<>
CONSTEXPR_14 csv::string_view csv::CSVField::get< csv::string_view > ( )

Retrieve a view over this field's string.

Warning
This string_view is only guaranteed to be valid as long as this CSVRow is still alive.

Definition at line 767 of file csv_row.hpp.

◆ CSVField::get< std::string >()

template<>
std::string csv::CSVField::get< std::string > ( )
inline

Retrieve this field's original string.

Definition at line 757 of file csv_row.hpp.

◆ CSVField::try_get< csv::string_view >()

template<>
CONSTEXPR_14 bool csv::CSVField::try_get< csv::string_view > ( csv::string_view out)
noexcept

Non-throwing retrieval of field as csv::string_view.

Definition at line 789 of file csv_row.hpp.

◆ CSVField::try_get< std::string >()

template<>
bool csv::CSVField::try_get< std::string > ( std::string &  out)
inlinenoexcept

Non-throwing retrieval of field as std::string.

Definition at line 782 of file csv_row.hpp.

◆ CSVRow::operator std::vector< std::string >()

csv::CSVRow::operator std::vector< std::string > ( ) const

Definition at line 58 of file csv_row.cpp.

◆ get_col_names()

std::vector< std::string > csv::get_col_names ( csv::string_view  filename,
const CSVFormat format = CSVFormat::guess_csv() 
)
inline

Get the column names of a CSV file using just the first 500KB.

Definition at line 197 of file csv_utility.hpp.

◆ get_col_pos()

long long csv::get_col_pos ( csv::string_view  filename,
csv::string_view  col_name,
const CSVFormat format = CSVFormat::guess_csv() 
)
inline

Find the position of a column in a CSV file or CSV_NOT_FOUND otherwise.

Definition at line 205 of file csv_utility.hpp.

◆ get_file_info()

CSVFileInfo csv::get_file_info ( const std::string &  filename)
inline

Get basic information about a CSV file.

#include "csv.hpp"
#include <iostream>
int main(int argc, char** argv) {
using namespace csv;
if (argc < 2) {
std::cout << "Usage: " << argv[0] << " [file]" << std::endl;
exit(1);
}
std::string file = argv[1];
auto info = get_file_info(file);
std::cout << file << std::endl << "Columns: ";
for (size_t i = 0; i < info.col_names.size(); i++) {
if (i) std::cout << ", ";
std::cout << info.col_names[i];
}
std::cout << std::endl
<< "Dimensions: " << info.n_rows << " rows x " << info.n_cols << " columns" << std::endl
<< "Delimiter: " << info.delim << std::endl;
return 0;
}
The all encompassing namespace.
CSVFileInfo get_file_info(const std::string &filename)
Get basic information about a CSV file.

Definition at line 176 of file csv_utility.hpp.

◆ make_csv_writer()

template<class OutputStream >
CSVWriter< OutputStream > csv::make_csv_writer ( OutputStream &  out,
bool  quote_minimal = true 
)
inline

Return a csv::CSVWriter over the output stream.

Definition at line 679 of file csv_writer.hpp.

◆ make_tsv_writer()

template<class OutputStream >
TSVWriter< OutputStream > csv::make_tsv_writer ( OutputStream &  out,
bool  quote_minimal = true 
)
inline

Return a csv::TSVWriter over the output stream.

Definition at line 685 of file csv_writer.hpp.

◆ operator""_csv()

CSVReader csv::operator""_csv ( const char *  in,
size_t  n 
)
inline

Parse a RFC 4180 CSV string.

String literals have static storage duration, so the zero-copy path is safe here.

Example
TEST_CASE( "Test Escaped Comma", "[read_csv_comma]" ) {
auto rows = "A,B,C\r\n" // Header row
"123,\"234,345\",456\r\n"
"1,2,3\r\n"
"1,2,3"_csv;
CSVRow row;
rows.read_row(row);
REQUIRE( vector<string>(row) ==
vector<string>({"123", "234,345", "456"}));
}
Data structure for representing CSV rows.
Definition csv_row.hpp:544

Definition at line 76 of file csv_utility.hpp.

◆ operator""_csv_no_header()

CSVReader csv::operator""_csv_no_header ( const char *  in,
size_t  n 
)
inline

A shorthand for csv::parse_no_header().

String literals have static storage duration, so the zero-copy path is safe here.

Definition at line 85 of file csv_utility.hpp.

◆ parse()

CSVReader csv::parse ( csv::string_view  in,
const CSVFormat format = CSVFormat::guess_csv() 
)
inline

Parse CSV from a string view, copying the input into an owned buffer.

Safe for any string_view regardless of the caller's ownership of the underlying memory.

Example
TEST_CASE( "Test Escaped Quote", "[read_csv_quote]" ) {
// Per RFC 1480, escaped quotes should be doubled up
auto csv_string = GENERATE(as<std::string> {},
(
"A,B,C\r\n" // Header row
"123,\"234\"\"345\",456\r\n"
"123,\"234\"345\",456\r\n" // Unescaped single quote (not strictly valid)
"123,\"234\"345\",\"456\"" // Quoted field at the end
"123, \"234\"345\",\"456\"" // Quoted field w/ leading whitespace
),
(
"\"A\",\"B\",\"C\"\r\n" // Header row
"123,\"234\"\"345\",456\r\n"
"123,\"234\"345\",456\r\n" // Unescaped single quote (not strictly valid)
"123,\"234\"345\",\"456\"" // Quoted field at the end
"123,\"234\"345\",\"456\"" // Quoted field w/ leading whitespace
)
);
SECTION("Escaped Quote") {
auto rows = parse(csv_string);
REQUIRE(rows.get_col_names() == vector<string>({ "A", "B", "C" }));
// Expected Results: Double " is an escape for a single "
vector<string> correct_row = { "123", "234\"345", "456" };
for (auto& row : rows) {
REQUIRE(vector<string>(row) == correct_row);
}
}
}
CSVReader parse(csv::string_view in, const CSVFormat &format=CSVFormat::guess_csv())
Parse CSV from a string view, copying the input into an owned buffer.

Definition at line 42 of file csv_utility.hpp.

◆ parse_no_header()

CSVReader csv::parse_no_header ( csv::string_view  in)
inline

Parses a CSV string with no headers.

Definition at line 62 of file csv_utility.hpp.

◆ parse_unsafe()

CSVReader csv::parse_unsafe ( csv::string_view  in,
CSVFormat  format = CSVFormat::guess_csv() 
)
inline

Parse CSV from an in-memory view with zero copy.

WARNING: Non-owning path. The caller must ensure in's backing memory remains valid and immutable while the reader may request additional rows from the source stream.

Rows already obtained from the reader remain valid, but unread rows still depend on the source view staying alive.

Definition at line 56 of file csv_utility.hpp.

Variable Documentation

◆ CHAR_OFFSET

constexpr unsigned csv::CHAR_OFFSET = std::numeric_limits<char>::is_signed ? 128 : 0
constexpr

Offset to convert char into array index.

Definition at line 482 of file common.hpp.

◆ CSV_NOT_FOUND

constexpr int csv::CSV_NOT_FOUND = -1
constexpr

Integer indicating a requested column wasn't found.

Definition at line 479 of file common.hpp.